PyPI - xinference - Versions diffs - 0.14.2__py3-none-any.whl → 0.14.3__py3-none-any.whl - Mend

xinference 0.14.2py3-none-any.whl → 0.14.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (137) hide show

xinference/_version.py CHANGED Viewed

@@ -8,11 +8,11 @@ import json
 version_json = '''
 {
- "date": "2024-08-16T18:10:38+0800",
+ "date": "2024-08-23T18:14:53+0800",
  "dirty": false,
  "error": null,
- "full-revisionid": "e4d225774dc7a9a9260396bf833e03a1df8e8a92",
- "version": "0.14.2"
+ "full-revisionid": "b5002242e04634bca7e75cac9df0cdc6c0bf407a",
+ "version": "0.14.3"
 }
 '''  # END VERSION_JSON

xinference/core/chat_interface.py CHANGED Viewed

@@ -340,7 +340,7 @@ class GradioInterface:
             state = gr.State([])
             with gr.Row():
                 chatbot = gr.Chatbot(
-                    elem_id="chatbot", label=self.model_name, height=550, scale=7
+                    elem_id="chatbot", label=self.model_name, height=700, scale=7
                 )
                 with gr.Column(scale=3):
                     imagebox = gr.Image(type="filepath")

xinference/core/image_interface.py CHANGED Viewed

@@ -163,6 +163,7 @@ class ImageInterface:
             size_width: int,
             size_height: int,
             num_inference_steps: int,
+            padding_image_to_multiple: int,
         ) -> PIL.Image.Image:
             from ..client import RESTfulClient
@@ -178,6 +179,7 @@ class ImageInterface:
             num_inference_steps = (
                 None if num_inference_steps == -1 else num_inference_steps  # type: ignore
             )
+            padding_image_to_multiple = None if padding_image_to_multiple == -1 else padding_image_to_multiple  # type: ignore
             bio = io.BytesIO()
             image.save(bio, format="png")
@@ -190,6 +192,7 @@ class ImageInterface:
                 size=size,
                 response_format="b64_json",
                 num_inference_steps=num_inference_steps,
+                padding_image_to_multiple=padding_image_to_multiple,
             )
             images = []
@@ -222,9 +225,14 @@ class ImageInterface:
                     n = gr.Number(label="Number of image", value=1)
                     size_width = gr.Number(label="Width", value=-1)
                     size_height = gr.Number(label="Height", value=-1)
+                with gr.Row():
                     num_inference_steps = gr.Number(
                         label="Inference Step Number", value=-1
                     )
+                    padding_image_to_multiple = gr.Number(
+                        label="Padding image to multiple", value=-1
+                    )
                 with gr.Row():
                     with gr.Column(scale=1):
@@ -242,6 +250,7 @@ class ImageInterface:
                     size_width,
                     size_height,
                     num_inference_steps,
+                    padding_image_to_multiple,
                 ],
                 outputs=output_gallery,
             )

xinference/core/model.py CHANGED Viewed

@@ -177,6 +177,7 @@ class ModelActor(xo.StatelessActor):
         request_limits: Optional[int] = None,
     ):
         super().__init__()
+        from ..model.llm.lmdeploy.core import LMDeployModel
         from ..model.llm.sglang.core import SGLANGModel
         from ..model.llm.transformers.core import PytorchModel
         from ..model.llm.vllm.core import VLLMModel
@@ -192,7 +193,9 @@ class ModelActor(xo.StatelessActor):
         self._current_generator = lambda: None
         self._lock = (
             None
-            if isinstance(self._model, (PytorchModel, VLLMModel, SGLANGModel))
+            if isinstance(
+                self._model, (PytorchModel, VLLMModel, SGLANGModel, LMDeployModel)
+            )
             else asyncio.locks.Lock()
         )
         self._worker_ref = None

xinference/core/worker.py CHANGED Viewed

@@ -39,9 +39,11 @@ from ..core.status_guard import LaunchStatus
 from ..device_utils import get_available_device_env_name, gpu_count
 from ..model.core import ModelDescription, create_model_instance
 from ..types import PeftModelConfig
+from .cache_tracker import CacheTrackerActor
 from .event import Event, EventCollectorActor, EventType
 from .metrics import launch_metrics_export_server, record_metrics
 from .resource import gather_node_info
+from .status_guard import StatusGuardActor
 from .utils import log_async, log_sync, parse_replica_model_uid, purge_dir
 logger = getLogger(__name__)
@@ -71,6 +73,15 @@ class WorkerActor(xo.StatelessActor):
         self._supervisor_ref: Optional[xo.ActorRefType] = None
         self._main_pool = main_pool
         self._main_pool.recover_sub_pool = self.recover_sub_pool
+        self._status_guard_ref: xo.ActorRefType[  # type: ignore
+            "StatusGuardActor"
+        ] = None
+        self._event_collector_ref: xo.ActorRefType[  # type: ignore
+            EventCollectorActor
+        ] = None
+        self._cache_tracker_ref: xo.ActorRefType[  # type: ignore
+            CacheTrackerActor
+        ] = None
         # internal states.
         # temporary placeholder during model launch process:
@@ -308,56 +319,50 @@ class WorkerActor(xo.StatelessActor):
         Params:
             add_worker: By default will call supervisor.add_worker after first connect
         """
-        from .status_guard import StatusGuardActor
         from .supervisor import SupervisorActor
         if self._supervisor_ref is not None:
             return self._supervisor_ref
-        self._supervisor_ref: xo.ActorRefType["SupervisorActor"] = await xo.actor_ref(  # type: ignore
+        supervisor_ref = await xo.actor_ref(  # type: ignore
             address=self._supervisor_address, uid=SupervisorActor.uid()
         )
+        # Prevent concurrent operations leads to double initialization, check again.
+        if self._supervisor_ref is not None:
+            return self._supervisor_ref
+        self._supervisor_ref = supervisor_ref
         if add_worker and len(self._model_uid_to_model) == 0:
             # Newly started (or restarted), has no model, notify supervisor
             await self._supervisor_ref.add_worker(self.address)
             logger.info("Connected to supervisor as a fresh worker")
-            self._status_guard_ref: xo.ActorRefType[  # type: ignore
-                "StatusGuardActor"
-            ] = await xo.actor_ref(
-                address=self._supervisor_address, uid=StatusGuardActor.uid()
-            )
-            self._event_collector_ref: xo.ActorRefType[  # type: ignore
-                EventCollectorActor
-            ] = await xo.actor_ref(
-                address=self._supervisor_address, uid=EventCollectorActor.uid()
-            )
-            from .cache_tracker import CacheTrackerActor
-            self._cache_tracker_ref: xo.ActorRefType[  # type: ignore
-                "CacheTrackerActor"
-            ] = await xo.actor_ref(
-                address=self._supervisor_address, uid=CacheTrackerActor.uid()
-            )
-            # cache_tracker is on supervisor
-            from ..model.audio import get_audio_model_descriptions
-            from ..model.embedding import get_embedding_model_descriptions
-            from ..model.flexible import get_flexible_model_descriptions
-            from ..model.image import get_image_model_descriptions
-            from ..model.llm import get_llm_model_descriptions
-            from ..model.rerank import get_rerank_model_descriptions
-            # record model version
-            model_version_infos: Dict[str, List[Dict]] = {}  # type: ignore
-            model_version_infos.update(get_llm_model_descriptions())
-            model_version_infos.update(get_embedding_model_descriptions())
-            model_version_infos.update(get_rerank_model_descriptions())
-            model_version_infos.update(get_image_model_descriptions())
-            model_version_infos.update(get_audio_model_descriptions())
-            model_version_infos.update(get_flexible_model_descriptions())
-            await self._cache_tracker_ref.record_model_version(
-                model_version_infos, self.address
-            )
+        self._status_guard_ref = await xo.actor_ref(
+            address=self._supervisor_address, uid=StatusGuardActor.uid()
+        )
+        self._event_collector_ref = await xo.actor_ref(
+            address=self._supervisor_address, uid=EventCollectorActor.uid()
+        )
+        self._cache_tracker_ref = await xo.actor_ref(
+            address=self._supervisor_address, uid=CacheTrackerActor.uid()
+        )
+        # cache_tracker is on supervisor
+        from ..model.audio import get_audio_model_descriptions
+        from ..model.embedding import get_embedding_model_descriptions
+        from ..model.flexible import get_flexible_model_descriptions
+        from ..model.image import get_image_model_descriptions
+        from ..model.llm import get_llm_model_descriptions
+        from ..model.rerank import get_rerank_model_descriptions
+        # record model version
+        model_version_infos: Dict[str, List[Dict]] = {}  # type: ignore
+        model_version_infos.update(get_llm_model_descriptions())
+        model_version_infos.update(get_embedding_model_descriptions())
+        model_version_infos.update(get_rerank_model_descriptions())
+        model_version_infos.update(get_image_model_descriptions())
+        model_version_infos.update(get_audio_model_descriptions())
+        model_version_infos.update(get_flexible_model_descriptions())
+        await self._cache_tracker_ref.record_model_version(
+            model_version_infos, self.address
+        )
         return self._supervisor_ref
     @staticmethod
@@ -734,7 +739,7 @@ class WorkerActor(xo.StatelessActor):
         elif model_type == "image":
             return ["text_to_image"]
         elif model_type == "audio":
-            return ["audio_to_text"]
+            return [model._model_spec.ability]
         elif model_type == "video":
             return ["text_to_video"]
         elif model_type == "flexible":
@@ -793,6 +798,7 @@ class WorkerActor(xo.StatelessActor):
             logger.exception(e)
             raise
         try:
+            _ = await self.get_supervisor_ref()
             if self._event_collector_ref is not None:
                 await self._event_collector_ref.report_event(
                     origin_uid,
@@ -914,6 +920,7 @@ class WorkerActor(xo.StatelessActor):
             raise ValueError(f"{model_uid} is launching")
         origin_uid, _, __ = parse_replica_model_uid(model_uid)
         try:
+            _ = await self.get_supervisor_ref()
             if self._event_collector_ref is not None:
                 await self._event_collector_ref.report_event(
                     origin_uid,
@@ -1081,7 +1088,7 @@ class WorkerActor(xo.StatelessActor):
                 paths.update([os.path.realpath(path) for path in paths])
             # get tensorizer path
-            from ..model.llm.pytorch.tensorizer_utils import get_tensorizer_dir
+            from ..model.llm.transformers.tensorizer_utils import get_tensorizer_dir
             tensorizer_path = get_tensorizer_dir(path)
             if os.path.isdir(tensorizer_path):

xinference/model/audio/chattts.py CHANGED Viewed

@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import base64
 import logging
 from io import BytesIO
 from typing import TYPE_CHECKING, Optional
@@ -61,16 +62,31 @@ class ChatTTSModel:
         import torchaudio
         import xxhash
-        seed = xxhash.xxh32_intdigest(voice)
+        rnd_spk_emb = None
-        torch.manual_seed(seed)
-        np.random.seed(seed)
-        torch.cuda.manual_seed(seed)
-        torch.backends.cudnn.deterministic = True
-        torch.backends.cudnn.benchmark = False
+        if len(voice) > 400:
+            try:
+                assert self._model is not None
+                b = base64.b64decode(voice)
+                bio = BytesIO(b)
+                tensor = torch.load(bio, map_location="cpu")
+                rnd_spk_emb = self._model._encode_spk_emb(tensor)
+                logger.info("Speech by input speaker")
+            except Exception as e:
+                logger.info("Fallback to random speaker due to %s", e)
-        assert self._model is not None
-        rnd_spk_emb = self._model.sample_random_speaker()
+        if rnd_spk_emb is None:
+            seed = xxhash.xxh32_intdigest(voice)
+            torch.manual_seed(seed)
+            np.random.seed(seed)
+            torch.cuda.manual_seed(seed)
+            torch.backends.cudnn.deterministic = True
+            torch.backends.cudnn.benchmark = False
+            assert self._model is not None
+            rnd_spk_emb = self._model.sample_random_speaker()
+            logger.info("Speech by voice %s", voice)
         default = 5
         infer_speed = int(default * speed)
@@ -100,7 +116,6 @@ class ChatTTSModel:
                                     if new_last_pos != last_pos:
                                         out.seek(last_pos)
                                         encoded_bytes = out.read()
-                                        print(len(encoded_bytes))
                                         yield encoded_bytes
                                         last_pos = new_last_pos

xinference/model/audio/core.py CHANGED Viewed

@@ -21,6 +21,7 @@ from ..core import CacheableModelSpec, ModelDescription
 from ..utils import valid_model_revision
 from .chattts import ChatTTSModel
 from .cosyvoice import CosyVoiceModel
+from .fish_speech import FishSpeechModel
 from .funasr import FunASRModel
 from .whisper import WhisperModel
@@ -46,6 +47,7 @@ class AudioModelFamilyV1(CacheableModelSpec):
     model_id: str
     model_revision: str
     multilingual: bool
+    ability: str
     default_model_config: Optional[Dict[str, Any]]
     default_transcription_config: Optional[Dict[str, Any]]
@@ -156,13 +158,15 @@ def create_audio_model_instance(
     model_path: Optional[str] = None,
     **kwargs,
 ) -> Tuple[
-    Union[WhisperModel, FunASRModel, ChatTTSModel, CosyVoiceModel],
+    Union[WhisperModel, FunASRModel, ChatTTSModel, CosyVoiceModel, FishSpeechModel],
     AudioModelDescription,
 ]:
     model_spec = match_audio(model_name, download_hub)
     if model_path is None:
         model_path = cache(model_spec)
-    model: Union[WhisperModel, FunASRModel, ChatTTSModel, CosyVoiceModel]
+    model: Union[
+        WhisperModel, FunASRModel, ChatTTSModel, CosyVoiceModel, FishSpeechModel
+    ]
     if model_spec.model_family == "whisper":
         model = WhisperModel(model_uid, model_path, model_spec, **kwargs)
     elif model_spec.model_family == "funasr":
@@ -171,6 +175,8 @@ def create_audio_model_instance(
         model = ChatTTSModel(model_uid, model_path, model_spec, **kwargs)
     elif model_spec.model_family == "CosyVoice":
         model = CosyVoiceModel(model_uid, model_path, model_spec, **kwargs)
+    elif model_spec.model_family == "FishAudio":
+        model = FishSpeechModel(model_uid, model_path, model_spec, **kwargs)
     else:
         raise Exception(f"Unsupported audio model family: {model_spec.model_family}")
     model_description = AudioModelDescription(

xinference/model/audio/fish_speech.py ADDED Viewed

@@ -0,0 +1,228 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import gc
+import logging
+import os.path
+import queue
+import sys
+from io import BytesIO
+from typing import TYPE_CHECKING, Optional
+import numpy as np
+import torch
+from ...device_utils import get_available_device, is_device_available
+if TYPE_CHECKING:
+    from .core import AudioModelFamilyV1
+logger = logging.getLogger(__name__)
+def wav_chunk_header(sample_rate=44100, bit_depth=16, channels=1):
+    import wave
+    buffer = BytesIO()
+    with wave.open(buffer, "wb") as wav_file:
+        wav_file.setnchannels(channels)
+        wav_file.setsampwidth(bit_depth // 8)
+        wav_file.setframerate(sample_rate)
+    wav_header_bytes = buffer.getvalue()
+    buffer.close()
+    return wav_header_bytes
+class FishSpeechModel:
+    def __init__(
+        self,
+        model_uid: str,
+        model_path: str,
+        model_spec: "AudioModelFamilyV1",
+        device: Optional[str] = None,
+        **kwargs,
+    ):
+        self._model_uid = model_uid
+        self._model_path = model_path
+        self._model_spec = model_spec
+        self._device = device
+        self._llama_queue = None
+        self._model = None
+        self._kwargs = kwargs
+    def load(self):
+        # There are too many imports from fish_speech.
+        sys.path.insert(
+            0, os.path.join(os.path.dirname(__file__), "../../thirdparty/fish_speech")
+        )
+        from tools.llama.generate import launch_thread_safe_queue
+        from tools.vqgan.inference import load_model as load_decoder_model
+        if self._device is None:
+            self._device = get_available_device()
+        else:
+            if not is_device_available(self._device):
+                raise ValueError(f"Device {self._device} is not available!")
+        logger.info("Loading Llama model...")
+        self._llama_queue = launch_thread_safe_queue(
+            checkpoint_path=self._model_path,
+            device=self._device,
+            precision=torch.bfloat16,
+            compile=False,
+        )
+        logger.info("Llama model loaded, loading VQ-GAN model...")
+        checkpoint_path = os.path.join(
+            self._model_path,
+            "firefly-gan-vq-fsq-4x1024-42hz-generator.pth",
+        )
+        self._model = load_decoder_model(
+            config_name="firefly_gan_vq",
+            checkpoint_path=checkpoint_path,
+            device=self._device,
+        )
+    @torch.inference_mode()
+    def _inference(
+        self,
+        text,
+        enable_reference_audio,
+        reference_audio,
+        reference_text,
+        max_new_tokens,
+        chunk_length,
+        top_p,
+        repetition_penalty,
+        temperature,
+        streaming=False,
+    ):
+        from fish_speech.utils import autocast_exclude_mps
+        from tools.api import decode_vq_tokens, encode_reference
+        from tools.llama.generate import (
+            GenerateRequest,
+            GenerateResponse,
+            WrappedGenerateResponse,
+        )
+        # Parse reference audio aka prompt
+        prompt_tokens = encode_reference(
+            decoder_model=self._model,
+            reference_audio=reference_audio,
+            enable_reference_audio=enable_reference_audio,
+        )
+        # LLAMA Inference
+        request = dict(
+            device=self._model.device,
+            max_new_tokens=max_new_tokens,
+            text=text,
+            top_p=top_p,
+            repetition_penalty=repetition_penalty,
+            temperature=temperature,
+            compile=False,
+            iterative_prompt=chunk_length > 0,
+            chunk_length=chunk_length,
+            max_length=2048,
+            prompt_tokens=prompt_tokens if enable_reference_audio else None,
+            prompt_text=reference_text if enable_reference_audio else None,
+        )
+        response_queue = queue.Queue()
+        self._llama_queue.put(
+            GenerateRequest(
+                request=request,
+                response_queue=response_queue,
+            )
+        )
+        if streaming:
+            yield wav_chunk_header(), None, None
+        segments = []
+        while True:
+            result: WrappedGenerateResponse = response_queue.get()
+            if result.status == "error":
+                raise Exception(str(result.response))
+            result: GenerateResponse = result.response
+            if result.action == "next":
+                break
+            with autocast_exclude_mps(
+                device_type=self._model.device.type, dtype=torch.bfloat16
+            ):
+                fake_audios = decode_vq_tokens(
+                    decoder_model=self._model,
+                    codes=result.codes,
+                )
+            fake_audios = fake_audios.float().cpu().numpy()
+            segments.append(fake_audios)
+            if streaming:
+                yield (fake_audios * 32768).astype(np.int16).tobytes(), None, None
+        if len(segments) == 0:
+            raise Exception("No audio generated, please check the input text.")
+        # No matter streaming or not, we need to return the final audio
+        audio = np.concatenate(segments, axis=0)
+        yield None, (self._model.spec_transform.sample_rate, audio), None
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            gc.collect()
+    def speech(
+        self,
+        input: str,
+        voice: str,
+        response_format: str = "mp3",
+        speed: float = 1.0,
+        stream: bool = False,
+        **kwargs,
+    ):
+        logger.warning("Fish speech does not support setting voice: %s.", voice)
+        if speed != 1.0:
+            logger.warning("Fish speech does not support setting speed: %s.", speed)
+        if stream is True:
+            logger.warning("stream mode is not implemented.")
+        import torchaudio
+        result = list(
+            self._inference(
+                text=input,
+                enable_reference_audio=False,
+                reference_audio=None,
+                reference_text="",
+                max_new_tokens=0,
+                chunk_length=100,
+                top_p=0.7,
+                repetition_penalty=1.2,
+                temperature=0.7,
+            )
+        )
+        sample_rate, audio = result[0][1]
+        audio = np.array([audio])
+        # Save the generated audio
+        with BytesIO() as out:
+            torchaudio.save(
+                out, torch.from_numpy(audio), sample_rate, format=response_format
+            )
+            return out.getvalue()

xinference/model/audio/model_spec.json CHANGED Viewed

@@ -146,5 +146,13 @@
     "model_revision": "fb5f676733139f35670bed9b59a77d476b1aa898",
     "ability": "text-to-audio",
     "multilingual": true
+  },
+  {
+    "model_name": "FishSpeech-1.2-SFT",
+    "model_family": "FishAudio",
+    "model_id": "fishaudio/fish-speech-1.2-sft",
+    "model_revision": "180288e21ec5c50cfc564023a22f789e4b88a0e0",
+    "ability": "text-to-audio",
+    "multilingual": true
   }
 ]

xinference/model/embedding/core.py CHANGED Viewed

@@ -154,10 +154,32 @@ class EmbeddingModel:
             "gte" in self._model_spec.model_name.lower()
             and "qwen2" in self._model_spec.model_name.lower()
         ):
+            import torch
+            torch_dtype_str = self._kwargs.get("torch_dtype")
+            if torch_dtype_str is not None:
+                try:
+                    torch_dtype = getattr(torch, torch_dtype_str)
+                    if torch_dtype not in [
+                        torch.float16,
+                        torch.float32,
+                        torch.bfloat16,
+                    ]:
+                        logger.warning(
+                            f"Load embedding model with unsupported torch dtype :  {torch_dtype_str}. Using default torch dtype: fp32."
+                        )
+                        torch_dtype = torch.float32
+                except AttributeError:
+                    logger.warning(
+                        f"Load embedding model with  unknown torch dtype '{torch_dtype_str}'. Using default torch dtype: fp32."
+                    )
+                    torch_dtype = torch.float32
+            else:
+                torch_dtype = "auto"
             self._model = XSentenceTransformer(
                 self._model_path,
                 device=self._device,
-                model_kwargs={"device_map": "auto"},
+                model_kwargs={"device_map": "auto", "torch_dtype": torch_dtype},
             )
         else:
             self._model = SentenceTransformer(self._model_path, device=self._device)

xinference/model/image/model_spec.json CHANGED Viewed

@@ -24,7 +24,8 @@
     "model_revision": "ea42f8cef0f178587cf766dc8129abd379c90671",
     "model_ability": [
       "text2image",
-      "image2image"
+      "image2image",
+      "inpainting"
     ]
   },
   {

xinference/model/image/model_spec_modelscope.json CHANGED Viewed

@@ -27,7 +27,8 @@
     "model_revision": "master",
     "model_ability": [
       "text2image",
-      "image2image"
+      "image2image",
+      "inpainting"
     ]
   },
   {

xinference 0.14.2__py3-none-any.whl → 0.14.3__py3-none-any.whl

Potentially problematic release.

xinference 0.14.2py3-none-any.whl → 0.14.3py3-none-any.whl