PyPI - xinference - Versions diffs - 0.13.2__py3-none-any.whl → 0.13.3__py3-none-any.whl - Mend

xinference 0.13.2py3-none-any.whl → 0.13.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (78) hide show

xinference/__init__.py CHANGED Viewed

@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from . import _version
 __version__ = _version.get_versions()["version"]

xinference/_version.py CHANGED Viewed

@@ -8,11 +8,11 @@ import json
 version_json = '''
 {
- "date": "2024-07-19T19:15:54+0800",
+ "date": "2024-07-26T18:42:50+0800",
  "dirty": false,
  "error": null,
- "full-revisionid": "880929cbbc73e5206ca069591b03d9d16dd858bf",
- "version": "0.13.2"
+ "full-revisionid": "aa51ff22dbfb5644554436270deaf57a7ebaf066",
+ "version": "0.13.3"
 }
 '''  # END VERSION_JSON

xinference/api/restful_api.py CHANGED Viewed

@@ -130,6 +130,7 @@ class SpeechRequest(BaseModel):
     response_format: Optional[str] = "mp3"
     speed: Optional[float] = 1.0
     stream: Optional[bool] = False
+    kwargs: Optional[str] = None
 class RegisterModelRequest(BaseModel):
@@ -1309,8 +1310,18 @@ class RESTfulAPI:
             await self._report_error_event(model_uid, str(e))
             raise HTTPException(status_code=500, detail=str(e))
-    async def create_speech(self, request: Request) -> Response:
-        body = SpeechRequest.parse_obj(await request.json())
+    async def create_speech(
+        self,
+        request: Request,
+        prompt_speech: Optional[UploadFile] = File(
+            None, media_type="application/octet-stream"
+        ),
+    ) -> Response:
+        if prompt_speech:
+            f = await request.form()
+        else:
+            f = await request.json()
+        body = SpeechRequest.parse_obj(f)
         model_uid = body.model
         try:
             model = await (await self._get_supervisor_ref()).get_model(model_uid)
@@ -1324,12 +1335,19 @@ class RESTfulAPI:
             raise HTTPException(status_code=500, detail=str(e))
         try:
+            if body.kwargs is not None:
+                parsed_kwargs = json.loads(body.kwargs)
+            else:
+                parsed_kwargs = {}
+            if prompt_speech is not None:
+                parsed_kwargs["prompt_speech"] = await prompt_speech.read()
             out = await model.speech(
                 input=body.input,
                 voice=body.voice,
                 response_format=body.response_format,
                 speed=body.speed,
                 stream=body.stream,
+                **parsed_kwargs,
             )
             if body.stream:
                 return EventSourceResponse(
@@ -1626,10 +1644,14 @@ class RESTfulAPI:
         if body.tools and body.stream:
             is_vllm = await model.is_vllm_backend()
-            if not is_vllm or model_family not in QWEN_TOOL_CALL_FAMILY:
+            if not (
+                (is_vllm and model_family in QWEN_TOOL_CALL_FAMILY)
+                or (not is_vllm and model_family in GLM4_TOOL_CALL_FAMILY)
+            ):
                 raise HTTPException(
                     status_code=400,
-                    detail="Streaming support for tool calls is available only when using vLLM backend and Qwen models.",
+                    detail="Streaming support for tool calls is available only when using "
+                    "Qwen models with vLLM backend or GLM4-chat models without vLLM backend.",
                 )
         if body.stream:

xinference/client/restful/restful_client.py CHANGED Viewed

@@ -768,6 +768,8 @@ class RESTfulAudioModelHandle(RESTfulModelHandle):
         response_format: str = "mp3",
         speed: float = 1.0,
         stream: bool = False,
+        prompt_speech: Optional[bytes] = None,
+        **kwargs,
     ):
         """
         Generates audio from the input text.
@@ -799,8 +801,21 @@ class RESTfulAudioModelHandle(RESTfulModelHandle):
             "response_format": response_format,
             "speed": speed,
             "stream": stream,
+            "kwargs": json.dumps(kwargs),
         }
-        response = requests.post(url, json=params, headers=self.auth_headers)
+        if prompt_speech:
+            files: List[Any] = []
+            files.append(
+                (
+                    "prompt_speech",
+                    ("prompt_speech", prompt_speech, "application/octet-stream"),
+                )
+            )
+            response = requests.post(
+                url, data=params, files=files, headers=self.auth_headers
+            )
+        else:
+            response = requests.post(url, json=params, headers=self.auth_headers)
         if response.status_code != 200:
             raise RuntimeError(
                 f"Failed to speech the text, detail: {_get_error_string(response)}"

xinference/core/chat_interface.py CHANGED Viewed

@@ -428,7 +428,7 @@ class GradioInterface:
                     }
             hist.append(response_content)
-            return {
+            return {  # type: ignore
                 textbox: response_content,
                 history: hist,
             }
@@ -467,7 +467,7 @@ class GradioInterface:
                     }
             hist.append(response_content)
-            return {
+            return {  # type: ignore
                 textbox: response_content,
                 history: hist,
             }

xinference/core/model.py CHANGED Viewed

@@ -646,7 +646,10 @@ class ModelActor(xo.StatelessActor):
             f"Model {self._model.model_spec} is not for creating translations."
         )
-    @log_async(logger=logger)
+    @log_async(
+        logger=logger,
+        args_formatter=lambda _, kwargs: kwargs.pop("prompt_speech", None),
+    )
     @request_limit
     @xo.generator
     async def speech(
@@ -656,6 +659,7 @@ class ModelActor(xo.StatelessActor):
         response_format: str = "mp3",
         speed: float = 1.0,
         stream: bool = False,
+        **kwargs,
     ):
         if hasattr(self._model, "speech"):
             return await self._call_wrapper_binary(
@@ -665,6 +669,7 @@ class ModelActor(xo.StatelessActor):
                 response_format,
                 speed,
                 stream,
+                **kwargs,
             )
         raise AttributeError(
             f"Model {self._model.model_spec} is not for creating speech."
@@ -735,7 +740,7 @@ class ModelActor(xo.StatelessActor):
         **kwargs,
     ):
         if hasattr(self._model, "inpainting"):
-            return await self._call_wrapper(
+            return await self._call_wrapper_json(
                 self._model.inpainting,
                 image,
                 mask_image,
@@ -758,7 +763,7 @@ class ModelActor(xo.StatelessActor):
         **kwargs,
     ):
         if hasattr(self._model, "infer"):
-            return await self._call_wrapper(
+            return await self._call_wrapper_json(
                 self._model.infer,
                 **kwargs,
             )

xinference/core/scheduler.py CHANGED Viewed

@@ -81,7 +81,7 @@ class InferenceRequest:
         self.future_or_queue = future_or_queue
         # Record error message when this request has error.
         # Must set stopped=True when this field is set.
-        self.error_msg: Optional[str] = None
+        self.error_msg: Optional[str] = None  # type: ignore
         # For compatibility. Record some extra parameters for some special cases.
         self.extra_kwargs = {}
@@ -295,11 +295,11 @@ class SchedulerActor(xo.StatelessActor):
     def __init__(self):
         super().__init__()
-        self._waiting_queue: deque[InferenceRequest] = deque()
-        self._running_queue: deque[InferenceRequest] = deque()
+        self._waiting_queue: deque[InferenceRequest] = deque()  # type: ignore
+        self._running_queue: deque[InferenceRequest] = deque()  # type: ignore
         self._model = None
         self._id_to_req = {}
-        self._abort_req_ids: Set[str] = set()
+        self._abort_req_ids: Set[str] = set()  # type: ignore
         self._isolation = None
     async def __post_create__(self):

xinference/model/audio/core.py CHANGED Viewed

@@ -20,6 +20,7 @@ from ...constants import XINFERENCE_CACHE_DIR
 from ..core import CacheableModelSpec, ModelDescription
 from ..utils import valid_model_revision
 from .chattts import ChatTTSModel
+from .cosyvoice import CosyVoiceModel
 from .whisper import WhisperModel
 MAX_ATTEMPTS = 3
@@ -150,14 +151,16 @@ def create_audio_model_instance(
     model_name: str,
     download_hub: Optional[Literal["huggingface", "modelscope", "csghub"]] = None,
     **kwargs,
-) -> Tuple[Union[WhisperModel, ChatTTSModel], AudioModelDescription]:
+) -> Tuple[Union[WhisperModel, ChatTTSModel, CosyVoiceModel], AudioModelDescription]:
     model_spec = match_audio(model_name, download_hub)
     model_path = cache(model_spec)
-    model: Union[WhisperModel, ChatTTSModel]
+    model: Union[WhisperModel, ChatTTSModel, CosyVoiceModel]
     if model_spec.model_family == "whisper":
         model = WhisperModel(model_uid, model_path, model_spec, **kwargs)
     elif model_spec.model_family == "ChatTTS":
         model = ChatTTSModel(model_uid, model_path, model_spec, **kwargs)
+    elif model_spec.model_family == "CosyVoice":
+        model = CosyVoiceModel(model_uid, model_path, model_spec, **kwargs)
     else:
         raise Exception(f"Unsupported audio model family: {model_spec.model_family}")
     model_description = AudioModelDescription(

xinference/model/audio/cosyvoice.py ADDED Viewed

@@ -0,0 +1,136 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import io
+import logging
+from io import BytesIO
+from typing import TYPE_CHECKING, Optional
+if TYPE_CHECKING:
+    from .core import AudioModelFamilyV1
+logger = logging.getLogger(__name__)
+class CosyVoiceModel:
+    def __init__(
+        self,
+        model_uid: str,
+        model_path: str,
+        model_spec: "AudioModelFamilyV1",
+        device: Optional[str] = None,
+        **kwargs,
+    ):
+        self._model_uid = model_uid
+        self._model_path = model_path
+        self._model_spec = model_spec
+        self._device = device
+        self._model = None
+        self._kwargs = kwargs
+    def load(self):
+        import os
+        import sys
+        # The yaml config loaded from model has hard-coded the import paths. please refer to: load_hyperpyyaml
+        sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../thirdparty"))
+        from cosyvoice.cli.cosyvoice import CosyVoice
+        self._model = CosyVoice(self._model_path)
+    def speech(
+        self,
+        input: str,
+        voice: str,
+        response_format: str = "mp3",
+        speed: float = 1.0,
+        stream: bool = False,
+        **kwargs,
+    ):
+        if stream:
+            raise Exception("CosyVoiceModel does not support stream.")
+        import torchaudio
+        from cosyvoice.utils.file_utils import load_wav
+        prompt_speech: Optional[bytes] = kwargs.pop("prompt_speech", None)
+        prompt_text: Optional[str] = kwargs.pop("prompt_text", None)
+        instruct_text: Optional[str] = kwargs.pop("instruct_text", None)
+        if "SFT" in self._model_spec.model_name:
+            # inference_sft
+            assert (
+                prompt_speech is None
+            ), "CosyVoice SFT model does not support prompt_speech"
+            assert (
+                prompt_text is None
+            ), "CosyVoice SFT model does not support prompt_text"
+            assert (
+                instruct_text is None
+            ), "CosyVoice SFT model does not support instruct_text"
+        elif "Instruct" in self._model_spec.model_name:
+            # inference_instruct
+            assert (
+                prompt_speech is None
+            ), "CosyVoice Instruct model does not support prompt_speech"
+            assert (
+                prompt_text is None
+            ), "CosyVoice Instruct model does not support prompt_text"
+            assert (
+                instruct_text is not None
+            ), "CosyVoice Instruct model expect a instruct_text"
+        else:
+            # inference_zero_shot
+            # inference_cross_lingual
+            assert prompt_speech is not None, "CosyVoice model expect a prompt_speech"
+            assert (
+                instruct_text is None
+            ), "CosyVoice model does not support instruct_text"
+        assert self._model is not None
+        if prompt_speech:
+            assert not voice, "voice can't be set with prompt speech."
+            with io.BytesIO(prompt_speech) as prompt_speech_io:
+                prompt_speech_16k = load_wav(prompt_speech_io, 16000)
+                if prompt_text:
+                    logger.info("CosyVoice inference_zero_shot")
+                    output = self._model.inference_zero_shot(
+                        input, prompt_text, prompt_speech_16k
+                    )
+                else:
+                    logger.info("CosyVoice inference_cross_lingual")
+                    output = self._model.inference_cross_lingual(
+                        input, prompt_speech_16k
+                    )
+        else:
+            available_speakers = self._model.list_avaliable_spks()
+            if not voice:
+                voice = available_speakers[0]
+            else:
+                assert (
+                    voice in available_speakers
+                ), f"Invalid voice {voice}, CosyVoice available speakers: {available_speakers}"
+            if instruct_text:
+                logger.info("CosyVoice inference_instruct")
+                output = self._model.inference_instruct(
+                    input, voice, instruct_text=instruct_text
+                )
+            else:
+                logger.info("CosyVoice inference_sft")
+                output = self._model.inference_sft(input, voice)
+        # Save the generated audio
+        with BytesIO() as out:
+            torchaudio.save(out, output["tts_speech"], 22050, format=response_format)
+            return out.getvalue()

xinference/model/audio/model_spec.json CHANGED Viewed

@@ -102,5 +102,29 @@
     "model_revision": "ce5913842aebd78e4a01a02d47244b8d62ac4ee3",
     "ability": "text-to-audio",
     "multilingual": true
+  },
+  {
+    "model_name": "CosyVoice-300M",
+    "model_family": "CosyVoice",
+    "model_id": "model-scope/CosyVoice-300M",
+    "model_revision": "ca4e036d2db2aa4731cc1747859a68044b6a4694",
+    "ability": "audio-to-audio",
+    "multilingual": true
+  },
+  {
+    "model_name": "CosyVoice-300M-SFT",
+    "model_family": "CosyVoice",
+    "model_id": "model-scope/CosyVoice-300M-SFT",
+    "model_revision": "ab918940c6c134b1fc1f069246e67bad6b66abcb",
+    "ability": "text-to-audio",
+    "multilingual": true
+  },
+  {
+    "model_name": "CosyVoice-300M-Instruct",
+    "model_family": "CosyVoice",
+    "model_id": "model-scope/CosyVoice-300M-Instruct",
+    "model_revision": "fb5f676733139f35670bed9b59a77d476b1aa898",
+    "ability": "text-to-audio",
+    "multilingual": true
   }
 ]

xinference/model/audio/model_spec_modelscope.json CHANGED Viewed

@@ -16,5 +16,32 @@
     "model_revision": "master",
     "ability": "text-to-audio",
     "multilingual": true
+  },
+  {
+    "model_name": "CosyVoice-300M",
+    "model_family": "CosyVoice",
+    "model_hub": "modelscope",
+    "model_id": "iic/CosyVoice-300M",
+    "model_revision": "master",
+    "ability": "audio-to-audio",
+    "multilingual": true
+  },
+  {
+    "model_name": "CosyVoice-300M-SFT",
+    "model_family": "CosyVoice",
+    "model_hub": "modelscope",
+    "model_id": "iic/CosyVoice-300M-SFT",
+    "model_revision": "master",
+    "ability": "text-to-audio",
+    "multilingual": true
+  },
+  {
+    "model_name": "CosyVoice-300M-Instruct",
+    "model_family": "CosyVoice",
+    "model_hub": "modelscope",
+    "model_id": "iic/CosyVoice-300M-Instruct",
+    "model_revision": "master",
+    "ability": "text-to-audio",
+    "multilingual": true
   }
 ]

xinference/model/flexible/launchers/__init__.py CHANGED Viewed

@@ -12,4 +12,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from .image_process_launcher import launcher as image_process
 from .transformers_launcher import launcher as transformers

xinference/model/flexible/launchers/image_process_launcher.py ADDED Viewed

@@ -0,0 +1,70 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import base64
+from io import BytesIO
+import PIL.Image
+import PIL.ImageOps
+from ....types import Image
+from ..core import FlexibleModel, FlexibleModelSpec
+class ImageRemoveBackgroundModel(FlexibleModel):
+    def infer(self, **kwargs):
+        invert = kwargs.get("invert", False)
+        b64_image: str = kwargs.get("image")  # type: ignore
+        only_mask = kwargs.pop("only_mask", True)
+        image_format = kwargs.pop("image_format", "PNG")
+        if not b64_image:
+            raise ValueError("No image found to remove background")
+        image = base64.b64decode(b64_image)
+        try:
+            from rembg import remove
+        except ImportError:
+            error_message = "Failed to import module 'rembg'"
+            installation_guide = [
+                "Please make sure 'rembg' is installed. ",
+                "You can install it by visiting the installation section of the git repo:\n",
+                "https://github.com/danielgatis/rembg?tab=readme-ov-file#installation",
+            ]
+            raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
+        im = PIL.Image.open(BytesIO(image))
+        om = remove(im, only_mask=only_mask, **kwargs)
+        if invert:
+            om = PIL.ImageOps.invert(om)
+        buffered = BytesIO()
+        om.save(buffered, format=image_format)
+        img_str = base64.b64encode(buffered.getvalue()).decode()
+        return Image(url=None, b64_json=img_str)
+def launcher(model_uid: str, model_spec: FlexibleModelSpec, **kwargs) -> FlexibleModel:
+    task = kwargs.get("task")
+    device = kwargs.get("device")
+    if task == "remove_background":
+        return ImageRemoveBackgroundModel(
+            model_uid=model_uid,
+            model_path=model_spec.model_uri,  # type: ignore
+            device=device,
+            config=kwargs,
+        )
+    else:
+        raise ValueError(f"Unknown Task for image processing: {task}")

xinference/model/image/model_spec.json CHANGED Viewed

@@ -106,5 +106,12 @@
     "model_id": "stabilityai/stable-diffusion-2-inpainting",
     "model_revision": "81a84f49b15956b60b4272a405ad3daef3da4590",
     "ability": "inpainting"
+  },
+  {
+    "model_name": "stable-diffusion-xl-inpainting",
+    "model_family": "stable_diffusion",
+    "model_id": "diffusers/stable-diffusion-xl-1.0-inpainting-0.1",
+    "model_revision": "115134f363124c53c7d878647567d04daf26e41e",
+    "ability": "inpainting"
   }
 ]

xinference/model/image/stable_diffusion/core.py CHANGED Viewed

@@ -94,7 +94,12 @@ class DiffusionModel:
             self._model_path,
             **self._kwargs,
         )
-        self._model = move_model_to_available_device(self._model)
+        if self._kwargs.get("cpu_offload", False):
+            logger.debug("CPU offloading model")
+            self._model.enable_model_cpu_offload()
+        else:
+            logger.debug("Loading model to available device")
+            self._model = move_model_to_available_device(self._model)
         # Recommended if your computer has < 64 GB of RAM
         self._model.enable_attention_slicing()
         self._apply_lora()

xinference 0.13.2__py3-none-any.whl → 0.13.3__py3-none-any.whl

Potentially problematic release.

xinference 0.13.2py3-none-any.whl → 0.13.3py3-none-any.whl