PyPI - xinference - Versions diffs - 0.13.2__py3-none-any.whl → 0.13.4__py3-none-any.whl - Mend

xinference 0.13.2py3-none-any.whl → 0.13.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (103) hide show

xinference/__init__.py +0 -1
xinference/_version.py +3 -3
xinference/api/restful_api.py +30 -5
xinference/client/restful/restful_client.py +18 -3
xinference/constants.py +0 -4
xinference/core/chat_interface.py +2 -2
xinference/core/image_interface.py +6 -3
xinference/core/model.py +9 -4
xinference/core/scheduler.py +4 -4
xinference/core/supervisor.py +2 -0
xinference/core/worker.py +7 -0
xinference/deploy/utils.py +6 -0
xinference/model/audio/core.py +9 -4
xinference/model/audio/cosyvoice.py +136 -0
xinference/model/audio/model_spec.json +24 -0
xinference/model/audio/model_spec_modelscope.json +27 -0
xinference/model/core.py +25 -4
xinference/model/embedding/core.py +88 -13
xinference/model/embedding/model_spec.json +8 -0
xinference/model/embedding/model_spec_modelscope.json +8 -0
xinference/model/flexible/core.py +8 -2
xinference/model/flexible/launchers/__init__.py +1 -0
xinference/model/flexible/launchers/image_process_launcher.py +70 -0
xinference/model/image/core.py +8 -5
xinference/model/image/model_spec.json +36 -5
xinference/model/image/model_spec_modelscope.json +21 -3
xinference/model/image/stable_diffusion/core.py +36 -28
xinference/model/llm/core.py +6 -4
xinference/model/llm/ggml/llamacpp.py +7 -5
xinference/model/llm/llm_family.json +802 -82
xinference/model/llm/llm_family.py +6 -6
xinference/model/llm/llm_family_csghub.json +39 -0
xinference/model/llm/llm_family_modelscope.json +295 -47
xinference/model/llm/mlx/core.py +7 -0
xinference/model/llm/pytorch/chatglm.py +246 -5
xinference/model/llm/pytorch/cogvlm2.py +1 -1
xinference/model/llm/pytorch/deepseek_vl.py +2 -1
xinference/model/llm/pytorch/falcon.py +2 -1
xinference/model/llm/pytorch/llama_2.py +4 -2
xinference/model/llm/pytorch/omnilmm.py +2 -1
xinference/model/llm/pytorch/qwen_vl.py +2 -1
xinference/model/llm/pytorch/vicuna.py +2 -1
xinference/model/llm/pytorch/yi_vl.py +2 -1
xinference/model/llm/sglang/core.py +12 -6
xinference/model/llm/utils.py +78 -1
xinference/model/llm/vllm/core.py +9 -5
xinference/model/rerank/core.py +4 -3
xinference/thirdparty/cosyvoice/__init__.py +0 -0
xinference/thirdparty/cosyvoice/bin/__init__.py +0 -0
xinference/thirdparty/cosyvoice/bin/inference.py +114 -0
xinference/thirdparty/cosyvoice/bin/train.py +136 -0
xinference/thirdparty/cosyvoice/cli/__init__.py +0 -0
xinference/thirdparty/cosyvoice/cli/cosyvoice.py +83 -0
xinference/thirdparty/cosyvoice/cli/frontend.py +168 -0
xinference/thirdparty/cosyvoice/cli/model.py +60 -0
xinference/thirdparty/cosyvoice/dataset/__init__.py +0 -0
xinference/thirdparty/cosyvoice/dataset/dataset.py +160 -0
xinference/thirdparty/cosyvoice/dataset/processor.py +369 -0
xinference/thirdparty/cosyvoice/flow/__init__.py +0 -0
xinference/thirdparty/cosyvoice/flow/decoder.py +222 -0
xinference/thirdparty/cosyvoice/flow/flow.py +135 -0
xinference/thirdparty/cosyvoice/flow/flow_matching.py +138 -0
xinference/thirdparty/cosyvoice/flow/length_regulator.py +49 -0
xinference/thirdparty/cosyvoice/hifigan/__init__.py +0 -0
xinference/thirdparty/cosyvoice/hifigan/f0_predictor.py +55 -0
xinference/thirdparty/cosyvoice/hifigan/generator.py +391 -0
xinference/thirdparty/cosyvoice/llm/__init__.py +0 -0
xinference/thirdparty/cosyvoice/llm/llm.py +206 -0
xinference/thirdparty/cosyvoice/transformer/__init__.py +0 -0
xinference/thirdparty/cosyvoice/transformer/activation.py +84 -0
xinference/thirdparty/cosyvoice/transformer/attention.py +326 -0
xinference/thirdparty/cosyvoice/transformer/convolution.py +145 -0
xinference/thirdparty/cosyvoice/transformer/decoder.py +396 -0
xinference/thirdparty/cosyvoice/transformer/decoder_layer.py +132 -0
xinference/thirdparty/cosyvoice/transformer/embedding.py +293 -0
xinference/thirdparty/cosyvoice/transformer/encoder.py +472 -0
xinference/thirdparty/cosyvoice/transformer/encoder_layer.py +236 -0
xinference/thirdparty/cosyvoice/transformer/label_smoothing_loss.py +96 -0
xinference/thirdparty/cosyvoice/transformer/positionwise_feed_forward.py +115 -0
xinference/thirdparty/cosyvoice/transformer/subsampling.py +383 -0
xinference/thirdparty/cosyvoice/utils/__init__.py +0 -0
xinference/thirdparty/cosyvoice/utils/class_utils.py +70 -0
xinference/thirdparty/cosyvoice/utils/common.py +103 -0
xinference/thirdparty/cosyvoice/utils/executor.py +110 -0
xinference/thirdparty/cosyvoice/utils/file_utils.py +41 -0
xinference/thirdparty/cosyvoice/utils/frontend_utils.py +125 -0
xinference/thirdparty/cosyvoice/utils/mask.py +227 -0
xinference/thirdparty/cosyvoice/utils/scheduler.py +739 -0
xinference/thirdparty/cosyvoice/utils/train_utils.py +289 -0
xinference/web/ui/build/asset-manifest.json +3 -3
xinference/web/ui/build/index.html +1 -1
xinference/web/ui/build/static/js/{main.95c1d652.js → main.af906659.js} +3 -3
xinference/web/ui/build/static/js/main.af906659.js.map +1 -0
xinference/web/ui/node_modules/.cache/babel-loader/2cd5e4279ad7e13a1f41d486e9fca7756295bfad5bd77d90992f4ac3e10b496d.json +1 -0
{xinference-0.13.2.dist-info → xinference-0.13.4.dist-info}/METADATA +39 -11
{xinference-0.13.2.dist-info → xinference-0.13.4.dist-info}/RECORD +101 -57
xinference/web/ui/build/static/js/main.95c1d652.js.map +0 -1
xinference/web/ui/node_modules/.cache/babel-loader/709711edada3f1596b309d571285fd31f1c364d66f4425bc28723d0088cc351a.json +0 -1
/xinference/web/ui/build/static/js/{main.95c1d652.js.LICENSE.txt → main.af906659.js.LICENSE.txt} +0 -0
{xinference-0.13.2.dist-info → xinference-0.13.4.dist-info}/LICENSE +0 -0
{xinference-0.13.2.dist-info → xinference-0.13.4.dist-info}/WHEEL +0 -0
{xinference-0.13.2.dist-info → xinference-0.13.4.dist-info}/entry_points.txt +0 -0
{xinference-0.13.2.dist-info → xinference-0.13.4.dist-info}/top_level.txt +0 -0

xinference/__init__.py CHANGED Viewed

@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from . import _version
 __version__ = _version.get_versions()["version"]

xinference/_version.py CHANGED Viewed

@@ -8,11 +8,11 @@ import json
 version_json = '''
 {
- "date": "2024-07-19T19:15:54+0800",
+ "date": "2024-08-02T16:08:07+0800",
  "dirty": false,
  "error": null,
- "full-revisionid": "880929cbbc73e5206ca069591b03d9d16dd858bf",
- "version": "0.13.2"
+ "full-revisionid": "dd85cfe015c9cd2d8110c79213640aa0e21f3a6a",
+ "version": "0.13.4"
 }
 '''  # END VERSION_JSON

xinference/api/restful_api.py CHANGED Viewed

@@ -130,6 +130,7 @@ class SpeechRequest(BaseModel):
     response_format: Optional[str] = "mp3"
     speed: Optional[float] = 1.0
     stream: Optional[bool] = False
+    kwargs: Optional[str] = None
 class RegisterModelRequest(BaseModel):
@@ -796,6 +797,7 @@ class RESTfulAPI:
         worker_ip = payload.get("worker_ip", None)
         gpu_idx = payload.get("gpu_idx", None)
         download_hub = payload.get("download_hub", None)
+        model_path = payload.get("model_path", None)
         exclude_keys = {
             "model_uid",
@@ -812,6 +814,7 @@ class RESTfulAPI:
             "worker_ip",
             "gpu_idx",
             "download_hub",
+            "model_path",
         }
         kwargs = {
@@ -860,6 +863,7 @@ class RESTfulAPI:
                 worker_ip=worker_ip,
                 gpu_idx=gpu_idx,
                 download_hub=download_hub,
+                model_path=model_path,
                 **kwargs,
             )
         except ValueError as ve:
@@ -1309,8 +1313,18 @@ class RESTfulAPI:
             await self._report_error_event(model_uid, str(e))
             raise HTTPException(status_code=500, detail=str(e))
-    async def create_speech(self, request: Request) -> Response:
-        body = SpeechRequest.parse_obj(await request.json())
+    async def create_speech(
+        self,
+        request: Request,
+        prompt_speech: Optional[UploadFile] = File(
+            None, media_type="application/octet-stream"
+        ),
+    ) -> Response:
+        if prompt_speech:
+            f = await request.form()
+        else:
+            f = await request.json()
+        body = SpeechRequest.parse_obj(f)
         model_uid = body.model
         try:
             model = await (await self._get_supervisor_ref()).get_model(model_uid)
@@ -1324,12 +1338,19 @@ class RESTfulAPI:
             raise HTTPException(status_code=500, detail=str(e))
         try:
+            if body.kwargs is not None:
+                parsed_kwargs = json.loads(body.kwargs)
+            else:
+                parsed_kwargs = {}
+            if prompt_speech is not None:
+                parsed_kwargs["prompt_speech"] = await prompt_speech.read()
             out = await model.speech(
                 input=body.input,
                 voice=body.voice,
                 response_format=body.response_format,
                 speed=body.speed,
                 stream=body.stream,
+                **parsed_kwargs,
             )
             if body.stream:
                 return EventSourceResponse(
@@ -1389,7 +1410,7 @@ class RESTfulAPI:
         negative_prompt: Optional[Union[str, List[str]]] = Form(None),
         n: Optional[int] = Form(1),
         response_format: Optional[str] = Form("url"),
-        size: Optional[str] = Form("1024*1024"),
+        size: Optional[str] = Form(None),
         kwargs: Optional[str] = Form(None),
     ) -> Response:
         model_uid = model
@@ -1626,10 +1647,14 @@ class RESTfulAPI:
         if body.tools and body.stream:
             is_vllm = await model.is_vllm_backend()
-            if not is_vllm or model_family not in QWEN_TOOL_CALL_FAMILY:
+            if not (
+                (is_vllm and model_family in QWEN_TOOL_CALL_FAMILY)
+                or (not is_vllm and model_family in GLM4_TOOL_CALL_FAMILY)
+            ):
                 raise HTTPException(
                     status_code=400,
-                    detail="Streaming support for tool calls is available only when using vLLM backend and Qwen models.",
+                    detail="Streaming support for tool calls is available only when using "
+                    "Qwen models with vLLM backend or GLM4-chat models without vLLM backend.",
                 )
         if body.stream:

xinference/client/restful/restful_client.py CHANGED Viewed

@@ -234,9 +234,9 @@ class RESTfulImageModelHandle(RESTfulModelHandle):
         self,
         image: Union[str, bytes],
         prompt: str,
-        negative_prompt: str,
+        negative_prompt: Optional[str] = None,
         n: int = 1,
-        size: str = "1024*1024",
+        size: Optional[str] = None,
         response_format: str = "url",
         **kwargs,
     ) -> "ImageList":
@@ -768,6 +768,8 @@ class RESTfulAudioModelHandle(RESTfulModelHandle):
         response_format: str = "mp3",
         speed: float = 1.0,
         stream: bool = False,
+        prompt_speech: Optional[bytes] = None,
+        **kwargs,
     ):
         """
         Generates audio from the input text.
@@ -799,8 +801,21 @@ class RESTfulAudioModelHandle(RESTfulModelHandle):
             "response_format": response_format,
             "speed": speed,
             "stream": stream,
+            "kwargs": json.dumps(kwargs),
         }
-        response = requests.post(url, json=params, headers=self.auth_headers)
+        if prompt_speech:
+            files: List[Any] = []
+            files.append(
+                (
+                    "prompt_speech",
+                    ("prompt_speech", prompt_speech, "application/octet-stream"),
+                )
+            )
+            response = requests.post(
+                url, data=params, files=files, headers=self.auth_headers
+            )
+        else:
+            response = requests.post(url, json=params, headers=self.auth_headers)
         if response.status_code != 200:
             raise RuntimeError(
                 f"Failed to speech the text, detail: {_get_error_string(response)}"

xinference/constants.py CHANGED Viewed

@@ -26,8 +26,6 @@ XINFERENCE_ENV_HEALTH_CHECK_FAILURE_THRESHOLD = (
 XINFERENCE_ENV_HEALTH_CHECK_INTERVAL = "XINFERENCE_HEALTH_CHECK_INTERVAL"
 XINFERENCE_ENV_HEALTH_CHECK_TIMEOUT = "XINFERENCE_HEALTH_CHECK_TIMEOUT"
 XINFERENCE_ENV_DISABLE_HEALTH_CHECK = "XINFERENCE_DISABLE_HEALTH_CHECK"
-XINFERENCE_ENV_DISABLE_VLLM = "XINFERENCE_DISABLE_VLLM"
-XINFERENCE_ENV_ENABLE_SGLANG = "XINFERENCE_ENABLE_SGLANG"
 XINFERENCE_ENV_DISABLE_METRICS = "XINFERENCE_DISABLE_METRICS"
 XINFERENCE_ENV_TRANSFORMERS_ENABLE_BATCHING = "XINFERENCE_TRANSFORMERS_ENABLE_BATCHING"
@@ -72,8 +70,6 @@ XINFERENCE_HEALTH_CHECK_TIMEOUT = int(
 XINFERENCE_DISABLE_HEALTH_CHECK = bool(
     int(os.environ.get(XINFERENCE_ENV_DISABLE_HEALTH_CHECK, 0))
 )
-XINFERENCE_DISABLE_VLLM = bool(int(os.environ.get(XINFERENCE_ENV_DISABLE_VLLM, 0)))
-XINFERENCE_ENABLE_SGLANG = bool(int(os.environ.get(XINFERENCE_ENV_ENABLE_SGLANG, 0)))
 XINFERENCE_DISABLE_METRICS = bool(
     int(os.environ.get(XINFERENCE_ENV_DISABLE_METRICS, 0))
 )

xinference/core/chat_interface.py CHANGED Viewed

@@ -428,7 +428,7 @@ class GradioInterface:
                     }
             hist.append(response_content)
-            return {
+            return {  # type: ignore
                 textbox: response_content,
                 history: hist,
             }
@@ -467,7 +467,7 @@ class GradioInterface:
                     }
             hist.append(response_content)
-            return {
+            return {  # type: ignore
                 textbox: response_content,
                 history: hist,
             }

xinference/core/image_interface.py CHANGED Viewed

@@ -153,7 +153,10 @@ class ImageInterface:
             model = client.get_model(self.model_uid)
             assert isinstance(model, RESTfulImageModelHandle)
-            size = f"{int(size_width)}*{int(size_height)}"
+            if size_width > 0 and size_height > 0:
+                size = f"{int(size_width)}*{int(size_height)}"
+            else:
+                size = None
             bio = io.BytesIO()
             image.save(bio, format="png")
@@ -195,8 +198,8 @@ class ImageInterface:
                 with gr.Row():
                     n = gr.Number(label="Number of image", value=1)
-                    size_width = gr.Number(label="Width", value=512)
-                    size_height = gr.Number(label="Height", value=512)
+                    size_width = gr.Number(label="Width", value=-1)
+                    size_height = gr.Number(label="Height", value=-1)
                 with gr.Row():
                     with gr.Column(scale=1):

xinference/core/model.py CHANGED Viewed

@@ -646,7 +646,10 @@ class ModelActor(xo.StatelessActor):
             f"Model {self._model.model_spec} is not for creating translations."
         )
-    @log_async(logger=logger)
+    @log_async(
+        logger=logger,
+        args_formatter=lambda _, kwargs: kwargs.pop("prompt_speech", None),
+    )
     @request_limit
     @xo.generator
     async def speech(
@@ -656,6 +659,7 @@ class ModelActor(xo.StatelessActor):
         response_format: str = "mp3",
         speed: float = 1.0,
         stream: bool = False,
+        **kwargs,
     ):
         if hasattr(self._model, "speech"):
             return await self._call_wrapper_binary(
@@ -665,6 +669,7 @@ class ModelActor(xo.StatelessActor):
                 response_format,
                 speed,
                 stream,
+                **kwargs,
             )
         raise AttributeError(
             f"Model {self._model.model_spec} is not for creating speech."
@@ -701,7 +706,7 @@ class ModelActor(xo.StatelessActor):
         prompt: str,
         negative_prompt: str,
         n: int = 1,
-        size: str = "1024*1024",
+        size: Optional[str] = None,
         response_format: str = "url",
         *args,
         **kwargs,
@@ -735,7 +740,7 @@ class ModelActor(xo.StatelessActor):
         **kwargs,
     ):
         if hasattr(self._model, "inpainting"):
-            return await self._call_wrapper(
+            return await self._call_wrapper_json(
                 self._model.inpainting,
                 image,
                 mask_image,
@@ -758,7 +763,7 @@ class ModelActor(xo.StatelessActor):
         **kwargs,
     ):
         if hasattr(self._model, "infer"):
-            return await self._call_wrapper(
+            return await self._call_wrapper_json(
                 self._model.infer,
                 **kwargs,
             )

xinference/core/scheduler.py CHANGED Viewed

@@ -81,7 +81,7 @@ class InferenceRequest:
         self.future_or_queue = future_or_queue
         # Record error message when this request has error.
         # Must set stopped=True when this field is set.
-        self.error_msg: Optional[str] = None
+        self.error_msg: Optional[str] = None  # type: ignore
         # For compatibility. Record some extra parameters for some special cases.
         self.extra_kwargs = {}
@@ -295,11 +295,11 @@ class SchedulerActor(xo.StatelessActor):
     def __init__(self):
         super().__init__()
-        self._waiting_queue: deque[InferenceRequest] = deque()
-        self._running_queue: deque[InferenceRequest] = deque()
+        self._waiting_queue: deque[InferenceRequest] = deque()  # type: ignore
+        self._running_queue: deque[InferenceRequest] = deque()  # type: ignore
         self._model = None
         self._id_to_req = {}
-        self._abort_req_ids: Set[str] = set()
+        self._abort_req_ids: Set[str] = set()  # type: ignore
         self._isolation = None
     async def __post_create__(self):

xinference/core/supervisor.py CHANGED Viewed

@@ -859,6 +859,7 @@ class SupervisorActor(xo.StatelessActor):
         worker_ip: Optional[str] = None,
         gpu_idx: Optional[Union[int, List[int]]] = None,
         download_hub: Optional[Literal["huggingface", "modelscope", "csghub"]] = None,
+        model_path: Optional[str] = None,
         **kwargs,
     ) -> str:
         # search in worker first
@@ -942,6 +943,7 @@ class SupervisorActor(xo.StatelessActor):
                 peft_model_config=peft_model_config,
                 gpu_idx=replica_gpu_idx,
                 download_hub=download_hub,
+                model_path=model_path,
                 **kwargs,
             )
             self._replica_model_uid_to_worker[_replica_model_uid] = worker_ref

xinference/core/worker.py CHANGED Viewed

@@ -743,6 +743,7 @@ class WorkerActor(xo.StatelessActor):
         request_limits: Optional[int] = None,
         gpu_idx: Optional[Union[int, List[int]]] = None,
         download_hub: Optional[Literal["huggingface", "modelscope", "csghub"]] = None,
+        model_path: Optional[str] = None,
         **kwargs,
     ):
         # !!! Note that The following code must be placed at the very beginning of this function,
@@ -799,6 +800,11 @@ class WorkerActor(xo.StatelessActor):
                 raise ValueError(
                     f"PEFT adaptors can only be applied to pytorch-like models"
                 )
+        if model_path is not None:
+            if not os.path.exists(model_path):
+                raise ValueError(
+                    f"Invalid input. `model_path`: {model_path} File or directory does not exist."
+                )
         assert model_uid not in self._model_uid_to_model
         self._check_model_is_valid(model_name, model_format)
@@ -826,6 +832,7 @@ class WorkerActor(xo.StatelessActor):
                     quantization,
                     peft_model_config,
                     download_hub,
+                    model_path,
                     **kwargs,
                 )
                 await self.update_cache_status(model_name, model_description)

xinference/deploy/utils.py CHANGED Viewed

@@ -27,6 +27,9 @@ if TYPE_CHECKING:
 logger = logging.getLogger(__name__)
+# mainly for k8s
+XINFERENCE_POD_NAME_ENV_KEY = "XINFERENCE_POD_NAME"
 class LoggerNameFilter(logging.Filter):
     def filter(self, record):
@@ -40,6 +43,9 @@ def get_log_file(sub_dir: str):
     """
     sub_dir should contain a timestamp.
     """
+    pod_name = os.environ.get(XINFERENCE_POD_NAME_ENV_KEY, None)
+    if pod_name is not None:
+        sub_dir = sub_dir + "_" + pod_name
     log_dir = os.path.join(XINFERENCE_LOG_DIR, sub_dir)
     # Here should be creating a new directory each time, so `exist_ok=False`
     os.makedirs(log_dir, exist_ok=False)

xinference/model/audio/core.py CHANGED Viewed

@@ -20,6 +20,7 @@ from ...constants import XINFERENCE_CACHE_DIR
 from ..core import CacheableModelSpec, ModelDescription
 from ..utils import valid_model_revision
 from .chattts import ChatTTSModel
+from .cosyvoice import CosyVoiceModel
 from .whisper import WhisperModel
 MAX_ATTEMPTS = 3
@@ -149,18 +150,22 @@ def create_audio_model_instance(
     model_uid: str,
     model_name: str,
     download_hub: Optional[Literal["huggingface", "modelscope", "csghub"]] = None,
+    model_path: Optional[str] = None,
     **kwargs,
-) -> Tuple[Union[WhisperModel, ChatTTSModel], AudioModelDescription]:
+) -> Tuple[Union[WhisperModel, ChatTTSModel, CosyVoiceModel], AudioModelDescription]:
     model_spec = match_audio(model_name, download_hub)
-    model_path = cache(model_spec)
-    model: Union[WhisperModel, ChatTTSModel]
+    if model_path is None:
+        model_path = cache(model_spec)
+    model: Union[WhisperModel, ChatTTSModel, CosyVoiceModel]
     if model_spec.model_family == "whisper":
         model = WhisperModel(model_uid, model_path, model_spec, **kwargs)
     elif model_spec.model_family == "ChatTTS":
         model = ChatTTSModel(model_uid, model_path, model_spec, **kwargs)
+    elif model_spec.model_family == "CosyVoice":
+        model = CosyVoiceModel(model_uid, model_path, model_spec, **kwargs)
     else:
         raise Exception(f"Unsupported audio model family: {model_spec.model_family}")
     model_description = AudioModelDescription(
-        subpool_addr, devices, model_spec, model_path=model_path
+        subpool_addr, devices, model_spec, model_path
     )
     return model, model_description

xinference/model/audio/cosyvoice.py ADDED Viewed

@@ -0,0 +1,136 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import io
+import logging
+from io import BytesIO
+from typing import TYPE_CHECKING, Optional
+if TYPE_CHECKING:
+    from .core import AudioModelFamilyV1
+logger = logging.getLogger(__name__)
+class CosyVoiceModel:
+    def __init__(
+        self,
+        model_uid: str,
+        model_path: str,
+        model_spec: "AudioModelFamilyV1",
+        device: Optional[str] = None,
+        **kwargs,
+    ):
+        self._model_uid = model_uid
+        self._model_path = model_path
+        self._model_spec = model_spec
+        self._device = device
+        self._model = None
+        self._kwargs = kwargs
+    def load(self):
+        import os
+        import sys
+        # The yaml config loaded from model has hard-coded the import paths. please refer to: load_hyperpyyaml
+        sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../thirdparty"))
+        from cosyvoice.cli.cosyvoice import CosyVoice
+        self._model = CosyVoice(self._model_path)
+    def speech(
+        self,
+        input: str,
+        voice: str,
+        response_format: str = "mp3",
+        speed: float = 1.0,
+        stream: bool = False,
+        **kwargs,
+    ):
+        if stream:
+            raise Exception("CosyVoiceModel does not support stream.")
+        import torchaudio
+        from cosyvoice.utils.file_utils import load_wav
+        prompt_speech: Optional[bytes] = kwargs.pop("prompt_speech", None)
+        prompt_text: Optional[str] = kwargs.pop("prompt_text", None)
+        instruct_text: Optional[str] = kwargs.pop("instruct_text", None)
+        if "SFT" in self._model_spec.model_name:
+            # inference_sft
+            assert (
+                prompt_speech is None
+            ), "CosyVoice SFT model does not support prompt_speech"
+            assert (
+                prompt_text is None
+            ), "CosyVoice SFT model does not support prompt_text"
+            assert (
+                instruct_text is None
+            ), "CosyVoice SFT model does not support instruct_text"
+        elif "Instruct" in self._model_spec.model_name:
+            # inference_instruct
+            assert (
+                prompt_speech is None
+            ), "CosyVoice Instruct model does not support prompt_speech"
+            assert (
+                prompt_text is None
+            ), "CosyVoice Instruct model does not support prompt_text"
+            assert (
+                instruct_text is not None
+            ), "CosyVoice Instruct model expect a instruct_text"
+        else:
+            # inference_zero_shot
+            # inference_cross_lingual
+            assert prompt_speech is not None, "CosyVoice model expect a prompt_speech"
+            assert (
+                instruct_text is None
+            ), "CosyVoice model does not support instruct_text"
+        assert self._model is not None
+        if prompt_speech:
+            assert not voice, "voice can't be set with prompt speech."
+            with io.BytesIO(prompt_speech) as prompt_speech_io:
+                prompt_speech_16k = load_wav(prompt_speech_io, 16000)
+                if prompt_text:
+                    logger.info("CosyVoice inference_zero_shot")
+                    output = self._model.inference_zero_shot(
+                        input, prompt_text, prompt_speech_16k
+                    )
+                else:
+                    logger.info("CosyVoice inference_cross_lingual")
+                    output = self._model.inference_cross_lingual(
+                        input, prompt_speech_16k
+                    )
+        else:
+            available_speakers = self._model.list_avaliable_spks()
+            if not voice:
+                voice = available_speakers[0]
+            else:
+                assert (
+                    voice in available_speakers
+                ), f"Invalid voice {voice}, CosyVoice available speakers: {available_speakers}"
+            if instruct_text:
+                logger.info("CosyVoice inference_instruct")
+                output = self._model.inference_instruct(
+                    input, voice, instruct_text=instruct_text
+                )
+            else:
+                logger.info("CosyVoice inference_sft")
+                output = self._model.inference_sft(input, voice)
+        # Save the generated audio
+        with BytesIO() as out:
+            torchaudio.save(out, output["tts_speech"], 22050, format=response_format)
+            return out.getvalue()

xinference/model/audio/model_spec.json CHANGED Viewed

@@ -102,5 +102,29 @@
     "model_revision": "ce5913842aebd78e4a01a02d47244b8d62ac4ee3",
     "ability": "text-to-audio",
     "multilingual": true
+  },
+  {
+    "model_name": "CosyVoice-300M",
+    "model_family": "CosyVoice",
+    "model_id": "model-scope/CosyVoice-300M",
+    "model_revision": "ca4e036d2db2aa4731cc1747859a68044b6a4694",
+    "ability": "audio-to-audio",
+    "multilingual": true
+  },
+  {
+    "model_name": "CosyVoice-300M-SFT",
+    "model_family": "CosyVoice",
+    "model_id": "model-scope/CosyVoice-300M-SFT",
+    "model_revision": "ab918940c6c134b1fc1f069246e67bad6b66abcb",
+    "ability": "text-to-audio",
+    "multilingual": true
+  },
+  {
+    "model_name": "CosyVoice-300M-Instruct",
+    "model_family": "CosyVoice",
+    "model_id": "model-scope/CosyVoice-300M-Instruct",
+    "model_revision": "fb5f676733139f35670bed9b59a77d476b1aa898",
+    "ability": "text-to-audio",
+    "multilingual": true
   }
 ]

xinference/model/audio/model_spec_modelscope.json CHANGED Viewed

@@ -16,5 +16,32 @@
     "model_revision": "master",
     "ability": "text-to-audio",
     "multilingual": true
+  },
+  {
+    "model_name": "CosyVoice-300M",
+    "model_family": "CosyVoice",
+    "model_hub": "modelscope",
+    "model_id": "iic/CosyVoice-300M",
+    "model_revision": "master",
+    "ability": "audio-to-audio",
+    "multilingual": true
+  },
+  {
+    "model_name": "CosyVoice-300M-SFT",
+    "model_family": "CosyVoice",
+    "model_hub": "modelscope",
+    "model_id": "iic/CosyVoice-300M-SFT",
+    "model_revision": "master",
+    "ability": "text-to-audio",
+    "multilingual": true
+  },
+  {
+    "model_name": "CosyVoice-300M-Instruct",
+    "model_family": "CosyVoice",
+    "model_hub": "modelscope",
+    "model_id": "iic/CosyVoice-300M-Instruct",
+    "model_revision": "master",
+    "ability": "text-to-audio",
+    "multilingual": true
   }
 ]

xinference/model/core.py CHANGED Viewed

@@ -56,6 +56,7 @@ def create_model_instance(
     quantization: Optional[str] = None,
     peft_model_config: Optional[PeftModelConfig] = None,
     download_hub: Optional[Literal["huggingface", "modelscope", "csghub"]] = None,
+    model_path: Optional[str] = None,
     **kwargs,
 ) -> Tuple[Any, ModelDescription]:
     from .audio.core import create_audio_model_instance
@@ -77,13 +78,20 @@ def create_model_instance(
             quantization,
             peft_model_config,
             download_hub,
+            model_path,
             **kwargs,
         )
     elif model_type == "embedding":
         # embedding model doesn't accept trust_remote_code
         kwargs.pop("trust_remote_code", None)
         return create_embedding_model_instance(
-            subpool_addr, devices, model_uid, model_name, download_hub, **kwargs
+            subpool_addr,
+            devices,
+            model_uid,
+            model_name,
+            download_hub,
+            model_path,
+            **kwargs,
         )
     elif model_type == "image":
         kwargs.pop("trust_remote_code", None)
@@ -94,22 +102,35 @@ def create_model_instance(
             model_name,
             peft_model_config,
             download_hub,
+            model_path,
             **kwargs,
         )
     elif model_type == "rerank":
         kwargs.pop("trust_remote_code", None)
         return create_rerank_model_instance(
-            subpool_addr, devices, model_uid, model_name, download_hub, **kwargs
+            subpool_addr,
+            devices,
+            model_uid,
+            model_name,
+            download_hub,
+            model_path,
+            **kwargs,
         )
     elif model_type == "audio":
         kwargs.pop("trust_remote_code", None)
         return create_audio_model_instance(
-            subpool_addr, devices, model_uid, model_name, download_hub, **kwargs
+            subpool_addr,
+            devices,
+            model_uid,
+            model_name,
+            download_hub,
+            model_path,
+            **kwargs,
         )
     elif model_type == "flexible":
         kwargs.pop("trust_remote_code", None)
         return create_flexible_model_instance(
-            subpool_addr, devices, model_uid, model_name, **kwargs
+            subpool_addr, devices, model_uid, model_name, model_path, **kwargs
         )
     else:
         raise ValueError(f"Unsupported model type: {model_type}.")

xinference 0.13.2__py3-none-any.whl → 0.13.4__py3-none-any.whl

Potentially problematic release.

xinference 0.13.2py3-none-any.whl → 0.13.4py3-none-any.whl