PyPI - xinference - Versions diffs - 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

xinference 1.0.0py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (94) hide show

xinference/_compat.py +22 -2
xinference/_version.py +3 -3
xinference/api/restful_api.py +91 -6
xinference/client/restful/restful_client.py +39 -0
xinference/core/model.py +41 -13
xinference/deploy/cmdline.py +3 -1
xinference/deploy/test/test_cmdline.py +56 -0
xinference/isolation.py +24 -0
xinference/model/audio/__init__.py +12 -0
xinference/model/audio/core.py +26 -4
xinference/model/audio/f5tts.py +195 -0
xinference/model/audio/fish_speech.py +71 -35
xinference/model/audio/model_spec.json +88 -0
xinference/model/audio/model_spec_modelscope.json +9 -0
xinference/model/audio/whisper_mlx.py +208 -0
xinference/model/embedding/core.py +322 -6
xinference/model/embedding/model_spec.json +8 -1
xinference/model/embedding/model_spec_modelscope.json +9 -1
xinference/model/llm/__init__.py +4 -2
xinference/model/llm/llm_family.json +479 -53
xinference/model/llm/llm_family_modelscope.json +423 -17
xinference/model/llm/mlx/core.py +230 -50
xinference/model/llm/sglang/core.py +2 -0
xinference/model/llm/transformers/chatglm.py +9 -5
xinference/model/llm/transformers/core.py +1 -0
xinference/model/llm/transformers/glm_edge_v.py +230 -0
xinference/model/llm/transformers/utils.py +16 -8
xinference/model/llm/utils.py +23 -1
xinference/model/llm/vllm/core.py +89 -2
xinference/thirdparty/f5_tts/__init__.py +0 -0
xinference/thirdparty/f5_tts/api.py +166 -0
xinference/thirdparty/f5_tts/configs/E2TTS_Base_train.yaml +44 -0
xinference/thirdparty/f5_tts/configs/E2TTS_Small_train.yaml +44 -0
xinference/thirdparty/f5_tts/configs/F5TTS_Base_train.yaml +46 -0
xinference/thirdparty/f5_tts/configs/F5TTS_Small_train.yaml +46 -0
xinference/thirdparty/f5_tts/eval/README.md +49 -0
xinference/thirdparty/f5_tts/eval/ecapa_tdnn.py +330 -0
xinference/thirdparty/f5_tts/eval/eval_infer_batch.py +207 -0
xinference/thirdparty/f5_tts/eval/eval_infer_batch.sh +13 -0
xinference/thirdparty/f5_tts/eval/eval_librispeech_test_clean.py +84 -0
xinference/thirdparty/f5_tts/eval/eval_seedtts_testset.py +84 -0
xinference/thirdparty/f5_tts/eval/utils_eval.py +405 -0
xinference/thirdparty/f5_tts/infer/README.md +191 -0
xinference/thirdparty/f5_tts/infer/SHARED.md +74 -0
xinference/thirdparty/f5_tts/infer/examples/basic/basic.toml +11 -0
xinference/thirdparty/f5_tts/infer/examples/basic/basic_ref_en.wav +0 -0
xinference/thirdparty/f5_tts/infer/examples/basic/basic_ref_zh.wav +0 -0
xinference/thirdparty/f5_tts/infer/examples/multi/country.flac +0 -0
xinference/thirdparty/f5_tts/infer/examples/multi/main.flac +0 -0
xinference/thirdparty/f5_tts/infer/examples/multi/story.toml +19 -0
xinference/thirdparty/f5_tts/infer/examples/multi/story.txt +1 -0
xinference/thirdparty/f5_tts/infer/examples/multi/town.flac +0 -0
xinference/thirdparty/f5_tts/infer/examples/vocab.txt +2545 -0
xinference/thirdparty/f5_tts/infer/infer_cli.py +226 -0
xinference/thirdparty/f5_tts/infer/infer_gradio.py +851 -0
xinference/thirdparty/f5_tts/infer/speech_edit.py +193 -0
xinference/thirdparty/f5_tts/infer/utils_infer.py +538 -0
xinference/thirdparty/f5_tts/model/__init__.py +10 -0
xinference/thirdparty/f5_tts/model/backbones/README.md +20 -0
xinference/thirdparty/f5_tts/model/backbones/dit.py +163 -0
xinference/thirdparty/f5_tts/model/backbones/mmdit.py +146 -0
xinference/thirdparty/f5_tts/model/backbones/unett.py +219 -0
xinference/thirdparty/f5_tts/model/cfm.py +285 -0
xinference/thirdparty/f5_tts/model/dataset.py +319 -0
xinference/thirdparty/f5_tts/model/modules.py +658 -0
xinference/thirdparty/f5_tts/model/trainer.py +366 -0
xinference/thirdparty/f5_tts/model/utils.py +185 -0
xinference/thirdparty/f5_tts/scripts/count_max_epoch.py +33 -0
xinference/thirdparty/f5_tts/scripts/count_params_gflops.py +39 -0
xinference/thirdparty/f5_tts/socket_server.py +159 -0
xinference/thirdparty/f5_tts/train/README.md +77 -0
xinference/thirdparty/f5_tts/train/datasets/prepare_csv_wavs.py +139 -0
xinference/thirdparty/f5_tts/train/datasets/prepare_emilia.py +230 -0
xinference/thirdparty/f5_tts/train/datasets/prepare_libritts.py +92 -0
xinference/thirdparty/f5_tts/train/datasets/prepare_ljspeech.py +65 -0
xinference/thirdparty/f5_tts/train/datasets/prepare_wenetspeech4tts.py +125 -0
xinference/thirdparty/f5_tts/train/finetune_cli.py +174 -0
xinference/thirdparty/f5_tts/train/finetune_gradio.py +1846 -0
xinference/thirdparty/f5_tts/train/train.py +75 -0
xinference/types.py +2 -1
xinference/web/ui/build/asset-manifest.json +3 -3
xinference/web/ui/build/index.html +1 -1
xinference/web/ui/build/static/js/{main.2f269bb3.js → main.4eb4ee80.js} +3 -3
xinference/web/ui/build/static/js/main.4eb4ee80.js.map +1 -0
xinference/web/ui/node_modules/.cache/babel-loader/8c5eeb02f772d02cbe8b89c05428d0dd41a97866f75f7dc1c2164a67f5a1cf98.json +1 -0
{xinference-1.0.0.dist-info → xinference-1.1.0.dist-info}/METADATA +39 -18
{xinference-1.0.0.dist-info → xinference-1.1.0.dist-info}/RECORD +92 -39
{xinference-1.0.0.dist-info → xinference-1.1.0.dist-info}/WHEEL +1 -1
xinference/web/ui/build/static/js/main.2f269bb3.js.map +0 -1
xinference/web/ui/node_modules/.cache/babel-loader/bd6ad8159341315a1764c397621a560809f7eb7219ab5174c801fca7e969d943.json +0 -1
/xinference/web/ui/build/static/js/{main.2f269bb3.js.LICENSE.txt → main.4eb4ee80.js.LICENSE.txt} +0 -0
{xinference-1.0.0.dist-info → xinference-1.1.0.dist-info}/LICENSE +0 -0
{xinference-1.0.0.dist-info → xinference-1.1.0.dist-info}/entry_points.txt +0 -0
{xinference-1.0.0.dist-info → xinference-1.1.0.dist-info}/top_level.txt +0 -0

xinference/_compat.py CHANGED Viewed

@@ -60,6 +60,10 @@ from openai.types.chat.chat_completion_stream_options_param import (
     ChatCompletionStreamOptionsParam,
 )
 from openai.types.chat.chat_completion_tool_param import ChatCompletionToolParam
+from openai.types.shared_params.response_format_json_object import (
+    ResponseFormatJSONObject,
+)
+from openai.types.shared_params.response_format_text import ResponseFormatText
 OpenAIChatCompletionStreamOptionsParam = create_model_from_typeddict(
     ChatCompletionStreamOptionsParam
@@ -70,6 +74,23 @@ OpenAIChatCompletionNamedToolChoiceParam = create_model_from_typeddict(
 )
+class JSONSchema(BaseModel):
+    name: str
+    description: Optional[str] = None
+    schema_: Optional[Dict[str, object]] = Field(alias="schema", default=None)
+    strict: Optional[bool] = None
+class ResponseFormatJSONSchema(BaseModel):
+    json_schema: JSONSchema
+    type: Literal["json_schema"]
+ResponseFormat = Union[
+    ResponseFormatText, ResponseFormatJSONObject, ResponseFormatJSONSchema
+]
 class CreateChatCompletionOpenAI(BaseModel):
     """
     Comes from source code: https://github.com/openai/openai-python/blob/main/src/openai/types/chat/completion_create_params.py
@@ -84,8 +105,7 @@ class CreateChatCompletionOpenAI(BaseModel):
     n: Optional[int]
     parallel_tool_calls: Optional[bool]
     presence_penalty: Optional[float]
-    # we do not support this
-    # response_format: ResponseFormat
+    response_format: Optional[ResponseFormat]
     seed: Optional[int]
     service_tier: Optional[Literal["auto", "default"]]
     stop: Union[Optional[str], List[str]]

xinference/_version.py CHANGED Viewed

@@ -8,11 +8,11 @@ import json
 version_json = '''
 {
- "date": "2024-11-15T17:33:11+0800",
+ "date": "2024-12-13T18:21:03+0800",
  "dirty": false,
  "error": null,
- "full-revisionid": "4c96475b8f90e354aa1b47856fda4db098b62b65",
- "version": "1.0.0"
+ "full-revisionid": "b132fca91f3e1b11b111f9b89f68a55e4b7605c6",
+ "version": "1.1.0"
 }
 '''  # END VERSION_JSON

xinference/api/restful_api.py CHANGED Viewed

@@ -94,9 +94,9 @@ class CreateCompletionRequest(CreateCompletion):
 class CreateEmbeddingRequest(BaseModel):
     model: str
-    input: Union[str, List[str], List[int], List[List[int]]] = Field(
-        description="The input to embed."
-    )
+    input: Union[
+        str, List[str], List[int], List[List[int]], Dict[str, str], List[Dict[str, str]]
+    ] = Field(description="The input to embed.")
     user: Optional[str] = None
     class Config:
@@ -489,6 +489,16 @@ class RESTfulAPI(CancelMixin):
                 else None
             ),
         )
+        self._router.add_api_route(
+            "/v1/convert_ids_to_tokens",
+            self.convert_ids_to_tokens,
+            methods=["POST"],
+            dependencies=(
+                [Security(self._auth_service, scopes=["models:read"])]
+                if self.is_authenticated()
+                else None
+            ),
+        )
         self._router.add_api_route(
             "/v1/rerank",
             self.rerank,
@@ -1219,6 +1229,9 @@ class RESTfulAPI(CancelMixin):
         raw_kwargs = {k: v for k, v in raw_body.items() if k not in exclude}
         kwargs = body.dict(exclude_unset=True, exclude=exclude)
+        # guided_decoding params
+        kwargs.update(self.extract_guided_params(raw_body=raw_body))
         # TODO: Decide if this default value override is necessary #1061
         if body.max_tokens is None:
             kwargs["max_tokens"] = max_tokens_field.default
@@ -1264,6 +1277,8 @@ class RESTfulAPI(CancelMixin):
                     # https://github.com/openai/openai-python/blob/e0aafc6c1a45334ac889fe3e54957d309c3af93f/src/openai/_streaming.py#L107
                     yield dict(data=json.dumps({"error": str(ex)}))
                     return
+                finally:
+                    await model.decrease_serve_count()
             return EventSourceResponse(stream_results())
         else:
@@ -1312,6 +1327,41 @@ class RESTfulAPI(CancelMixin):
             await self._report_error_event(model_uid, str(e))
             raise HTTPException(status_code=500, detail=str(e))
+    async def convert_ids_to_tokens(self, request: Request) -> Response:
+        payload = await request.json()
+        body = CreateEmbeddingRequest.parse_obj(payload)
+        model_uid = body.model
+        exclude = {
+            "model",
+            "input",
+            "user",
+        }
+        kwargs = {key: value for key, value in payload.items() if key not in exclude}
+        try:
+            model = await (await self._get_supervisor_ref()).get_model(model_uid)
+        except ValueError as ve:
+            logger.error(str(ve), exc_info=True)
+            await self._report_error_event(model_uid, str(ve))
+            raise HTTPException(status_code=400, detail=str(ve))
+        except Exception as e:
+            logger.error(e, exc_info=True)
+            await self._report_error_event(model_uid, str(e))
+            raise HTTPException(status_code=500, detail=str(e))
+        try:
+            decoded_texts = await model.convert_ids_to_tokens(body.input, **kwargs)
+            return Response(decoded_texts, media_type="application/json")
+        except RuntimeError as re:
+            logger.error(re, exc_info=True)
+            await self._report_error_event(model_uid, str(re))
+            self.handle_request_limit_error(re)
+            raise HTTPException(status_code=400, detail=str(re))
+        except Exception as e:
+            logger.error(e, exc_info=True)
+            await self._report_error_event(model_uid, str(e))
+            raise HTTPException(status_code=500, detail=str(e))
     async def rerank(self, request: Request) -> Response:
         payload = await request.json()
         body = RerankRequest.parse_obj(payload)
@@ -1495,8 +1545,16 @@ class RESTfulAPI(CancelMixin):
                 **parsed_kwargs,
             )
             if body.stream:
+                async def stream_results():
+                    try:
+                        async for item in out:
+                            yield item
+                    finally:
+                        await model.decrease_serve_count()
                 return EventSourceResponse(
-                    media_type="application/octet-stream", content=out
+                    media_type="application/octet-stream", content=stream_results()
                 )
             else:
                 return Response(media_type="application/octet-stream", content=out)
@@ -1916,9 +1974,13 @@ class RESTfulAPI(CancelMixin):
             "logit_bias_type",
             "user",
         }
         raw_kwargs = {k: v for k, v in raw_body.items() if k not in exclude}
         kwargs = body.dict(exclude_unset=True, exclude=exclude)
+        # guided_decoding params
+        kwargs.update(self.extract_guided_params(raw_body=raw_body))
         # TODO: Decide if this default value override is necessary #1061
         if body.max_tokens is None:
             kwargs["max_tokens"] = max_tokens_field.default
@@ -1982,7 +2044,6 @@ class RESTfulAPI(CancelMixin):
                 )
         if body.tools and body.stream:
             is_vllm = await model.is_vllm_backend()
             if not (
                 (is_vllm and model_family in QWEN_TOOL_CALL_FAMILY)
                 or (not is_vllm and model_family in GLM4_TOOL_CALL_FAMILY)
@@ -1992,7 +2053,8 @@ class RESTfulAPI(CancelMixin):
                     detail="Streaming support for tool calls is available only when using "
                     "Qwen models with vLLM backend or GLM4-chat models without vLLM backend.",
                 )
+        if "skip_special_tokens" in raw_kwargs and await model.is_vllm_backend():
+            kwargs["skip_special_tokens"] = raw_kwargs["skip_special_tokens"]
         if body.stream:
             async def stream_results():
@@ -2027,6 +2089,8 @@ class RESTfulAPI(CancelMixin):
                     # https://github.com/openai/openai-python/blob/e0aafc6c1a45334ac889fe3e54957d309c3af93f/src/openai/_streaming.py#L107
                     yield dict(data=json.dumps({"error": str(ex)}))
                     return
+                finally:
+                    await model.decrease_serve_count()
             return EventSourceResponse(stream_results())
         else:
@@ -2279,6 +2343,27 @@ class RESTfulAPI(CancelMixin):
             logger.error(e, exc_info=True)
             raise HTTPException(status_code=500, detail=str(e))
+    @staticmethod
+    def extract_guided_params(raw_body: dict) -> dict:
+        kwargs = {}
+        if raw_body.get("guided_json") is not None:
+            kwargs["guided_json"] = raw_body.get("guided_json")
+        if raw_body.get("guided_regex") is not None:
+            kwargs["guided_regex"] = raw_body.get("guided_regex")
+        if raw_body.get("guided_choice") is not None:
+            kwargs["guided_choice"] = raw_body.get("guided_choice")
+        if raw_body.get("guided_grammar") is not None:
+            kwargs["guided_grammar"] = raw_body.get("guided_grammar")
+        if raw_body.get("guided_json_object") is not None:
+            kwargs["guided_json_object"] = raw_body.get("guided_json_object")
+        if raw_body.get("guided_decoding_backend") is not None:
+            kwargs["guided_decoding_backend"] = raw_body.get("guided_decoding_backend")
+        if raw_body.get("guided_whitespace_pattern") is not None:
+            kwargs["guided_whitespace_pattern"] = raw_body.get(
+                "guided_whitespace_pattern"
+            )
+        return kwargs
 def run(
     supervisor_address: str,

xinference/client/restful/restful_client.py CHANGED Viewed

@@ -126,6 +126,43 @@ class RESTfulEmbeddingModelHandle(RESTfulModelHandle):
         response_data = response.json()
         return response_data
+    def convert_ids_to_tokens(
+        self, input: Union[List, List[List]], **kwargs
+    ) -> List[str]:
+        """
+        Convert token IDs to human readable tokens via RESTful APIs.
+        Parameters
+        ----------
+        input: Union[List, List[List]]
+            Input token IDs to convert, can be a single list of token IDs or a list of token ID lists.
+            To convert multiple sequences in a single request, pass a list of token ID lists.
+        Returns
+        -------
+        list
+            A list of decoded tokens in human readable format.
+        Raises
+        ------
+        RuntimeError
+            Report the failure of token conversion and provide the error message.
+        """
+        url = f"{self._base_url}/v1/convert_ids_to_tokens"
+        request_body = {
+            "model": self._model_uid,
+            "input": input,
+        }
+        request_body.update(kwargs)
+        response = requests.post(url, json=request_body, headers=self.auth_headers)
+        if response.status_code != 200:
+            raise RuntimeError(
+                f"Failed to decode token ids, detail: {_get_error_string(response)}"
+            )
+        response_data = response.json()
+        return response_data
 class RESTfulRerankModelHandle(RESTfulModelHandle):
     def rerank(
@@ -704,6 +741,8 @@ class RESTfulAudioModelHandle(RESTfulModelHandle):
             The speed of the generated audio.
         stream: bool
             Use stream or not.
+        prompt_speech: bytes
+            The audio bytes to be provided to the model.
         Returns
         -------

xinference/core/model.py CHANGED Viewed

@@ -78,6 +78,7 @@ XINFERENCE_BATCHING_ALLOWED_VISION_MODELS = [
 ]
 XINFERENCE_TEXT_TO_IMAGE_BATCHING_ALLOWED_MODELS = ["FLUX.1-dev", "FLUX.1-schnell"]
+XINFERENCE_BATCHING_BLACK_LIST = ["glm4-chat"]
 def request_limit(fn):
@@ -91,21 +92,26 @@ def request_limit(fn):
         logger.debug(
             f"Request {fn.__name__}, current serve request count: {self._serve_count}, request limit: {self._request_limits} for the model {self.model_uid()}"
         )
-        if self._request_limits is not None:
-            if 1 + self._serve_count <= self._request_limits:
-                self._serve_count += 1
-            else:
-                raise RuntimeError(
-                    f"Rate limit reached for the model. Request limit {self._request_limits} for the model: {self.model_uid()}"
-                )
+        if 1 + self._serve_count <= self._request_limits:
+            self._serve_count += 1
+        else:
+            raise RuntimeError(
+                f"Rate limit reached for the model. Request limit {self._request_limits} for the model: {self.model_uid()}"
+            )
+        ret = None
         try:
             ret = await fn(self, *args, **kwargs)
         finally:
-            if self._request_limits is not None:
+            if ret is not None and (
+                inspect.isasyncgen(ret) or inspect.isgenerator(ret)
+            ):
+                # stream case, let client call model_ref to decrease self._serve_count
+                pass
+            else:
                 self._serve_count -= 1
-            logger.debug(
-                f"After request {fn.__name__}, current serve request count: {self._serve_count} for the model {self.model_uid()}"
-            )
+                logger.debug(
+                    f"After request {fn.__name__}, current serve request count: {self._serve_count} for the model {self.model_uid()}"
+                )
         return ret
     return wrapped_func
@@ -215,7 +221,9 @@ class ModelActor(xo.StatelessActor, CancelMixin):
         self._model_description = (
             model_description.to_dict() if model_description else {}
         )
-        self._request_limits = request_limits
+        self._request_limits = (
+            float("inf") if request_limits is None else request_limits
+        )
         self._pending_requests: asyncio.Queue = asyncio.Queue()
         self._handle_pending_requests_task = None
         self._lock = (
@@ -268,6 +276,9 @@ class ModelActor(xo.StatelessActor, CancelMixin):
     def __repr__(self) -> str:
         return f"ModelActor({self._replica_model_uid})"
+    def decrease_serve_count(self):
+        self._serve_count -= 1
     async def _record_completion_metrics(
         self, duration, completion_tokens, prompt_tokens
     ):
@@ -362,7 +373,11 @@ class ModelActor(xo.StatelessActor, CancelMixin):
                     f"Your model {self._model.model_family.model_name} with model family {self._model.model_family.model_family} is disqualified."
                 )
                 return False
-        return condition
+        return (
+            condition
+            and self._model.model_family.model_name
+            not in XINFERENCE_BATCHING_BLACK_LIST
+        )
     def allow_batching_for_text_to_image(self) -> bool:
         from ..model.image.stable_diffusion.core import DiffusionModel
@@ -794,6 +809,19 @@ class ModelActor(xo.StatelessActor, CancelMixin):
             f"Model {self._model.model_spec} is not for creating embedding."
         )
+    @request_limit
+    @log_async(logger=logger)
+    async def convert_ids_to_tokens(
+        self, input: Union[List, List[List]], *args, **kwargs
+    ):
+        kwargs.pop("request_id", None)
+        if hasattr(self._model, "convert_ids_to_tokens"):
+            return await self._call_wrapper_json(
+                self._model.convert_ids_to_tokens, input, *args, **kwargs
+            )
+        raise AttributeError(f"Model {self._model.model_spec} can convert token id.")
     @request_limit
     @log_async(logger=logger)
     async def rerank(

xinference/deploy/cmdline.py CHANGED Viewed

@@ -846,7 +846,9 @@ def model_launch(
     kwargs = {}
     for i in range(0, len(ctx.args), 2):
         if not ctx.args[i].startswith("--"):
-            raise ValueError("You must specify extra kwargs with `--` prefix.")
+            raise ValueError(
+                f"You must specify extra kwargs with `--` prefix. There is an error in parameter passing that is {ctx.args[i]}."
+            )
         kwargs[ctx.args[i][2:]] = handle_click_args_type(ctx.args[i + 1])
     print(f"Launch model name: {model_name} with kwargs: {kwargs}", file=sys.stderr)

xinference/deploy/test/test_cmdline.py CHANGED Viewed

@@ -23,6 +23,7 @@ from ..cmdline import (
     list_model_registrations,
     model_chat,
     model_generate,
+    model_launch,
     model_list,
     model_terminate,
     register_model,
@@ -311,3 +312,58 @@ def test_remove_cache(setup):
     assert result.exit_code == 0
     assert "Cache directory qwen1.5-chat has been deleted."
+def test_launch_error_in_passing_parameters():
+    runner = CliRunner()
+    # Known parameter but not provided with value.
+    result = runner.invoke(
+        model_launch,
+        [
+            "--model-engine",
+            "transformers",
+            "--model-name",
+            "qwen2.5-instruct",
+            "--model-uid",
+            "-s",
+            "0.5",
+            "-f",
+            "gptq",
+            "-q",
+            "INT4",
+            "111",
+            "-l",
+        ],
+    )
+    assert result.exit_code == 1
+    assert (
+        "You must specify extra kwargs with `--` prefix. There is an error in parameter passing that is 0.5."
+        in str(result)
+    )
+    # Unknown parameter
+    result = runner.invoke(
+        model_launch,
+        [
+            "--model-engine",
+            "transformers",
+            "--model-name",
+            "qwen2.5-instruct",
+            "--model-uid",
+            "123",
+            "-s",
+            "0.5",
+            "-f",
+            "gptq",
+            "-q",
+            "INT4",
+            "-l",
+            "111",
+        ],
+    )
+    assert result.exit_code == 1
+    assert (
+        "You must specify extra kwargs with `--` prefix. There is an error in parameter passing that is -l."
+        in str(result)
+    )

xinference/isolation.py CHANGED Viewed

@@ -37,6 +37,30 @@ class Isolation:
         asyncio.set_event_loop(self._loop)
         self._stopped = asyncio.Event()
         self._loop.run_until_complete(self._stopped.wait())
+        self._cancel_all_tasks(self._loop)
+    @staticmethod
+    def _cancel_all_tasks(loop):
+        to_cancel = asyncio.all_tasks(loop)
+        if not to_cancel:
+            return
+        for task in to_cancel:
+            task.cancel()
+        loop.run_until_complete(asyncio.gather(*to_cancel, return_exceptions=True))
+        for task in to_cancel:
+            if task.cancelled():
+                continue
+            if task.exception() is not None:
+                loop.call_exception_handler(
+                    {
+                        "message": "unhandled exception during asyncio.run() shutdown",
+                        "exception": task.exception(),
+                        "task": task,
+                    }
+                )
     def start(self):
         if self._threaded:

xinference/model/audio/__init__.py CHANGED Viewed

@@ -15,6 +15,8 @@
 import codecs
 import json
 import os
+import platform
+import sys
 import warnings
 from typing import Any, Dict
@@ -55,6 +57,14 @@ def register_custom_model():
                 warnings.warn(f"{user_defined_audio_dir}/{f} has error, {e}")
+def _need_filter(spec: dict):
+    if (sys.platform != "darwin" or platform.processor() != "arm") and spec.get(
+        "engine", ""
+    ).upper() == "MLX":
+        return True
+    return False
 def _install():
     _model_spec_json = os.path.join(os.path.dirname(__file__), "model_spec.json")
     _model_spec_modelscope_json = os.path.join(
@@ -64,6 +74,7 @@ def _install():
         dict(
             (spec["model_name"], AudioModelFamilyV1(**spec))
             for spec in json.load(codecs.open(_model_spec_json, "r", encoding="utf-8"))
+            if not _need_filter(spec)
         )
     )
     for model_name, model_spec in BUILTIN_AUDIO_MODELS.items():
@@ -75,6 +86,7 @@ def _install():
             for spec in json.load(
                 codecs.open(_model_spec_modelscope_json, "r", encoding="utf-8")
             )
+            if not _need_filter(spec)
         )
     )
     for model_name, model_spec in MODELSCOPE_AUDIO_MODELS.items():

xinference/model/audio/core.py CHANGED Viewed

@@ -21,9 +21,11 @@ from ..core import CacheableModelSpec, ModelDescription
 from ..utils import valid_model_revision
 from .chattts import ChatTTSModel
 from .cosyvoice import CosyVoiceModel
+from .f5tts import F5TTSModel
 from .fish_speech import FishSpeechModel
 from .funasr import FunASRModel
 from .whisper import WhisperModel
+from .whisper_mlx import WhisperMLXModel
 logger = logging.getLogger(__name__)
@@ -43,11 +45,12 @@ class AudioModelFamilyV1(CacheableModelSpec):
     model_family: str
     model_name: str
     model_id: str
-    model_revision: str
+    model_revision: Optional[str]
     multilingual: bool
     model_ability: Optional[str]
     default_model_config: Optional[Dict[str, Any]]
     default_transcription_config: Optional[Dict[str, Any]]
+    engine: Optional[str]
 class AudioModelDescription(ModelDescription):
@@ -160,17 +163,34 @@ def create_audio_model_instance(
     model_path: Optional[str] = None,
     **kwargs,
 ) -> Tuple[
-    Union[WhisperModel, FunASRModel, ChatTTSModel, CosyVoiceModel, FishSpeechModel],
+    Union[
+        WhisperModel,
+        WhisperMLXModel,
+        FunASRModel,
+        ChatTTSModel,
+        CosyVoiceModel,
+        FishSpeechModel,
+        F5TTSModel,
+    ],
     AudioModelDescription,
 ]:
     model_spec = match_audio(model_name, download_hub)
     if model_path is None:
         model_path = cache(model_spec)
     model: Union[
-        WhisperModel, FunASRModel, ChatTTSModel, CosyVoiceModel, FishSpeechModel
+        WhisperModel,
+        WhisperMLXModel,
+        FunASRModel,
+        ChatTTSModel,
+        CosyVoiceModel,
+        FishSpeechModel,
+        F5TTSModel,
     ]
     if model_spec.model_family == "whisper":
-        model = WhisperModel(model_uid, model_path, model_spec, **kwargs)
+        if not model_spec.engine:
+            model = WhisperModel(model_uid, model_path, model_spec, **kwargs)
+        else:
+            model = WhisperMLXModel(model_uid, model_path, model_spec, **kwargs)
     elif model_spec.model_family == "funasr":
         model = FunASRModel(model_uid, model_path, model_spec, **kwargs)
     elif model_spec.model_family == "ChatTTS":
@@ -179,6 +199,8 @@ def create_audio_model_instance(
         model = CosyVoiceModel(model_uid, model_path, model_spec, **kwargs)
     elif model_spec.model_family == "FishAudio":
         model = FishSpeechModel(model_uid, model_path, model_spec, **kwargs)
+    elif model_spec.model_family == "F5-TTS":
+        model = F5TTSModel(model_uid, model_path, model_spec, **kwargs)
     else:
         raise Exception(f"Unsupported audio model family: {model_spec.model_family}")
     model_description = AudioModelDescription(

xinference 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

Potentially problematic release.

xinference 1.0.0py3-none-any.whl → 1.1.0py3-none-any.whl