PyPI - xinference - Versions diffs - 1.2.1__py3-none-any.whl → 1.3.0__py3-none-any.whl - Mend

xinference 1.2.1py3-none-any.whl → 1.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (80) hide show

xinference/deploy/cmdline.py CHANGED Viewed

@@ -770,11 +770,17 @@ def remove_cache(
     type=int,
     help="The replica count of the model, default is 1.",
 )
+@click.option(
+    "--n-worker",
+    default=1,
+    type=int,
+    help="The number of workers used by the model, default is 1.",
+)
 @click.option(
     "--n-gpu",
     default="auto",
     type=str,
-    help='The number of GPUs used by the model, default is "auto".',
+    help='The number of GPUs used by the model, if n_worker>1, means number of GPUs per worker, default is "auto".',
 )
 @click.option(
     "--lora-modules",
@@ -815,6 +821,12 @@ def remove_cache(
     type=bool,
     help="Whether or not to allow for custom models defined on the Hub in their own modeling files.",
 )
+@click.option(
+    "--reasoning-content",
+    default=False,
+    type=bool,
+    help="Whether or not to enable reasoning content in model responses.",
+)
 @click.option(
     "--api-key",
     "-ak",
@@ -822,6 +834,7 @@ def remove_cache(
     type=str,
     help="Api-Key for access xinference api with authorization.",
 )
+@click.option("--model-path", "-mp", default=None, type=str, help="Model path to run.")
 @click.pass_context
 def model_launch(
     ctx,
@@ -834,6 +847,7 @@ def model_launch(
     model_format: str,
     quantization: str,
     replica: int,
+    n_worker: int,
     n_gpu: str,
     lora_modules: Optional[Tuple],
     image_lora_load_kwargs: Optional[Tuple],
@@ -841,15 +855,28 @@ def model_launch(
     worker_ip: Optional[str],
     gpu_idx: Optional[str],
     trust_remote_code: bool,
+    reasoning_content: bool,
     api_key: Optional[str],
+    model_path: Optional[str],
 ):
     kwargs = {}
     for i in range(0, len(ctx.args), 2):
         if not ctx.args[i].startswith("--"):
             raise ValueError(
-                f"You must specify extra kwargs with `--` prefix. There is an error in parameter passing that is {ctx.args[i]}."
+                f"You must specify extra kwargs with `--` prefix. "
+                f"There is an error in parameter passing that is {ctx.args[i]}."
             )
-        kwargs[ctx.args[i][2:]] = handle_click_args_type(ctx.args[i + 1])
+        param_name = ctx.args[i][2:]
+        param_value = handle_click_args_type(ctx.args[i + 1])
+        if param_name == "model_path":
+            # fix for --model_path which is the old fashion to set model_path,
+            # now model_path is a builtin option, try to make it compatible
+            if model_path is None:
+                model_path = param_value
+                continue
+            else:
+                raise ValueError("Cannot set both for --model-path and --model_path")
+        kwargs[param_name] = param_value
     print(f"Launch model name: {model_name} with kwargs: {kwargs}", file=sys.stderr)
     if model_type == "LLM" and model_engine is None:
@@ -914,11 +941,14 @@ def model_launch(
         model_format=model_format,
         quantization=quantization,
         replica=replica,
+        n_worker=n_worker,
         n_gpu=_n_gpu,
         peft_model_config=peft_model_config,
         worker_ip=worker_ip,
         gpu_idx=_gpu_idx,
         trust_remote_code=trust_remote_code,
+        model_path=model_path,
+        reasoning_content=reasoning_content,
         **kwargs,
     )

xinference/deploy/local.py CHANGED Viewed

@@ -41,7 +41,8 @@ async def _start_local_cluster(
 ):
     from .utils import create_worker_actor_pool
-    logging.config.dictConfig(logging_conf)  # type: ignore
+    if logging_conf:
+        logging.config.dictConfig(logging_conf)  # type: ignore
     pool = None
     try:

xinference/deploy/test/test_cmdline.py CHANGED Viewed

@@ -147,6 +147,38 @@ def test_cmdline(setup, stream, model_uid):
     assert model_uid not in result.stdout
+def test_cmdline_model_path_error(setup):
+    endpoint, _ = setup
+    runner = CliRunner(mix_stderr=False)
+    # launch model
+    result = runner.invoke(
+        model_launch,
+        [
+            "--endpoint",
+            endpoint,
+            "--model-name",
+            "tiny-llama",
+            "--size-in-billions",
+            1,
+            "--model-format",
+            "ggufv2",
+            "--quantization",
+            "Q2_K",
+            "--model-path",
+            "/path/to/model",
+            "--model_path",
+            "/path/to/model",
+        ],
+    )
+    assert result.exit_code > 0
+    with pytest.raises(
+        ValueError, match="Cannot set both for --model-path and --model_path"
+    ):
+        t, e, tb = result.exc_info
+        raise e.with_traceback(tb)
 def test_cmdline_of_custom_model(setup):
     endpoint, _ = setup
     runner = CliRunner()

xinference/device_utils.py CHANGED Viewed

@@ -13,9 +13,9 @@
 # limitations under the License.
 import os
+from typing import Dict, Literal, Union
 import torch
-from typing_extensions import Literal, Union
 DeviceType = Literal["cuda", "mps", "xpu", "npu", "cpu"]
 DEVICE_TO_ENV_NAME = {
@@ -122,3 +122,45 @@ def gpu_count():
         return torch.npu.device_count()
     else:
         return 0
+def _get_nvidia_gpu_mem_info(gpu_id: int) -> Dict[str, float]:
+    from pynvml import (
+        nvmlDeviceGetHandleByIndex,
+        nvmlDeviceGetMemoryInfo,
+        nvmlDeviceGetName,
+        nvmlDeviceGetUtilizationRates,
+    )
+    handler = nvmlDeviceGetHandleByIndex(gpu_id)
+    gpu_name = nvmlDeviceGetName(handler)
+    mem_info = nvmlDeviceGetMemoryInfo(handler)
+    utilization = nvmlDeviceGetUtilizationRates(handler)
+    return {
+        "name": gpu_name,
+        "total": mem_info.total,
+        "used": mem_info.used,
+        "free": mem_info.free,
+        "util": utilization.gpu,
+    }
+def get_nvidia_gpu_info() -> Dict:
+    from pynvml import nvmlDeviceGetCount, nvmlInit, nvmlShutdown
+    try:
+        nvmlInit()
+        device_count = nvmlDeviceGetCount()
+        res = {}
+        for i in range(device_count):
+            res[f"gpu-{i}"] = _get_nvidia_gpu_mem_info(i)
+        return res
+    except:
+        # TODO: add log here
+        # logger.debug(f"Cannot init nvml. Maybe due to lack of NVIDIA GPUs or incorrect installation of CUDA.")
+        return {}
+    finally:
+        try:
+            nvmlShutdown()
+        except:
+            pass

xinference/model/audio/core.py CHANGED Viewed

@@ -25,6 +25,7 @@ from .f5tts import F5TTSModel
 from .f5tts_mlx import F5TTSMLXModel
 from .fish_speech import FishSpeechModel
 from .funasr import FunASRModel
+from .kokoro import KokoroModel
 from .melotts import MeloTTSModel
 from .whisper import WhisperModel
 from .whisper_mlx import WhisperMLXModel
@@ -176,6 +177,7 @@ def create_audio_model_instance(
         F5TTSModel,
         F5TTSMLXModel,
         MeloTTSModel,
+        KokoroModel,
     ],
     AudioModelDescription,
 ]:
@@ -192,6 +194,7 @@ def create_audio_model_instance(
         F5TTSModel,
         F5TTSMLXModel,
         MeloTTSModel,
+        KokoroModel,
     ]
     if model_spec.model_family == "whisper":
         if not model_spec.engine:
@@ -212,6 +215,8 @@ def create_audio_model_instance(
         model = F5TTSMLXModel(model_uid, model_path, model_spec, **kwargs)
     elif model_spec.model_family == "MeloTTS":
         model = MeloTTSModel(model_uid, model_path, model_spec, **kwargs)
+    elif model_spec.model_family == "Kokoro":
+        model = KokoroModel(model_uid, model_path, model_spec, **kwargs)
     else:
         raise Exception(f"Unsupported audio model family: {model_spec.model_family}")
     model_description = AudioModelDescription(

xinference/model/audio/kokoro.py ADDED Viewed

@@ -0,0 +1,122 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+from io import BytesIO
+from typing import TYPE_CHECKING, Optional
+import numpy as np
+from ...device_utils import get_available_device, is_device_available
+if TYPE_CHECKING:
+    from .core import AudioModelFamilyV1
+logger = logging.getLogger(__name__)
+class KokoroModel:
+    def __init__(
+        self,
+        model_uid: str,
+        model_path: str,
+        model_spec: "AudioModelFamilyV1",
+        device: Optional[str] = None,
+        **kwargs,
+    ):
+        self._model_uid = model_uid
+        self._model_path = model_path
+        self._model_spec = model_spec
+        self._device = device
+        self._model = None
+        self._kwargs = kwargs
+    @property
+    def model_ability(self):
+        return self._model_spec.model_ability
+    def load(self):
+        if self._device is None:
+            self._device = get_available_device()
+        else:
+            if not is_device_available(self._device):
+                raise ValueError(f"Device {self._device} is not available!")
+        import os
+        from kokoro import KModel, KPipeline
+        config_path = os.path.join(self._model_path, "config.json")
+        model_path = os.path.join(self._model_path, "kokoro-v1_0.pth")
+        # LANG_CODES = dict(
+        #     # pip install misaki[en]
+        #     a='American English',
+        #     b='British English',
+        #
+        #     # espeak-ng
+        #     e='es',
+        #     f='fr-fr',
+        #     h='hi',
+        #     i='it',
+        #     p='pt-br',
+        #
+        #     # pip install misaki[ja]
+        #     j='Japanese',
+        #
+        #     # pip install misaki[zh]
+        #     z='Mandarin Chinese',
+        # )
+        lang_code = self._kwargs.get("lang_code", "a")
+        logger.info("Launching Kokoro model with language code: %s", lang_code)
+        self._model = KPipeline(
+            lang_code=lang_code,
+            model=KModel(config=config_path, model=model_path),
+            device=self._device,
+        )
+    def speech(
+        self,
+        input: str,
+        voice: str,
+        response_format: str = "mp3",
+        speed: float = 1.0,
+        stream: bool = False,
+        **kwargs,
+    ):
+        import soundfile
+        if stream:
+            raise Exception("Kokoro does not support stream mode.")
+        assert self._model is not None
+        if not voice:
+            voice = "af_alloy"
+            logger.info("Auto select speaker: %s", voice)
+        elif voice.endswith(".pt"):
+            logger.info("Using custom voice pt: %s", voice)
+        else:
+            logger.info("Using voice: %s", voice)
+        logger.info("Speech kwargs: %s", kwargs)
+        generator = self._model(text=input, voice=voice, speed=speed, **kwargs)
+        results = list(generator)
+        audio = np.concatenate([r[2] for r in results])
+        # Save the generated audio
+        with BytesIO() as out:
+            with soundfile.SoundFile(
+                out,
+                "w",
+                24000,
+                1,
+                format=response_format.upper(),
+            ) as f:
+                f.write(audio)
+            return out.getvalue()

xinference/model/audio/model_spec.json CHANGED Viewed

@@ -338,5 +338,13 @@
     "model_ability": "text-to-audio",
     "multilingual": false,
     "language": "KR"
+  },
+  {
+    "model_name": "Kokoro-82M",
+    "model_family": "Kokoro",
+    "model_id": "hexgrad/Kokoro-82M",
+    "model_revision": "7884269d6fd3f9beabc271b6f1308e5699281fa9",
+    "model_ability": "text-to-audio",
+    "multilingual": true
   }
 ]

xinference/model/audio/model_spec_modelscope.json CHANGED Viewed

@@ -100,5 +100,14 @@
     "model_revision": "master",
     "model_ability": "text-to-audio",
     "multilingual": true
+  },
+  {
+    "model_name": "Kokoro-82M",
+    "model_family": "Kokoro",
+    "model_hub": "modelscope",
+    "model_id": "AI-ModelScope/Kokoro-82M",
+    "model_revision": "master",
+    "model_ability": "text-to-audio",
+    "multilingual": true
   }
 ]

xinference/model/image/stable_diffusion/core.py CHANGED Viewed

@@ -22,7 +22,6 @@ import logging
 import os
 import re
 import sys
-import warnings
 from glob import glob
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
@@ -412,12 +411,22 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
         else:
             raise ValueError(f"Unknown sampler: {sampler_name}")
-    @staticmethod
+    def _need_set_scheduler(self, scheduler: Any) -> bool:
+        """Determine whether it is necessary to set up a scheduler"""
+        if self._model_spec is None:
+            return False
+        if scheduler is None:
+            return False
+        if "FLUX" in self._model_spec.model_name:
+            logger.warning("FLUX model, skipping scheduler setup")
+            return False
+        return True
     @contextlib.contextmanager
-    def _reset_when_done(model: Any, sampler_name: str):
-        assert model is not None
+    def _reset_when_done(self, model: Any, sampler_name: str):
         scheduler = DiffusionModel._get_scheduler(model, sampler_name)
-        if scheduler:
+        if self._need_set_scheduler(scheduler):
+            logger.debug("Use scheduler %s", scheduler)
             default_scheduler = model.scheduler
             model.scheduler = scheduler
             try:
@@ -517,7 +526,7 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
         for key in list(kwargs):
             allow_key = model_accept_param(key, model)
             if not allow_key:
-                warnings.warn(f"{type(model)} cannot accept `{key}`, will ignore it")
+                logger.warning(f"{type(model)} cannot accept `{key}`, will ignore it")
                 kwargs.pop(key)
     def text_to_image(

xinference/model/llm/llama_cpp/core.py CHANGED Viewed

@@ -28,7 +28,7 @@ from ....types import (
 )
 from ..core import LLM
 from ..llm_family import LLMFamilyV1, LLMSpecV1
-from ..utils import QWEN_TOOL_CALL_FAMILY, ChatModelMixin
+from ..utils import DEEPSEEK_TOOL_CALL_FAMILY, QWEN_TOOL_CALL_FAMILY, ChatModelMixin
 logger = logging.getLogger(__name__)
@@ -123,18 +123,22 @@ class LlamaCppModel(LLM):
             raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
-        # handle legacy cache.
-        model_path = os.path.realpath(
-            os.path.join(
-                self.model_path,
-                self.model_spec.model_file_name_template.format(
-                    quantization=self.quantization
-                ),
+        if os.path.isfile(self.model_path):
+            # mostly passed from --model_path
+            model_path = os.path.realpath(self.model_path)
+        else:
+            # handle legacy cache.
+            model_path = os.path.realpath(
+                os.path.join(
+                    self.model_path,
+                    self.model_spec.model_file_name_template.format(
+                        quantization=self.quantization
+                    ),
+                )
             )
-        )
-        legacy_model_file_path = os.path.join(self.model_path, "model.bin")
-        if os.path.exists(legacy_model_file_path):
-            model_path = legacy_model_file_path
+            legacy_model_file_path = os.path.join(self.model_path, "model.bin")
+            if os.path.exists(legacy_model_file_path):
+                model_path = legacy_model_file_path
         try:
             self._llm = Llama(
@@ -272,8 +276,11 @@ class LlamaCppChatModel(LlamaCppModel, ChatModelMixin):
         model_family = self.model_family.model_family or self.model_family.model_name
         tools = generate_config.pop("tools", []) if generate_config else None
         full_context_kwargs = {}
-        if tools and model_family in QWEN_TOOL_CALL_FAMILY:
-            full_context_kwargs["tools"] = tools
+        if tools:
+            if model_family in QWEN_TOOL_CALL_FAMILY:
+                full_context_kwargs["tools"] = tools
+            elif model_family in DEEPSEEK_TOOL_CALL_FAMILY:
+                self._tools_to_messages_for_deepseek(messages, tools)
         assert self.model_family.chat_template is not None
         full_prompt = self.get_full_context(
             messages, self.model_family.chat_template, **full_context_kwargs

xinference 1.2.1__py3-none-any.whl → 1.3.0__py3-none-any.whl

Potentially problematic release.

xinference 1.2.1py3-none-any.whl → 1.3.0py3-none-any.whl