PyPI - xinference - Versions diffs - 1.2.2__py3-none-any.whl → 1.3.0.post1__py3-none-any.whl - Mend

xinference 1.2.2py3-none-any.whl → 1.3.0.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (68) hide show

xinference/deploy/cmdline.py CHANGED Viewed

@@ -770,11 +770,17 @@ def remove_cache(
     type=int,
     help="The replica count of the model, default is 1.",
 )
+@click.option(
+    "--n-worker",
+    default=1,
+    type=int,
+    help="The number of workers used by the model, default is 1.",
+)
 @click.option(
     "--n-gpu",
     default="auto",
     type=str,
-    help='The number of GPUs used by the model, default is "auto".',
+    help='The number of GPUs used by the model, if n_worker>1, means number of GPUs per worker, default is "auto".',
 )
 @click.option(
     "--lora-modules",
@@ -815,6 +821,12 @@ def remove_cache(
     type=bool,
     help="Whether or not to allow for custom models defined on the Hub in their own modeling files.",
 )
+@click.option(
+    "--reasoning-content",
+    default=False,
+    type=bool,
+    help="Whether or not to enable reasoning content in model responses.",
+)
 @click.option(
     "--api-key",
     "-ak",
@@ -822,6 +834,7 @@ def remove_cache(
     type=str,
     help="Api-Key for access xinference api with authorization.",
 )
+@click.option("--model-path", "-mp", default=None, type=str, help="Model path to run.")
 @click.pass_context
 def model_launch(
     ctx,
@@ -834,6 +847,7 @@ def model_launch(
     model_format: str,
     quantization: str,
     replica: int,
+    n_worker: int,
     n_gpu: str,
     lora_modules: Optional[Tuple],
     image_lora_load_kwargs: Optional[Tuple],
@@ -841,15 +855,28 @@ def model_launch(
     worker_ip: Optional[str],
     gpu_idx: Optional[str],
     trust_remote_code: bool,
+    reasoning_content: bool,
     api_key: Optional[str],
+    model_path: Optional[str],
 ):
     kwargs = {}
     for i in range(0, len(ctx.args), 2):
         if not ctx.args[i].startswith("--"):
             raise ValueError(
-                f"You must specify extra kwargs with `--` prefix. There is an error in parameter passing that is {ctx.args[i]}."
+                f"You must specify extra kwargs with `--` prefix. "
+                f"There is an error in parameter passing that is {ctx.args[i]}."
             )
-        kwargs[ctx.args[i][2:]] = handle_click_args_type(ctx.args[i + 1])
+        param_name = ctx.args[i][2:]
+        param_value = handle_click_args_type(ctx.args[i + 1])
+        if param_name == "model_path":
+            # fix for --model_path which is the old fashion to set model_path,
+            # now model_path is a builtin option, try to make it compatible
+            if model_path is None:
+                model_path = param_value
+                continue
+            else:
+                raise ValueError("Cannot set both for --model-path and --model_path")
+        kwargs[param_name] = param_value
     print(f"Launch model name: {model_name} with kwargs: {kwargs}", file=sys.stderr)
     if model_type == "LLM" and model_engine is None:
@@ -914,11 +941,14 @@ def model_launch(
         model_format=model_format,
         quantization=quantization,
         replica=replica,
+        n_worker=n_worker,
         n_gpu=_n_gpu,
         peft_model_config=peft_model_config,
         worker_ip=worker_ip,
         gpu_idx=_gpu_idx,
         trust_remote_code=trust_remote_code,
+        model_path=model_path,
+        reasoning_content=reasoning_content,
         **kwargs,
     )

xinference/deploy/test/test_cmdline.py CHANGED Viewed

@@ -147,6 +147,38 @@ def test_cmdline(setup, stream, model_uid):
     assert model_uid not in result.stdout
+def test_cmdline_model_path_error(setup):
+    endpoint, _ = setup
+    runner = CliRunner(mix_stderr=False)
+    # launch model
+    result = runner.invoke(
+        model_launch,
+        [
+            "--endpoint",
+            endpoint,
+            "--model-name",
+            "tiny-llama",
+            "--size-in-billions",
+            1,
+            "--model-format",
+            "ggufv2",
+            "--quantization",
+            "Q2_K",
+            "--model-path",
+            "/path/to/model",
+            "--model_path",
+            "/path/to/model",
+        ],
+    )
+    assert result.exit_code > 0
+    with pytest.raises(
+        ValueError, match="Cannot set both for --model-path and --model_path"
+    ):
+        t, e, tb = result.exc_info
+        raise e.with_traceback(tb)
 def test_cmdline_of_custom_model(setup):
     endpoint, _ = setup
     runner = CliRunner()

xinference/device_utils.py CHANGED Viewed

@@ -13,9 +13,9 @@
 # limitations under the License.
 import os
+from typing import Dict, Literal, Union
 import torch
-from typing_extensions import Literal, Union
 DeviceType = Literal["cuda", "mps", "xpu", "npu", "cpu"]
 DEVICE_TO_ENV_NAME = {
@@ -122,3 +122,45 @@ def gpu_count():
         return torch.npu.device_count()
     else:
         return 0
+def _get_nvidia_gpu_mem_info(gpu_id: int) -> Dict[str, float]:
+    from pynvml import (
+        nvmlDeviceGetHandleByIndex,
+        nvmlDeviceGetMemoryInfo,
+        nvmlDeviceGetName,
+        nvmlDeviceGetUtilizationRates,
+    )
+    handler = nvmlDeviceGetHandleByIndex(gpu_id)
+    gpu_name = nvmlDeviceGetName(handler)
+    mem_info = nvmlDeviceGetMemoryInfo(handler)
+    utilization = nvmlDeviceGetUtilizationRates(handler)
+    return {
+        "name": gpu_name,
+        "total": mem_info.total,
+        "used": mem_info.used,
+        "free": mem_info.free,
+        "util": utilization.gpu,
+    }
+def get_nvidia_gpu_info() -> Dict:
+    from pynvml import nvmlDeviceGetCount, nvmlInit, nvmlShutdown
+    try:
+        nvmlInit()
+        device_count = nvmlDeviceGetCount()
+        res = {}
+        for i in range(device_count):
+            res[f"gpu-{i}"] = _get_nvidia_gpu_mem_info(i)
+        return res
+    except:
+        # TODO: add log here
+        # logger.debug(f"Cannot init nvml. Maybe due to lack of NVIDIA GPUs or incorrect installation of CUDA.")
+        return {}
+    finally:
+        try:
+            nvmlShutdown()
+        except:
+            pass

xinference/model/audio/kokoro.py CHANGED Viewed

@@ -26,36 +26,6 @@ logger = logging.getLogger(__name__)
 class KokoroModel:
-    # The available voices, should keep sync with https://huggingface.co/hexgrad/Kokoro-82M/tree/main/voices
-    VOICES = [
-        "af_alloy",
-        "af_aoede",
-        "af_bella",
-        "af_jessica",
-        "af_kore",
-        "af_nicole",
-        "af_nova",
-        "af_river",
-        "af_sarah",
-        "af_sky",
-        "am_adam",
-        "am_echo",
-        "am_eric",
-        "am_fenrir",
-        "am_liam",
-        "am_michael",
-        "am_onyx",
-        "am_puck",
-        "bf_alice",
-        "bf_emma",
-        "bf_isabella",
-        "bf_lily",
-        "bm_daniel",
-        "bm_fable",
-        "bm_george",
-        "bm_lewis",
-    ]
     def __init__(
         self,
         model_uid: str,
@@ -89,10 +59,25 @@ class KokoroModel:
         config_path = os.path.join(self._model_path, "config.json")
         model_path = os.path.join(self._model_path, "kokoro-v1_0.pth")
         # LANG_CODES = dict(
+        #     # pip install misaki[en]
         #     a='American English',
         #     b='British English',
+        #
+        #     # espeak-ng
+        #     e='es',
+        #     f='fr-fr',
+        #     h='hi',
+        #     i='it',
+        #     p='pt-br',
+        #
+        #     # pip install misaki[ja]
+        #     j='Japanese',
+        #
+        #     # pip install misaki[zh]
+        #     z='Mandarin Chinese',
         # )
         lang_code = self._kwargs.get("lang_code", "a")
+        logger.info("Launching Kokoro model with language code: %s", lang_code)
         self._model = KPipeline(
             lang_code=lang_code,
             model=KModel(config=config_path, model=model_path),
@@ -114,14 +99,12 @@ class KokoroModel:
             raise Exception("Kokoro does not support stream mode.")
         assert self._model is not None
         if not voice:
-            voice = next(iter(self.VOICES))
+            voice = "af_alloy"
             logger.info("Auto select speaker: %s", voice)
-        elif not voice.endswith(".pt") and voice not in self.VOICES:
-            raise ValueError(
-                f"Invalid voice: {voice}, available speakers: {self.VOICES}"
-            )
-        else:
+        elif voice.endswith(".pt"):
             logger.info("Using custom voice pt: %s", voice)
+        else:
+            logger.info("Using voice: %s", voice)
         logger.info("Speech kwargs: %s", kwargs)
         generator = self._model(text=input, voice=voice, speed=speed, **kwargs)
         results = list(generator)

xinference/model/audio/model_spec.json CHANGED Viewed

@@ -343,7 +343,7 @@
     "model_name": "Kokoro-82M",
     "model_family": "Kokoro",
     "model_id": "hexgrad/Kokoro-82M",
-    "model_revision": "7a29fcdf8e997bac6d6f5f6f0c2f0b92912f6102",
+    "model_revision": "7884269d6fd3f9beabc271b6f1308e5699281fa9",
     "model_ability": "text-to-audio",
     "multilingual": true
   }

xinference/model/image/stable_diffusion/core.py CHANGED Viewed

@@ -22,7 +22,6 @@ import logging
 import os
 import re
 import sys
-import warnings
 from glob import glob
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
@@ -412,12 +411,22 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
         else:
             raise ValueError(f"Unknown sampler: {sampler_name}")
-    @staticmethod
+    def _need_set_scheduler(self, scheduler: Any) -> bool:
+        """Determine whether it is necessary to set up a scheduler"""
+        if self._model_spec is None:
+            return False
+        if scheduler is None:
+            return False
+        if "FLUX" in self._model_spec.model_name:
+            logger.warning("FLUX model, skipping scheduler setup")
+            return False
+        return True
     @contextlib.contextmanager
-    def _reset_when_done(model: Any, sampler_name: str):
-        assert model is not None
+    def _reset_when_done(self, model: Any, sampler_name: str):
         scheduler = DiffusionModel._get_scheduler(model, sampler_name)
-        if scheduler:
+        if self._need_set_scheduler(scheduler):
+            logger.debug("Use scheduler %s", scheduler)
             default_scheduler = model.scheduler
             model.scheduler = scheduler
             try:
@@ -517,7 +526,7 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
         for key in list(kwargs):
             allow_key = model_accept_param(key, model)
             if not allow_key:
-                warnings.warn(f"{type(model)} cannot accept `{key}`, will ignore it")
+                logger.warning(f"{type(model)} cannot accept `{key}`, will ignore it")
                 kwargs.pop(key)
     def text_to_image(

xinference 1.2.2__py3-none-any.whl → 1.3.0.post1__py3-none-any.whl

Potentially problematic release.

xinference 1.2.2py3-none-any.whl → 1.3.0.post1py3-none-any.whl