PyPI - xinference - Versions diffs - 1.7.0.post1__py3-none-any.whl → 1.7.1__py3-none-any.whl - Mend

xinference 1.7.0.post1py3-none-any.whl → 1.7.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (83) hide show

xinference/deploy/local.py CHANGED Viewed

@@ -17,6 +17,8 @@ import logging
 import multiprocessing
 import signal
 import sys
+import traceback
+from multiprocessing.connection import Connection
 from typing import Dict, Optional
 import xoscar as xo
@@ -25,6 +27,7 @@ from xoscar.utils import get_next_port
 from ..constants import (
     XINFERENCE_HEALTH_CHECK_FAILURE_THRESHOLD,
     XINFERENCE_HEALTH_CHECK_INTERVAL,
+    XINFERENCE_HEALTH_CHECK_TIMEOUT,
 )
 from ..core.supervisor import SupervisorActor
 from .utils import health_check
@@ -33,11 +36,15 @@ from .worker import start_worker_components
 logger = logging.getLogger(__name__)
+READY = "ok"
 async def _start_local_cluster(
     address: str,
     metrics_exporter_host: Optional[str] = None,
     metrics_exporter_port: Optional[int] = None,
     logging_conf: Optional[Dict] = None,
+    conn: Optional[Connection] = None,
 ):
     from .utils import create_worker_actor_pool
@@ -59,6 +66,13 @@ async def _start_local_cluster(
             metrics_exporter_host=metrics_exporter_host,
             metrics_exporter_port=metrics_exporter_port,
         )
+        if conn:
+            try:
+                conn.send(READY)
+            except BrokenPipeError:
+                # connection may be gc collected,
+                # just ignore this error
+                pass
         await pool.join()
     except asyncio.CancelledError:
         if pool is not None:
@@ -70,22 +84,36 @@ def run(
     metrics_exporter_host: Optional[str] = None,
     metrics_exporter_port: Optional[int] = None,
     logging_conf: Optional[Dict] = None,
+    conn: Optional[Connection] = None,
 ):
     def sigterm_handler(signum, frame):
         sys.exit(0)
     signal.signal(signal.SIGTERM, sigterm_handler)
-    loop = asyncio.get_event_loop()
-    task = loop.create_task(
-        _start_local_cluster(
-            address=address,
-            metrics_exporter_host=metrics_exporter_host,
-            metrics_exporter_port=metrics_exporter_port,
-            logging_conf=logging_conf,
+    try:
+        loop = asyncio.get_event_loop()
+        task = loop.create_task(
+            _start_local_cluster(
+                address=address,
+                metrics_exporter_host=metrics_exporter_host,
+                metrics_exporter_port=metrics_exporter_port,
+                logging_conf=logging_conf,
+                conn=conn,
+            )
         )
-    )
-    loop.run_until_complete(task)
+        loop.run_until_complete(task)
+    except:
+        tb = traceback.format_exc()
+        if conn:
+            try:
+                conn.send(f"error: {tb}")
+            except BrokenPipeError:
+                # connection may be gc collected,
+                # just ignore this error
+                pass
+        # raise again in subprocess
+        raise
 def run_in_subprocess(
@@ -94,11 +122,25 @@ def run_in_subprocess(
     metrics_exporter_port: Optional[int] = None,
     logging_conf: Optional[Dict] = None,
 ) -> multiprocessing.Process:
+    parent_conn, child_conn = multiprocessing.Pipe()
     p = multiprocessing.Process(
         target=run,
         args=(address, metrics_exporter_host, metrics_exporter_port, logging_conf),
+        kwargs={"conn": child_conn},
     )
+    # Since Xoscar 0.7, we do not uses multiprocessing to create subpool any more,
+    # we should be able to use daemon here
+    p.daemon = True
     p.start()
+    if parent_conn.poll(timeout=XINFERENCE_HEALTH_CHECK_TIMEOUT):
+        msg = parent_conn.recv()
+        if msg != READY:
+            raise RuntimeError(f"Start service process failed during startup:\n{msg}")
+    else:
+        logger.info(
+            "No response from process after %s seconds", XINFERENCE_HEALTH_CHECK_TIMEOUT
+        )
     return p

xinference/deploy/worker.py CHANGED Viewed

@@ -21,7 +21,7 @@ import xoscar as xo
 from xoscar import MainActorPoolType
 from ..core.worker import WorkerActor
-from ..device_utils import gpu_count
+from ..device_utils import get_available_device_env_name, gpu_count
 logger = logging.getLogger(__name__)
@@ -34,8 +34,10 @@ async def start_worker_components(
     metrics_exporter_port: Optional[int],
 ):
     gpu_device_indices = []
-    cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", None)
-    if cuda_visible_devices is not None and cuda_visible_devices != "-1":
+    env_name = get_available_device_env_name()
+    cuda_visible_devices = os.environ.get(env_name) if env_name else None
+    if cuda_visible_devices and cuda_visible_devices != "-1":
         gpu_device_indices.extend([int(i) for i in cuda_visible_devices.split(",")])
     else:
         gpu_device_indices = list(range(gpu_count()))

xinference/device_utils.py CHANGED Viewed

@@ -17,10 +17,11 @@ from typing import Dict, Literal, Union
 import torch
-DeviceType = Literal["cuda", "mps", "xpu", "npu", "cpu"]
+DeviceType = Literal["cuda", "mps", "xpu", "npu", "mlu", "cpu"]
 DEVICE_TO_ENV_NAME = {
     "cuda": "CUDA_VISIBLE_DEVICES",
     "npu": "ASCEND_RT_VISIBLE_DEVICES",
+    "mlu": "MLU_VISIBLE_DEVICES",
 }
@@ -38,6 +39,16 @@ def is_npu_available() -> bool:
         return False
+def is_mlu_available() -> bool:
+    try:
+        import torch
+        import torch_mlu  # noqa: F401
+        return torch.mlu.is_available()
+    except ImportError:
+        return False
 def get_available_device() -> DeviceType:
     if torch.cuda.is_available():
         return "cuda"
@@ -47,6 +58,8 @@ def get_available_device() -> DeviceType:
         return "xpu"
     elif is_npu_available():
         return "npu"
+    elif is_mlu_available():
+        return "mlu"
     return "cpu"
@@ -59,6 +72,8 @@ def is_device_available(device: str) -> bool:
         return is_xpu_available()
     elif device == "npu":
         return is_npu_available()
+    elif device == "mlu":
+        return is_mlu_available()
     elif device == "cpu":
         return True
@@ -77,7 +92,7 @@ def move_model_to_available_device(model):
 def get_device_preferred_dtype(device: str) -> Union[torch.dtype, None]:
     if device == "cpu":
         return torch.float32
-    elif device == "cuda" or device == "mps" or device == "npu":
+    elif device == "cuda" or device == "mps" or device == "npu" or device == "mlu":
         return torch.float16
     elif device == "xpu":
         return torch.bfloat16
@@ -86,7 +101,7 @@ def get_device_preferred_dtype(device: str) -> Union[torch.dtype, None]:
 def is_hf_accelerate_supported(device: str) -> bool:
-    return device == "cuda" or device == "xpu" or device == "npu"
+    return device == "cuda" or device == "xpu" or device == "npu" or device == "mlu"
 def empty_cache():
@@ -98,6 +113,8 @@ def empty_cache():
         torch.xpu.empty_cache()
     if is_npu_available():
         torch.npu.empty_cache()
+    if is_mlu_available():
+        torch.mlu.empty_cache()
 def get_available_device_env_name():
@@ -120,6 +137,8 @@ def gpu_count():
         return torch.xpu.device_count()
     elif is_npu_available():
         return torch.npu.device_count()
+    elif is_mlu_available():
+        return torch.mlu.device_count()
     else:
         return 0

xinference/model/audio/fish_speech.py CHANGED Viewed

@@ -123,9 +123,10 @@ class FishSpeechModel:
         logger.warning("Fish speech does not support setting voice: %s.", voice)
         if speed != 1.0:
             logger.warning("Fish speech does not support setting speed: %s.", speed)
-        import torchaudio
         from tools.schema import ServeReferenceAudio, ServeTTSRequest
+        from .utils import audio_stream_generator, audio_to_bytes
         prompt_speech = kwargs.get("prompt_speech")
         prompt_text = kwargs.get("prompt_text", kwargs.get("reference_text", ""))
         if prompt_speech is not None:
@@ -153,40 +154,28 @@ class FishSpeechModel:
         if stream:
-            def _stream_generator():
-                with BytesIO() as out:
-                    writer = torchaudio.io.StreamWriter(out, format=response_format)
-                    writer.add_audio_stream(
-                        sample_rate=self._model.spec_transform.sample_rate,
-                        num_channels=1,
-                    )
-                    i = 0
-                    last_pos = 0
-                    with writer.open():
-                        for chunk in result:
-                            if chunk.code == "final":
-                                continue
-                            chunk = chunk.audio[1]
-                            if chunk is not None:
-                                chunk = chunk.reshape((chunk.shape[0], 1))
-                                trans_chunk = torch.from_numpy(chunk)
-                                writer.write_audio_chunk(i, trans_chunk)
-                                new_last_pos = out.tell()
-                                if new_last_pos != last_pos:
-                                    out.seek(last_pos)
-                                    encoded_bytes = out.read()
-                                    yield encoded_bytes
-                                    last_pos = new_last_pos
-            return _stream_generator()
+            def _gen_chunk():
+                for chunk in result:
+                    if chunk.code == "final":
+                        continue
+                    chunk = chunk.audio[1]
+                    if chunk is not None:
+                        yield chunk
+            return audio_stream_generator(
+                response_format=response_format,
+                sample_rate=self._model.spec_transform.sample_rate,
+                output_generator=_gen_chunk(),
+                output_chunk_transformer=lambda c: torch.from_numpy(
+                    c.reshape((c.shape[0], 1))
+                ),
+            )
         else:
             result = list(result)
             sample_rate, audio = result[0].audio
             audio = np.array([audio])
-            # Save the generated audio
-            with BytesIO() as out:
-                torchaudio.save(
-                    out, torch.from_numpy(audio), sample_rate, format=response_format
-                )
-                return out.getvalue()
+            return audio_to_bytes(
+                response_format=response_format,
+                sample_rate=sample_rate,
+                tensor=torch.from_numpy(audio),
+            )

xinference/model/audio/model_spec.json CHANGED Viewed

@@ -280,7 +280,7 @@
       "hotword": "",
       "batch_size_s": 300
     }
-  },
+  },
   {
     "model_name": "ChatTTS",
     "model_family": "ChatTTS",
@@ -329,6 +329,7 @@
     "multilingual": true,
     "virtualenv": {
       "packages": [
+        "librosa",
         "tiktoken",
         "lightning>=2.0.0",
         "hydra-core>=1.3.2",
@@ -340,7 +341,8 @@
         "HyperPyYAML",
         "onnxruntime>=1.16.0",
         "pyworld>=0.3.4",
-        "numpy==1.26.4",
+        "WeTextProcessing<1.0.4",
+        "#system_numpy#",
         "#system_torch#"
       ]
     }

xinference/model/audio/model_spec_modelscope.json CHANGED Viewed

@@ -129,7 +129,7 @@
       "hotword": "",
       "batch_size_s": 300
     }
-  },
+  },
   {
     "model_name": "ChatTTS",
     "model_family": "ChatTTS",
@@ -183,6 +183,7 @@
     "multilingual": true,
     "virtualenv": {
       "packages": [
+        "librosa",
         "tiktoken",
         "lightning>=2.0.0",
         "hydra-core>=1.3.2",
@@ -194,7 +195,8 @@
         "HyperPyYAML",
         "onnxruntime>=1.16.0",
         "pyworld>=0.3.4",
-        "numpy==1.26.4",
+        "WeTextProcessing<1.0.4",
+        "#system_numpy#",
         "#system_torch#"
       ]
     }

xinference/model/audio/utils.py CHANGED Viewed

@@ -14,7 +14,7 @@
 import io
 import logging
-import types
+import typing
 import wave
 from collections.abc import Callable
@@ -67,7 +67,7 @@ def ensure_sample_rate(
 def audio_stream_generator(
     response_format: str,
     sample_rate: int,
-    output_generator: types.GeneratorType,
+    output_generator: typing.Generator[typing.Any, None, None],
     output_chunk_transformer: Callable,
 ):
     import torch

xinference/model/core.py CHANGED Viewed

@@ -170,3 +170,4 @@ class VirtualEnvSettings(BaseModel):
     extra_index_url: Optional[str] = None
     find_links: Optional[str] = None
     trusted_host: Optional[str] = None
+    no_build_isolation: Optional[bool] = None

xinference/model/embedding/__init__.py CHANGED Viewed

@@ -119,14 +119,6 @@ def _install():
                     generate_embedding_description(model_spec)
                 )
-    register_custom_model()
-    # register model description
-    for ud_embedding in get_user_defined_embeddings():
-        EMBEDDING_MODEL_DESCRIPTIONS.update(
-            generate_embedding_description(ud_embedding)
-        )
     from .flag.core import FlagEmbeddingModel
     from .sentence_transformers.core import SentenceTransformerEmbeddingModel
     from .vllm.core import VLLMEmbeddingModel
@@ -144,5 +136,13 @@ def _install():
         for model_spec in model_infos.values():
             generate_engine_config_by_model_name(model_spec)
+    register_custom_model()
+    # register model description
+    for ud_embedding in get_user_defined_embeddings():
+        EMBEDDING_MODEL_DESCRIPTIONS.update(
+            generate_embedding_description(ud_embedding)
+        )
     del _model_spec_json
     del _model_spec_modelscope_json

xinference/model/embedding/custom.py CHANGED Viewed

@@ -42,7 +42,11 @@ def get_user_defined_embeddings() -> List[EmbeddingModelSpec]:
 def register_embedding(model_spec: CustomEmbeddingModelSpec, persist: bool):
     from ...constants import XINFERENCE_MODEL_DIR
     from ..utils import is_valid_model_name, is_valid_model_uri
-    from . import BUILTIN_EMBEDDING_MODELS, MODELSCOPE_EMBEDDING_MODELS
+    from . import (
+        BUILTIN_EMBEDDING_MODELS,
+        MODELSCOPE_EMBEDDING_MODELS,
+        generate_engine_config_by_model_name,
+    )
     if not is_valid_model_name(model_spec.model_name):
         raise ValueError(f"Invalid model name {model_spec.model_name}.")
@@ -63,6 +67,7 @@ def register_embedding(model_spec: CustomEmbeddingModelSpec, persist: bool):
                 )
         UD_EMBEDDINGS.append(model_spec)
+        generate_engine_config_by_model_name(model_spec)
     if persist:
         persist_path = os.path.join(

xinference/model/embedding/embed_family.py CHANGED Viewed

@@ -13,11 +13,8 @@
 # limitations under the License.
 import logging
-from threading import Lock
 from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Type
-from ..utils import is_valid_model_name
 if TYPE_CHECKING:
     from .core import EmbeddingModel, EmbeddingModelSpec
@@ -71,44 +68,6 @@ def match_embedding(
 # { embedding model name -> { engine name -> engine params } }
 EMBEDDING_ENGINES: Dict[str, Dict[str, List[Dict[str, Type["EmbeddingModel"]]]]] = {}
 SUPPORTED_ENGINES: Dict[str, List[Type["EmbeddingModel"]]] = {}
-UD_EMBEDDING_FAMILIES_LOCK = Lock()
-# user defined embedding models
-UD_EMBEDDING_SPECS: Dict[str, "EmbeddingModelSpec"] = {}
-def register_embedding(custom_embedding_spec: "EmbeddingModelSpec", persist: bool):
-    from ..utils import is_valid_model_uri
-    from . import generate_engine_config_by_model_name
-    if not is_valid_model_name(custom_embedding_spec.model_name):
-        raise ValueError(f"Invalid model name {custom_embedding_spec.model_name}.")
-    model_uri = custom_embedding_spec.model_uri
-    if model_uri and not is_valid_model_uri(model_uri):
-        raise ValueError(f"Invalid model URI {model_uri}.")
-    with UD_EMBEDDING_FAMILIES_LOCK:
-        if (
-            custom_embedding_spec.model_name in BUILTIN_EMBEDDING_MODELS
-            or custom_embedding_spec.model_name in MODELSCOPE_EMBEDDING_MODELS
-            or custom_embedding_spec.model_name in UD_EMBEDDING_SPECS
-        ):
-            raise ValueError(
-                f"Model name conflicts with existing model {custom_embedding_spec.model_name}"
-            )
-    UD_EMBEDDING_SPECS[custom_embedding_spec.model_name] = custom_embedding_spec
-    generate_engine_config_by_model_name(custom_embedding_spec)
-# TODO: add persist feature
-def unregister_embedding(custom_embedding_spec: "EmbeddingModelSpec"):
-    with UD_EMBEDDING_FAMILIES_LOCK:
-        model_name = custom_embedding_spec.model_name
-        if model_name in UD_EMBEDDING_SPECS:
-            del UD_EMBEDDING_SPECS[model_name]
-        if model_name in EMBEDDING_ENGINES:
-            del EMBEDDING_ENGINES[model_name]
 def check_engine_by_model_name_and_engine(

xinference/model/embedding/model_spec.json CHANGED Viewed

@@ -275,6 +275,15 @@
     "dimensions": 1024,
     "max_tokens": 8192,
     "language": ["89 languages supported"],
-    "model_id": "jinaai/jina-clip-v2"
+    "model_id": "jinaai/jina-clip-v2",
+    "virtualenv": {
+      "packages": [
+        "sentence_transformers",
+        "transformers==4.51.3",
+        "xformers",
+        "flash_attn==2.7.4 ; sys_platform=='linux'"
+      ],
+      "no_build_isolation": true
+    }
   }
 ]

xinference/model/embedding/model_spec_modelscope.json CHANGED Viewed

@@ -279,6 +279,15 @@
     "max_tokens": 8192,
     "language": ["89 languages supported"],
     "model_id": "jinaai/jina-clip-v2",
-    "model_hub": "modelscope"
+    "model_hub": "modelscope",
+    "virtualenv": {
+      "packages": [
+        "sentence_transformers",
+        "transformers==4.51.3",
+        "xformers",
+        "flash_attn==2.7.3 ; sys_platform=='linux'"
+      ],
+      "no_build_isolation": true
+    }
   }
 ]

xinference/model/embedding/sentence_transformers/core.py CHANGED Viewed

@@ -90,9 +90,10 @@ class SentenceTransformerEmbeddingModel(EmbeddingModel):
         elif "qwen3" in self._model_spec.model_name.lower():
             # qwen3 embedding
             flash_attn_installed = importlib.util.find_spec("flash_attn") is not None
+            flash_attn_enabled = self._kwargs.get("enable_flash_attn", True)
             model_kwargs = {"device_map": "auto"}
             tokenizer_kwargs = {}
-            if flash_attn_installed:
+            if flash_attn_installed and flash_attn_enabled:
                 model_kwargs["attn_implementation"] = "flash_attention_2"
                 model_kwargs["torch_dtype"] = "bfloat16"
                 tokenizer_kwargs["padding_side"] = "left"
@@ -254,8 +255,14 @@ class SentenceTransformerEmbeddingModel(EmbeddingModel):
                 # when batching, the attention mask 1 means there is a token
                 # thus we just sum up it to get the total number of tokens
                 if "clip" in self._model_spec.model_name.lower():
-                    all_token_nums += features["input_ids"].numel()
-                    all_token_nums += features["pixel_values"].numel()
+                    if "input_ids" in features and hasattr(
+                        features["input_ids"], "numel"
+                    ):
+                        all_token_nums += features["input_ids"].numel()
+                    if "pixel_values" in features and hasattr(
+                        features["pixel_values"], "numel"
+                    ):
+                        all_token_nums += features["pixel_values"].numel()
                 else:
                     all_token_nums += features["attention_mask"].sum().item()
@@ -340,24 +347,32 @@ class SentenceTransformerEmbeddingModel(EmbeddingModel):
                 img = Image.open(image_data)
                 return img
-            objs: list[dict[str, str]] = []
-            for item in sentences:
-                if isinstance(item, dict):
-                    if item.get("text") is not None:
-                        objs.append(item["text"])
-                    elif item.get("image") is not None:
-                        if re.match(r"^data:image/.+;base64,", item["image"]):
-                            image = base64_to_image(item["image"])
-                            objs.append(image)
+            objs: list[str] = []
+            if isinstance(sentences, str):
+                objs.append(sentences)
+            else:
+                for item in sentences:
+                    if isinstance(item, dict):
+                        if item.get("text") is not None:
+                            objs.append(item["text"])
+                        elif item.get("image") is not None:
+                            if re.match(r"^data:image/.+;base64,", item["image"]):
+                                image = base64_to_image(item["image"])
+                                objs.append(image)
+                            else:
+                                objs.append(item["image"])
                         else:
-                            objs.append(item["image"])
+                            raise ValueError("Please check the input data.")
+                    elif isinstance(item, str):
+                        objs.append(item)
                     else:
-                        logger.error("Please check the input data.")
+                        raise ValueError("Please check the input data.")
             all_embeddings, all_token_nums = encode(
                 self._model,
                 objs,
                 convert_to_numpy=False,
-                **self._kwargs,
+                **kwargs,
             )
         else:
             all_embeddings, all_token_nums = encode(

xinference/model/flexible/core.py CHANGED Viewed

@@ -189,7 +189,7 @@ class FlexibleModel:
         Load the model.
         """
-    def infer(self, **kwargs):
+    def infer(self, *args, **kwargs):
         """
         Call model to inference.
         """

xinference/model/flexible/launchers/__init__.py CHANGED Viewed

@@ -13,4 +13,6 @@
 # limitations under the License.
 from .image_process_launcher import launcher as image_process
+from .modelscope_launcher import launcher as modelscope
 from .transformers_launcher import launcher as transformers
+from .yolo_launcher import launcher as yolo

xinference/model/flexible/launchers/image_process_launcher.py CHANGED Viewed

@@ -23,7 +23,7 @@ from ..core import FlexibleModel, FlexibleModelSpec
 class ImageRemoveBackgroundModel(FlexibleModel):
-    def infer(self, **kwargs):
+    def infer(self, *args, **kwargs):
         invert = kwargs.get("invert", False)
         b64_image: str = kwargs.get("image")  # type: ignore
         only_mask = kwargs.pop("only_mask", True)

xinference/model/flexible/launchers/modelscope_launcher.py ADDED Viewed

@@ -0,0 +1,47 @@
+# Copyright 2022-2025 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ..core import FlexibleModel, FlexibleModelSpec
+class ModelScopePipelineModel(FlexibleModel):
+    def load(self):
+        # we have to move import here,
+        # modelscope cannot be compatible with datasets>3.2.0
+        # if put outside, it will just raise error
+        # when enabled virtualenv,
+        # we can make sure mdoelscope works well
+        from modelscope.pipelines import pipeline
+        config = dict(self.config or {})
+        if self._device:
+            config["device"] = self._device
+        self._pipeline = pipeline(model=self._model_path, **config)
+    def infer(self, *args, **kwargs):
+        return self._pipeline(*args, **kwargs)
+def launcher(model_uid: str, model_spec: FlexibleModelSpec, **kwargs) -> FlexibleModel:
+    device = kwargs.get("device")
+    if not kwargs.get("task"):
+        raise ValueError("modelscope launcher requires `task`")
+    model_path = model_spec.model_uri
+    if model_path is None:
+        raise ValueError("model_path required")
+    return ModelScopePipelineModel(
+        model_uid=model_uid, model_path=model_path, device=device, config=kwargs
+    )

xinference 1.7.0.post1__py3-none-any.whl → 1.7.1__py3-none-any.whl

Potentially problematic release.

xinference 1.7.0.post1py3-none-any.whl → 1.7.1py3-none-any.whl