PyPI - xinference - Versions diffs - 1.7.0.post1__py3-none-any.whl → 1.7.1.post1__py3-none-any.whl - Mend

xinference 1.7.0.post1py3-none-any.whl → 1.7.1.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (83) hide show

xinference/model/llm/mlx/core.py CHANGED Viewed

@@ -11,14 +11,32 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import asyncio
+import concurrent.futures
+import importlib
 import importlib.util
 import logging
+import pathlib
 import platform
 import sys
+import threading
 import time
 import uuid
 from dataclasses import dataclass, field
-from typing import Any, Dict, Iterator, List, Optional, Tuple, TypedDict, Union
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterator,
+    List,
+    Optional,
+    Tuple,
+    TypedDict,
+    Union,
+)
+import xoscar as xo
 from ....fields import max_tokens_field
 from ....types import (
@@ -29,7 +47,7 @@ from ....types import (
     CompletionUsage,
     LoRA,
 )
-from ..core import LLM
+from ..core import LLM, chat_context_var
 from ..llm_family import LLMFamilyV1, LLMSpecV1
 from ..utils import (
     DEEPSEEK_TOOL_CALL_FAMILY,
@@ -46,6 +64,10 @@ class MLXModelConfig(TypedDict, total=False):
     max_gpu_memory: str
     trust_remote_code: bool
     reasoning_content: bool
+    # distributed
+    address: Optional[str]
+    shard: Optional[int]
+    n_worker: Optional[int]
 class MLXGenerateConfig(TypedDict, total=False):
@@ -71,6 +93,8 @@ class PromptCache:
 class MLXModel(LLM):
+    _rank_to_addresses: Optional[Dict[int, str]]
     def __init__(
         self,
         model_uid: str,
@@ -84,10 +108,43 @@ class MLXModel(LLM):
         super().__init__(model_uid, model_family, model_spec, quantization, model_path)
         self._use_fast_tokenizer = True
         self._model_config: MLXModelConfig = self._sanitize_model_config(model_config)
+        # for distributed
+        assert model_config is not None
+        self._address = model_config.pop("address", None)
+        self._n_worker = model_config.pop("n_worker", 1)
+        self._shard = model_config.pop("shard", 0)
+        self._driver_info = model_config.pop("driver_info", None)  # type: ignore
+        self._rank_to_addresses = None
+        self._loading_thread = None
+        self._loading_error = None
+        self._all_worker_started = asyncio.Event()
         self._max_kv_size = None
         self._prompt_cache = None
         if peft_model is not None:
             raise ValueError("MLX engine has not supported lora yet")
+        # used to call async
+        self._loop = None
+    def set_loop(self, loop: asyncio.AbstractEventLoop):
+        # loop will be passed into ModelWrapper,
+        # to call aynsc method with asyncio.run_coroutine_threadsafe
+        self._loop = loop  # type: ignore
+    @property
+    def driver_info(self) -> Optional[dict]:
+        return self._driver_info
+    def set_shard_info(self, shard: int, address: str):
+        # set shard info to rank 0
+        if self._rank_to_addresses is None:
+            self._rank_to_addresses = {}
+        self._rank_to_addresses[shard] = address
+        if len(self._rank_to_addresses) == self._n_worker:
+            self._all_worker_started.set()
+    async def get_rank_addresses(self) -> Optional[Dict[int, str]]:
+        await self._all_worker_started.wait()
+        return self._rank_to_addresses
     def _sanitize_model_config(
         self, model_config: Optional[MLXModelConfig]
@@ -158,6 +215,97 @@ class MLXModel(LLM):
                 tokenizer.add_eos_token(stop_token_id)
         return model, tokenizer
+    def _load_model_shard(self, **kwargs):
+        try:
+            import mlx.core as mx
+            from mlx_lm.utils import load_model, load_tokenizer
+        except ImportError:
+            error_message = "Failed to import module 'mlx_lm'"
+            installation_guide = [
+                "Please make sure 'mlx_lm' is installed. ",
+                "You can install it by `pip install mlx_lm`\n",
+            ]
+            raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
+        # Ensure some attributes correctly inited by model actor
+        assert (
+            self._loop is not None and self._rank_to_addresses is not None
+        ), "Service not started correctly"
+        tokenizer_config = dict(
+            use_fast=self._use_fast_tokenizer,
+            trust_remote_code=kwargs["trust_remote_code"],
+            revision=kwargs["revision"],
+        )
+        logger.debug(
+            "loading model with tokenizer config: %s, model config: %s, shard: %d, n_worker: %d",
+            tokenizer_config,
+            self._model_config,
+            self._shard,
+            self._n_worker,
+        )
+        cache_limit_gb = kwargs.get("cache_limit_gb", None)
+        if cache_limit_gb:
+            logger.debug(f"Setting cache limit to {cache_limit_gb} GB")
+            mx.metal.set_cache_limit(cache_limit_gb * 1024 * 1024 * 1024)
+        self._max_kv_size = kwargs.get("max_kv_size", None)
+        self._prompt_cache = PromptCache()
+        self._model, config = load_model(
+            pathlib.Path(self.model_path),
+            lazy=True,
+            get_model_classes=self._get_classes,
+        )
+        model = self._model.model
+        model.rank = self._shard
+        model.world_size = self._n_worker
+        model.model_uid = self.model_uid
+        model.loop = self._loop
+        model.address = self._address
+        model.rank_to_addresses = self._rank_to_addresses
+        # create actors and so forth
+        model.prepare()
+        # real load the partial weights
+        model.pipeline()
+        mx.eval(model.parameters())
+        self._tokenizer = load_tokenizer(
+            pathlib.Path(self.model_path),
+            tokenizer_config,
+            eos_token_ids=config.get("eos_token_id", None),
+        )
+    @staticmethod
+    def _get_classes(config: dict):
+        """
+        Retrieve the model and model args classes based on the configuration
+        that supported distributed inference.
+        Args:
+            config (dict): The model configuration.
+        Returns:
+            A tuple containing the Model class and the ModelArgs class.
+        """
+        from mlx_lm.utils import MODEL_REMAPPING
+        model_type = config["model_type"]
+        model_type = MODEL_REMAPPING.get(model_type, model_type)
+        try:
+            arch = importlib.import_module(
+                f"xinference.model.llm.mlx.distributed_models.{model_type}"
+            )
+        except ImportError:
+            msg = f"Model type {model_type} not supported for distributed inference."
+            logger.error(msg)
+            raise ValueError(msg)
+        return arch.Model, arch.ModelArgs
     def load(self):
         reasoning_content = self._model_config.pop("reasoning_content")
         enable_thinking = self._model_config.pop("enable_thinking", True)
@@ -172,7 +320,49 @@ class MLXModel(LLM):
         kwargs["trust_remote_code"] = self._model_config.get("trust_remote_code")
         kwargs["cache_limit_gb"] = self._model_config.pop("cache_limit_gb", None)
-        self._model, self._tokenizer = self._load_model(**kwargs)
+        if self._n_worker <= 1:
+            self._model, self._tokenizer = self._load_model(**kwargs)
+        else:
+            def _load():
+                try:
+                    if self._shard == 0:
+                        self._driver_info = {"address": self._address}
+                        self.set_shard_info(0, self._address)
+                    else:
+                        assert self._driver_info is not None
+                        driver_address = self._driver_info["address"]
+                        async def wait_for_all_shards():
+                            model_ref = await xo.actor_ref(
+                                address=driver_address, uid=self.raw_model_uid
+                            )
+                            # set shard info
+                            await model_ref.set_shard_info(self._shard, self._address)
+                            # wait for all shards
+                            self._rank_to_addresses = (
+                                await model_ref.get_rank_addresses()
+                            )
+                        asyncio.run_coroutine_threadsafe(
+                            wait_for_all_shards(), self._loop
+                        ).result()
+                    self._load_model_shard(**kwargs)
+                except:
+                    logger.exception("Loading mlx shard model failed")
+                    self._loading_error = sys.exc_info()
+            # distributed inference
+            self._loading_thread = threading.Thread(target=_load)
+            self._loading_thread.start()
+    def wait_for_load(self):
+        if self._loading_thread:
+            self._loading_thread.join()
+            if self._loading_error:
+                _, err, tb = self._loading_error
+                raise err.with_traceback(tb)
     @classmethod
     def check_lib(cls) -> bool:
@@ -369,20 +559,57 @@ class MLXModel(LLM):
             )
             yield completion_chunk, completion_usage
+    def _run_non_drivers(
+        self, method: str, stream: bool, *args, **kwargs
+    ) -> Optional[concurrent.futures.Future]:
+        assert self._n_worker is not None and self._shard is not None
+        if self._n_worker == 1 or self._shard > 0:
+            # only run for distributed driver
+            return None
+        async def run_other_shard(shard: int):
+            assert self._rank_to_addresses is not None
+            address = self._rank_to_addresses[shard]
+            model_actor_ref = await xo.actor_ref(
+                address=address, uid=self.raw_model_uid
+            )
+            # we don't actually need to get the result from shard >= 1
+            if stream:
+                async for _ in await getattr(model_actor_ref, method)(*args, **kwargs):
+                    pass
+            else:
+                await getattr(model_actor_ref, method)(*args, **kwargs)
+        async def run_non_driver_shards():
+            logger.debug("Start to run non driver %s", method)
+            coros = []
+            for rank in range(1, self._n_worker):
+                coros.append(run_other_shard(rank))
+            await asyncio.gather(*coros)
+        assert self._loop is not None
+        return asyncio.run_coroutine_threadsafe(run_non_driver_shards(), self._loop)
     def generate(
         self,
         prompt: Union[str, Dict[str, Any]],
         generate_config: Optional[MLXGenerateConfig] = None,
+        from_chat: bool = False,
     ) -> Union[Completion, Iterator[CompletionChunk]]:
         def generator_wrapper(
-            prompt: Union[str, Dict[str, Any]], generate_config: MLXGenerateConfig
+            prompt: Union[str, Dict[str, Any]],
+            generate_config: MLXGenerateConfig,
+            cb: Callable,
         ) -> Iterator[CompletionChunk]:
-            for completion_chunk, completion_usage in self._generate_stream(
-                prompt,
-                generate_config,
-            ):
-                completion_chunk["usage"] = completion_usage
-                yield completion_chunk
+            try:
+                for completion_chunk, completion_usage in self._generate_stream(
+                    prompt,
+                    generate_config,
+                ):
+                    completion_chunk["usage"] = completion_usage
+                    yield completion_chunk
+            finally:
+                cb()
         logger.debug(
             "Enter generate, prompt: %s, generate config: %s", prompt, generate_config
@@ -394,6 +621,9 @@ class MLXModel(LLM):
         assert self._tokenizer is not None
         stream = generate_config.get("stream", False)
+        fut = self._run_non_drivers(
+            "generate", stream, prompt, generate_config=generate_config
+        )
         if not stream:
             for completion_chunk, completion_usage in self._generate_stream(
                 prompt,
@@ -408,9 +638,18 @@ class MLXModel(LLM):
                 choices=completion_chunk["choices"],
                 usage=completion_usage,
             )
-            return completion
+            try:
+                return completion
+            finally:
+                if fut:
+                    fut.result()
         else:
-            return generator_wrapper(prompt, generate_config)
+            def finish_callback():
+                if fut:
+                    fut.result()
+            return generator_wrapper(prompt, generate_config, finish_callback)
 class MLXChatModel(MLXModel, ChatModelMixin):
@@ -452,9 +691,14 @@ class MLXChatModel(MLXModel, ChatModelMixin):
     ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
         model_family = self.model_family.model_family or self.model_family.model_name
         tools = generate_config.pop("tools", []) if generate_config else None
-        full_context_kwargs = (
-            self._get_chat_template_kwargs_from_generate_config(generate_config, self.reasoning_parser) or {}  # type: ignore
+        chat_template_kwargs = (
+            self._get_chat_template_kwargs_from_generate_config(
+                generate_config, self.reasoning_parser
+            )
+            or {}
         )
+        chat_context_var.set(chat_template_kwargs)
+        full_context_kwargs = chat_template_kwargs.copy()
         if tools:
             if (
                 model_family in QWEN_TOOL_CALL_FAMILY
@@ -470,11 +714,11 @@ class MLXChatModel(MLXModel, ChatModelMixin):
         stream = generate_config.get("stream", False)
         if stream:
-            it = self.generate(full_prompt, generate_config)
+            it = self.generate(full_prompt, generate_config, from_chat=True)
             assert isinstance(it, Iterator)
             return self._to_chat_completion_chunks(it, self.reasoning_parser)
         else:
-            c = self.generate(full_prompt, generate_config)
+            c = self.generate(full_prompt, generate_config, from_chat=True)
             assert not isinstance(c, Iterator)
             if tools:
                 return self._post_process_completion(
@@ -518,6 +762,11 @@ class MLXVisionModel(MLXModel, ChatModelMixin):
         return load(self.model_path)
     def load(self):
+        if self._n_worker > 1:
+            raise NotImplementedError(
+                "Distributed inference is not supported for vision models"
+            )
         kwargs = {}
         kwargs["revision"] = self._model_config.get(
             "revision", self.model_spec.model_revision
@@ -636,10 +885,14 @@ class MLXVisionModel(MLXModel, ChatModelMixin):
         if "internvl2" not in model_family.lower():
             from qwen_vl_utils import process_vision_info
-            full_context_kwargs = (
-                self._get_chat_template_kwargs_from_generate_config(generate_config, self.reasoning_parser)  # type: ignore
+            chat_template_kwargs = (
+                self._get_chat_template_kwargs_from_generate_config(
+                    generate_config, self.reasoning_parser
+                )
                 or {}
             )
+            chat_context_var.set(chat_template_kwargs)
+            full_context_kwargs = chat_template_kwargs.copy()
             if tools and model_family in QWEN_TOOL_CALL_FAMILY:
                 full_context_kwargs["tools"] = tools
             assert self.model_family.chat_template is not None

xinference/model/llm/mlx/distributed_models/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

xinference/model/llm/mlx/distributed_models/core.py ADDED Viewed

@@ -0,0 +1,164 @@
+# Copyright 2022-2025 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import asyncio
+import logging
+import os
+from typing import TYPE_CHECKING, Dict, Optional
+import xoscar as xo
+from xoscar.utils import lazy_import
+if TYPE_CHECKING:
+    import mlx.core as mx
+else:
+    mx = lazy_import("mlx.core")
+logger = logging.getLogger(__name__)
+DEBUG_DISTRIBUTED_MLX = bool(int(os.getenv("XINFERENCE_DEBUG_DISTRIBUTED_MLX", "0")))
+class ReceiverActor(xo.StatelessActor):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._recv_queue = asyncio.Queue()
+    @classmethod
+    def gen_uid(cls, uid: str, rank: int):
+        return f"Receiver-{uid}-{rank}"
+    async def send(self, data: "mx.array"):
+        # no need to use async function,
+        # but make it more convenient to patch this function for test purpose
+        if not isinstance(data, mx.array):
+            data = mx.array(data)
+        self._recv_queue.put_nowait(data)
+    async def recv(self):
+        return await self._recv_queue.get()
+class DistributedModelMixin:
+    rank: int
+    world_size: int
+    model_uid: Optional[str]
+    address: Optional[str]
+    _receiver_ref: Optional[xo.ActorRefType[ReceiverActor]]
+    rank_to_addresses: Optional[Dict[int, str]]
+    layers: list
+    def __init__(self):
+        self.rank = 0
+        self.world_size = 1
+        self.model_uid = None
+        self.loop = None
+        self.address = None
+        # actor ref
+        self._receiver_ref = None
+        self.rank_to_addresses = None
+    def prepare(self):
+        coro = xo.create_actor(
+            ReceiverActor,
+            uid=ReceiverActor.gen_uid(self.model_uid, self.rank),
+            address=self.address,
+        )
+        self._receiver_ref = asyncio.run_coroutine_threadsafe(coro, self.loop).result()
+        if DEBUG_DISTRIBUTED_MLX:
+            logger.debug("Finish preparing distributed env for rank %s", self.rank)
+    def _send_stage_result(self, result: "mx.array"):
+        assert self.rank > 0
+        assert self.rank_to_addresses is not None
+        assert self.model_uid is not None
+        last_rank = self.rank - 1
+        if DEBUG_DISTRIBUTED_MLX:
+            logger.debug(
+                "Start to send %s partial result to rank %d", self.model_uid, last_rank
+            )
+        async def send():
+            receiver_ref = await xo.actor_ref(
+                uid=ReceiverActor.gen_uid(self.model_uid, last_rank),
+                address=self.rank_to_addresses[last_rank],
+            )
+            return await receiver_ref.send(result)
+        asyncio.run_coroutine_threadsafe(send(), self.loop).result()
+        if DEBUG_DISTRIBUTED_MLX:
+            logger.debug(
+                "Finish send %s partial result to rank %d, shape %s",
+                self.model_uid,
+                last_rank,
+                result.shape,
+            )
+    def _wait_prev_stage_result(self):
+        if DEBUG_DISTRIBUTED_MLX:
+            logger.debug("Wait for partial result from prev shard %d", self.rank + 1)
+        coro = self._receiver_ref.recv()
+        result = asyncio.run_coroutine_threadsafe(coro, self.loop).result()
+        if DEBUG_DISTRIBUTED_MLX:
+            logger.debug(
+                "Received partial result from prev shard %d, shape %s",
+                self.rank + 1,
+                result.shape,
+            )
+        return result
+    def _broadcast_result(self, result: "mx.array"):
+        if DEBUG_DISTRIBUTED_MLX:
+            logger.debug("broadcast result from driver")
+        async def broadcast(rank: int):
+            assert self.model_uid is not None
+            assert self.rank_to_addresses is not None
+            receiver = await xo.actor_ref(
+                uid=ReceiverActor.gen_uid(self.model_uid, rank),
+                address=self.rank_to_addresses[rank],
+            )
+            await receiver.send(result)
+        async def broadcast_all():
+            coros = []
+            for rank in range(1, self.world_size):
+                coros.append(broadcast(rank))
+            await asyncio.gather(*coros)
+        return asyncio.run_coroutine_threadsafe(broadcast_all(), self.loop).result()
+    def _get_result(self) -> "mx.array":
+        if DEBUG_DISTRIBUTED_MLX:
+            logger.debug("Get result from broadcasted data on self receiver")
+        assert self.model_uid is not None
+        coro = xo.actor_ref(
+            uid=ReceiverActor.gen_uid(self.model_uid, self.rank), address=self.address
+        )
+        ref = asyncio.run_coroutine_threadsafe(coro, self.loop).result()
+        return asyncio.run_coroutine_threadsafe(ref.recv(), loop=self.loop).result()
+    def pipeline(self):
+        pipeline_size, rank = self.world_size, self.rank
+        layers_per_rank = len(self.layers) // pipeline_size
+        extra = len(self.layers) - layers_per_rank * pipeline_size
+        if self.rank < extra:
+            layers_per_rank += 1
+        self.start_idx = (pipeline_size - rank - 1) * layers_per_rank
+        self.end_idx = self.start_idx + layers_per_rank
+        self.layers = self.layers[: self.end_idx]
+        self.layers[: self.start_idx] = [None] * self.start_idx
+        self.num_layers = len(self.layers) - self.start_idx

xinference/model/llm/mlx/distributed_models/deepseek_v3.py ADDED Viewed

@@ -0,0 +1,75 @@
+# Copyright 2022-2025 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Optional
+import mlx.core as mx
+import mlx.nn as nn
+from mlx_lm.models.base import create_attention_mask
+from mlx_lm.models.deepseek_v3 import DeepseekV3Model as _DeepseekV3Model
+from mlx_lm.models.deepseek_v3 import Model as _Model
+from mlx_lm.models.deepseek_v3 import ModelArgs
+from .core import DistributedModelMixin
+class DeepseekV3Model(_DeepseekV3Model, DistributedModelMixin):
+    def __init__(self, *args, **kwargs):
+        _DeepseekV3Model.__init__(self, *args, **kwargs)
+        DistributedModelMixin.__init__(self)
+    def __call__(
+        self,
+        x: mx.array,
+        cache: Optional[Any] = None,
+        mask: Optional[mx.array] = None,
+    ) -> mx.array:
+        h = self.embed_tokens(x)
+        pipeline_rank = self.rank
+        pipeline_size = self.world_size
+        if mask is None:
+            mask = create_attention_mask(h, cache)
+        if cache is None:
+            cache = [None] * self.num_layers
+        # Receive from the previous process in the pipeline
+        if pipeline_rank < pipeline_size - 1:
+            # wait for previous result
+            h = self._wait_prev_stage_result()
+        for i in range(self.num_layers):
+            h = self.layers[self.start_idx + i](h, mask, cache[i])
+        mx.eval(h)
+        if pipeline_rank != 0:
+            # Send to the next process in the pipeline
+            self._send_stage_result(h)
+            # wait for the final result
+            h = self._get_result()
+        else:
+            self._set_result(h)
+        return self.norm(h)
+class Model(_Model):
+    def __init__(self, config: ModelArgs):
+        nn.Module.__init__(self)
+        self.args = config
+        self.model_type = config.model_type
+        self.model = DeepseekV3Model(config)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

xinference 1.7.0.post1__py3-none-any.whl → 1.7.1.post1__py3-none-any.whl

Potentially problematic release.

xinference 1.7.0.post1py3-none-any.whl → 1.7.1.post1py3-none-any.whl