PyPI - xinference - Versions diffs - 1.2.0__py3-none-any.whl → 1.2.2__py3-none-any.whl - Mend

xinference 1.2.0py3-none-any.whl → 1.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (124) hide show

xinference/_version.py +3 -3
xinference/api/restful_api.py +4 -7
xinference/client/handlers.py +3 -0
xinference/core/chat_interface.py +6 -1
xinference/core/model.py +2 -0
xinference/core/scheduler.py +4 -7
xinference/core/supervisor.py +114 -23
xinference/core/worker.py +70 -4
xinference/deploy/local.py +2 -1
xinference/model/audio/core.py +11 -0
xinference/model/audio/cosyvoice.py +16 -5
xinference/model/audio/kokoro.py +139 -0
xinference/model/audio/melotts.py +110 -0
xinference/model/audio/model_spec.json +80 -0
xinference/model/audio/model_spec_modelscope.json +18 -0
xinference/model/audio/whisper.py +35 -10
xinference/model/llm/llama_cpp/core.py +21 -14
xinference/model/llm/llm_family.json +527 -1
xinference/model/llm/llm_family.py +4 -1
xinference/model/llm/llm_family_modelscope.json +495 -3
xinference/model/llm/memory.py +1 -1
xinference/model/llm/mlx/core.py +24 -6
xinference/model/llm/transformers/core.py +9 -1
xinference/model/llm/transformers/qwen2_audio.py +3 -1
xinference/model/llm/transformers/qwen2_vl.py +20 -3
xinference/model/llm/transformers/utils.py +22 -11
xinference/model/llm/utils.py +115 -1
xinference/model/llm/vllm/core.py +14 -4
xinference/model/llm/vllm/xavier/block.py +3 -4
xinference/model/llm/vllm/xavier/block_tracker.py +71 -58
xinference/model/llm/vllm/xavier/collective.py +74 -0
xinference/model/llm/vllm/xavier/collective_manager.py +147 -0
xinference/model/llm/vllm/xavier/executor.py +18 -16
xinference/model/llm/vllm/xavier/scheduler.py +79 -63
xinference/model/llm/vllm/xavier/test/test_xavier.py +60 -35
xinference/model/llm/vllm/xavier/transfer.py +53 -32
xinference/thirdparty/cosyvoice/bin/spk2info.pt +0 -0
xinference/thirdparty/melo/__init__.py +0 -0
xinference/thirdparty/melo/api.py +135 -0
xinference/thirdparty/melo/app.py +61 -0
xinference/thirdparty/melo/attentions.py +459 -0
xinference/thirdparty/melo/commons.py +160 -0
xinference/thirdparty/melo/configs/config.json +94 -0
xinference/thirdparty/melo/data/example/metadata.list +20 -0
xinference/thirdparty/melo/data_utils.py +413 -0
xinference/thirdparty/melo/download_utils.py +67 -0
xinference/thirdparty/melo/infer.py +25 -0
xinference/thirdparty/melo/init_downloads.py +14 -0
xinference/thirdparty/melo/losses.py +58 -0
xinference/thirdparty/melo/main.py +36 -0
xinference/thirdparty/melo/mel_processing.py +174 -0
xinference/thirdparty/melo/models.py +1030 -0
xinference/thirdparty/melo/modules.py +598 -0
xinference/thirdparty/melo/monotonic_align/__init__.py +16 -0
xinference/thirdparty/melo/monotonic_align/core.py +46 -0
xinference/thirdparty/melo/preprocess_text.py +135 -0
xinference/thirdparty/melo/split_utils.py +174 -0
xinference/thirdparty/melo/text/__init__.py +35 -0
xinference/thirdparty/melo/text/chinese.py +199 -0
xinference/thirdparty/melo/text/chinese_bert.py +107 -0
xinference/thirdparty/melo/text/chinese_mix.py +253 -0
xinference/thirdparty/melo/text/cleaner.py +36 -0
xinference/thirdparty/melo/text/cleaner_multiling.py +110 -0
xinference/thirdparty/melo/text/cmudict.rep +129530 -0
xinference/thirdparty/melo/text/cmudict_cache.pickle +0 -0
xinference/thirdparty/melo/text/english.py +284 -0
xinference/thirdparty/melo/text/english_bert.py +39 -0
xinference/thirdparty/melo/text/english_utils/__init__.py +0 -0
xinference/thirdparty/melo/text/english_utils/abbreviations.py +35 -0
xinference/thirdparty/melo/text/english_utils/number_norm.py +97 -0
xinference/thirdparty/melo/text/english_utils/time_norm.py +47 -0
xinference/thirdparty/melo/text/es_phonemizer/__init__.py +0 -0
xinference/thirdparty/melo/text/es_phonemizer/base.py +140 -0
xinference/thirdparty/melo/text/es_phonemizer/cleaner.py +109 -0
xinference/thirdparty/melo/text/es_phonemizer/es_symbols.json +79 -0
xinference/thirdparty/melo/text/es_phonemizer/es_symbols.txt +1 -0
xinference/thirdparty/melo/text/es_phonemizer/es_symbols_v2.json +83 -0
xinference/thirdparty/melo/text/es_phonemizer/es_to_ipa.py +12 -0
xinference/thirdparty/melo/text/es_phonemizer/example_ipa.txt +400 -0
xinference/thirdparty/melo/text/es_phonemizer/gruut_wrapper.py +253 -0
xinference/thirdparty/melo/text/es_phonemizer/punctuation.py +174 -0
xinference/thirdparty/melo/text/es_phonemizer/spanish_symbols.txt +1 -0
xinference/thirdparty/melo/text/es_phonemizer/test.ipynb +124 -0
xinference/thirdparty/melo/text/fr_phonemizer/__init__.py +0 -0
xinference/thirdparty/melo/text/fr_phonemizer/base.py +140 -0
xinference/thirdparty/melo/text/fr_phonemizer/cleaner.py +122 -0
xinference/thirdparty/melo/text/fr_phonemizer/en_symbols.json +78 -0
xinference/thirdparty/melo/text/fr_phonemizer/example_ipa.txt +1 -0
xinference/thirdparty/melo/text/fr_phonemizer/fr_symbols.json +89 -0
xinference/thirdparty/melo/text/fr_phonemizer/fr_to_ipa.py +30 -0
xinference/thirdparty/melo/text/fr_phonemizer/french_abbreviations.py +48 -0
xinference/thirdparty/melo/text/fr_phonemizer/french_symbols.txt +1 -0
xinference/thirdparty/melo/text/fr_phonemizer/gruut_wrapper.py +258 -0
xinference/thirdparty/melo/text/fr_phonemizer/punctuation.py +172 -0
xinference/thirdparty/melo/text/french.py +94 -0
xinference/thirdparty/melo/text/french_bert.py +39 -0
xinference/thirdparty/melo/text/japanese.py +647 -0
xinference/thirdparty/melo/text/japanese_bert.py +49 -0
xinference/thirdparty/melo/text/ko_dictionary.py +44 -0
xinference/thirdparty/melo/text/korean.py +192 -0
xinference/thirdparty/melo/text/opencpop-strict.txt +429 -0
xinference/thirdparty/melo/text/spanish.py +122 -0
xinference/thirdparty/melo/text/spanish_bert.py +39 -0
xinference/thirdparty/melo/text/symbols.py +290 -0
xinference/thirdparty/melo/text/tone_sandhi.py +769 -0
xinference/thirdparty/melo/train.py +635 -0
xinference/thirdparty/melo/train.sh +19 -0
xinference/thirdparty/melo/transforms.py +209 -0
xinference/thirdparty/melo/utils.py +424 -0
xinference/types.py +2 -0
xinference/web/ui/build/asset-manifest.json +3 -3
xinference/web/ui/build/index.html +1 -1
xinference/web/ui/build/static/js/{main.1eb206d1.js → main.b0936c54.js} +3 -3
xinference/web/ui/build/static/js/main.b0936c54.js.map +1 -0
xinference/web/ui/node_modules/.cache/babel-loader/a3ff866acddf34917a7ee399e0e571a4dfd8ba66d5057db885f243e16a6eb17d.json +1 -0
{xinference-1.2.0.dist-info → xinference-1.2.2.dist-info}/METADATA +37 -27
{xinference-1.2.0.dist-info → xinference-1.2.2.dist-info}/RECORD +122 -45
xinference/web/ui/build/static/js/main.1eb206d1.js.map +0 -1
xinference/web/ui/node_modules/.cache/babel-loader/2213d49de260e1f67c888081b18f120f5225462b829ae57c9e05a05cec83689d.json +0 -1
/xinference/web/ui/build/static/js/{main.1eb206d1.js.LICENSE.txt → main.b0936c54.js.LICENSE.txt} +0 -0
{xinference-1.2.0.dist-info → xinference-1.2.2.dist-info}/LICENSE +0 -0
{xinference-1.2.0.dist-info → xinference-1.2.2.dist-info}/WHEEL +0 -0
{xinference-1.2.0.dist-info → xinference-1.2.2.dist-info}/entry_points.txt +0 -0
{xinference-1.2.0.dist-info → xinference-1.2.2.dist-info}/top_level.txt +0 -0

xinference/model/llm/transformers/qwen2_vl.py CHANGED Viewed

@@ -45,9 +45,13 @@ class Qwen2VLChatModel(PytorchChatModel):
     def match(
         cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
     ) -> bool:
+        if model_spec.model_format not in ["pytorch", "gptq", "awq"]:
+            return False
         llm_family = model_family.model_family or model_family.model_name
         if "qwen2-vl-instruct".lower() in llm_family.lower():
             return True
+        if "qwen2.5-vl-instruct".lower() in llm_family.lower():
+            return True
         if "qvq-72b-preview".lower() in llm_family.lower():
             return True
         return False
@@ -55,6 +59,11 @@ class Qwen2VLChatModel(PytorchChatModel):
     def load(self):
         from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
+        try:
+            from transformers import Qwen2_5_VLForConditionalGeneration
+        except ImportError:
+            Qwen2_5_VLForConditionalGeneration = None
         device = self._pytorch_model_config.get("device", "auto")
         device = select_device(device)
         self._device = device
@@ -66,8 +75,16 @@ class Qwen2VLChatModel(PytorchChatModel):
         )
         self._tokenizer = self._processor.tokenizer
         flash_attn_installed = importlib.util.find_spec("flash_attn") is not None
+        llm_family = self.model_family.model_family or self.model_family.model_name
+        model_cls = (
+            Qwen2_5_VLForConditionalGeneration
+            if "qwen2.5" in llm_family
+            else Qwen2VLForConditionalGeneration
+        )
+        if model_cls is None:
+            raise ImportError("`transformers` version is too old, please upgrade it")
         if flash_attn_installed:
-            self._model = Qwen2VLForConditionalGeneration.from_pretrained(
+            self._model = model_cls.from_pretrained(
                 self.model_path,
                 torch_dtype="bfloat16",
                 device_map=device,
@@ -76,14 +93,14 @@ class Qwen2VLChatModel(PytorchChatModel):
             ).eval()
         elif is_npu_available():
             # Ascend do not support bf16
-            self._model = Qwen2VLForConditionalGeneration.from_pretrained(
+            self._model = model_cls.from_pretrained(
                 self.model_path,
                 device_map="auto",
                 trust_remote_code=True,
                 torch_dtype="float16",
             ).eval()
         else:
-            self._model = Qwen2VLForConditionalGeneration.from_pretrained(
+            self._model = model_cls.from_pretrained(
                 self.model_path, device_map=device, trust_remote_code=True
             ).eval()

xinference/model/llm/transformers/utils.py CHANGED Viewed

@@ -193,16 +193,14 @@ def _get_pad_param(seq_len_idx: int, pad_len: int) -> Tuple:
 def _merge_kv_cache(
     xinf_model_obj: "PytorchModel",
-    past_kv: Tuple[Tuple[torch.Tensor]],
-    new_kv: Tuple[Tuple[torch.Tensor]],
-):
+    past_cache: DynamicCache,
+    new_cache: DynamicCache,
+) -> DynamicCache:
     from torch.nn.functional import pad
     _, seq_len_idx = xinf_model_obj.get_batch_size_and_seq_len_indexes_from_kv()
-    past_cache = DynamicCache.from_legacy_cache(past_kv)
-    new_cache = DynamicCache.from_legacy_cache(new_kv)
-    past_seq_len = past_kv[0][0].shape[seq_len_idx]
-    new_seq_len = new_kv[0][0].shape[seq_len_idx]
+    past_seq_len = past_cache[0][0].shape[seq_len_idx]
+    new_seq_len = new_cache[0][0].shape[seq_len_idx]
     if past_seq_len != new_seq_len:
         padding_target = new_cache if past_seq_len > new_seq_len else past_cache
         padding_len = abs(past_seq_len - new_seq_len)
@@ -219,8 +217,12 @@ def _merge_kv_cache(
     for idx in range(len(past_cache)):
         k1, k2 = new_cache.key_cache[idx], past_cache.key_cache[idx]
         v1, v2 = new_cache.value_cache[idx], past_cache.value_cache[idx]
-        ret_kv.update(torch.cat((k1, k2), 0), torch.cat((v1, v2), 0), idx)
-    return ret_kv.to_legacy_cache()
+        ret_kv.update(
+            torch.cat((k1, k2), 0).contiguous(),
+            torch.cat((v1, v2), 0).contiguous(),
+            idx,
+        )
+    return ret_kv
 def get_batch_size_and_seq_len_from_kv_cache(kv, xinf_model_obj: "PytorchModel"):
@@ -228,6 +230,15 @@ def get_batch_size_and_seq_len_from_kv_cache(kv, xinf_model_obj: "PytorchModel")
     return kv[0][0].shape[bs_idx], kv[0][0].shape[seq_len_idx] + 1
+def convert_to_cache_cls(cache) -> DynamicCache:
+    """
+    Compatible with some old models
+    """
+    if isinstance(cache, tuple):
+        return DynamicCache.from_legacy_cache(cache)
+    return cache
 @torch.inference_mode()
 def _batch_inference_one_step_internal(
     xinf_model_obj: "PytorchModel",
@@ -269,7 +280,7 @@ def _batch_inference_one_step_internal(
         out = model(**prefill_kws, use_cache=True)
         logits = out.logits
-        past_key_values = out.past_key_values
+        past_key_values = convert_to_cache_cls(out.past_key_values)
         for i, r in enumerate(prefill_reqs):
             (
@@ -317,7 +328,7 @@ def _batch_inference_one_step_internal(
         )
         out = model(**inf_kws, use_cache=True, past_key_values=past_key_values)
         logits = out.logits
-        past_key_values = out.past_key_values
+        past_key_values = convert_to_cache_cls(out.past_key_values)
         for i, r in enumerate(valid_req_list):
             (

xinference/model/llm/utils.py CHANGED Viewed

@@ -11,16 +11,28 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import base64
 import functools
 import json
 import logging
 import os
+import re
 import time
 import typing
 import uuid
 from io import BytesIO
-from typing import AsyncGenerator, Dict, Iterator, List, Optional, Tuple, cast
+from typing import (
+    Any,
+    AsyncGenerator,
+    Dict,
+    Iterable,
+    Iterator,
+    List,
+    Optional,
+    Tuple,
+    cast,
+)
 import requests
 from PIL import Image
@@ -64,6 +76,18 @@ LLAMA3_TOOL_CALL_FAMILY = [
     "llama-3.1-instruct",
 ]
+DEEPSEEK_TOOL_CALL_FAMILY = [
+    "deepseek-r1-distill-qwen",
+    "deepseek-r1-distill-llama",
+]
+TOOL_CALL_FAMILY = (
+    QWEN_TOOL_CALL_FAMILY
+    + GLM4_TOOL_CALL_FAMILY
+    + LLAMA3_TOOL_CALL_FAMILY
+    + DEEPSEEK_TOOL_CALL_FAMILY
+)
 QWEN_TOOL_CALL_SYMBOLS = ["<tool_call>", "</tool_call>"]
@@ -104,6 +128,10 @@ class ChatModelMixin:
         tokenize=False,
         **kwargs,
     ):
+        if "vision" not in self.model_family.model_ability:  # type: ignore
+            messages = self.convert_messages_with_content_list_to_str_conversion(
+                messages
+            )
         if tokenizer is not None:
             try:
                 full_context = tokenizer.apply_chat_template(
@@ -304,6 +332,35 @@ class ChatModelMixin:
             else:
                 yield cls._to_chat_completion_chunk(chunk)
+    @classmethod
+    def _tools_to_messages_for_deepseek(
+        cls, messages: List[dict], tools: Iterable[dict]
+    ):
+        # deepseek integrates tool calls into messages
+        # we follow the chat template rule to integrate tools into messages
+        tool_call_message: Dict[str, Any] = {
+            "role": "assistant",
+            "content": None,
+            "tool_calls": [],
+        }
+        for tool in tools:
+            function_name = tool["function"]["name"]
+            parameters = tool["function"].get("parameters", {}).get("properties", {})
+            function_args_json = json.dumps(parameters)
+            tool_call_message["tool_calls"].append(
+                {
+                    "type": "function",
+                    "function": {
+                        "name": function_name,
+                        "arguments": function_args_json,
+                    },
+                }
+            )
+        messages.append(tool_call_message)
     @classmethod
     async def _async_to_chat_completion_chunks(
         cls,
@@ -397,6 +454,61 @@ class ChatModelMixin:
         except Exception:
             return [(text, None, None)]
+    @classmethod
+    def _eval_deepseek_chat_arguments(cls, c) -> List[Tuple]:
+        """
+        Parses tool calls from deepseek-r1 format and removes duplicates.
+        Returns:
+        List[Tuple[Optional[str], Optional[str], Optional[dict]]]
+        - (None, function_name, arguments) if successfully parsed.
+        - (content, None, None) if parsing failed (content is raw JSON text).
+        Example input:
+        <｜tool▁call｜>get_current_weather
+        ```json
+        {"location": "tokyo", "unit": "fahrenheit"}
+        ```
+        Output:
+        [
+            (None, "get_current_weather", {"location": "tokyo", "unit": "fahrenheit"})
+        ]
+        """
+        text = c["choices"][0]["text"]
+        pattern = r"<｜tool▁call｜>(\w+)\s*```json\s*(.*?)\s*```"
+        matches = re.findall(pattern, text, re.DOTALL)
+        if not matches:
+            return [(text, None, None)]
+        tool_calls = set()  # Used for deduplication
+        results = []
+        for function_name, args_json in matches:
+            try:
+                arguments = json.loads(args_json)
+                # Convert dictionary to frozenset for deduplication
+                arguments_hashable = frozenset(arguments.items())
+                tool_call_tuple = (None, function_name, arguments)
+            except json.JSONDecodeError:
+                tool_call_tuple = (
+                    args_json,
+                    None,
+                    None,
+                )  # If parsing fails, treat as raw content
+                arguments_hashable = None  # No need for hashing
+            # Avoid duplicate entries
+            dedup_key = (function_name, arguments_hashable)
+            if dedup_key not in tool_calls:
+                tool_calls.add(dedup_key)
+                results.append(tool_call_tuple)
+        return results
     @classmethod
     def _eval_tool_arguments(cls, model_family, c):
         family = model_family.model_family or model_family.model_name
@@ -406,6 +518,8 @@ class ChatModelMixin:
             result = cls._eval_qwen_chat_arguments(c)
         elif family in LLAMA3_TOOL_CALL_FAMILY:
             result = cls._eval_llama3_chat_arguments(c)
+        elif family in DEEPSEEK_TOOL_CALL_FAMILY:
+            result = cls._eval_deepseek_chat_arguments(c)
         else:
             raise Exception(
                 f"Model {model_family.model_name} is not support tool calls."

xinference/model/llm/vllm/core.py CHANGED Viewed

@@ -44,6 +44,7 @@ from ....types import (
 from .. import LLM, LLMFamilyV1, LLMSpecV1
 from ..llm_family import CustomLLMFamilyV1
 from ..utils import (
+    DEEPSEEK_TOOL_CALL_FAMILY,
     QWEN_TOOL_CALL_FAMILY,
     QWEN_TOOL_CALL_SYMBOLS,
     ChatModelMixin,
@@ -157,7 +158,7 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.3.0":
     VLLM_SUPPORTED_CHAT_MODELS.append("qwen2.5-coder-instruct")
     VLLM_SUPPORTED_CHAT_MODELS.append("QwQ-32B-Preview")
     VLLM_SUPPORTED_CHAT_MODELS.append("marco-o1")
+    VLLM_SUPPORTED_CHAT_MODELS.append("deepseek-r1-distill-qwen")
 if VLLM_INSTALLED and vllm.__version__ >= "0.3.2":
     VLLM_SUPPORTED_CHAT_MODELS.append("gemma-it")
@@ -185,6 +186,7 @@ if VLLM_INSTALLED and vllm.__version__ > "0.5.3":
     VLLM_SUPPORTED_MODELS.append("llama-3.1")
     VLLM_SUPPORTED_CHAT_MODELS.append("llama-3.1-instruct")
     VLLM_SUPPORTED_CHAT_MODELS.append("llama-3.3-instruct")
+    VLLM_SUPPORTED_CHAT_MODELS.append("deepseek-r1-distill-llama")
 if VLLM_INSTALLED and vllm.__version__ >= "0.6.1":
     VLLM_SUPPORTED_VISION_MODEL_LIST.append("internvl2")
@@ -198,6 +200,12 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.6.3":
     VLLM_SUPPORTED_VISION_MODEL_LIST.append("qwen2-vl-instruct")
     VLLM_SUPPORTED_VISION_MODEL_LIST.append("QvQ-72B-Preview")
+if VLLM_INSTALLED and vllm.__version__ >= "0.7.0":
+    VLLM_SUPPORTED_CHAT_MODELS.append("internlm3-instruct")
+if VLLM_INSTALLED and vllm.__version__ >= "0.7.2":
+    VLLM_SUPPORTED_VISION_MODEL_LIST.append("qwen2.5-vl-instruct")
 class VLLMModel(LLM):
     def __init__(
@@ -804,12 +812,14 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
         generate_config: Optional[Dict] = None,
         request_id: Optional[str] = None,
     ) -> Union[ChatCompletion, AsyncGenerator[ChatCompletionChunk, None]]:
-        messages = self.convert_messages_with_content_list_to_str_conversion(messages)
         tools = generate_config.pop("tools", []) if generate_config else None
         model_family = self.model_family.model_family or self.model_family.model_name
         full_context_kwargs = {}
-        if tools and model_family in QWEN_TOOL_CALL_FAMILY:
-            full_context_kwargs["tools"] = tools
+        if tools:
+            if model_family in QWEN_TOOL_CALL_FAMILY:
+                full_context_kwargs["tools"] = tools
+            elif model_family in DEEPSEEK_TOOL_CALL_FAMILY:
+                self._tools_to_messages_for_deepseek(messages, tools)
         assert self.model_family.chat_template is not None
         full_prompt = self.get_full_context(
             messages, self.model_family.chat_template, **full_context_kwargs

xinference/model/llm/vllm/xavier/block.py CHANGED Viewed

@@ -76,12 +76,11 @@ class XavierPrefixCachingBlockAllocator(PrefixCachingBlockAllocator):
         self._xavier_config = v
     async def _get_block_tracker_ref(self):
-        from .block_tracker import VLLMBlockTracker
         if self._block_tracker_ref is None:
             block_tracker_address = self.xavier_config.get("block_tracker_address")
+            block_tracker_uid = self.xavier_config.get("block_tracker_uid")
             self._block_tracker_ref = await xo.actor_ref(
-                address=block_tracker_address, uid=VLLMBlockTracker.default_uid()
+                address=block_tracker_address, uid=block_tracker_uid
             )
         return self._block_tracker_ref
@@ -90,7 +89,7 @@ class XavierPrefixCachingBlockAllocator(PrefixCachingBlockAllocator):
         tracker_ref = await self._get_block_tracker_ref()
         await tracker_ref.unregister_block(
             self.xavier_config.get("virtual_engine"),
-            self.xavier_config.get("rank_address"),
+            self.xavier_config.get("rank"),
             block_id,
         )

xinference/model/llm/vllm/xavier/block_tracker.py CHANGED Viewed

@@ -24,81 +24,75 @@ class VLLMBlockTracker(xo.StatelessActor):
     def __init__(self):
         super().__init__()
-        # engine -> hash_to_address_and_block_id
-        self._hash_to_address_and_block_id: Dict[
-            int, Dict[int, Set[Tuple[str, int]]]
-        ] = {}
-        # engine -> address_to_hash_and_block_id
-        self._address_to_hash_and_block_id: Dict[
-            int, Dict[str, Set[Tuple[int, int]]]
-        ] = {}
+        # engine -> hash -> (rank, block_id)
+        self._hash_to_rank_and_block_id: Dict[int, Dict[int, Set[Tuple[int, int]]]] = {}
+        # engine -> rank -> (hash, block_id)
+        self._rank_to_hash_and_block_id: Dict[int, Dict[int, Set[Tuple[int, int]]]] = {}
+        self._unavailable_ranks: Set[int] = set()
     def register_blocks(
-        self, virtual_engine: int, block_infos: List[Tuple[int, int]], address: str
+        self, virtual_engine: int, block_infos: List[Tuple[int, int]], rank: int
     ):
         # Update query meta
-        if virtual_engine not in self._hash_to_address_and_block_id:
-            self._hash_to_address_and_block_id[virtual_engine] = {}
-        hash_to_address_and_block_id = self._hash_to_address_and_block_id[
-            virtual_engine
-        ]
+        if virtual_engine not in self._hash_to_rank_and_block_id:
+            self._hash_to_rank_and_block_id[virtual_engine] = {}
+        hash_to_rank_and_block_id = self._hash_to_rank_and_block_id[virtual_engine]
         for hash_content, block_id in block_infos:
-            if hash_content not in hash_to_address_and_block_id:
-                hash_to_address_and_block_id[hash_content] = {
-                    (address, block_id),
+            if hash_content not in hash_to_rank_and_block_id:
+                hash_to_rank_and_block_id[hash_content] = {
+                    (rank, block_id),
                 }
             else:
-                hash_to_address_and_block_id[hash_content].add((address, block_id))
+                hash_to_rank_and_block_id[hash_content].add((rank, block_id))
         # Update remove meta
-        if virtual_engine not in self._address_to_hash_and_block_id:
-            self._address_to_hash_and_block_id[virtual_engine] = {}
-        address_to_hash_and_block_id = self._address_to_hash_and_block_id[
-            virtual_engine
-        ]
-        if address not in address_to_hash_and_block_id:
-            address_to_hash_and_block_id[address] = set()
-        address_to_hash_and_block_id[address].update(block_infos)
+        if virtual_engine not in self._rank_to_hash_and_block_id:
+            self._rank_to_hash_and_block_id[virtual_engine] = {}
+        rank_to_hash_and_block_id = self._rank_to_hash_and_block_id[virtual_engine]
+        if rank not in rank_to_hash_and_block_id:
+            rank_to_hash_and_block_id[rank] = set()
+        rank_to_hash_and_block_id[rank].update(block_infos)
     def query_blocks(
         self, virtual_engine: int, hash_contents: List[Tuple[int, int]]
-    ) -> Dict[str, Set[Tuple[int, int, int]]]:
-        if virtual_engine not in self._hash_to_address_and_block_id:
+    ) -> Dict[int, Set[Tuple[int, int, int]]]:
+        if virtual_engine not in self._hash_to_rank_and_block_id:
             return {}
-        hash_to_address_and_block_id = self._hash_to_address_and_block_id[
-            virtual_engine
-        ]
-        remote: Dict[str, Set[Tuple[int, int, int]]] = {}
+        hash_to_rank_and_block_id = self._hash_to_rank_and_block_id[virtual_engine]
+        remote: Dict[int, Set[Tuple[int, int, int]]] = {}
         for hash_content, _id in hash_contents:
             if (
-                hash_content in hash_to_address_and_block_id
-            ) and hash_to_address_and_block_id[hash_content]:
-                # TODO: Randomly select here, and try to distribute requests as evenly as possible.
-                # There may be better methods in the future.
-                address, block_id = random.choice(
-                    list(hash_to_address_and_block_id[hash_content])
-                )
-                if address not in remote:
-                    remote[address] = {
-                        (hash_content, block_id, _id),
-                    }
-                else:
-                    remote[address].add((hash_content, block_id, _id))
+                hash_content in hash_to_rank_and_block_id
+            ) and hash_to_rank_and_block_id[hash_content]:
+                # exclude ranks that are in the recovery process
+                rank_and_block_id = [
+                    (r, b)
+                    for r, b in hash_to_rank_and_block_id[hash_content]
+                    if r not in self._unavailable_ranks
+                ]
+                if rank_and_block_id:
+                    # TODO: Randomly select here, and try to distribute requests as evenly as possible.
+                    # There may be better methods in the future.
+                    rank, block_id = random.choice(rank_and_block_id)
+                    if rank not in remote:
+                        remote[rank] = {
+                            (hash_content, block_id, _id),
+                        }
+                    else:
+                        remote[rank].add((hash_content, block_id, _id))
         return remote
-    def unregister_block(self, virtual_engine: int, address: str, block_id: int):
-        if (virtual_engine not in self._address_to_hash_and_block_id) or (
-            virtual_engine not in self._hash_to_address_and_block_id
+    def unregister_block(self, virtual_engine: int, rank: int, block_id: int):
+        if (virtual_engine not in self._rank_to_hash_and_block_id) or (
+            virtual_engine not in self._hash_to_rank_and_block_id
         ):
             return
         # Update remove meta
-        address_to_hash_and_block_id = self._address_to_hash_and_block_id[
-            virtual_engine
-        ]
-        if address not in address_to_hash_and_block_id:
+        rank_to_hash_and_block_id = self._rank_to_hash_and_block_id[virtual_engine]
+        if rank not in rank_to_hash_and_block_id:
             return
-        hash_and_block_id = address_to_hash_and_block_id[address]
+        hash_and_block_id = rank_to_hash_and_block_id[rank]
         detail: Optional[Tuple[int, int]] = None
         for hash_content, _id in hash_and_block_id.copy():
             if _id == block_id:
@@ -108,9 +102,28 @@ class VLLMBlockTracker(xo.StatelessActor):
         # Update query meta
         if detail is not None:
-            hash_to_address_and_block_id = self._hash_to_address_and_block_id[
-                virtual_engine
-            ]
+            hash_to_rank_and_block_id = self._hash_to_rank_and_block_id[virtual_engine]
             _hash = detail[0]
-            if _hash in hash_to_address_and_block_id:
-                hash_to_address_and_block_id[_hash].discard((address, detail[1]))
+            if _hash in hash_to_rank_and_block_id:
+                hash_to_rank_and_block_id[_hash].discard((rank, detail[1]))
+    def unregister_rank(self, rank: int):
+        """
+        This rank is in the recovery process, and its query results will be excluded.
+        """
+        self._unavailable_ranks.add(rank)
+    def register_rank(self, rank: int):
+        """
+        After recovery is successful, clear all stale data of the rank and mark the rank as available.
+        """
+        for _, rank_to_hash_and_block_id in self._rank_to_hash_and_block_id.items():
+            rank_to_hash_and_block_id.pop(rank, None)
+        for _, hash_to_rank_and_block_id in self._hash_to_rank_and_block_id.items():
+            for _, rank_and_block_id in hash_to_rank_and_block_id.items():
+                to_delete = [(r, b) for r, b in rank_and_block_id if r == rank]
+                if to_delete:
+                    rank_and_block_id.difference_update(to_delete)
+        self._unavailable_ranks.discard(rank)

xinference/model/llm/vllm/xavier/collective.py ADDED Viewed

@@ -0,0 +1,74 @@
+# Copyright 2022-2025 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+from typing import List, Optional
+logger = logging.getLogger(__name__)
+class CollectiveRank:
+    def __init__(
+        self,
+        rank: int,
+        world_size: int,
+        rank_address: str,
+        store_address: str,
+        store_port: int,
+        world_addresses: List[str],
+    ):
+        self._rank = rank
+        self._world_size = world_size
+        self._rank_address = rank_address
+        self._world_addresses = world_addresses
+        self._store_address = store_address
+        self._store_port = store_port
+        self._device = None
+        self._tcp_store = None
+        self._context = None
+    def init_rank(self):
+        from xoscar.collective import xoscar_pygloo as xp
+        self._context = xp.rendezvous.Context(self._rank, self._world_size)
+        attr = xp.transport.tcp.attr(self._rank_address.split(":")[0])
+        self._device = xp.transport.tcp.CreateDevice(attr)
+        opt = xp.rendezvous.TCPStoreOptions()
+        opt.port = self._store_port
+        opt.numWorkers = self._world_size
+        opt.isServer = self._rank == 0
+        opt.waitWorkers = False
+        self._tcp_store = xp.rendezvous.TCPStore(self._store_address, opt)
+        if self._world_addresses:
+            self.connect_full_mesh()
+    def connect_full_mesh(
+        self, prefix: Optional[str] = None, world_addresses: Optional[List[str]] = None
+    ):
+        from xoscar.collective import xoscar_pygloo as xp
+        assert self._device is not None
+        assert self._tcp_store is not None
+        assert self._context is not None
+        if world_addresses is not None:
+            self._world_addresses = world_addresses
+        prefix_store = xp.rendezvous.PrefixStore(
+            prefix or str(self._world_size), self._tcp_store
+        )
+        self._context.connectFullMesh(prefix_store, self._device)
+        logger.debug(
+            f"Rank {self._rank} arrives successfully, world addresses: {self._world_addresses}"
+        )

xinference 1.2.0__py3-none-any.whl → 1.2.2__py3-none-any.whl

Potentially problematic release.

xinference 1.2.0py3-none-any.whl → 1.2.2py3-none-any.whl