PyPI - xinference - Versions diffs - 1.5.0.post2__py3-none-any.whl → 1.6.0__py3-none-any.whl - Mend

xinference 1.5.0.post2py3-none-any.whl → 1.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (137) hide show

xinference/model/llm/utils.py CHANGED Viewed

@@ -42,6 +42,7 @@ from ...types import (
     ChatCompletion,
     ChatCompletionChoice,
     ChatCompletionChunk,
+    ChatCompletionChunkChoice,
     ChatCompletionChunkDelta,
     ChatCompletionMessage,
     Completion,
@@ -68,6 +69,11 @@ QWEN_TOOL_CALL_FAMILY = [
     "qwen2-moe-instruct",
     "qwen2.5-instruct",
     "qwen2.5-coder-instruct",
+    "XiYanSQL-QwenCoder-2504",
+    "QwQ-32B",
+    "qwen3",
+    "HuatuoGPT-o1-Qwen2.5",
+    "DianJin-R1",
 ]
 GLM4_TOOL_CALL_FAMILY = [
@@ -77,6 +83,7 @@ GLM4_TOOL_CALL_FAMILY = [
 LLAMA3_TOOL_CALL_FAMILY = [
     "llama-3.1-instruct",
+    "HuatuoGPT-o1-LLaMA-3.1",
 ]
 DEEPSEEK_TOOL_CALL_FAMILY = [
@@ -143,6 +150,7 @@ class ChatModelMixin:
                     add_generation_prompt=True,
                     **kwargs,
                 )
+                logger.debug("Prompt: %s", full_context)
                 return full_context
             except Exception as e:
                 logger.warning(
@@ -154,6 +162,36 @@ class ChatModelMixin:
             # Compilation function uses a cache to avoid recompiling the same template
             return self._build_from_raw_template(messages, chat_template, **kwargs)
+    @staticmethod
+    def _get_chat_template_kwargs_from_generate_config(
+        generate_config: Optional[Union[dict, Any]],
+        reasoning_parser: Optional[ReasoningParser] = None,
+    ) -> Optional[dict]:
+        if reasoning_parser and not reasoning_parser.enable_thinking:
+            # hybrid model like qwen3,
+            # disabled thinking
+            return {"enable_thinking": False}
+        if not generate_config:
+            return None
+        if "chat_template_kwargs" in generate_config:
+            kwargs = generate_config["chat_template_kwargs"]
+            if isinstance(kwargs, str):
+                try:
+                    return json.loads(kwargs)
+                except json.JSONDecodeError:
+                    raise TypeError(
+                        f"`chat_template_kwargs` should be json parsable, "
+                        f"got: {kwargs}"
+                    )
+            elif isinstance(kwargs, dict):
+                return kwargs
+            else:
+                raise TypeError(
+                    f"`chat_template_kwargs` but be a JSON parsable str "
+                    f"or dict, got: {kwargs}"
+                )
+        return None
     @staticmethod
     def convert_messages_with_content_list_to_str_conversion(
         messages: List[Dict],
@@ -257,7 +295,7 @@ class ChatModelMixin:
             and "delta" in choices[0]
         ):
             if choices[0]["finish_reason"] is None:
-                if reasoning_parser is not None:
+                if reasoning_parser and reasoning_parser.check_content_parser():
                     # process parsing reasoning content
                     assert previous_texts is not None
                     delta = choices[0]["delta"]  # type: ignore
@@ -274,7 +312,7 @@ class ChatModelMixin:
                 delta = choices[0]["delta"]  # type: ignore
                 if "content" not in delta:
                     delta["content"] = ""  # type: ignore
-                if reasoning_parser is not None:
+                if reasoning_parser and reasoning_parser.check_content_parser():
                     delta["reasoning_content"] = None  # type: ignore
             # Already a ChatCompletionChunk, we don't need to convert chunk.
             return cast(ChatCompletionChunk, chunk)
@@ -283,7 +321,7 @@ class ChatModelMixin:
         for i, choice in enumerate(choices):  # type: ignore
             delta = ChatCompletionChunkDelta()
             if "text" in choice and choice["finish_reason"] is None:
-                if reasoning_parser is None:
+                if not reasoning_parser or not reasoning_parser.check_content_parser():
                     delta["content"] = choice["text"]
                 else:
                     assert previous_texts is not None
@@ -296,7 +334,7 @@ class ChatModelMixin:
                     previous_texts[-1] = current_text
             elif "text" in choice and choice["finish_reason"] is not None:
                 delta["content"] = choice["text"]
-                if reasoning_parser is not None:
+                if reasoning_parser and reasoning_parser.check_content_parser():
                     delta["reasoning_content"] = None
             elif "tool_calls" in choice:
                 delta["tool_calls"] = choice["tool_calls"]
@@ -310,7 +348,9 @@ class ChatModelMixin:
         assert choices is not None
         usage = (
             chunk["usage"]
-            if choices[0]["finish_reason"] is not None and reasoning_parser is not None
+            if choices[0]["finish_reason"] is not None
+            and reasoning_parser
+            and reasoning_parser.check_content_parser()
             else None
         )
         chat_chunk = {
@@ -328,28 +368,32 @@ class ChatModelMixin:
         cls,
         chunk: CompletionChunk,
         reasoning_parser: Optional[ReasoningParser] = None,
-    ) -> ChatCompletionChunk:
-        choices_list = []
+    ) -> List[ChatCompletionChunk]:
+        choices_list: List[ChatCompletionChunkChoice] = []
+        chunks: List[ChatCompletionChunk] = []
         for i, choice in enumerate(chunk["choices"]):
             delta = ChatCompletionChunkDelta(role="assistant", content="")
-            if reasoning_parser is not None:
+            if reasoning_parser and reasoning_parser.check_content_parser():
                 delta["content"] = None
                 delta["reasoning_content"] = ""
             choices_list.append(
-                {
-                    "index": i,
-                    "delta": delta,
-                    "finish_reason": None,
-                }
+                ChatCompletionChunkChoice(
+                    index=i,
+                    delta=delta,
+                    finish_reason=None,
+                )
             )
-        chat_chunk = {
-            "id": "chat" + chunk["id"],
-            "model": chunk["model"],
-            "created": chunk["created"],
-            "object": "chat.completion.chunk",
-            "choices": choices_list,
-        }
-        return cast(ChatCompletionChunk, chat_chunk)
+        chat_chunk = ChatCompletionChunk(
+            id="chat" + chunk["id"],
+            model=chunk["model"],
+            created=chunk["created"],
+            object="chat.completion.chunk",
+            choices=choices_list,
+        )
+        chunks.append(chat_chunk)
+        if reasoning_parser:
+            chunks.extend(reasoning_parser.prepare_first_reasoning_content_chunk(chunk))
+        return chunks
     @classmethod
     def _get_final_chat_completion_chunk(
@@ -374,6 +418,8 @@ class ChatModelMixin:
         reasoning_parse: Optional[ReasoningParser] = None,
     ) -> Iterator[ChatCompletionChunk]:
         previous_texts = [""]
+        if reasoning_parse:
+            chunks = reasoning_parse.prepare_reasoning_content_sync(chunks)
         for _, chunk in enumerate(chunks):
             # usage
             choices = chunk.get("choices")
@@ -421,6 +467,9 @@ class ChatModelMixin:
         reasoning_parser: Optional[ReasoningParser] = None,
     ) -> AsyncGenerator[ChatCompletionChunk, None]:
         previous_texts = [""]
+        # Process chunks
+        if reasoning_parser:
+            chunks = reasoning_parser.prepare_reasoning_content_streaming(chunks)
         async for chunk in chunks:
             choices = chunk.get("choices")
             if not choices:
@@ -436,19 +485,25 @@ class ChatModelMixin:
     def _to_chat_completion(
         completion: Completion, reasoning_parser: Optional[ReasoningParser] = None
     ) -> ChatCompletion:
+        # prepare reasoning content
+        if reasoning_parser:
+            completion = reasoning_parser.prepare_reasoning_content(completion)
         if completion.get("object") == "chat.completion" and completion.get("choices"):
             # Already a ChatCompletion
-            if reasoning_parser is not None:
-                for choice in completion["choices"]:
-                    message = choice["message"]  # type: ignore
-                    text = message["content"]
+            for choice in completion["choices"]:
+                message = choice["message"]  # type: ignore
+                text = message["content"]  # Original content from the message
+                if reasoning_parser and reasoning_parser.check_content_parser():
+                    # Parse into reasoning and content parts
                     (
-                        reasoning_content,
-                        content,
+                        reasoning_val,
+                        content_val,
                     ) = reasoning_parser.extract_reasoning_content(text)
-                    message["content"] = content
-                    if reasoning_content is not None:
-                        message["reasoning_content"] = reasoning_content
+                    message["content"] = content_val
+                    if reasoning_val is not None:
+                        message["reasoning_content"] = reasoning_val
             return cast(ChatCompletion, completion)
         choices = []
@@ -456,7 +511,7 @@ class ChatModelMixin:
             content = choice["text"]
             reasoning_content = None
-            if reasoning_parser is not None:
+            if reasoning_parser and reasoning_parser.check_content_parser():
                 reasoning_content, content = reasoning_parser.extract_reasoning_content(  # type: ignore
                     choice
                 )
@@ -653,20 +708,12 @@ class ChatModelMixin:
                 failed_contents.append(content)
         finish_reason = "tool_calls" if tool_calls else "stop"
-        reasoning_content = None
         content = ". ".join(failed_contents) if failed_contents else None
-        if reasoning_parser is not None:
-            reasoning_content, content = reasoning_parser.extract_reasoning_content(  # type: ignore
-                content
-            )
         d = {
             "role": "assistant",
             "content": content,
             "tool_calls": tool_calls,
         }
-        # add only reasoning_content is None
-        if reasoning_content is not None:
-            d["reasoning_content"] = reasoning_content
         try:
             usage = c.get("usage")
@@ -701,7 +748,17 @@ class ChatModelMixin:
         c,
         reasoning_parser: Optional[ReasoningParser] = None,
     ):
+        if reasoning_parser:
+            c = reasoning_parser.prepare_reasoning_content(c)
         _id = str(uuid.uuid4())
+        reasoning_content = None
+        if reasoning_parser and reasoning_parser.check_content_parser():
+            text = c["choices"][0]["text"]
+            reasoning_content, content = reasoning_parser.extract_reasoning_content(
+                text
+            )
+            c["choices"][0]["text"] = content
         tool_result = cls._eval_tool_arguments(model_family, c)
         tool_calls = []
@@ -722,12 +779,6 @@ class ChatModelMixin:
                 failed_contents.append(content)
         finish_reason = "tool_calls" if tool_calls else "stop"
-        reasoning_content = None
-        content = ". ".join(failed_contents) if failed_contents else None
-        if reasoning_parser is not None:
-            reasoning_content, content = reasoning_parser.extract_reasoning_content(  # type: ignore
-                content
-            )
         m = {
             "role": "assistant",
             "content": content,

xinference/model/llm/vllm/core.py CHANGED Viewed

@@ -13,6 +13,7 @@
 # limitations under the License.
 import asyncio
+import importlib.util
 import itertools
 import json
 import logging
@@ -50,7 +51,7 @@ from ....types import (
     LoRA,
 )
 from .. import LLM, LLMFamilyV1, LLMSpecV1
-from ..llm_family import CustomLLMFamilyV1
+from ..llm_family import CustomLLMFamilyV1, cache_model_tokenizer_and_config
 from ..utils import (
     DEEPSEEK_TOOL_CALL_FAMILY,
     QWEN_TOOL_CALL_FAMILY,
@@ -169,6 +170,7 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.3.0":
     VLLM_SUPPORTED_CHAT_MODELS.append("qwen2.5-instruct")
     VLLM_SUPPORTED_MODELS.append("qwen2.5-coder")
     VLLM_SUPPORTED_CHAT_MODELS.append("qwen2.5-coder-instruct")
+    VLLM_SUPPORTED_CHAT_MODELS.append("XiYanSQL-QwenCoder-2504")
     VLLM_SUPPORTED_CHAT_MODELS.append("QwQ-32B-Preview")
     VLLM_SUPPORTED_CHAT_MODELS.append("QwQ-32B")
     VLLM_SUPPORTED_CHAT_MODELS.append("marco-o1")
@@ -176,6 +178,9 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.3.0":
     VLLM_SUPPORTED_CHAT_MODELS.append("fin-r1")
     VLLM_SUPPORTED_CHAT_MODELS.append("seallms-v3")
     VLLM_SUPPORTED_CHAT_MODELS.append("skywork-or1-preview")
+    VLLM_SUPPORTED_CHAT_MODELS.append("skywork-or1")
+    VLLM_SUPPORTED_CHAT_MODELS.append("HuatuoGPT-o1-Qwen2.5")
+    VLLM_SUPPORTED_CHAT_MODELS.append("DianJin-R1")
 if VLLM_INSTALLED and vllm.__version__ >= "0.3.2":
     VLLM_SUPPORTED_CHAT_MODELS.append("gemma-it")
@@ -206,6 +211,7 @@ if VLLM_INSTALLED and vllm.__version__ > "0.5.3":
     VLLM_SUPPORTED_CHAT_MODELS.append("llama-3.1-instruct")
     VLLM_SUPPORTED_CHAT_MODELS.append("llama-3.3-instruct")
     VLLM_SUPPORTED_CHAT_MODELS.append("deepseek-r1-distill-llama")
+    VLLM_SUPPORTED_CHAT_MODELS.append("HuatuoGPT-o1-LLaMA-3.1")
 if VLLM_INSTALLED and vllm.__version__ >= "0.6.1":
     VLLM_SUPPORTED_VISION_MODEL_LIST.append("internvl2")
@@ -239,6 +245,9 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.8.0":
 if VLLM_INSTALLED and vllm.__version__ >= "0.8.4":
     VLLM_SUPPORTED_CHAT_MODELS.append("glm4-0414")
+if VLLM_INSTALLED and vllm.__version__ >= "0.8.5":
+    VLLM_SUPPORTED_CHAT_MODELS.append("qwen3")
 class VLLMModel(LLM):
     def __init__(
@@ -330,8 +339,10 @@ class VLLMModel(LLM):
             raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
-        if vllm.__version__ >= "0.3.1":
-            # from vllm v0.3.1, it uses cupy as NCCL backend
+        from ..llm_family import LlamaCppLLMSpecV1
+        if "0.3.1" <= vllm.__version__ <= "0.3.3":
+            # from vllm v0.3.1 to v0.3.3, it uses cupy as NCCL backend
             # in which cupy will fork a process
             # only for xoscar >= 0.3.0, new process is allowed in subpool
             # besides, xinference set start method as forkserver for unix
@@ -341,8 +352,17 @@ class VLLMModel(LLM):
         self._device_count = self._get_cuda_count()
         self._model_config = self._sanitize_model_config(self._model_config)
         reasoning_content = self._model_config.pop("reasoning_content")
+        enable_thinking = self._model_config.pop("enable_thinking", False)
+        self.prepare_parse_reasoning_content(
+            reasoning_content, enable_thinking=enable_thinking
+        )
-        self.prepare_parse_reasoning_content(reasoning_content)
+        if (
+            isinstance(self.model_spec, LlamaCppLLMSpecV1)
+            and self.model_spec.model_format == "ggufv2"
+        ):
+            # gguf
+            self._preprocess_load_gguf()
         if self.lora_modules is None:
             self.lora_requests = []
@@ -482,6 +502,45 @@ class VLLMModel(LLM):
                 _, err, tb = self._loading_error
                 raise err.with_traceback(tb)
+    def _preprocess_load_gguf(self):
+        # check if it is multi gguf files
+        if (
+            not os.path.isfile(self.model_path)
+            and self.model_spec.quantization_parts
+            and self.quantization in self.model_spec.quantization_parts
+        ):
+            raise RuntimeError(
+                "vllm does not support multiple gguf files, please merge them first and "
+                "provide `model_path` with merged file"
+            )
+        if "tokenizer" not in self._model_config:
+            # find pytorch format without quantization
+            non_quant_spec = next(
+                spec
+                for spec in self.model_family.model_specs
+                if spec.model_format == "pytorch"
+                and "none" in spec.quantizations
+                and spec.model_size_in_billions
+                == self.model_spec.model_size_in_billions
+            )
+            path = cache_model_tokenizer_and_config(self.model_family, non_quant_spec)
+            # other than gguf file, vllm requires to provide tokenizer and hf_config_path
+            self._model_config["tokenizer"] = self._model_config[
+                "hf_config_path"
+            ] = path
+        if not os.path.isfile(self.model_path):
+            self.model_path = os.path.realpath(
+                os.path.join(
+                    self.model_path,
+                    self.model_spec.model_file_name_template.format(
+                        quantization=self.quantization
+                    ),
+                )
+            )
     def stop(self):
         # though the vLLM engine will shutdown when deleted,
         # but some issue e.g. GH#1682 reported
@@ -642,7 +701,11 @@ class VLLMModel(LLM):
         return sanitized
     @classmethod
-    def match(
+    def check_lib(cls) -> bool:
+        return importlib.util.find_spec("vllm") is not None
+    @classmethod
+    def match_json(
         cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         if not cls._has_cuda_device():
@@ -755,10 +818,6 @@ class VLLMModel(LLM):
             raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
         sanitized_generate_config = self._sanitize_generate_config(generate_config)
-        if self.reasoning_parser:
-            # For reasoning model, the </think> we be split into multiple words,
-            # if `stop` param is passed, so we pop it from config.
-            sanitized_generate_config.pop("stop")
         logger.debug(
             "Enter generate, prompt: %s, generate config: %s", prompt, generate_config
         )
@@ -935,10 +994,10 @@ class VLLMModel(LLM):
 class VLLMChatModel(VLLMModel, ChatModelMixin):
     @classmethod
-    def match(
+    def match_json(
         cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8"]:
+        if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8", "ggufv2"]:
             return False
         if llm_spec.model_format == "pytorch":
             if quantization != "none" and not (quantization is None):
@@ -954,6 +1013,9 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
             else:
                 if "4" not in quantization:
                     return False
+        if llm_spec.model_format == "ggufv2":
+            if not (VLLM_INSTALLED and vllm.__version__ >= "0.8.2"):
+                return False
         if isinstance(llm_family, CustomLLMFamilyV1):
             if llm_family.model_family not in VLLM_SUPPORTED_CHAT_MODELS:
                 return False
@@ -970,13 +1032,19 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
     ) -> Dict:
         if not generate_config:
             generate_config = {}
-        if not generate_config.get("stop") and self.model_family.stop:
-            generate_config["stop"] = self.model_family.stop.copy()
-        if (
-            not generate_config.get("stop_token_ids")
-            and self.model_family.stop_token_ids
-        ):
-            generate_config["stop_token_ids"] = self.model_family.stop_token_ids.copy()
+        if "reasoning" in getattr(self.model_family, "model_ability", []):
+            generate_config.pop("stop", None)
+            generate_config.pop("stop_token_ids", None)
+        else:
+            if not generate_config.get("stop") and self.model_family.stop:
+                generate_config["stop"] = self.model_family.stop.copy()
+            if (
+                not generate_config.get("stop_token_ids")
+                and self.model_family.stop_token_ids
+            ):
+                generate_config[
+                    "stop_token_ids"
+                ] = self.model_family.stop_token_ids.copy()
         return generate_config
     @staticmethod
@@ -988,11 +1056,15 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
         chunks: AsyncGenerator[CompletionChunk, None],
     ) -> AsyncGenerator[ChatCompletionChunk, None]:
         i = 0
+        previous_texts = [""]
+        if self.reasoning_parser:
+            chunks = self.reasoning_parser.prepare_reasoning_content(chunks)
         async for chunk in chunks:
             if i == 0:
-                yield self._get_first_chat_completion_chunk(
+                for first_chunk in self._get_first_chat_completion_chunk(
                     chunk, self.reasoning_parser
-                )
+                ):
+                    yield first_chunk
             # usage
             choices = chunk.get("choices")
             if not choices:
@@ -1006,7 +1078,9 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
                         reasoning_parser=self.reasoning_parser,
                     )
                 else:
-                    yield self._to_chat_completion_chunk(chunk, self.reasoning_parser)
+                    yield self._to_chat_completion_chunk(
+                        chunk, self.reasoning_parser, previous_texts
+                    )
             i += 1
     @vllm_check
@@ -1018,7 +1092,12 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
     ) -> Union[ChatCompletion, AsyncGenerator[ChatCompletionChunk, None]]:
         tools = generate_config.pop("tools", []) if generate_config else None
         model_family = self.model_family.model_family or self.model_family.model_name
-        full_context_kwargs = {}
+        full_context_kwargs = (
+            self._get_chat_template_kwargs_from_generate_config(
+                generate_config, self.reasoning_parser
+            )
+            or {}
+        )
         if tools:
             if (
                 model_family in QWEN_TOOL_CALL_FAMILY
@@ -1055,7 +1134,7 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
 class VLLMVisionModel(VLLMModel, ChatModelMixin):
     @classmethod
-    def match(
+    def match_json(
         cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         if not cls._has_cuda_device():
@@ -1136,7 +1215,12 @@ class VLLMVisionModel(VLLMModel, ChatModelMixin):
         if "internvl2" not in model_family.lower():
             from qwen_vl_utils import process_vision_info
-            full_context_kwargs = {}
+            full_context_kwargs = (
+                self._get_chat_template_kwargs_from_generate_config(
+                    generate_config, self.reasoning_parser
+                )
+                or {}
+            )
             if tools and model_family in QWEN_TOOL_CALL_FAMILY:
                 full_context_kwargs["tools"] = tools
             assert self.model_family.chat_template is not None

xinference/model/llm/vllm/distributed_executor.py CHANGED Viewed

@@ -54,13 +54,14 @@ class WorkerActor(xo.StatelessActor):
         return f"VllmWorker_{rank}"
     def execute_method(self, method: Union[str, Callable], *args, **kwargs):
-        logger.debug(
-            "Calling method %s in vllm worker %s, args: %s, kwargs: %s",
-            method,
-            self.uid,
-            args,
-            kwargs,
-        )
+        # NOTE: too many logs, but useful for debug
+        # logger.debug(
+        #     "Calling method %s in vllm worker %s, args: %s, kwargs: %s",
+        #     method,
+        #     self.uid,
+        #     args,
+        #     kwargs,
+        # )
         if isinstance(method, str):
             return getattr(self._worker, method)(*args, **kwargs)
         else:

xinference/model/llm/vllm/xavier/allocator.py CHANGED Viewed

@@ -24,7 +24,7 @@ from .block import XavierPrefixCachingBlockAllocator
 class XavierCpuGpuBlockAllocator(CpuGpuBlockAllocator):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self._xavier_config: Optional[Dict[str, Any]] = None
+        self._xavier_config: Optional[Dict[str, Any]] = None  # type: ignore
     @property
     def xavier_config(self):

xinference/model/llm/vllm/xavier/block_manager.py CHANGED Viewed

@@ -30,7 +30,7 @@ class XavierBlockManager(SelfAttnBlockSpaceManager):
         # Monkey patch
         CpuGpuBlockAllocator.create = XavierCpuGpuBlockAllocator.create
         super().__init__(*args, **kwargs)
-        self._xavier_config: Optional[Dict[str, Any]] = None
+        self._xavier_config: Optional[Dict[str, Any]] = None  # type: ignore
         logger.debug("Init xavier block manager done.")
     @property

xinference/model/llm/vllm/xavier/block_tracker.py CHANGED Viewed

@@ -25,10 +25,10 @@ class VLLMBlockTracker(xo.StatelessActor):
     def __init__(self):
         super().__init__()
         # engine -> hash -> (rank, block_id)
-        self._hash_to_rank_and_block_id: Dict[int, Dict[int, Set[Tuple[int, int]]]] = {}
+        self._hash_to_rank_and_block_id: Dict[int, Dict[int, Set[Tuple[int, int]]]] = {}  # type: ignore
         # engine -> rank -> (hash, block_id)
-        self._rank_to_hash_and_block_id: Dict[int, Dict[int, Set[Tuple[int, int]]]] = {}
-        self._unavailable_ranks: Set[int] = set()
+        self._rank_to_hash_and_block_id: Dict[int, Dict[int, Set[Tuple[int, int]]]] = {}  # type: ignore
+        self._unavailable_ranks: Set[int] = set()  # type: ignore
     def register_blocks(
         self, virtual_engine: int, block_infos: List[Tuple[int, int]], rank: int

xinference/model/llm/vllm/xavier/executor.py CHANGED Viewed

@@ -38,7 +38,7 @@ class XavierExecutor(MultiprocessingDistributedExecutor):
         Retrieve the necessary transmission information from the `cache_engine`.
         """
         transfer_ref = await self._get_transfer_ref()
-        ref_cache_engine: CacheEngine = self.driver_worker.cache_engine[0]
+        ref_cache_engine: CacheEngine = self.driver_worker.cache_engine[0]  # type: ignore
         buffer_dtype = ref_cache_engine.dtype
         buffer_device = "cpu"
         buffer_pin_memory = is_pin_memory_available()

xinference/model/llm/vllm/xavier/test/test_xavier.py CHANGED Viewed

@@ -11,8 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import os
-import sys
 import pytest
 import xoscar as xo
@@ -30,14 +28,7 @@ class ExtendedBlockTracker(VLLMBlockTracker):
 @pytest.fixture
 async def actor_pool_context():
-    start_method = (
-        os.environ.get("POOL_START_METHOD", "forkserver")
-        if sys.platform != "win32"
-        else None
-    )
-    pool = await xo.create_actor_pool(
-        "127.0.0.1", n_process=2, subprocess_start_method=start_method
-    )
+    pool = await xo.create_actor_pool("127.0.0.1", n_process=2)
     async with pool:
         yield pool
@@ -46,7 +37,7 @@ async def actor_pool_context():
 async def test_block_tracker(actor_pool_context):
     actor_pool = actor_pool_context
     addr = actor_pool.external_address
-    tracker_ref: xo.ActorRefType[ExtendedBlockTracker] = await xo.create_actor(
+    tracker_ref: xo.ActorRefType[ExtendedBlockTracker] = await xo.create_actor(  # type: ignore
         ExtendedBlockTracker,
         address=addr,
         uid=VLLMBlockTracker.default_uid(),

xinference 1.5.0.post2__py3-none-any.whl → 1.6.0__py3-none-any.whl

Potentially problematic release.

xinference 1.5.0.post2py3-none-any.whl → 1.6.0py3-none-any.whl