PyPI - xinference - Versions diffs - 1.5.1__py3-none-any.whl → 1.6.0.post1__py3-none-any.whl - Mend

xinference 1.5.1py3-none-any.whl → 1.6.0.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (96) hide show

xinference/model/llm/utils.py CHANGED Viewed

@@ -42,6 +42,7 @@ from ...types import (
     ChatCompletion,
     ChatCompletionChoice,
     ChatCompletionChunk,
+    ChatCompletionChunkChoice,
     ChatCompletionChunkDelta,
     ChatCompletionMessage,
     Completion,
@@ -68,8 +69,11 @@ QWEN_TOOL_CALL_FAMILY = [
     "qwen2-moe-instruct",
     "qwen2.5-instruct",
     "qwen2.5-coder-instruct",
+    "XiYanSQL-QwenCoder-2504",
     "QwQ-32B",
     "qwen3",
+    "HuatuoGPT-o1-Qwen2.5",
+    "DianJin-R1",
 ]
 GLM4_TOOL_CALL_FAMILY = [
@@ -79,6 +83,7 @@ GLM4_TOOL_CALL_FAMILY = [
 LLAMA3_TOOL_CALL_FAMILY = [
     "llama-3.1-instruct",
+    "HuatuoGPT-o1-LLaMA-3.1",
 ]
 DEEPSEEK_TOOL_CALL_FAMILY = [
@@ -160,7 +165,12 @@ class ChatModelMixin:
     @staticmethod
     def _get_chat_template_kwargs_from_generate_config(
         generate_config: Optional[Union[dict, Any]],
+        reasoning_parser: Optional[ReasoningParser] = None,
     ) -> Optional[dict]:
+        if reasoning_parser and not reasoning_parser.enable_thinking:
+            # hybrid model like qwen3,
+            # disabled thinking
+            return {"enable_thinking": False}
         if not generate_config:
             return None
         if "chat_template_kwargs" in generate_config:
@@ -285,7 +295,7 @@ class ChatModelMixin:
             and "delta" in choices[0]
         ):
             if choices[0]["finish_reason"] is None:
-                if reasoning_parser is not None:
+                if reasoning_parser and reasoning_parser.check_content_parser():
                     # process parsing reasoning content
                     assert previous_texts is not None
                     delta = choices[0]["delta"]  # type: ignore
@@ -302,7 +312,7 @@ class ChatModelMixin:
                 delta = choices[0]["delta"]  # type: ignore
                 if "content" not in delta:
                     delta["content"] = ""  # type: ignore
-                if reasoning_parser is not None:
+                if reasoning_parser and reasoning_parser.check_content_parser():
                     delta["reasoning_content"] = None  # type: ignore
             # Already a ChatCompletionChunk, we don't need to convert chunk.
             return cast(ChatCompletionChunk, chunk)
@@ -311,7 +321,7 @@ class ChatModelMixin:
         for i, choice in enumerate(choices):  # type: ignore
             delta = ChatCompletionChunkDelta()
             if "text" in choice and choice["finish_reason"] is None:
-                if reasoning_parser is None:
+                if not reasoning_parser or not reasoning_parser.check_content_parser():
                     delta["content"] = choice["text"]
                 else:
                     assert previous_texts is not None
@@ -324,7 +334,7 @@ class ChatModelMixin:
                     previous_texts[-1] = current_text
             elif "text" in choice and choice["finish_reason"] is not None:
                 delta["content"] = choice["text"]
-                if reasoning_parser is not None:
+                if reasoning_parser and reasoning_parser.check_content_parser():
                     delta["reasoning_content"] = None
             elif "tool_calls" in choice:
                 delta["tool_calls"] = choice["tool_calls"]
@@ -338,7 +348,9 @@ class ChatModelMixin:
         assert choices is not None
         usage = (
             chunk["usage"]
-            if choices[0]["finish_reason"] is not None and reasoning_parser is not None
+            if choices[0]["finish_reason"] is not None
+            and reasoning_parser
+            and reasoning_parser.check_content_parser()
             else None
         )
         chat_chunk = {
@@ -356,28 +368,32 @@ class ChatModelMixin:
         cls,
         chunk: CompletionChunk,
         reasoning_parser: Optional[ReasoningParser] = None,
-    ) -> ChatCompletionChunk:
-        choices_list = []
+    ) -> List[ChatCompletionChunk]:
+        choices_list: List[ChatCompletionChunkChoice] = []
+        chunks: List[ChatCompletionChunk] = []
         for i, choice in enumerate(chunk["choices"]):
             delta = ChatCompletionChunkDelta(role="assistant", content="")
-            if reasoning_parser is not None:
+            if reasoning_parser and reasoning_parser.check_content_parser():
                 delta["content"] = None
                 delta["reasoning_content"] = ""
             choices_list.append(
-                {
-                    "index": i,
-                    "delta": delta,
-                    "finish_reason": None,
-                }
+                ChatCompletionChunkChoice(
+                    index=i,
+                    delta=delta,
+                    finish_reason=None,
+                )
             )
-        chat_chunk = {
-            "id": "chat" + chunk["id"],
-            "model": chunk["model"],
-            "created": chunk["created"],
-            "object": "chat.completion.chunk",
-            "choices": choices_list,
-        }
-        return cast(ChatCompletionChunk, chat_chunk)
+        chat_chunk = ChatCompletionChunk(
+            id="chat" + chunk["id"],
+            model=chunk["model"],
+            created=chunk["created"],
+            object="chat.completion.chunk",
+            choices=choices_list,
+        )
+        chunks.append(chat_chunk)
+        if reasoning_parser:
+            chunks.extend(reasoning_parser.prepare_first_reasoning_content_chunk(chunk))
+        return chunks
     @classmethod
     def _get_final_chat_completion_chunk(
@@ -402,6 +418,8 @@ class ChatModelMixin:
         reasoning_parse: Optional[ReasoningParser] = None,
     ) -> Iterator[ChatCompletionChunk]:
         previous_texts = [""]
+        if reasoning_parse:
+            chunks = reasoning_parse.prepare_reasoning_content_sync(chunks)
         for _, chunk in enumerate(chunks):
             # usage
             choices = chunk.get("choices")
@@ -449,6 +467,9 @@ class ChatModelMixin:
         reasoning_parser: Optional[ReasoningParser] = None,
     ) -> AsyncGenerator[ChatCompletionChunk, None]:
         previous_texts = [""]
+        # Process chunks
+        if reasoning_parser:
+            chunks = reasoning_parser.prepare_reasoning_content_streaming(chunks)
         async for chunk in chunks:
             choices = chunk.get("choices")
             if not choices:
@@ -464,19 +485,25 @@ class ChatModelMixin:
     def _to_chat_completion(
         completion: Completion, reasoning_parser: Optional[ReasoningParser] = None
     ) -> ChatCompletion:
+        # prepare reasoning content
+        if reasoning_parser:
+            completion = reasoning_parser.prepare_reasoning_content(completion)
         if completion.get("object") == "chat.completion" and completion.get("choices"):
             # Already a ChatCompletion
-            if reasoning_parser is not None:
-                for choice in completion["choices"]:
-                    message = choice["message"]  # type: ignore
-                    text = message["content"]
+            for choice in completion["choices"]:
+                message = choice["message"]  # type: ignore
+                text = message["content"]  # Original content from the message
+                if reasoning_parser and reasoning_parser.check_content_parser():
+                    # Parse into reasoning and content parts
                     (
-                        reasoning_content,
-                        content,
+                        reasoning_val,
+                        content_val,
                     ) = reasoning_parser.extract_reasoning_content(text)
-                    message["content"] = content
-                    if reasoning_content is not None:
-                        message["reasoning_content"] = reasoning_content
+                    message["content"] = content_val
+                    if reasoning_val is not None:
+                        message["reasoning_content"] = reasoning_val
             return cast(ChatCompletion, completion)
         choices = []
@@ -484,7 +511,7 @@ class ChatModelMixin:
             content = choice["text"]
             reasoning_content = None
-            if reasoning_parser is not None:
+            if reasoning_parser and reasoning_parser.check_content_parser():
                 reasoning_content, content = reasoning_parser.extract_reasoning_content(  # type: ignore
                     choice
                 )
@@ -681,20 +708,12 @@ class ChatModelMixin:
                 failed_contents.append(content)
         finish_reason = "tool_calls" if tool_calls else "stop"
-        reasoning_content = None
         content = ". ".join(failed_contents) if failed_contents else None
-        if reasoning_parser is not None:
-            reasoning_content, content = reasoning_parser.extract_reasoning_content(  # type: ignore
-                content
-            )
         d = {
             "role": "assistant",
             "content": content,
             "tool_calls": tool_calls,
         }
-        # add only reasoning_content is None
-        if reasoning_content is not None:
-            d["reasoning_content"] = reasoning_content
         try:
             usage = c.get("usage")
@@ -729,7 +748,17 @@ class ChatModelMixin:
         c,
         reasoning_parser: Optional[ReasoningParser] = None,
     ):
+        if reasoning_parser:
+            c = reasoning_parser.prepare_reasoning_content(c)
         _id = str(uuid.uuid4())
+        reasoning_content = None
+        if reasoning_parser and reasoning_parser.check_content_parser():
+            text = c["choices"][0]["text"]
+            reasoning_content, content = reasoning_parser.extract_reasoning_content(
+                text
+            )
+            c["choices"][0]["text"] = content
         tool_result = cls._eval_tool_arguments(model_family, c)
         tool_calls = []
@@ -750,12 +779,6 @@ class ChatModelMixin:
                 failed_contents.append(content)
         finish_reason = "tool_calls" if tool_calls else "stop"
-        reasoning_content = None
-        content = ". ".join(failed_contents) if failed_contents else None
-        if reasoning_parser is not None:
-            reasoning_content, content = reasoning_parser.extract_reasoning_content(  # type: ignore
-                content
-            )
         m = {
             "role": "assistant",
             "content": content,

xinference/model/llm/vllm/core.py CHANGED Viewed

@@ -170,6 +170,7 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.3.0":
     VLLM_SUPPORTED_CHAT_MODELS.append("qwen2.5-instruct")
     VLLM_SUPPORTED_MODELS.append("qwen2.5-coder")
     VLLM_SUPPORTED_CHAT_MODELS.append("qwen2.5-coder-instruct")
+    VLLM_SUPPORTED_CHAT_MODELS.append("XiYanSQL-QwenCoder-2504")
     VLLM_SUPPORTED_CHAT_MODELS.append("QwQ-32B-Preview")
     VLLM_SUPPORTED_CHAT_MODELS.append("QwQ-32B")
     VLLM_SUPPORTED_CHAT_MODELS.append("marco-o1")
@@ -177,6 +178,9 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.3.0":
     VLLM_SUPPORTED_CHAT_MODELS.append("fin-r1")
     VLLM_SUPPORTED_CHAT_MODELS.append("seallms-v3")
     VLLM_SUPPORTED_CHAT_MODELS.append("skywork-or1-preview")
+    VLLM_SUPPORTED_CHAT_MODELS.append("skywork-or1")
+    VLLM_SUPPORTED_CHAT_MODELS.append("HuatuoGPT-o1-Qwen2.5")
+    VLLM_SUPPORTED_CHAT_MODELS.append("DianJin-R1")
 if VLLM_INSTALLED and vllm.__version__ >= "0.3.2":
     VLLM_SUPPORTED_CHAT_MODELS.append("gemma-it")
@@ -207,6 +211,7 @@ if VLLM_INSTALLED and vllm.__version__ > "0.5.3":
     VLLM_SUPPORTED_CHAT_MODELS.append("llama-3.1-instruct")
     VLLM_SUPPORTED_CHAT_MODELS.append("llama-3.3-instruct")
     VLLM_SUPPORTED_CHAT_MODELS.append("deepseek-r1-distill-llama")
+    VLLM_SUPPORTED_CHAT_MODELS.append("HuatuoGPT-o1-LLaMA-3.1")
 if VLLM_INSTALLED and vllm.__version__ >= "0.6.1":
     VLLM_SUPPORTED_VISION_MODEL_LIST.append("internvl2")
@@ -347,8 +352,10 @@ class VLLMModel(LLM):
         self._device_count = self._get_cuda_count()
         self._model_config = self._sanitize_model_config(self._model_config)
         reasoning_content = self._model_config.pop("reasoning_content")
-        self.prepare_parse_reasoning_content(reasoning_content)
+        enable_thinking = self._model_config.pop("enable_thinking", False)
+        self.prepare_parse_reasoning_content(
+            reasoning_content, enable_thinking=enable_thinking
+        )
         if (
             isinstance(self.model_spec, LlamaCppLLMSpecV1)
@@ -811,10 +818,6 @@ class VLLMModel(LLM):
             raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
         sanitized_generate_config = self._sanitize_generate_config(generate_config)
-        if self.reasoning_parser:
-            # For reasoning model, the </think> we be split into multiple words,
-            # if `stop` param is passed, so we pop it from config.
-            sanitized_generate_config.pop("stop")
         logger.debug(
             "Enter generate, prompt: %s, generate config: %s", prompt, generate_config
         )
@@ -1029,13 +1032,19 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
     ) -> Dict:
         if not generate_config:
             generate_config = {}
-        if not generate_config.get("stop") and self.model_family.stop:
-            generate_config["stop"] = self.model_family.stop.copy()
-        if (
-            not generate_config.get("stop_token_ids")
-            and self.model_family.stop_token_ids
-        ):
-            generate_config["stop_token_ids"] = self.model_family.stop_token_ids.copy()
+        if "reasoning" in getattr(self.model_family, "model_ability", []):
+            generate_config.pop("stop", None)
+            generate_config.pop("stop_token_ids", None)
+        else:
+            if not generate_config.get("stop") and self.model_family.stop:
+                generate_config["stop"] = self.model_family.stop.copy()
+            if (
+                not generate_config.get("stop_token_ids")
+                and self.model_family.stop_token_ids
+            ):
+                generate_config[
+                    "stop_token_ids"
+                ] = self.model_family.stop_token_ids.copy()
         return generate_config
     @staticmethod
@@ -1047,11 +1056,15 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
         chunks: AsyncGenerator[CompletionChunk, None],
     ) -> AsyncGenerator[ChatCompletionChunk, None]:
         i = 0
+        previous_texts = [""]
+        if self.reasoning_parser:
+            chunks = self.reasoning_parser.prepare_reasoning_content(chunks)
         async for chunk in chunks:
             if i == 0:
-                yield self._get_first_chat_completion_chunk(
+                for first_chunk in self._get_first_chat_completion_chunk(
                     chunk, self.reasoning_parser
-                )
+                ):
+                    yield first_chunk
             # usage
             choices = chunk.get("choices")
             if not choices:
@@ -1065,7 +1078,9 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
                         reasoning_parser=self.reasoning_parser,
                     )
                 else:
-                    yield self._to_chat_completion_chunk(chunk, self.reasoning_parser)
+                    yield self._to_chat_completion_chunk(
+                        chunk, self.reasoning_parser, previous_texts
+                    )
             i += 1
     @vllm_check
@@ -1078,7 +1093,10 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
         tools = generate_config.pop("tools", []) if generate_config else None
         model_family = self.model_family.model_family or self.model_family.model_name
         full_context_kwargs = (
-            self._get_chat_template_kwargs_from_generate_config(generate_config) or {}
+            self._get_chat_template_kwargs_from_generate_config(
+                generate_config, self.reasoning_parser
+            )
+            or {}
         )
         if tools:
             if (
@@ -1198,7 +1216,9 @@ class VLLMVisionModel(VLLMModel, ChatModelMixin):
             from qwen_vl_utils import process_vision_info
             full_context_kwargs = (
-                self._get_chat_template_kwargs_from_generate_config(generate_config)
+                self._get_chat_template_kwargs_from_generate_config(
+                    generate_config, self.reasoning_parser
+                )
                 or {}
             )
             if tools and model_family in QWEN_TOOL_CALL_FAMILY:

xinference/model/llm/vllm/xavier/test/test_xavier.py CHANGED Viewed

@@ -11,8 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import os
-import sys
 import pytest
 import xoscar as xo
@@ -30,14 +28,7 @@ class ExtendedBlockTracker(VLLMBlockTracker):
 @pytest.fixture
 async def actor_pool_context():
-    start_method = (
-        os.environ.get("POOL_START_METHOD", "forkserver")
-        if sys.platform != "win32"
-        else None
-    )
-    pool = await xo.create_actor_pool(
-        "127.0.0.1", n_process=2, subprocess_start_method=start_method
-    )
+    pool = await xo.create_actor_pool("127.0.0.1", n_process=2)
     async with pool:
         yield pool

xinference/model/rerank/__init__.py CHANGED Viewed

@@ -56,29 +56,8 @@ def register_custom_model():
 def _install():
-    _model_spec_json = os.path.join(os.path.dirname(__file__), "model_spec.json")
-    _model_spec_modelscope_json = os.path.join(
-        os.path.dirname(__file__), "model_spec_modelscope.json"
-    )
-    BUILTIN_RERANK_MODELS.update(
-        dict(
-            (spec["model_name"], RerankModelSpec(**spec))
-            for spec in json.load(codecs.open(_model_spec_json, "r", encoding="utf-8"))
-        )
-    )
-    for model_name, model_spec in BUILTIN_RERANK_MODELS.items():
-        MODEL_NAME_TO_REVISION[model_name].append(model_spec.model_revision)
-    MODELSCOPE_RERANK_MODELS.update(
-        dict(
-            (spec["model_name"], RerankModelSpec(**spec))
-            for spec in json.load(
-                codecs.open(_model_spec_modelscope_json, "r", encoding="utf-8")
-            )
-        )
-    )
-    for model_name, model_spec in MODELSCOPE_RERANK_MODELS.items():
-        MODEL_NAME_TO_REVISION[model_name].append(model_spec.model_revision)
+    load_model_family_from_json("model_spec.json", BUILTIN_RERANK_MODELS)
+    load_model_family_from_json("model_spec_modelscope.json", MODELSCOPE_RERANK_MODELS)
     # register model description after recording model revision
     for model_spec_info in [BUILTIN_RERANK_MODELS, MODELSCOPE_RERANK_MODELS]:
@@ -94,5 +73,15 @@ def _install():
     for ud_rerank in get_user_defined_reranks():
         RERANK_MODEL_DESCRIPTIONS.update(generate_rerank_description(ud_rerank))
+def load_model_family_from_json(json_filename, target_families):
+    _model_spec_json = os.path.join(os.path.dirname(__file__), json_filename)
+    target_families.update(
+        dict(
+            (spec["model_name"], RerankModelSpec(**spec))
+            for spec in json.load(codecs.open(_model_spec_json, "r", encoding="utf-8"))
+        )
+    )
+    for model_name, model_spec in target_families.items():
+        MODEL_NAME_TO_REVISION[model_name].append(model_spec.model_revision)
     del _model_spec_json
-    del _model_spec_modelscope_json

xinference/model/video/__init__.py CHANGED Viewed

@@ -30,29 +30,8 @@ from .core import (
 def _install():
-    _model_spec_json = os.path.join(os.path.dirname(__file__), "model_spec.json")
-    _model_spec_modelscope_json = os.path.join(
-        os.path.dirname(__file__), "model_spec_modelscope.json"
-    )
-    BUILTIN_VIDEO_MODELS.update(
-        dict(
-            (spec["model_name"], VideoModelFamilyV1(**spec))
-            for spec in json.load(codecs.open(_model_spec_json, "r", encoding="utf-8"))
-        )
-    )
-    for model_name, model_spec in BUILTIN_VIDEO_MODELS.items():
-        MODEL_NAME_TO_REVISION[model_name].append(model_spec.model_revision)
-    MODELSCOPE_VIDEO_MODELS.update(
-        dict(
-            (spec["model_name"], VideoModelFamilyV1(**spec))
-            for spec in json.load(
-                codecs.open(_model_spec_modelscope_json, "r", encoding="utf-8")
-            )
-        )
-    )
-    for model_name, model_spec in MODELSCOPE_VIDEO_MODELS.items():
-        MODEL_NAME_TO_REVISION[model_name].append(model_spec.model_revision)
+    load_model_family_from_json("model_spec.json", BUILTIN_VIDEO_MODELS)
+    load_model_family_from_json("model_spec_modelscope.json", MODELSCOPE_VIDEO_MODELS)
     # register model description
     for model_name, model_spec in chain(
@@ -60,5 +39,16 @@ def _install():
     ):
         VIDEO_MODEL_DESCRIPTIONS.update(generate_video_description(model_spec))
-    del _model_spec_json
-    del _model_spec_modelscope_json
+def load_model_family_from_json(json_filename, target_families):
+    json_path = os.path.join(os.path.dirname(__file__), json_filename)
+    target_families.update(
+        dict(
+            (spec["model_name"], VideoModelFamilyV1(**spec))
+            for spec in json.load(codecs.open(json_path, "r", encoding="utf-8"))
+        )
+    )
+    for model_name, model_spec in target_families.items():
+        MODEL_NAME_TO_REVISION[model_name].append(model_spec.model_revision)
+    del json_path

xinference/model/video/core.py CHANGED Viewed

@@ -19,7 +19,7 @@ from typing import Any, Dict, List, Literal, Optional, Tuple
 from ...constants import XINFERENCE_CACHE_DIR
 from ..core import CacheableModelSpec, ModelDescription, VirtualEnvSettings
 from ..utils import valid_model_revision
-from .diffusers import DiffUsersVideoModel
+from .diffusers import DiffusersVideoModel
 logger = logging.getLogger(__name__)
@@ -169,13 +169,13 @@ def create_video_model_instance(
     ] = None,
     model_path: Optional[str] = None,
     **kwargs,
-) -> Tuple[DiffUsersVideoModel, VideoModelDescription]:
+) -> Tuple[DiffusersVideoModel, VideoModelDescription]:
     model_spec = match_diffusion(model_name, download_hub)
     if not model_path:
         model_path = cache(model_spec)
     assert model_path is not None
-    model = DiffUsersVideoModel(
+    model = DiffusersVideoModel(
         model_uid,
         model_path,
         model_spec,

xinference 1.5.1__py3-none-any.whl → 1.6.0.post1__py3-none-any.whl

Potentially problematic release.

xinference 1.5.1py3-none-any.whl → 1.6.0.post1py3-none-any.whl