PyPI - xinference - Versions diffs - 1.7.1.post1__py3-none-any.whl → 1.8.0__py3-none-any.whl - Mend

xinference 1.7.1.post1py3-none-any.whl → 1.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (136) hide show

xinference/_version.py +3 -3
xinference/client/restful/async_restful_client.py +8 -13
xinference/client/restful/restful_client.py +6 -2
xinference/core/chat_interface.py +6 -4
xinference/core/media_interface.py +5 -0
xinference/core/model.py +1 -5
xinference/core/supervisor.py +117 -68
xinference/core/worker.py +49 -37
xinference/deploy/test/test_cmdline.py +2 -6
xinference/model/audio/__init__.py +26 -23
xinference/model/audio/chattts.py +3 -2
xinference/model/audio/core.py +49 -98
xinference/model/audio/cosyvoice.py +3 -2
xinference/model/audio/custom.py +28 -73
xinference/model/audio/f5tts.py +3 -2
xinference/model/audio/f5tts_mlx.py +3 -2
xinference/model/audio/fish_speech.py +3 -2
xinference/model/audio/funasr.py +17 -4
xinference/model/audio/kokoro.py +3 -2
xinference/model/audio/megatts.py +3 -2
xinference/model/audio/melotts.py +3 -2
xinference/model/audio/model_spec.json +572 -171
xinference/model/audio/utils.py +0 -6
xinference/model/audio/whisper.py +3 -2
xinference/model/audio/whisper_mlx.py +3 -2
xinference/model/cache_manager.py +141 -0
xinference/model/core.py +6 -49
xinference/model/custom.py +174 -0
xinference/model/embedding/__init__.py +67 -56
xinference/model/embedding/cache_manager.py +35 -0
xinference/model/embedding/core.py +104 -84
xinference/model/embedding/custom.py +55 -78
xinference/model/embedding/embed_family.py +80 -31
xinference/model/embedding/flag/core.py +21 -5
xinference/model/embedding/llama_cpp/__init__.py +0 -0
xinference/model/embedding/llama_cpp/core.py +234 -0
xinference/model/embedding/model_spec.json +968 -103
xinference/model/embedding/sentence_transformers/core.py +30 -20
xinference/model/embedding/vllm/core.py +11 -5
xinference/model/flexible/__init__.py +8 -2
xinference/model/flexible/core.py +26 -119
xinference/model/flexible/custom.py +69 -0
xinference/model/flexible/launchers/image_process_launcher.py +1 -0
xinference/model/flexible/launchers/modelscope_launcher.py +5 -1
xinference/model/flexible/launchers/transformers_launcher.py +15 -3
xinference/model/flexible/launchers/yolo_launcher.py +5 -1
xinference/model/image/__init__.py +20 -20
xinference/model/image/cache_manager.py +62 -0
xinference/model/image/core.py +70 -182
xinference/model/image/custom.py +28 -72
xinference/model/image/model_spec.json +402 -119
xinference/model/image/ocr/got_ocr2.py +3 -2
xinference/model/image/stable_diffusion/core.py +22 -7
xinference/model/image/stable_diffusion/mlx.py +6 -6
xinference/model/image/utils.py +2 -2
xinference/model/llm/__init__.py +71 -94
xinference/model/llm/cache_manager.py +292 -0
xinference/model/llm/core.py +37 -111
xinference/model/llm/custom.py +88 -0
xinference/model/llm/llama_cpp/core.py +5 -7
xinference/model/llm/llm_family.json +16260 -8151
xinference/model/llm/llm_family.py +138 -839
xinference/model/llm/lmdeploy/core.py +5 -7
xinference/model/llm/memory.py +3 -4
xinference/model/llm/mlx/core.py +6 -8
xinference/model/llm/reasoning_parser.py +3 -1
xinference/model/llm/sglang/core.py +32 -14
xinference/model/llm/transformers/chatglm.py +3 -7
xinference/model/llm/transformers/core.py +49 -27
xinference/model/llm/transformers/deepseek_v2.py +2 -2
xinference/model/llm/transformers/gemma3.py +2 -2
xinference/model/llm/transformers/multimodal/cogagent.py +2 -2
xinference/model/llm/transformers/multimodal/deepseek_vl2.py +2 -2
xinference/model/llm/transformers/multimodal/gemma3.py +2 -2
xinference/model/llm/transformers/multimodal/glm4_1v.py +167 -0
xinference/model/llm/transformers/multimodal/glm4v.py +2 -2
xinference/model/llm/transformers/multimodal/intern_vl.py +2 -2
xinference/model/llm/transformers/multimodal/minicpmv26.py +3 -3
xinference/model/llm/transformers/multimodal/ovis2.py +2 -2
xinference/model/llm/transformers/multimodal/qwen-omni.py +2 -2
xinference/model/llm/transformers/multimodal/qwen2_audio.py +2 -2
xinference/model/llm/transformers/multimodal/qwen2_vl.py +2 -2
xinference/model/llm/transformers/opt.py +3 -7
xinference/model/llm/utils.py +34 -49
xinference/model/llm/vllm/core.py +77 -27
xinference/model/llm/vllm/xavier/engine.py +5 -3
xinference/model/llm/vllm/xavier/scheduler.py +10 -6
xinference/model/llm/vllm/xavier/transfer.py +1 -1
xinference/model/rerank/__init__.py +26 -25
xinference/model/rerank/core.py +47 -87
xinference/model/rerank/custom.py +25 -71
xinference/model/rerank/model_spec.json +158 -33
xinference/model/rerank/utils.py +2 -2
xinference/model/utils.py +115 -54
xinference/model/video/__init__.py +13 -17
xinference/model/video/core.py +44 -102
xinference/model/video/diffusers.py +4 -3
xinference/model/video/model_spec.json +90 -21
xinference/types.py +5 -3
xinference/web/ui/build/asset-manifest.json +3 -3
xinference/web/ui/build/index.html +1 -1
xinference/web/ui/build/static/js/main.7d24df53.js +3 -0
xinference/web/ui/build/static/js/main.7d24df53.js.map +1 -0
xinference/web/ui/node_modules/.cache/babel-loader/2704ff66a5f73ca78b341eb3edec60154369df9d87fbc8c6dd60121abc5e1b0a.json +1 -0
xinference/web/ui/node_modules/.cache/babel-loader/607dfef23d33e6b594518c0c6434567639f24f356b877c80c60575184ec50ed0.json +1 -0
xinference/web/ui/node_modules/.cache/babel-loader/9be3d56173aacc3efd0b497bcb13c4f6365de30069176ee9403b40e717542326.json +1 -0
xinference/web/ui/node_modules/.cache/babel-loader/9f9dd6c32c78a222d07da5987ae902effe16bcf20aac00774acdccc4de3c9ff2.json +1 -0
xinference/web/ui/node_modules/.cache/babel-loader/b2ab5ee972c60d15eb9abf5845705f8ab7e1d125d324d9a9b1bcae5d6fd7ffb2.json +1 -0
xinference/web/ui/src/locales/en.json +0 -1
xinference/web/ui/src/locales/ja.json +0 -1
xinference/web/ui/src/locales/ko.json +0 -1
xinference/web/ui/src/locales/zh.json +0 -1
{xinference-1.7.1.post1.dist-info → xinference-1.8.0.dist-info}/METADATA +9 -11
{xinference-1.7.1.post1.dist-info → xinference-1.8.0.dist-info}/RECORD +119 -119
xinference/model/audio/model_spec_modelscope.json +0 -231
xinference/model/embedding/model_spec_modelscope.json +0 -293
xinference/model/embedding/utils.py +0 -18
xinference/model/image/model_spec_modelscope.json +0 -375
xinference/model/llm/llama_cpp/memory.py +0 -457
xinference/model/llm/llm_family_csghub.json +0 -56
xinference/model/llm/llm_family_modelscope.json +0 -8700
xinference/model/llm/llm_family_openmind_hub.json +0 -1019
xinference/model/rerank/model_spec_modelscope.json +0 -85
xinference/model/video/model_spec_modelscope.json +0 -184
xinference/web/ui/build/static/js/main.9b12b7f9.js +0 -3
xinference/web/ui/build/static/js/main.9b12b7f9.js.map +0 -1
xinference/web/ui/node_modules/.cache/babel-loader/1460361af6975e63576708039f1cb732faf9c672d97c494d4055fc6331460be0.json +0 -1
xinference/web/ui/node_modules/.cache/babel-loader/4efd8dda58fda83ed9546bf2f587df67f8d98e639117bee2d9326a9a1d9bebb2.json +0 -1
xinference/web/ui/node_modules/.cache/babel-loader/55b9fb40b57fa926e8f05f31c2f96467e76e5ad62f033dca97c03f9e8c4eb4fe.json +0 -1
xinference/web/ui/node_modules/.cache/babel-loader/5b2dafe5aa9e1105e0244a2b6751807342fa86aa0144b4e84d947a1686102715.json +0 -1
xinference/web/ui/node_modules/.cache/babel-loader/611fa2c6c53b66039991d06dfb0473b5ab37fc63b4564e0f6e1718523768a045.json +0 -1
/xinference/web/ui/build/static/js/{main.9b12b7f9.js.LICENSE.txt → main.7d24df53.js.LICENSE.txt} +0 -0
{xinference-1.7.1.post1.dist-info → xinference-1.8.0.dist-info}/WHEEL +0 -0
{xinference-1.7.1.post1.dist-info → xinference-1.8.0.dist-info}/entry_points.txt +0 -0
{xinference-1.7.1.post1.dist-info → xinference-1.8.0.dist-info}/licenses/LICENSE +0 -0
{xinference-1.7.1.post1.dist-info → xinference-1.8.0.dist-info}/top_level.txt +0 -0

xinference/model/llm/lmdeploy/core.py CHANGED Viewed

@@ -20,7 +20,7 @@ import torch
 from ....types import ChatCompletion, ChatCompletionChunk, Completion, LoRA
 from ..core import LLM
-from ..llm_family import LLMFamilyV1, LLMSpecV1
+from ..llm_family import LLMFamilyV2, LLMSpecV1
 from ..utils import ChatModelMixin, generate_chat_completion, generate_completion_chunk
 logger = logging.getLogger(__name__)
@@ -76,14 +76,12 @@ class LMDeployModel(LLM):
     def __init__(
         self,
         model_uid: str,
-        model_family: "LLMFamilyV1",
-        model_spec: "LLMSpecV1",
-        quantization: str,
+        model_family: "LLMFamilyV2",
         model_path: str,
         model_config: Optional[LMDeployModelConfig] = None,
         peft_model: Optional[List[LoRA]] = None,
     ):
-        super().__init__(model_uid, model_family, model_spec, quantization, model_path)
+        super().__init__(model_uid, model_family, model_path)
         self._model_config: LMDeployModelConfig = self._sanitize_model_config(
             model_config
         )
@@ -119,7 +117,7 @@ class LMDeployModel(LLM):
     @classmethod
     def match_json(
-        cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
+        cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         return False
@@ -172,7 +170,7 @@ class LMDeployChatModel(LMDeployModel, ChatModelMixin):
     @classmethod
     def match_json(
-        cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
+        cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         if llm_spec.model_format == "awq":
             # Currently, only 4-bit weight quantization is supported for AWQ, but got 8 bits.

xinference/model/llm/memory.py CHANGED Viewed

@@ -214,16 +214,15 @@ def get_model_layers_info(
         logger.debug("get_model_layers_info by default size=%s", model_size_in_billions)
         size_in_billions = convert_model_size_to_float(model_size_in_billions)
         return _get_default_layers_from_size(size_in_billions)
-    match_result = match_llm(
+    llm_family = match_llm(
         model_name=model_name,
         model_format=model_format,
         model_size_in_billions=model_size_in_billions,
         quantization=quantization,
     )
-    if not match_result:
+    if not llm_family:
         return None
-    llm_family, llm_spec, _quant = match_result
-    config_path = cache_model_config(llm_family, llm_spec)
+    config_path = cache_model_config(llm_family)
     return load_model_config_json(config_path)

xinference/model/llm/mlx/core.py CHANGED Viewed

@@ -48,7 +48,7 @@ from ....types import (
     LoRA,
 )
 from ..core import LLM, chat_context_var
-from ..llm_family import LLMFamilyV1, LLMSpecV1
+from ..llm_family import LLMFamilyV2, LLMSpecV1
 from ..utils import (
     DEEPSEEK_TOOL_CALL_FAMILY,
     QWEN_TOOL_CALL_FAMILY,
@@ -98,14 +98,12 @@ class MLXModel(LLM):
     def __init__(
         self,
         model_uid: str,
-        model_family: "LLMFamilyV1",
-        model_spec: "LLMSpecV1",
-        quantization: str,
+        model_family: "LLMFamilyV2",
         model_path: str,
         model_config: Optional[MLXModelConfig] = None,
         peft_model: Optional[List[LoRA]] = None,
     ):
-        super().__init__(model_uid, model_family, model_spec, quantization, model_path)
+        super().__init__(model_uid, model_family, model_path)
         self._use_fast_tokenizer = True
         self._model_config: MLXModelConfig = self._sanitize_model_config(model_config)
         # for distributed
@@ -370,7 +368,7 @@ class MLXModel(LLM):
     @classmethod
     def match_json(
-        cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
+        cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         if llm_spec.model_format not in ["mlx"]:
             return False
@@ -670,7 +668,7 @@ class MLXChatModel(MLXModel, ChatModelMixin):
     @classmethod
     def match_json(
-        cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
+        cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         if llm_spec.model_format not in ["mlx"]:
             return False
@@ -734,7 +732,7 @@ class MLXVisionModel(MLXModel, ChatModelMixin):
     @classmethod
     def match_json(
-        cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
+        cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         if llm_spec.model_format not in ["mlx"]:
             return False

xinference/model/llm/reasoning_parser.py CHANGED Viewed

@@ -165,7 +165,9 @@ class ReasoningParser:
         Returns:
             bool: True if reasoning content should be extracted, False otherwise
         """
-        return self.reasoning_content
+        if self.is_enable_thinking():
+            return self.reasoning_content
+        return False
     def _create_chat_completion_chunk(
         self, chunk: Union[Dict[str, Any], CompletionChunk], content: str

xinference/model/llm/sglang/core.py CHANGED Viewed

@@ -32,10 +32,15 @@ from ....types import (
     CompletionChunk,
     CompletionUsage,
 )
-from .. import LLM, LLMFamilyV1, LLMSpecV1
+from .. import LLM, LLMFamilyV2, LLMSpecV1
 from ..core import chat_context_var
-from ..llm_family import CustomLLMFamilyV1
-from ..utils import ChatModelMixin, generate_completion_chunk
+from ..llm_family import CustomLLMFamilyV2
+from ..utils import (
+    DEEPSEEK_TOOL_CALL_FAMILY,
+    QWEN_TOOL_CALL_FAMILY,
+    ChatModelMixin,
+    generate_completion_chunk,
+)
 logger = logging.getLogger(__name__)
@@ -131,13 +136,11 @@ class SGLANGModel(LLM):
     def __init__(
         self,
         model_uid: str,
-        model_family: "LLMFamilyV1",
-        model_spec: "LLMSpecV1",
-        quantization: str,
+        model_family: "LLMFamilyV2",
         model_path: str,
         model_config: Optional[SGLANGModelConfig],
     ):
-        super().__init__(model_uid, model_family, model_spec, quantization, model_path)
+        super().__init__(model_uid, model_family, model_path)
         self._model_config = model_config
         self._engine = None
         self._address = model_config.pop("address", None)  # type: ignore
@@ -319,7 +322,7 @@ class SGLANGModel(LLM):
     @classmethod
     def match_json(
-        cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
+        cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         if not cls._has_cuda_device():
             return False
@@ -330,7 +333,7 @@ class SGLANGModel(LLM):
         if llm_spec.model_format == "pytorch":
             if quantization != "none" and not (quantization is None):
                 return False
-        if isinstance(llm_family, CustomLLMFamilyV1):
+        if isinstance(llm_family, CustomLLMFamilyV2):
             if llm_family.model_family not in SGLANG_SUPPORTED_MODELS:
                 return False
         else:
@@ -547,14 +550,14 @@ class SGLANGModel(LLM):
 class SGLANGChatModel(SGLANGModel, ChatModelMixin):
     @classmethod
     def match_json(
-        cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
+        cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8"]:
             return False
         if llm_spec.model_format == "pytorch":
             if quantization != "none" and not (quantization is None):
                 return False
-        if isinstance(llm_family, CustomLLMFamilyV1):
+        if isinstance(llm_family, CustomLLMFamilyV2):
             if llm_family.model_family not in SGLANG_SUPPORTED_CHAT_MODELS:
                 return False
         else:
@@ -583,6 +586,9 @@ class SGLANGChatModel(SGLANGModel, ChatModelMixin):
         request_id: Optional[str] = None,
     ) -> Union[ChatCompletion, AsyncGenerator[ChatCompletionChunk, None]]:
         assert self.model_family.chat_template is not None
+        # fix: Object of type list_iterator is not JSON serializable
+        tools = list(generate_config.pop("tools", [])) if generate_config else None
+        model_family = self.model_family.model_family or self.model_family.model_name
         chat_template_kwargs = (
             self._get_chat_template_kwargs_from_generate_config(
                 generate_config, self.reasoning_parser
@@ -591,6 +597,12 @@ class SGLANGChatModel(SGLANGModel, ChatModelMixin):
         )
         chat_context_var.set(chat_template_kwargs)
         full_context_kwargs = chat_template_kwargs.copy()
+        if tools:
+            if (
+                model_family in QWEN_TOOL_CALL_FAMILY
+                or model_family in DEEPSEEK_TOOL_CALL_FAMILY
+            ):
+                full_context_kwargs["tools"] = tools
         full_prompt = self.get_full_context(
             messages, self.model_family.chat_template, **full_context_kwargs
         )
@@ -599,17 +611,23 @@ class SGLANGChatModel(SGLANGModel, ChatModelMixin):
         if stream:
             agen = await self.async_generate(full_prompt, generate_config=generate_config)  # type: ignore
             assert isinstance(agen, AsyncGenerator)
-            return self._async_to_chat_completion_chunks(agen, self.reasoning_parser)
+            return self._async_to_chat_completion_chunks(
+                agen, self.reasoning_parser, chat_template_kwargs
+            )
         else:
             c = await self.async_generate(full_prompt, generate_config=generate_config)  # type: ignore
             assert not isinstance(c, AsyncGenerator)
+            if tools:
+                return self._post_process_completion(
+                    self.model_family, self.model_uid, c, self.reasoning_parser
+                )
             return self._to_chat_completion(c, self.reasoning_parser)
 class SGLANGVisionModel(SGLANGModel, ChatModelMixin):
     @classmethod
     def match_json(
-        cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
+        cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         if not cls._has_cuda_device():
             return False
@@ -620,7 +638,7 @@ class SGLANGVisionModel(SGLANGModel, ChatModelMixin):
         if llm_spec.model_format == "pytorch":
             if quantization != "none" and not (quantization is None):
                 return False
-        if isinstance(llm_family, CustomLLMFamilyV1):
+        if isinstance(llm_family, CustomLLMFamilyV2):
             if llm_family.model_family not in SGLANG_SUPPORTED_VISION_MODEL_LIST:
                 return False
         else:

xinference/model/llm/transformers/chatglm.py CHANGED Viewed

@@ -23,7 +23,7 @@ import torch
 from ....core.scheduler import InferenceRequest
 from ....types import ChatCompletion, ChatCompletionChunk, LoRA, PytorchGenerateConfig
 from ..core import chat_context_var
-from ..llm_family import LLMFamilyV1, LLMSpecV1, register_transformer
+from ..llm_family import LLMFamilyV2, LLMSpecV1, register_transformer
 from ..utils import (
     GLM4_TOOL_CALL_FAMILY,
     generate_chat_completion,
@@ -40,9 +40,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
     def __init__(
         self,
         model_uid: str,
-        model_family: "LLMFamilyV1",
-        model_spec: "LLMSpecV1",
-        quantization: str,
+        model_family: "LLMFamilyV2",
         model_path: str,
         pytorch_model_config: Optional[PytorchModelConfig] = None,
         peft_model: Optional[List[LoRA]] = None,
@@ -50,8 +48,6 @@ class ChatglmPytorchChatModel(PytorchChatModel):
         super().__init__(
             model_uid,
             model_family,
-            model_spec,
-            quantization,
             model_path,
             pytorch_model_config=pytorch_model_config,
             peft_model=peft_model,
@@ -88,7 +84,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
     @classmethod
     def match_json(
-        cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
+        cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         if llm_spec.model_format != "pytorch":
             return False

xinference/model/llm/transformers/core.py CHANGED Viewed

@@ -38,7 +38,7 @@ from ....types import (
 )
 from ...utils import select_device
 from ..core import LLM, chat_context_var
-from ..llm_family import LLMFamilyV1, LLMSpecV1
+from ..llm_family import LLMFamilyV2, LLMSpecV1
 from ..utils import (
     DEEPSEEK_TOOL_CALL_FAMILY,
     LLAMA3_TOOL_CALL_FAMILY,
@@ -92,14 +92,12 @@ class PytorchModel(LLM):
     def __init__(
         self,
         model_uid: str,
-        model_family: "LLMFamilyV1",
-        model_spec: "LLMSpecV1",
-        quantization: str,
+        model_family: "LLMFamilyV2",
         model_path: str,
         pytorch_model_config: Optional[PytorchModelConfig] = None,
         peft_model: Optional[List[LoRA]] = None,
     ):
-        super().__init__(model_uid, model_family, model_spec, quantization, model_path)
+        super().__init__(model_uid, model_family, model_path)
         self._use_fast_tokenizer = True
         self._pytorch_model_config: PytorchModelConfig = self._sanitize_model_config(
             pytorch_model_config
@@ -345,7 +343,7 @@ class PytorchModel(LLM):
     @classmethod
     def match_json(
-        cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
+        cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         if llm_spec.model_format not in ["pytorch", "gptq", "awq"]:
             return False
@@ -367,14 +365,26 @@ class PytorchModel(LLM):
         data = []
         for r in reqs:
             real_len = seq_length - r.padding_len
-            x = torch.cat(
-                [
-                    torch.full((r.padding_len,), 0, dtype=torch.long),
-                    torch.ones((real_len,), dtype=torch.long),
-                ]
-            )
-            data.append(x)
             r.extra_kwargs["attention_mask_seq_len"] = real_len
+            if self._tokenizer.padding_side == "left":
+                # [PAD][PAD]...[TOKEN]
+                x = torch.cat(
+                    [
+                        torch.full((r.padding_len,), 0, dtype=torch.long),
+                        torch.ones((real_len,), dtype=torch.long),
+                    ]
+                )
+            else:  # right padding
+                # [TOKEN]...[PAD][PAD]
+                x = torch.cat(
+                    [
+                        torch.ones((real_len,), dtype=torch.long),
+                        torch.full((r.padding_len,), 0, dtype=torch.long),
+                    ]
+                )
+            data.append(x)
         return torch.stack(data).to(self._device)
     def build_decode_attention_mask(
@@ -388,14 +398,30 @@ class PytorchModel(LLM):
         data = []
         for r in reqs:
             r.extra_kwargs["attention_mask_seq_len"] += 1
-            attention_mask_seq_len = r.extra_kwargs["attention_mask_seq_len"]
-            pad_len = seq_length - attention_mask_seq_len
-            x = torch.cat(
-                [
-                    torch.full((pad_len,), 0, dtype=torch.long),
-                    torch.ones((attention_mask_seq_len,), dtype=torch.long),
-                ]
-            )
+            if self._tokenizer.padding_side == "left":
+                attention_mask_seq_len = r.extra_kwargs["attention_mask_seq_len"]
+                pad_len = seq_length - attention_mask_seq_len
+                assert pad_len > 0, (
+                    f"pad_len must be greater than 0, got {pad_len} = "
+                    f"seq_length({seq_length}) - attention_mask_seq_len({attention_mask_seq_len})"
+                )
+                x = torch.cat(
+                    [
+                        torch.full((pad_len,), 0, dtype=torch.long),
+                        torch.ones((attention_mask_seq_len,), dtype=torch.long),
+                    ]
+                )
+            else:
+                max_len = max(r.extra_kwargs["attention_mask_seq_len"] for r in reqs)
+                real_len = r.extra_kwargs["attention_mask_seq_len"]
+                pad_len = max_len - real_len
+                x = torch.cat(
+                    [
+                        torch.ones((real_len,), dtype=torch.long),
+                        torch.full((pad_len,), 0, dtype=torch.long),
+                    ]
+                )
             data.append(x)
         return torch.stack(data).to(self._device)
@@ -668,9 +694,7 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
     def __init__(
         self,
         model_uid: str,
-        model_family: "LLMFamilyV1",
-        model_spec: "LLMSpecV1",
-        quantization: str,
+        model_family: "LLMFamilyV2",
         model_path: str,
         pytorch_model_config: Optional[PytorchModelConfig] = None,
         peft_model: Optional[List[LoRA]] = None,
@@ -678,8 +702,6 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
         super().__init__(
             model_uid,
             model_family,
-            model_spec,
-            quantization,
             model_path,
             pytorch_model_config,
             peft_model,
@@ -702,7 +724,7 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
     @classmethod
     def match_json(
-        cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
+        cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         if llm_spec.model_format not in ["pytorch", "gptq", "awq"]:
             return False

xinference/model/llm/transformers/deepseek_v2.py CHANGED Viewed

@@ -15,7 +15,7 @@ import logging
 import torch
-from ..llm_family import LLMFamilyV1, LLMSpecV1, register_transformer
+from ..llm_family import LLMFamilyV2, LLMSpecV1, register_transformer
 from .core import PytorchChatModel, register_non_default_model
 logger = logging.getLogger(__name__)
@@ -61,7 +61,7 @@ class DeepSeekV2PytorchChatModel(PytorchChatModel):
     @classmethod
     def match_json(
-        cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
+        cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         if llm_spec.model_format != "pytorch":
             return False

xinference/model/llm/transformers/gemma3.py CHANGED Viewed

@@ -15,7 +15,7 @@ import logging
 from typing import Dict, List, Set
 from ....core.scheduler import InferenceRequest
-from ..llm_family import LLMFamilyV1, LLMSpecV1, register_transformer
+from ..llm_family import LLMFamilyV2, LLMSpecV1, register_transformer
 from .core import PytorchChatModel, register_non_default_model
 logger = logging.getLogger(__name__)
@@ -26,7 +26,7 @@ logger = logging.getLogger(__name__)
 class Gemma3TextChatModel(PytorchChatModel):
     @classmethod
     def match_json(
-        cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
+        cls, model_family: "LLMFamilyV2", model_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         if model_spec.model_format not in ["pytorch", "gptq", "awq"]:
             return False

xinference/model/llm/transformers/multimodal/cogagent.py CHANGED Viewed

@@ -21,7 +21,7 @@ import torch
 from .....model.utils import select_device
 from ...core import chat_context_var
-from ...llm_family import LLMFamilyV1, LLMSpecV1, register_transformer
+from ...llm_family import LLMFamilyV2, LLMSpecV1, register_transformer
 from ...utils import _decode_image, parse_messages
 from ..core import register_non_default_model
 from .core import PytorchMultiModalModel
@@ -47,7 +47,7 @@ class CogAgentChatModel(PytorchMultiModalModel):
     @classmethod
     def match_json(
-        cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
+        cls, model_family: "LLMFamilyV2", model_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         family = model_family.model_family or model_family.model_name
         if "cogagent" in family.lower():

xinference/model/llm/transformers/multimodal/deepseek_vl2.py CHANGED Viewed

@@ -23,7 +23,7 @@ import requests
 import torch
 from .....model.utils import select_device
-from ...llm_family import LLMFamilyV1, LLMSpecV1, register_transformer
+from ...llm_family import LLMFamilyV2, LLMSpecV1, register_transformer
 from ..core import register_non_default_model
 from .core import PytorchMultiModalModel
@@ -39,7 +39,7 @@ class DeepSeekVL2ChatModel(PytorchMultiModalModel):
     @classmethod
     def match_json(
-        cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
+        cls, model_family: "LLMFamilyV2", model_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         llm_family = model_family.model_family or model_family.model_name
         if "deepseek-vl2" == llm_family.lower():

xinference/model/llm/transformers/multimodal/gemma3.py CHANGED Viewed

@@ -17,7 +17,7 @@ from typing import Any, Dict, Iterator, List, Optional, Tuple
 from .....model.utils import select_device
 from .....types import PytorchModelConfig
-from ...llm_family import LLMFamilyV1, LLMSpecV1, register_transformer
+from ...llm_family import LLMFamilyV2, LLMSpecV1, register_transformer
 from ..core import register_non_default_model
 from .core import PytorchMultiModalModel
@@ -29,7 +29,7 @@ logger = logging.getLogger(__name__)
 class Gemma3ChatModel(PytorchMultiModalModel):
     @classmethod
     def match_json(
-        cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
+        cls, model_family: "LLMFamilyV2", model_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         if model_spec.model_format not in ["pytorch", "gptq", "awq"]:
             return False

xinference 1.7.1.post1__py3-none-any.whl → 1.8.0__py3-none-any.whl

Potentially problematic release.

xinference 1.7.1.post1py3-none-any.whl → 1.8.0py3-none-any.whl