xinference 1.7.1__py3-none-any.whl → 1.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (136) hide show
  1. xinference/_version.py +3 -3
  2. xinference/client/restful/async_restful_client.py +8 -13
  3. xinference/client/restful/restful_client.py +6 -2
  4. xinference/core/chat_interface.py +6 -4
  5. xinference/core/media_interface.py +5 -0
  6. xinference/core/model.py +1 -5
  7. xinference/core/supervisor.py +117 -68
  8. xinference/core/worker.py +49 -37
  9. xinference/deploy/test/test_cmdline.py +2 -6
  10. xinference/model/audio/__init__.py +26 -23
  11. xinference/model/audio/chattts.py +3 -2
  12. xinference/model/audio/core.py +49 -98
  13. xinference/model/audio/cosyvoice.py +3 -2
  14. xinference/model/audio/custom.py +28 -73
  15. xinference/model/audio/f5tts.py +3 -2
  16. xinference/model/audio/f5tts_mlx.py +3 -2
  17. xinference/model/audio/fish_speech.py +3 -2
  18. xinference/model/audio/funasr.py +17 -4
  19. xinference/model/audio/kokoro.py +3 -2
  20. xinference/model/audio/megatts.py +3 -2
  21. xinference/model/audio/melotts.py +3 -2
  22. xinference/model/audio/model_spec.json +572 -171
  23. xinference/model/audio/utils.py +0 -6
  24. xinference/model/audio/whisper.py +3 -2
  25. xinference/model/audio/whisper_mlx.py +3 -2
  26. xinference/model/cache_manager.py +141 -0
  27. xinference/model/core.py +6 -49
  28. xinference/model/custom.py +174 -0
  29. xinference/model/embedding/__init__.py +67 -56
  30. xinference/model/embedding/cache_manager.py +35 -0
  31. xinference/model/embedding/core.py +104 -84
  32. xinference/model/embedding/custom.py +55 -78
  33. xinference/model/embedding/embed_family.py +80 -31
  34. xinference/model/embedding/flag/core.py +21 -5
  35. xinference/model/embedding/llama_cpp/__init__.py +0 -0
  36. xinference/model/embedding/llama_cpp/core.py +234 -0
  37. xinference/model/embedding/model_spec.json +968 -103
  38. xinference/model/embedding/sentence_transformers/core.py +30 -20
  39. xinference/model/embedding/vllm/core.py +11 -5
  40. xinference/model/flexible/__init__.py +8 -2
  41. xinference/model/flexible/core.py +26 -119
  42. xinference/model/flexible/custom.py +69 -0
  43. xinference/model/flexible/launchers/image_process_launcher.py +1 -0
  44. xinference/model/flexible/launchers/modelscope_launcher.py +5 -1
  45. xinference/model/flexible/launchers/transformers_launcher.py +15 -3
  46. xinference/model/flexible/launchers/yolo_launcher.py +5 -1
  47. xinference/model/image/__init__.py +20 -20
  48. xinference/model/image/cache_manager.py +62 -0
  49. xinference/model/image/core.py +70 -182
  50. xinference/model/image/custom.py +28 -72
  51. xinference/model/image/model_spec.json +402 -119
  52. xinference/model/image/ocr/got_ocr2.py +3 -2
  53. xinference/model/image/stable_diffusion/core.py +22 -7
  54. xinference/model/image/stable_diffusion/mlx.py +6 -6
  55. xinference/model/image/utils.py +2 -2
  56. xinference/model/llm/__init__.py +71 -94
  57. xinference/model/llm/cache_manager.py +292 -0
  58. xinference/model/llm/core.py +37 -111
  59. xinference/model/llm/custom.py +88 -0
  60. xinference/model/llm/llama_cpp/core.py +5 -7
  61. xinference/model/llm/llm_family.json +16260 -8151
  62. xinference/model/llm/llm_family.py +138 -839
  63. xinference/model/llm/lmdeploy/core.py +5 -7
  64. xinference/model/llm/memory.py +3 -4
  65. xinference/model/llm/mlx/core.py +6 -8
  66. xinference/model/llm/reasoning_parser.py +3 -1
  67. xinference/model/llm/sglang/core.py +32 -14
  68. xinference/model/llm/transformers/chatglm.py +3 -7
  69. xinference/model/llm/transformers/core.py +49 -27
  70. xinference/model/llm/transformers/deepseek_v2.py +2 -2
  71. xinference/model/llm/transformers/gemma3.py +2 -2
  72. xinference/model/llm/transformers/multimodal/cogagent.py +2 -2
  73. xinference/model/llm/transformers/multimodal/deepseek_vl2.py +2 -2
  74. xinference/model/llm/transformers/multimodal/gemma3.py +2 -2
  75. xinference/model/llm/transformers/multimodal/glm4_1v.py +167 -0
  76. xinference/model/llm/transformers/multimodal/glm4v.py +2 -2
  77. xinference/model/llm/transformers/multimodal/intern_vl.py +2 -2
  78. xinference/model/llm/transformers/multimodal/minicpmv26.py +3 -3
  79. xinference/model/llm/transformers/multimodal/ovis2.py +2 -2
  80. xinference/model/llm/transformers/multimodal/qwen-omni.py +2 -2
  81. xinference/model/llm/transformers/multimodal/qwen2_audio.py +2 -2
  82. xinference/model/llm/transformers/multimodal/qwen2_vl.py +2 -2
  83. xinference/model/llm/transformers/opt.py +3 -7
  84. xinference/model/llm/utils.py +34 -49
  85. xinference/model/llm/vllm/core.py +77 -27
  86. xinference/model/llm/vllm/xavier/engine.py +5 -3
  87. xinference/model/llm/vllm/xavier/scheduler.py +10 -6
  88. xinference/model/llm/vllm/xavier/transfer.py +1 -1
  89. xinference/model/rerank/__init__.py +26 -25
  90. xinference/model/rerank/core.py +47 -87
  91. xinference/model/rerank/custom.py +25 -71
  92. xinference/model/rerank/model_spec.json +158 -33
  93. xinference/model/rerank/utils.py +2 -2
  94. xinference/model/utils.py +115 -54
  95. xinference/model/video/__init__.py +13 -17
  96. xinference/model/video/core.py +44 -102
  97. xinference/model/video/diffusers.py +4 -3
  98. xinference/model/video/model_spec.json +90 -21
  99. xinference/types.py +5 -3
  100. xinference/web/ui/build/asset-manifest.json +3 -3
  101. xinference/web/ui/build/index.html +1 -1
  102. xinference/web/ui/build/static/js/main.7d24df53.js +3 -0
  103. xinference/web/ui/build/static/js/main.7d24df53.js.map +1 -0
  104. xinference/web/ui/node_modules/.cache/babel-loader/2704ff66a5f73ca78b341eb3edec60154369df9d87fbc8c6dd60121abc5e1b0a.json +1 -0
  105. xinference/web/ui/node_modules/.cache/babel-loader/607dfef23d33e6b594518c0c6434567639f24f356b877c80c60575184ec50ed0.json +1 -0
  106. xinference/web/ui/node_modules/.cache/babel-loader/9be3d56173aacc3efd0b497bcb13c4f6365de30069176ee9403b40e717542326.json +1 -0
  107. xinference/web/ui/node_modules/.cache/babel-loader/9f9dd6c32c78a222d07da5987ae902effe16bcf20aac00774acdccc4de3c9ff2.json +1 -0
  108. xinference/web/ui/node_modules/.cache/babel-loader/b2ab5ee972c60d15eb9abf5845705f8ab7e1d125d324d9a9b1bcae5d6fd7ffb2.json +1 -0
  109. xinference/web/ui/src/locales/en.json +0 -1
  110. xinference/web/ui/src/locales/ja.json +0 -1
  111. xinference/web/ui/src/locales/ko.json +0 -1
  112. xinference/web/ui/src/locales/zh.json +0 -1
  113. {xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/METADATA +9 -11
  114. {xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/RECORD +119 -119
  115. xinference/model/audio/model_spec_modelscope.json +0 -231
  116. xinference/model/embedding/model_spec_modelscope.json +0 -293
  117. xinference/model/embedding/utils.py +0 -18
  118. xinference/model/image/model_spec_modelscope.json +0 -375
  119. xinference/model/llm/llama_cpp/memory.py +0 -457
  120. xinference/model/llm/llm_family_csghub.json +0 -56
  121. xinference/model/llm/llm_family_modelscope.json +0 -8700
  122. xinference/model/llm/llm_family_openmind_hub.json +0 -1019
  123. xinference/model/rerank/model_spec_modelscope.json +0 -85
  124. xinference/model/video/model_spec_modelscope.json +0 -184
  125. xinference/web/ui/build/static/js/main.9b12b7f9.js +0 -3
  126. xinference/web/ui/build/static/js/main.9b12b7f9.js.map +0 -1
  127. xinference/web/ui/node_modules/.cache/babel-loader/1460361af6975e63576708039f1cb732faf9c672d97c494d4055fc6331460be0.json +0 -1
  128. xinference/web/ui/node_modules/.cache/babel-loader/4efd8dda58fda83ed9546bf2f587df67f8d98e639117bee2d9326a9a1d9bebb2.json +0 -1
  129. xinference/web/ui/node_modules/.cache/babel-loader/55b9fb40b57fa926e8f05f31c2f96467e76e5ad62f033dca97c03f9e8c4eb4fe.json +0 -1
  130. xinference/web/ui/node_modules/.cache/babel-loader/5b2dafe5aa9e1105e0244a2b6751807342fa86aa0144b4e84d947a1686102715.json +0 -1
  131. xinference/web/ui/node_modules/.cache/babel-loader/611fa2c6c53b66039991d06dfb0473b5ab37fc63b4564e0f6e1718523768a045.json +0 -1
  132. /xinference/web/ui/build/static/js/{main.9b12b7f9.js.LICENSE.txt → main.7d24df53.js.LICENSE.txt} +0 -0
  133. {xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/WHEEL +0 -0
  134. {xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/entry_points.txt +0 -0
  135. {xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/licenses/LICENSE +0 -0
  136. {xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/top_level.txt +0 -0
@@ -20,7 +20,7 @@ import torch
20
20
 
21
21
  from ....types import ChatCompletion, ChatCompletionChunk, Completion, LoRA
22
22
  from ..core import LLM
23
- from ..llm_family import LLMFamilyV1, LLMSpecV1
23
+ from ..llm_family import LLMFamilyV2, LLMSpecV1
24
24
  from ..utils import ChatModelMixin, generate_chat_completion, generate_completion_chunk
25
25
 
26
26
  logger = logging.getLogger(__name__)
@@ -76,14 +76,12 @@ class LMDeployModel(LLM):
76
76
  def __init__(
77
77
  self,
78
78
  model_uid: str,
79
- model_family: "LLMFamilyV1",
80
- model_spec: "LLMSpecV1",
81
- quantization: str,
79
+ model_family: "LLMFamilyV2",
82
80
  model_path: str,
83
81
  model_config: Optional[LMDeployModelConfig] = None,
84
82
  peft_model: Optional[List[LoRA]] = None,
85
83
  ):
86
- super().__init__(model_uid, model_family, model_spec, quantization, model_path)
84
+ super().__init__(model_uid, model_family, model_path)
87
85
  self._model_config: LMDeployModelConfig = self._sanitize_model_config(
88
86
  model_config
89
87
  )
@@ -119,7 +117,7 @@ class LMDeployModel(LLM):
119
117
 
120
118
  @classmethod
121
119
  def match_json(
122
- cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
120
+ cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
123
121
  ) -> bool:
124
122
  return False
125
123
 
@@ -172,7 +170,7 @@ class LMDeployChatModel(LMDeployModel, ChatModelMixin):
172
170
 
173
171
  @classmethod
174
172
  def match_json(
175
- cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
173
+ cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
176
174
  ) -> bool:
177
175
  if llm_spec.model_format == "awq":
178
176
  # Currently, only 4-bit weight quantization is supported for AWQ, but got 8 bits.
@@ -214,16 +214,15 @@ def get_model_layers_info(
214
214
  logger.debug("get_model_layers_info by default size=%s", model_size_in_billions)
215
215
  size_in_billions = convert_model_size_to_float(model_size_in_billions)
216
216
  return _get_default_layers_from_size(size_in_billions)
217
- match_result = match_llm(
217
+ llm_family = match_llm(
218
218
  model_name=model_name,
219
219
  model_format=model_format,
220
220
  model_size_in_billions=model_size_in_billions,
221
221
  quantization=quantization,
222
222
  )
223
- if not match_result:
223
+ if not llm_family:
224
224
  return None
225
- llm_family, llm_spec, _quant = match_result
226
- config_path = cache_model_config(llm_family, llm_spec)
225
+ config_path = cache_model_config(llm_family)
227
226
  return load_model_config_json(config_path)
228
227
 
229
228
 
@@ -48,7 +48,7 @@ from ....types import (
48
48
  LoRA,
49
49
  )
50
50
  from ..core import LLM, chat_context_var
51
- from ..llm_family import LLMFamilyV1, LLMSpecV1
51
+ from ..llm_family import LLMFamilyV2, LLMSpecV1
52
52
  from ..utils import (
53
53
  DEEPSEEK_TOOL_CALL_FAMILY,
54
54
  QWEN_TOOL_CALL_FAMILY,
@@ -98,14 +98,12 @@ class MLXModel(LLM):
98
98
  def __init__(
99
99
  self,
100
100
  model_uid: str,
101
- model_family: "LLMFamilyV1",
102
- model_spec: "LLMSpecV1",
103
- quantization: str,
101
+ model_family: "LLMFamilyV2",
104
102
  model_path: str,
105
103
  model_config: Optional[MLXModelConfig] = None,
106
104
  peft_model: Optional[List[LoRA]] = None,
107
105
  ):
108
- super().__init__(model_uid, model_family, model_spec, quantization, model_path)
106
+ super().__init__(model_uid, model_family, model_path)
109
107
  self._use_fast_tokenizer = True
110
108
  self._model_config: MLXModelConfig = self._sanitize_model_config(model_config)
111
109
  # for distributed
@@ -370,7 +368,7 @@ class MLXModel(LLM):
370
368
 
371
369
  @classmethod
372
370
  def match_json(
373
- cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
371
+ cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
374
372
  ) -> bool:
375
373
  if llm_spec.model_format not in ["mlx"]:
376
374
  return False
@@ -670,7 +668,7 @@ class MLXChatModel(MLXModel, ChatModelMixin):
670
668
 
671
669
  @classmethod
672
670
  def match_json(
673
- cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
671
+ cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
674
672
  ) -> bool:
675
673
  if llm_spec.model_format not in ["mlx"]:
676
674
  return False
@@ -734,7 +732,7 @@ class MLXVisionModel(MLXModel, ChatModelMixin):
734
732
 
735
733
  @classmethod
736
734
  def match_json(
737
- cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
735
+ cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
738
736
  ) -> bool:
739
737
  if llm_spec.model_format not in ["mlx"]:
740
738
  return False
@@ -165,7 +165,9 @@ class ReasoningParser:
165
165
  Returns:
166
166
  bool: True if reasoning content should be extracted, False otherwise
167
167
  """
168
- return self.reasoning_content
168
+ if self.is_enable_thinking():
169
+ return self.reasoning_content
170
+ return False
169
171
 
170
172
  def _create_chat_completion_chunk(
171
173
  self, chunk: Union[Dict[str, Any], CompletionChunk], content: str
@@ -32,10 +32,15 @@ from ....types import (
32
32
  CompletionChunk,
33
33
  CompletionUsage,
34
34
  )
35
- from .. import LLM, LLMFamilyV1, LLMSpecV1
35
+ from .. import LLM, LLMFamilyV2, LLMSpecV1
36
36
  from ..core import chat_context_var
37
- from ..llm_family import CustomLLMFamilyV1
38
- from ..utils import ChatModelMixin, generate_completion_chunk
37
+ from ..llm_family import CustomLLMFamilyV2
38
+ from ..utils import (
39
+ DEEPSEEK_TOOL_CALL_FAMILY,
40
+ QWEN_TOOL_CALL_FAMILY,
41
+ ChatModelMixin,
42
+ generate_completion_chunk,
43
+ )
39
44
 
40
45
  logger = logging.getLogger(__name__)
41
46
 
@@ -131,13 +136,11 @@ class SGLANGModel(LLM):
131
136
  def __init__(
132
137
  self,
133
138
  model_uid: str,
134
- model_family: "LLMFamilyV1",
135
- model_spec: "LLMSpecV1",
136
- quantization: str,
139
+ model_family: "LLMFamilyV2",
137
140
  model_path: str,
138
141
  model_config: Optional[SGLANGModelConfig],
139
142
  ):
140
- super().__init__(model_uid, model_family, model_spec, quantization, model_path)
143
+ super().__init__(model_uid, model_family, model_path)
141
144
  self._model_config = model_config
142
145
  self._engine = None
143
146
  self._address = model_config.pop("address", None) # type: ignore
@@ -319,7 +322,7 @@ class SGLANGModel(LLM):
319
322
 
320
323
  @classmethod
321
324
  def match_json(
322
- cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
325
+ cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
323
326
  ) -> bool:
324
327
  if not cls._has_cuda_device():
325
328
  return False
@@ -330,7 +333,7 @@ class SGLANGModel(LLM):
330
333
  if llm_spec.model_format == "pytorch":
331
334
  if quantization != "none" and not (quantization is None):
332
335
  return False
333
- if isinstance(llm_family, CustomLLMFamilyV1):
336
+ if isinstance(llm_family, CustomLLMFamilyV2):
334
337
  if llm_family.model_family not in SGLANG_SUPPORTED_MODELS:
335
338
  return False
336
339
  else:
@@ -547,14 +550,14 @@ class SGLANGModel(LLM):
547
550
  class SGLANGChatModel(SGLANGModel, ChatModelMixin):
548
551
  @classmethod
549
552
  def match_json(
550
- cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
553
+ cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
551
554
  ) -> bool:
552
555
  if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8"]:
553
556
  return False
554
557
  if llm_spec.model_format == "pytorch":
555
558
  if quantization != "none" and not (quantization is None):
556
559
  return False
557
- if isinstance(llm_family, CustomLLMFamilyV1):
560
+ if isinstance(llm_family, CustomLLMFamilyV2):
558
561
  if llm_family.model_family not in SGLANG_SUPPORTED_CHAT_MODELS:
559
562
  return False
560
563
  else:
@@ -583,6 +586,9 @@ class SGLANGChatModel(SGLANGModel, ChatModelMixin):
583
586
  request_id: Optional[str] = None,
584
587
  ) -> Union[ChatCompletion, AsyncGenerator[ChatCompletionChunk, None]]:
585
588
  assert self.model_family.chat_template is not None
589
+ # fix: Object of type list_iterator is not JSON serializable
590
+ tools = list(generate_config.pop("tools", [])) if generate_config else None
591
+ model_family = self.model_family.model_family or self.model_family.model_name
586
592
  chat_template_kwargs = (
587
593
  self._get_chat_template_kwargs_from_generate_config(
588
594
  generate_config, self.reasoning_parser
@@ -591,6 +597,12 @@ class SGLANGChatModel(SGLANGModel, ChatModelMixin):
591
597
  )
592
598
  chat_context_var.set(chat_template_kwargs)
593
599
  full_context_kwargs = chat_template_kwargs.copy()
600
+ if tools:
601
+ if (
602
+ model_family in QWEN_TOOL_CALL_FAMILY
603
+ or model_family in DEEPSEEK_TOOL_CALL_FAMILY
604
+ ):
605
+ full_context_kwargs["tools"] = tools
594
606
  full_prompt = self.get_full_context(
595
607
  messages, self.model_family.chat_template, **full_context_kwargs
596
608
  )
@@ -599,17 +611,23 @@ class SGLANGChatModel(SGLANGModel, ChatModelMixin):
599
611
  if stream:
600
612
  agen = await self.async_generate(full_prompt, generate_config=generate_config) # type: ignore
601
613
  assert isinstance(agen, AsyncGenerator)
602
- return self._async_to_chat_completion_chunks(agen, self.reasoning_parser)
614
+ return self._async_to_chat_completion_chunks(
615
+ agen, self.reasoning_parser, chat_template_kwargs
616
+ )
603
617
  else:
604
618
  c = await self.async_generate(full_prompt, generate_config=generate_config) # type: ignore
605
619
  assert not isinstance(c, AsyncGenerator)
620
+ if tools:
621
+ return self._post_process_completion(
622
+ self.model_family, self.model_uid, c, self.reasoning_parser
623
+ )
606
624
  return self._to_chat_completion(c, self.reasoning_parser)
607
625
 
608
626
 
609
627
  class SGLANGVisionModel(SGLANGModel, ChatModelMixin):
610
628
  @classmethod
611
629
  def match_json(
612
- cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
630
+ cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
613
631
  ) -> bool:
614
632
  if not cls._has_cuda_device():
615
633
  return False
@@ -620,7 +638,7 @@ class SGLANGVisionModel(SGLANGModel, ChatModelMixin):
620
638
  if llm_spec.model_format == "pytorch":
621
639
  if quantization != "none" and not (quantization is None):
622
640
  return False
623
- if isinstance(llm_family, CustomLLMFamilyV1):
641
+ if isinstance(llm_family, CustomLLMFamilyV2):
624
642
  if llm_family.model_family not in SGLANG_SUPPORTED_VISION_MODEL_LIST:
625
643
  return False
626
644
  else:
@@ -23,7 +23,7 @@ import torch
23
23
  from ....core.scheduler import InferenceRequest
24
24
  from ....types import ChatCompletion, ChatCompletionChunk, LoRA, PytorchGenerateConfig
25
25
  from ..core import chat_context_var
26
- from ..llm_family import LLMFamilyV1, LLMSpecV1, register_transformer
26
+ from ..llm_family import LLMFamilyV2, LLMSpecV1, register_transformer
27
27
  from ..utils import (
28
28
  GLM4_TOOL_CALL_FAMILY,
29
29
  generate_chat_completion,
@@ -40,9 +40,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
40
40
  def __init__(
41
41
  self,
42
42
  model_uid: str,
43
- model_family: "LLMFamilyV1",
44
- model_spec: "LLMSpecV1",
45
- quantization: str,
43
+ model_family: "LLMFamilyV2",
46
44
  model_path: str,
47
45
  pytorch_model_config: Optional[PytorchModelConfig] = None,
48
46
  peft_model: Optional[List[LoRA]] = None,
@@ -50,8 +48,6 @@ class ChatglmPytorchChatModel(PytorchChatModel):
50
48
  super().__init__(
51
49
  model_uid,
52
50
  model_family,
53
- model_spec,
54
- quantization,
55
51
  model_path,
56
52
  pytorch_model_config=pytorch_model_config,
57
53
  peft_model=peft_model,
@@ -88,7 +84,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
88
84
 
89
85
  @classmethod
90
86
  def match_json(
91
- cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
87
+ cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
92
88
  ) -> bool:
93
89
  if llm_spec.model_format != "pytorch":
94
90
  return False
@@ -38,7 +38,7 @@ from ....types import (
38
38
  )
39
39
  from ...utils import select_device
40
40
  from ..core import LLM, chat_context_var
41
- from ..llm_family import LLMFamilyV1, LLMSpecV1
41
+ from ..llm_family import LLMFamilyV2, LLMSpecV1
42
42
  from ..utils import (
43
43
  DEEPSEEK_TOOL_CALL_FAMILY,
44
44
  LLAMA3_TOOL_CALL_FAMILY,
@@ -92,14 +92,12 @@ class PytorchModel(LLM):
92
92
  def __init__(
93
93
  self,
94
94
  model_uid: str,
95
- model_family: "LLMFamilyV1",
96
- model_spec: "LLMSpecV1",
97
- quantization: str,
95
+ model_family: "LLMFamilyV2",
98
96
  model_path: str,
99
97
  pytorch_model_config: Optional[PytorchModelConfig] = None,
100
98
  peft_model: Optional[List[LoRA]] = None,
101
99
  ):
102
- super().__init__(model_uid, model_family, model_spec, quantization, model_path)
100
+ super().__init__(model_uid, model_family, model_path)
103
101
  self._use_fast_tokenizer = True
104
102
  self._pytorch_model_config: PytorchModelConfig = self._sanitize_model_config(
105
103
  pytorch_model_config
@@ -345,7 +343,7 @@ class PytorchModel(LLM):
345
343
 
346
344
  @classmethod
347
345
  def match_json(
348
- cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
346
+ cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
349
347
  ) -> bool:
350
348
  if llm_spec.model_format not in ["pytorch", "gptq", "awq"]:
351
349
  return False
@@ -367,14 +365,26 @@ class PytorchModel(LLM):
367
365
  data = []
368
366
  for r in reqs:
369
367
  real_len = seq_length - r.padding_len
370
- x = torch.cat(
371
- [
372
- torch.full((r.padding_len,), 0, dtype=torch.long),
373
- torch.ones((real_len,), dtype=torch.long),
374
- ]
375
- )
376
- data.append(x)
377
368
  r.extra_kwargs["attention_mask_seq_len"] = real_len
369
+
370
+ if self._tokenizer.padding_side == "left":
371
+ # [PAD][PAD]...[TOKEN]
372
+ x = torch.cat(
373
+ [
374
+ torch.full((r.padding_len,), 0, dtype=torch.long),
375
+ torch.ones((real_len,), dtype=torch.long),
376
+ ]
377
+ )
378
+ else: # right padding
379
+ # [TOKEN]...[PAD][PAD]
380
+ x = torch.cat(
381
+ [
382
+ torch.ones((real_len,), dtype=torch.long),
383
+ torch.full((r.padding_len,), 0, dtype=torch.long),
384
+ ]
385
+ )
386
+ data.append(x)
387
+
378
388
  return torch.stack(data).to(self._device)
379
389
 
380
390
  def build_decode_attention_mask(
@@ -388,14 +398,30 @@ class PytorchModel(LLM):
388
398
  data = []
389
399
  for r in reqs:
390
400
  r.extra_kwargs["attention_mask_seq_len"] += 1
391
- attention_mask_seq_len = r.extra_kwargs["attention_mask_seq_len"]
392
- pad_len = seq_length - attention_mask_seq_len
393
- x = torch.cat(
394
- [
395
- torch.full((pad_len,), 0, dtype=torch.long),
396
- torch.ones((attention_mask_seq_len,), dtype=torch.long),
397
- ]
398
- )
401
+ if self._tokenizer.padding_side == "left":
402
+ attention_mask_seq_len = r.extra_kwargs["attention_mask_seq_len"]
403
+ pad_len = seq_length - attention_mask_seq_len
404
+ assert pad_len > 0, (
405
+ f"pad_len must be greater than 0, got {pad_len} = "
406
+ f"seq_length({seq_length}) - attention_mask_seq_len({attention_mask_seq_len})"
407
+ )
408
+ x = torch.cat(
409
+ [
410
+ torch.full((pad_len,), 0, dtype=torch.long),
411
+ torch.ones((attention_mask_seq_len,), dtype=torch.long),
412
+ ]
413
+ )
414
+ else:
415
+ max_len = max(r.extra_kwargs["attention_mask_seq_len"] for r in reqs)
416
+ real_len = r.extra_kwargs["attention_mask_seq_len"]
417
+ pad_len = max_len - real_len
418
+
419
+ x = torch.cat(
420
+ [
421
+ torch.ones((real_len,), dtype=torch.long),
422
+ torch.full((pad_len,), 0, dtype=torch.long),
423
+ ]
424
+ )
399
425
  data.append(x)
400
426
  return torch.stack(data).to(self._device)
401
427
 
@@ -668,9 +694,7 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
668
694
  def __init__(
669
695
  self,
670
696
  model_uid: str,
671
- model_family: "LLMFamilyV1",
672
- model_spec: "LLMSpecV1",
673
- quantization: str,
697
+ model_family: "LLMFamilyV2",
674
698
  model_path: str,
675
699
  pytorch_model_config: Optional[PytorchModelConfig] = None,
676
700
  peft_model: Optional[List[LoRA]] = None,
@@ -678,8 +702,6 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
678
702
  super().__init__(
679
703
  model_uid,
680
704
  model_family,
681
- model_spec,
682
- quantization,
683
705
  model_path,
684
706
  pytorch_model_config,
685
707
  peft_model,
@@ -702,7 +724,7 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
702
724
 
703
725
  @classmethod
704
726
  def match_json(
705
- cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
727
+ cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
706
728
  ) -> bool:
707
729
  if llm_spec.model_format not in ["pytorch", "gptq", "awq"]:
708
730
  return False
@@ -15,7 +15,7 @@ import logging
15
15
 
16
16
  import torch
17
17
 
18
- from ..llm_family import LLMFamilyV1, LLMSpecV1, register_transformer
18
+ from ..llm_family import LLMFamilyV2, LLMSpecV1, register_transformer
19
19
  from .core import PytorchChatModel, register_non_default_model
20
20
 
21
21
  logger = logging.getLogger(__name__)
@@ -61,7 +61,7 @@ class DeepSeekV2PytorchChatModel(PytorchChatModel):
61
61
 
62
62
  @classmethod
63
63
  def match_json(
64
- cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
64
+ cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
65
65
  ) -> bool:
66
66
  if llm_spec.model_format != "pytorch":
67
67
  return False
@@ -15,7 +15,7 @@ import logging
15
15
  from typing import Dict, List, Set
16
16
 
17
17
  from ....core.scheduler import InferenceRequest
18
- from ..llm_family import LLMFamilyV1, LLMSpecV1, register_transformer
18
+ from ..llm_family import LLMFamilyV2, LLMSpecV1, register_transformer
19
19
  from .core import PytorchChatModel, register_non_default_model
20
20
 
21
21
  logger = logging.getLogger(__name__)
@@ -26,7 +26,7 @@ logger = logging.getLogger(__name__)
26
26
  class Gemma3TextChatModel(PytorchChatModel):
27
27
  @classmethod
28
28
  def match_json(
29
- cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
29
+ cls, model_family: "LLMFamilyV2", model_spec: "LLMSpecV1", quantization: str
30
30
  ) -> bool:
31
31
  if model_spec.model_format not in ["pytorch", "gptq", "awq"]:
32
32
  return False
@@ -21,7 +21,7 @@ import torch
21
21
 
22
22
  from .....model.utils import select_device
23
23
  from ...core import chat_context_var
24
- from ...llm_family import LLMFamilyV1, LLMSpecV1, register_transformer
24
+ from ...llm_family import LLMFamilyV2, LLMSpecV1, register_transformer
25
25
  from ...utils import _decode_image, parse_messages
26
26
  from ..core import register_non_default_model
27
27
  from .core import PytorchMultiModalModel
@@ -47,7 +47,7 @@ class CogAgentChatModel(PytorchMultiModalModel):
47
47
 
48
48
  @classmethod
49
49
  def match_json(
50
- cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
50
+ cls, model_family: "LLMFamilyV2", model_spec: "LLMSpecV1", quantization: str
51
51
  ) -> bool:
52
52
  family = model_family.model_family or model_family.model_name
53
53
  if "cogagent" in family.lower():
@@ -23,7 +23,7 @@ import requests
23
23
  import torch
24
24
 
25
25
  from .....model.utils import select_device
26
- from ...llm_family import LLMFamilyV1, LLMSpecV1, register_transformer
26
+ from ...llm_family import LLMFamilyV2, LLMSpecV1, register_transformer
27
27
  from ..core import register_non_default_model
28
28
  from .core import PytorchMultiModalModel
29
29
 
@@ -39,7 +39,7 @@ class DeepSeekVL2ChatModel(PytorchMultiModalModel):
39
39
 
40
40
  @classmethod
41
41
  def match_json(
42
- cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
42
+ cls, model_family: "LLMFamilyV2", model_spec: "LLMSpecV1", quantization: str
43
43
  ) -> bool:
44
44
  llm_family = model_family.model_family or model_family.model_name
45
45
  if "deepseek-vl2" == llm_family.lower():
@@ -17,7 +17,7 @@ from typing import Any, Dict, Iterator, List, Optional, Tuple
17
17
 
18
18
  from .....model.utils import select_device
19
19
  from .....types import PytorchModelConfig
20
- from ...llm_family import LLMFamilyV1, LLMSpecV1, register_transformer
20
+ from ...llm_family import LLMFamilyV2, LLMSpecV1, register_transformer
21
21
  from ..core import register_non_default_model
22
22
  from .core import PytorchMultiModalModel
23
23
 
@@ -29,7 +29,7 @@ logger = logging.getLogger(__name__)
29
29
  class Gemma3ChatModel(PytorchMultiModalModel):
30
30
  @classmethod
31
31
  def match_json(
32
- cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
32
+ cls, model_family: "LLMFamilyV2", model_spec: "LLMSpecV1", quantization: str
33
33
  ) -> bool:
34
34
  if model_spec.model_format not in ["pytorch", "gptq", "awq"]:
35
35
  return False