sglang 0.4.9.post2__py3-none-any.whl → 0.4.9.post3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (168) hide show
  1. sglang/bench_one_batch.py +2 -1
  2. sglang/eval/loogle_eval.py +7 -0
  3. sglang/srt/configs/deepseekvl2.py +11 -2
  4. sglang/srt/configs/internvl.py +3 -0
  5. sglang/srt/configs/janus_pro.py +3 -0
  6. sglang/srt/configs/model_config.py +9 -7
  7. sglang/srt/configs/update_config.py +3 -1
  8. sglang/srt/conversation.py +1 -0
  9. sglang/srt/custom_op.py +5 -2
  10. sglang/srt/disaggregation/decode.py +9 -1
  11. sglang/srt/disaggregation/mooncake/conn.py +44 -56
  12. sglang/srt/distributed/parallel_state.py +33 -0
  13. sglang/srt/entrypoints/engine.py +30 -26
  14. sglang/srt/entrypoints/openai/serving_chat.py +21 -2
  15. sglang/srt/eplb/expert_location_dispatch.py +1 -1
  16. sglang/srt/function_call/function_call_parser.py +2 -0
  17. sglang/srt/function_call/qwen3_detector.py +150 -0
  18. sglang/srt/hf_transformers_utils.py +0 -1
  19. sglang/srt/layers/activation.py +13 -0
  20. sglang/srt/layers/attention/flashattention_backend.py +3 -3
  21. sglang/srt/layers/attention/flashinfer_backend.py +40 -1
  22. sglang/srt/layers/linear.py +13 -102
  23. sglang/srt/layers/moe/ep_moe/kernels.py +4 -2
  24. sglang/srt/layers/moe/ep_moe/layer.py +23 -402
  25. sglang/srt/layers/moe/fused_moe_native.py +7 -47
  26. sglang/srt/layers/moe/fused_moe_triton/__init__.py +4 -4
  27. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  28. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  29. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  30. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=385,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  31. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=385,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  32. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +35 -45
  33. sglang/srt/layers/moe/fused_moe_triton/layer.py +14 -396
  34. sglang/srt/layers/moe/topk.py +187 -12
  35. sglang/srt/layers/quantization/__init__.py +20 -134
  36. sglang/srt/layers/quantization/awq.py +578 -11
  37. sglang/srt/layers/quantization/awq_triton.py +339 -0
  38. sglang/srt/layers/quantization/base_config.py +85 -10
  39. sglang/srt/layers/quantization/blockwise_int8.py +17 -55
  40. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +13 -11
  41. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +24 -73
  42. sglang/srt/layers/quantization/fp8.py +273 -62
  43. sglang/srt/layers/quantization/fp8_kernel.py +210 -46
  44. sglang/srt/layers/quantization/fp8_utils.py +2 -2
  45. sglang/srt/layers/quantization/gptq.py +501 -143
  46. sglang/srt/layers/quantization/marlin_utils.py +790 -0
  47. sglang/srt/layers/quantization/modelopt_quant.py +26 -108
  48. sglang/srt/layers/quantization/moe_wna16.py +45 -49
  49. sglang/srt/layers/quantization/petit.py +252 -0
  50. sglang/srt/layers/quantization/petit_utils.py +104 -0
  51. sglang/srt/layers/quantization/qoq.py +7 -6
  52. sglang/srt/layers/quantization/scalar_type.py +352 -0
  53. sglang/srt/layers/quantization/unquant.py +422 -0
  54. sglang/srt/layers/quantization/utils.py +343 -3
  55. sglang/srt/layers/quantization/w4afp8.py +8 -4
  56. sglang/srt/layers/quantization/w8a8_fp8.py +17 -51
  57. sglang/srt/layers/quantization/w8a8_int8.py +51 -115
  58. sglang/srt/layers/vocab_parallel_embedding.py +1 -41
  59. sglang/srt/lora/lora.py +0 -4
  60. sglang/srt/lora/lora_manager.py +87 -53
  61. sglang/srt/lora/mem_pool.py +81 -33
  62. sglang/srt/lora/utils.py +12 -5
  63. sglang/srt/managers/cache_controller.py +241 -0
  64. sglang/srt/managers/io_struct.py +41 -29
  65. sglang/srt/managers/mm_utils.py +7 -8
  66. sglang/srt/managers/schedule_batch.py +150 -110
  67. sglang/srt/managers/schedule_policy.py +68 -27
  68. sglang/srt/managers/scheduler.py +243 -61
  69. sglang/srt/managers/scheduler_output_processor_mixin.py +22 -4
  70. sglang/srt/managers/tokenizer_manager.py +11 -3
  71. sglang/srt/managers/tp_worker.py +14 -0
  72. sglang/srt/managers/tp_worker_overlap_thread.py +11 -0
  73. sglang/srt/mem_cache/allocator.py +7 -16
  74. sglang/srt/mem_cache/base_prefix_cache.py +14 -2
  75. sglang/srt/mem_cache/chunk_cache.py +5 -2
  76. sglang/srt/mem_cache/hicache_storage.py +152 -0
  77. sglang/srt/mem_cache/hiradix_cache.py +179 -4
  78. sglang/srt/mem_cache/memory_pool.py +16 -1
  79. sglang/srt/mem_cache/memory_pool_host.py +41 -2
  80. sglang/srt/mem_cache/radix_cache.py +26 -0
  81. sglang/srt/mem_cache/swa_radix_cache.py +1025 -0
  82. sglang/srt/metrics/collector.py +9 -0
  83. sglang/srt/model_executor/cuda_graph_runner.py +5 -6
  84. sglang/srt/model_executor/forward_batch_info.py +14 -1
  85. sglang/srt/model_executor/model_runner.py +109 -22
  86. sglang/srt/model_loader/loader.py +7 -1
  87. sglang/srt/model_loader/utils.py +4 -4
  88. sglang/srt/models/clip.py +1 -1
  89. sglang/srt/models/deepseek.py +9 -6
  90. sglang/srt/models/deepseek_janus_pro.py +1 -1
  91. sglang/srt/models/deepseek_v2.py +191 -171
  92. sglang/srt/models/deepseek_vl2.py +5 -5
  93. sglang/srt/models/gemma.py +48 -0
  94. sglang/srt/models/gemma2.py +52 -0
  95. sglang/srt/models/gemma3_causal.py +63 -0
  96. sglang/srt/models/gemma3_mm.py +1 -1
  97. sglang/srt/models/gemma3n_mm.py +2 -4
  98. sglang/srt/models/granitemoe.py +385 -0
  99. sglang/srt/models/grok.py +9 -3
  100. sglang/srt/models/hunyuan.py +63 -16
  101. sglang/srt/models/internvl.py +1 -1
  102. sglang/srt/models/kimi_vl.py +1 -1
  103. sglang/srt/models/llama.py +41 -0
  104. sglang/srt/models/llama4.py +11 -11
  105. sglang/srt/models/llava.py +2 -2
  106. sglang/srt/models/llavavid.py +1 -1
  107. sglang/srt/models/minicpm.py +0 -2
  108. sglang/srt/models/minicpmo.py +3 -7
  109. sglang/srt/models/minicpmv.py +1 -1
  110. sglang/srt/models/mistral.py +1 -1
  111. sglang/srt/models/mixtral.py +9 -2
  112. sglang/srt/models/mllama.py +3 -5
  113. sglang/srt/models/mllama4.py +3 -3
  114. sglang/srt/models/olmoe.py +8 -5
  115. sglang/srt/models/persimmon.py +330 -0
  116. sglang/srt/models/phi.py +321 -0
  117. sglang/srt/models/phi4mm.py +44 -4
  118. sglang/srt/models/phi4mm_audio.py +1260 -0
  119. sglang/srt/models/phi4mm_utils.py +1917 -0
  120. sglang/srt/models/phimoe.py +9 -3
  121. sglang/srt/models/qwen.py +37 -0
  122. sglang/srt/models/qwen2.py +41 -0
  123. sglang/srt/models/qwen2_5_vl.py +4 -4
  124. sglang/srt/models/qwen2_audio.py +1 -1
  125. sglang/srt/models/qwen2_moe.py +53 -5
  126. sglang/srt/models/qwen2_vl.py +4 -4
  127. sglang/srt/models/qwen3.py +65 -1
  128. sglang/srt/models/qwen3_moe.py +56 -18
  129. sglang/srt/models/vila.py +1 -1
  130. sglang/srt/multimodal/processors/base_processor.py +91 -97
  131. sglang/srt/multimodal/processors/clip.py +21 -19
  132. sglang/srt/multimodal/processors/deepseek_vl_v2.py +8 -26
  133. sglang/srt/multimodal/processors/gemma3.py +13 -17
  134. sglang/srt/multimodal/processors/gemma3n.py +19 -23
  135. sglang/srt/multimodal/processors/internvl.py +9 -10
  136. sglang/srt/multimodal/processors/janus_pro.py +12 -27
  137. sglang/srt/multimodal/processors/kimi_vl.py +12 -14
  138. sglang/srt/multimodal/processors/llava.py +4 -2
  139. sglang/srt/multimodal/processors/minicpm.py +35 -44
  140. sglang/srt/multimodal/processors/mlama.py +21 -18
  141. sglang/srt/multimodal/processors/mllama4.py +4 -5
  142. sglang/srt/multimodal/processors/phi4mm.py +63 -39
  143. sglang/srt/multimodal/processors/pixtral.py +14 -35
  144. sglang/srt/multimodal/processors/qwen_audio.py +65 -0
  145. sglang/srt/multimodal/processors/qwen_vl.py +16 -21
  146. sglang/srt/multimodal/processors/vila.py +14 -14
  147. sglang/srt/sampling/sampling_params.py +8 -1
  148. sglang/srt/server_args.py +393 -230
  149. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +9 -1
  150. sglang/srt/two_batch_overlap.py +1 -0
  151. sglang/srt/utils.py +27 -1
  152. sglang/test/runners.py +14 -3
  153. sglang/test/test_block_fp8.py +8 -3
  154. sglang/test/test_block_fp8_ep.py +1 -1
  155. sglang/test/test_custom_ops.py +12 -7
  156. sglang/test/test_cutlass_w4a8_moe.py +1 -3
  157. sglang/test/test_fp4_moe.py +1 -3
  158. sglang/test/test_marlin_moe.py +286 -0
  159. sglang/test/test_marlin_utils.py +171 -0
  160. sglang/test/test_utils.py +35 -0
  161. sglang/version.py +1 -1
  162. {sglang-0.4.9.post2.dist-info → sglang-0.4.9.post3.dist-info}/METADATA +8 -8
  163. {sglang-0.4.9.post2.dist-info → sglang-0.4.9.post3.dist-info}/RECORD +166 -146
  164. sglang/srt/layers/quantization/quant_utils.py +0 -166
  165. sglang/srt/managers/multimodal_processors/qwen_audio.py +0 -94
  166. {sglang-0.4.9.post2.dist-info → sglang-0.4.9.post3.dist-info}/WHEEL +0 -0
  167. {sglang-0.4.9.post2.dist-info → sglang-0.4.9.post3.dist-info}/licenses/LICENSE +0 -0
  168. {sglang-0.4.9.post2.dist-info → sglang-0.4.9.post3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,65 @@
1
+ import re
2
+
3
+ from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
4
+ from sglang.srt.models.qwen2_audio import Qwen2AudioForConditionalGeneration
5
+ from sglang.srt.multimodal.processors.base_processor import (
6
+ BaseMultimodalProcessor,
7
+ MultimodalSpecialTokens,
8
+ )
9
+
10
+
11
+ class Qwen2AudioMultimodalProcessor(BaseMultimodalProcessor):
12
+ models = [Qwen2AudioForConditionalGeneration]
13
+
14
+ def __init__(self, hf_config, server_args, _processor):
15
+ super().__init__(hf_config, server_args, _processor)
16
+ self.AUDIO_TOKEN = "<|audio_bos|><|AUDIO|><|audio_eos|>"
17
+ self.AUDIO_TOKEN_REGEX = re.compile(
18
+ r"<\|audio_bos\|>(?:<\|AUDIO\|>)+<\|audio_eos\|>"
19
+ )
20
+ # Collect special token ids
21
+ tokenizer = self._processor.tokenizer
22
+ self.audio_start_id = tokenizer.convert_tokens_to_ids("<|audio_bos|>")
23
+ self.audio_token_id = tokenizer.convert_tokens_to_ids("<|AUDIO|>")
24
+ self.audio_end_id = tokenizer.convert_tokens_to_ids("<|audio_eos|>")
25
+
26
+ self.mm_tokens = MultimodalSpecialTokens(
27
+ audio_token=self.AUDIO_TOKEN,
28
+ audio_token_regex=self.AUDIO_TOKEN_REGEX,
29
+ audio_token_id=self.audio_token_id,
30
+ ).build(_processor)
31
+
32
+ async def process_mm_data_async(
33
+ self,
34
+ audio_data,
35
+ input_text,
36
+ **kwargs,
37
+ ):
38
+ base_output = self.load_mm_data(
39
+ prompt=input_text,
40
+ audio_data=audio_data,
41
+ multimodal_tokens=self.mm_tokens,
42
+ )
43
+ if base_output is None:
44
+ return None
45
+
46
+ mm_items, input_ids, ret = self.process_and_combine_mm_data(
47
+ base_output, self.mm_tokens
48
+ )
49
+
50
+ assert (
51
+ "feature_attention_mask" in ret
52
+ ), "feature_attention_mask not found in processor output"
53
+ input_lengths = ret["feature_attention_mask"].sum(dim=-1)
54
+ input_lengths = (input_lengths - 1) // 2 + 1
55
+ output_lengths = (input_lengths - 2) // 2 + 1
56
+
57
+ mm_items[0].model_specific_data["audio_feature_lens"] = output_lengths
58
+
59
+ return {
60
+ "mm_items": mm_items,
61
+ "input_ids": input_ids.tolist(),
62
+ "audio_start_id": self.audio_start_id,
63
+ "audio_token_id": self.audio_token_id,
64
+ "audio_end_id": self.audio_end_id,
65
+ }
@@ -203,16 +203,9 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
203
203
 
204
204
  def __init__(self, hf_config, server_args, _processor):
205
205
  super().__init__(hf_config, server_args, _processor)
206
- # The single, pre-expanded image token.
207
- self.IMAGE_TOKEN = "<|vision_start|><|image_pad|><|vision_end|>"
208
206
  # The regex that matches expanded image tokens.
209
- self.IMAGE_TOKEN_REGEX = re.compile(
210
- r"<\|vision_start\|>(?:<\|image_pad\|>)+<\|vision_end\|>"
211
- )
212
207
  self.IM_START_TOKEN_ID = hf_config.vision_start_token_id
213
208
  self.IM_END_TOKEN_ID = hf_config.vision_end_token_id
214
- self.IM_TOKEN_ID = hf_config.image_token_id
215
- self.VIDEO_TOKEN_ID = hf_config.video_token_id
216
209
  self.vision_start_token_id = hf_config.vision_start_token_id
217
210
  self.vision_end_token_id = hf_config.vision_end_token_id
218
211
  self.NUM_TOKEN_PER_FRAME = 770
@@ -220,19 +213,20 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
220
213
  self.MIN_PIXELS = 4 * 28 * 28
221
214
  self.MAX_PIXELS = 16384 * 28 * 28
222
215
  self.MAX_RATIO = 200
223
- # TODO(mick): move all MultimodalSpecialTokens initializations into processor init
224
- self.mm_special_tokens = MultimodalSpecialTokens(
225
- image_token=self.IMAGE_TOKEN,
226
- image_token_regex=self.IMAGE_TOKEN_REGEX,
227
- video_token=self.VIDEO_TOKEN_ID,
228
- )
216
+ self.mm_tokens = MultimodalSpecialTokens(
217
+ image_token="<|vision_start|><|image_pad|><|vision_end|>",
218
+ image_token_id=hf_config.image_token_id,
219
+ image_token_regex=re.compile(
220
+ r"<\|vision_start\|>(?:<\|image_pad\|>)+<\|vision_end\|>"
221
+ ),
222
+ video_token_id=hf_config.video_token_id,
223
+ ).build(_processor)
229
224
 
230
225
  async def process_mm_data_async(
231
226
  self,
232
227
  image_data: List[Union[str, bytes]],
233
228
  input_text,
234
229
  request_obj,
235
- max_req_input_len,
236
230
  *args,
237
231
  **kwargs,
238
232
  ):
@@ -241,8 +235,7 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
241
235
  prompt=input_text,
242
236
  image_data=image_data,
243
237
  video_data=request_obj.video_data,
244
- multimodal_tokens=self.mm_special_tokens,
245
- max_req_input_len=max_req_input_len,
238
+ multimodal_tokens=self.mm_tokens,
246
239
  )
247
240
 
248
241
  # Qwen-specific: resize images if they are raw Image objects
@@ -255,13 +248,15 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
255
248
  await preprocess_video(video) for video in base_output.videos
256
249
  ]
257
250
 
258
- mm_items, input_ids, ret = self.process_and_combine_mm_data(base_output)
251
+ mm_items, input_ids, ret = self.process_and_combine_mm_data(
252
+ base_output, self.mm_tokens
253
+ )
259
254
 
260
255
  input_ids = input_ids.flatten()
261
256
  mrope_positions, mrope_position_delta = MRotaryEmbedding.get_rope_index(
262
257
  spatial_merge_size=self.hf_config.vision_config.spatial_merge_size,
263
- image_token_id=self.IM_TOKEN_ID,
264
- video_token_id=self.VIDEO_TOKEN_ID,
258
+ image_token_id=self.mm_tokens.image_token_id,
259
+ video_token_id=self.mm_tokens.video_token_id,
265
260
  vision_start_token_id=self.vision_start_token_id,
266
261
  model_type=self.hf_config.model_type,
267
262
  tokens_per_second=getattr(
@@ -279,8 +274,8 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
279
274
  "mm_items": mm_items,
280
275
  "im_start_id": self.IM_START_TOKEN_ID,
281
276
  "im_end_id": self.IM_END_TOKEN_ID,
282
- "im_token_id": self.IM_TOKEN_ID,
283
- "video_token_id": self.VIDEO_TOKEN_ID,
277
+ "im_token_id": self.mm_tokens.image_token_id,
278
+ "video_token_id": self.mm_tokens.video_token_id,
284
279
  "mrope_positions": mrope_positions,
285
280
  "mrope_position_delta": mrope_position_delta,
286
281
  }
@@ -1,4 +1,4 @@
1
- from typing import Any, Dict, List, Optional, Type, cast
1
+ from typing import Any, Dict, List, Optional, Type
2
2
 
3
3
  import torch.nn as nn
4
4
  from transformers.configuration_utils import PretrainedConfig
@@ -8,9 +8,8 @@ from transformers.tokenization_utils_base import PreTrainedTokenizerBase
8
8
  from sglang.srt.managers.io_struct import (
9
9
  EmbeddingReqInput,
10
10
  GenerateReqInput,
11
- ImageDataItem,
11
+ ImageDataInputItem,
12
12
  )
13
- from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
14
13
  from sglang.srt.models.vila import VILAForConditionalGeneration
15
14
  from sglang.srt.multimodal.processors.base_processor import (
16
15
  BaseMultimodalProcessor,
@@ -37,31 +36,32 @@ class VILAMultimodalProcessor(BaseMultimodalProcessor):
37
36
  _processor: VILAProcessor,
38
37
  ) -> None:
39
38
  super().__init__(hf_config, server_args, _processor)
40
- self.IM_TOKEN_ID = hf_config.image_token_id
41
- self.VIDEO_TOKEN_ID = hf_config.video_token_id
39
+ self.mm_tokens = MultimodalSpecialTokens(
40
+ image_token=self._processor.tokenizer.image_token,
41
+ image_token_id=hf_config.image_token_id,
42
+ video_token_id=hf_config.video_token_id,
43
+ ).build(_processor)
42
44
 
43
45
  async def process_mm_data_async(
44
46
  self,
45
- image_data: Optional[ImageDataItem | List[ImageDataItem]],
47
+ image_data: Optional[ImageDataInputItem | List[ImageDataInputItem]],
46
48
  input_text: str | List[int],
47
49
  request_obj: GenerateReqInput | EmbeddingReqInput,
48
- max_req_input_len: int,
49
50
  **kwargs,
50
51
  ) -> Optional[Dict[str, Any]]:
51
52
  base_output = self.load_mm_data(
52
53
  prompt=input_text,
53
- multimodal_tokens=MultimodalSpecialTokens(
54
- image_token=self._processor.tokenizer.image_token
55
- ),
56
- max_req_input_len=max_req_input_len,
54
+ multimodal_tokens=self.mm_tokens,
57
55
  image_data=image_data,
58
56
  )
59
57
 
60
- mm_items, input_ids, _ = self.process_and_combine_mm_data(base_output)
58
+ mm_items, input_ids, _ = self.process_and_combine_mm_data(
59
+ base_output, self.mm_tokens
60
+ )
61
61
 
62
62
  return {
63
63
  "input_ids": input_ids.tolist(),
64
64
  "mm_items": mm_items,
65
- "im_token_id": self.IM_TOKEN_ID,
66
- "video_token_id": self.VIDEO_TOKEN_ID,
65
+ "im_token_id": self.mm_tokens.image_token_id,
66
+ "video_token_id": self.mm_tokens.video_token_id,
67
67
  }
@@ -89,7 +89,7 @@ class SamplingParams:
89
89
  if self.top_k == -1:
90
90
  self.top_k = TOP_K_ALL # whole vocabulary
91
91
 
92
- def verify(self):
92
+ def verify(self, vocab_size):
93
93
  if self.temperature < 0.0:
94
94
  raise ValueError(
95
95
  f"temperature must be non-negative, got {self.temperature}."
@@ -131,6 +131,13 @@ class SamplingParams:
131
131
  f"min_new_tokens must be in [0, max_new_tokens({self.max_new_tokens})], got "
132
132
  f"{self.min_new_tokens}."
133
133
  )
134
+ if self.logit_bias is not None:
135
+ for token_id in self.logit_bias:
136
+ if not 0 <= int(token_id) < vocab_size:
137
+ raise ValueError(
138
+ f"logit_bias must has keys in [0, {vocab_size - 1}], got "
139
+ f"{token_id}."
140
+ )
134
141
  grammars = [
135
142
  self.json_schema,
136
143
  self.regex,