sglang 0.4.6.post4__py3-none-any.whl → 0.4.6.post5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (130) hide show
  1. sglang/bench_offline_throughput.py +6 -6
  2. sglang/bench_one_batch.py +5 -4
  3. sglang/bench_one_batch_server.py +23 -15
  4. sglang/bench_serving.py +133 -57
  5. sglang/compile_deep_gemm.py +4 -4
  6. sglang/srt/configs/model_config.py +39 -28
  7. sglang/srt/conversation.py +1 -1
  8. sglang/srt/disaggregation/decode.py +122 -133
  9. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +142 -0
  10. sglang/srt/disaggregation/fake/conn.py +3 -13
  11. sglang/srt/disaggregation/kv_events.py +357 -0
  12. sglang/srt/disaggregation/mini_lb.py +57 -24
  13. sglang/srt/disaggregation/mooncake/conn.py +11 -2
  14. sglang/srt/disaggregation/mooncake/transfer_engine.py +2 -1
  15. sglang/srt/disaggregation/nixl/conn.py +9 -19
  16. sglang/srt/disaggregation/prefill.py +126 -44
  17. sglang/srt/disaggregation/utils.py +116 -5
  18. sglang/srt/distributed/utils.py +3 -3
  19. sglang/srt/entrypoints/EngineBase.py +5 -0
  20. sglang/srt/entrypoints/engine.py +28 -8
  21. sglang/srt/entrypoints/http_server.py +6 -4
  22. sglang/srt/entrypoints/http_server_engine.py +5 -2
  23. sglang/srt/function_call/base_format_detector.py +250 -0
  24. sglang/srt/function_call/core_types.py +34 -0
  25. sglang/srt/function_call/deepseekv3_detector.py +157 -0
  26. sglang/srt/function_call/ebnf_composer.py +234 -0
  27. sglang/srt/function_call/function_call_parser.py +175 -0
  28. sglang/srt/function_call/llama32_detector.py +74 -0
  29. sglang/srt/function_call/mistral_detector.py +84 -0
  30. sglang/srt/function_call/pythonic_detector.py +163 -0
  31. sglang/srt/function_call/qwen25_detector.py +67 -0
  32. sglang/srt/function_call/utils.py +35 -0
  33. sglang/srt/hf_transformers_utils.py +46 -7
  34. sglang/srt/layers/attention/aiter_backend.py +513 -0
  35. sglang/srt/layers/attention/flashattention_backend.py +63 -17
  36. sglang/srt/layers/attention/flashinfer_mla_backend.py +8 -4
  37. sglang/srt/layers/attention/flashmla_backend.py +340 -78
  38. sglang/srt/layers/attention/triton_backend.py +3 -0
  39. sglang/srt/layers/attention/utils.py +2 -2
  40. sglang/srt/layers/attention/vision.py +1 -1
  41. sglang/srt/layers/communicator.py +451 -0
  42. sglang/srt/layers/dp_attention.py +0 -10
  43. sglang/srt/layers/moe/cutlass_moe.py +207 -0
  44. sglang/srt/layers/moe/ep_moe/kernels.py +33 -11
  45. sglang/srt/layers/moe/ep_moe/layer.py +104 -50
  46. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +82 -7
  47. sglang/srt/layers/moe/fused_moe_triton/layer.py +14 -0
  48. sglang/srt/layers/moe/topk.py +66 -9
  49. sglang/srt/layers/multimodal.py +70 -0
  50. sglang/srt/layers/quantization/__init__.py +7 -2
  51. sglang/srt/layers/quantization/deep_gemm.py +5 -3
  52. sglang/srt/layers/quantization/fp8.py +90 -0
  53. sglang/srt/layers/quantization/fp8_utils.py +6 -0
  54. sglang/srt/layers/quantization/gptq.py +298 -6
  55. sglang/srt/layers/quantization/int8_kernel.py +18 -5
  56. sglang/srt/layers/quantization/qoq.py +244 -0
  57. sglang/srt/lora/lora_manager.py +1 -3
  58. sglang/srt/managers/deepseek_eplb.py +278 -0
  59. sglang/srt/managers/eplb_manager.py +55 -0
  60. sglang/srt/managers/expert_distribution.py +704 -56
  61. sglang/srt/managers/expert_location.py +394 -0
  62. sglang/srt/managers/expert_location_dispatch.py +91 -0
  63. sglang/srt/managers/io_struct.py +16 -3
  64. sglang/srt/managers/mm_utils.py +293 -139
  65. sglang/srt/managers/multimodal_processors/base_processor.py +127 -42
  66. sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +6 -1
  67. sglang/srt/managers/multimodal_processors/gemma3.py +31 -6
  68. sglang/srt/managers/multimodal_processors/internvl.py +14 -5
  69. sglang/srt/managers/multimodal_processors/janus_pro.py +7 -1
  70. sglang/srt/managers/multimodal_processors/kimi_vl.py +7 -6
  71. sglang/srt/managers/multimodal_processors/llava.py +3 -3
  72. sglang/srt/managers/multimodal_processors/minicpm.py +25 -31
  73. sglang/srt/managers/multimodal_processors/mllama4.py +6 -0
  74. sglang/srt/managers/multimodal_processors/pixtral.py +9 -9
  75. sglang/srt/managers/multimodal_processors/qwen_vl.py +58 -16
  76. sglang/srt/managers/schedule_batch.py +49 -21
  77. sglang/srt/managers/schedule_policy.py +4 -5
  78. sglang/srt/managers/scheduler.py +92 -50
  79. sglang/srt/managers/session_controller.py +1 -1
  80. sglang/srt/managers/tokenizer_manager.py +99 -24
  81. sglang/srt/mem_cache/base_prefix_cache.py +3 -0
  82. sglang/srt/mem_cache/chunk_cache.py +3 -1
  83. sglang/srt/mem_cache/hiradix_cache.py +4 -4
  84. sglang/srt/mem_cache/memory_pool.py +74 -52
  85. sglang/srt/mem_cache/multimodal_cache.py +45 -0
  86. sglang/srt/mem_cache/radix_cache.py +58 -5
  87. sglang/srt/metrics/collector.py +2 -2
  88. sglang/srt/mm_utils.py +10 -0
  89. sglang/srt/model_executor/cuda_graph_runner.py +20 -9
  90. sglang/srt/model_executor/expert_location_updater.py +422 -0
  91. sglang/srt/model_executor/forward_batch_info.py +4 -0
  92. sglang/srt/model_executor/model_runner.py +144 -54
  93. sglang/srt/model_loader/loader.py +10 -6
  94. sglang/srt/models/clip.py +5 -1
  95. sglang/srt/models/deepseek_v2.py +297 -343
  96. sglang/srt/models/exaone.py +8 -3
  97. sglang/srt/models/gemma3_mm.py +70 -33
  98. sglang/srt/models/llama4.py +10 -2
  99. sglang/srt/models/llava.py +26 -18
  100. sglang/srt/models/mimo_mtp.py +220 -0
  101. sglang/srt/models/minicpmo.py +5 -12
  102. sglang/srt/models/mistral.py +71 -1
  103. sglang/srt/models/mllama.py +3 -3
  104. sglang/srt/models/qwen2.py +95 -26
  105. sglang/srt/models/qwen2_5_vl.py +8 -0
  106. sglang/srt/models/qwen2_moe.py +330 -60
  107. sglang/srt/models/qwen2_vl.py +6 -0
  108. sglang/srt/models/qwen3.py +52 -10
  109. sglang/srt/models/qwen3_moe.py +411 -48
  110. sglang/srt/models/siglip.py +294 -0
  111. sglang/srt/openai_api/adapter.py +28 -16
  112. sglang/srt/openai_api/protocol.py +6 -0
  113. sglang/srt/operations.py +154 -0
  114. sglang/srt/operations_strategy.py +31 -0
  115. sglang/srt/server_args.py +134 -24
  116. sglang/srt/speculative/eagle_utils.py +131 -0
  117. sglang/srt/speculative/eagle_worker.py +47 -2
  118. sglang/srt/utils.py +68 -12
  119. sglang/test/test_cutlass_moe.py +278 -0
  120. sglang/test/test_utils.py +2 -36
  121. sglang/utils.py +2 -2
  122. sglang/version.py +1 -1
  123. {sglang-0.4.6.post4.dist-info → sglang-0.4.6.post5.dist-info}/METADATA +20 -11
  124. {sglang-0.4.6.post4.dist-info → sglang-0.4.6.post5.dist-info}/RECORD +128 -102
  125. {sglang-0.4.6.post4.dist-info → sglang-0.4.6.post5.dist-info}/WHEEL +1 -1
  126. sglang/srt/function_call_parser.py +0 -858
  127. sglang/srt/platforms/interface.py +0 -371
  128. /sglang/srt/models/{xiaomi_mimo.py → mimo.py} +0 -0
  129. {sglang-0.4.6.post4.dist-info → sglang-0.4.6.post5.dist-info}/licenses/LICENSE +0 -0
  130. {sglang-0.4.6.post4.dist-info → sglang-0.4.6.post5.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,7 @@
1
1
  import asyncio
2
2
  import math
3
- from typing import List, Union
3
+ import re
4
+ from typing import Dict, List, Union
4
5
 
5
6
  import torch
6
7
  from PIL import Image
@@ -23,7 +24,12 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
23
24
 
24
25
  def __init__(self, hf_config, server_args, _processor):
25
26
  super().__init__(hf_config, server_args, _processor)
27
+ # The single, pre-expanded image token.
26
28
  self.IMAGE_TOKEN = "<|vision_start|><|image_pad|><|vision_end|>"
29
+ # The regex that matches expanded image tokens.
30
+ self.IMAGE_TOKEN_REGEX = re.compile(
31
+ r"<\|vision_start\|>(?:<\|image_pad\|>)+<\|vision_end\|>"
32
+ )
27
33
  self.IM_START_TOKEN_ID = hf_config.vision_start_token_id
28
34
  self.IM_END_TOKEN_ID = hf_config.vision_end_token_id
29
35
  self.image_token_id = hf_config.image_token_id
@@ -38,7 +44,7 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
38
44
 
39
45
  async def process_mm_data_async(
40
46
  self,
41
- image_data: List[Union[str, bytes]],
47
+ image_data: List[Union[str, bytes, Dict]],
42
48
  input_text,
43
49
  request_obj,
44
50
  max_req_input_len,
@@ -48,11 +54,13 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
48
54
  if isinstance(image_data, str):
49
55
  image_data = [image_data]
50
56
 
51
- image_token = self.IMAGE_TOKEN
52
57
  base_output = self.load_mm_data(
53
58
  prompt=input_text,
54
59
  image_data=image_data,
55
- multimodal_tokens=MultimodalSpecialTokens(image_token=image_token),
60
+ multimodal_tokens=MultimodalSpecialTokens(
61
+ image_token=self.IMAGE_TOKEN,
62
+ image_token_regex=self.IMAGE_TOKEN_REGEX,
63
+ ),
56
64
  max_req_input_len=max_req_input_len,
57
65
  )
58
66
 
@@ -117,26 +125,60 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
117
125
  async def resize_image_async(image):
118
126
  return resize_image(image)
119
127
 
120
- if base_output.images:
128
+ images_are_preprocessed = self.mm_inputs_are_preprocessed(base_output.images)
129
+ if base_output.images and not images_are_preprocessed:
121
130
  resize_tasks = [resize_image_async(image) for image in base_output.images]
122
131
  base_output.images = await asyncio.gather(*resize_tasks)
123
132
 
124
133
  ret = self.process_mm_data(
125
134
  input_text=base_output.input_text,
126
- images=base_output.images,
135
+ images=None if images_are_preprocessed else base_output.images,
127
136
  )
128
-
137
+ input_ids = ret["input_ids"].flatten().tolist()
138
+ image_offsets = self.get_mm_items_offset(
139
+ input_ids=ret["input_ids"].flatten(), mm_token_id=self.image_token_id
140
+ )
141
+ image_grid_thw = None
142
+ video_grid_thw = None # TODO
129
143
  items = []
130
144
 
131
- input_ids = ret["input_ids"].flatten().tolist()
132
- if "pixel_values" in ret:
145
+ if base_output.images:
146
+ if images_are_preprocessed:
147
+ image_grid_thw = torch.concat(
148
+ [
149
+ torch.as_tensor(item.image_grid_thws)
150
+ for item in base_output.images
151
+ ]
152
+ )
153
+ all_pixel_values = [
154
+ item.pixel_values
155
+ for item in base_output.images
156
+ if item.pixel_values is not None
157
+ ]
158
+ all_precomputed_features = [
159
+ item.precomputed_features
160
+ for item in base_output.images
161
+ if item.precomputed_features is not None
162
+ ]
163
+ pixel_values = (
164
+ torch.concat(all_pixel_values) if all_pixel_values else None
165
+ )
166
+ precomputed_features = (
167
+ torch.concat(all_precomputed_features)
168
+ if all_precomputed_features
169
+ else None
170
+ )
171
+ else:
172
+ image_grid_thw = ret["image_grid_thw"]
173
+ pixel_values = ret["pixel_values"]
174
+ precomputed_features = None
133
175
  items += [
134
176
  MultimodalDataItem(
135
- pixel_values=ret["pixel_values"],
136
- image_grid_thws=torch.concat([ret["image_grid_thw"]]),
137
- # TODO
138
- video_grid_thws=None,
139
- second_per_grid_ts=ret.get("second_per_grid_ts", None),
177
+ pixel_values=pixel_values,
178
+ image_grid_thws=image_grid_thw,
179
+ video_grid_thws=video_grid_thw,
180
+ precomputed_features=precomputed_features,
181
+ image_offsets=image_offsets,
140
182
  modality=Modality.IMAGE,
141
183
  )
142
184
  ]
@@ -151,8 +193,8 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
151
193
  self.hf_config.vision_config, "tokens_per_second", None
152
194
  ),
153
195
  input_ids=torch.tensor(input_ids).unsqueeze(0),
154
- image_grid_thw=ret.get("image_grid_thw", None),
155
- video_grid_thw=ret.get("video_grid_thw", None),
196
+ image_grid_thw=image_grid_thw,
197
+ video_grid_thw=video_grid_thw,
156
198
  second_per_grid_ts=ret.get("second_per_grid_ts", None),
157
199
  )
158
200
  mrope_positions = mrope_positions.squeeze(1)
@@ -48,7 +48,10 @@ from sglang.global_config import global_config
48
48
  from sglang.srt.configs.model_config import ModelConfig
49
49
  from sglang.srt.constrained.base_grammar_backend import BaseGrammarObject
50
50
  from sglang.srt.disaggregation.base import BaseKVSender
51
- from sglang.srt.disaggregation.decode import ScheduleBatchDisaggregationDecodeMixin
51
+ from sglang.srt.disaggregation.decode_schedule_batch_mixin import (
52
+ ScheduleBatchDisaggregationDecodeMixin,
53
+ )
54
+ from sglang.srt.layers.multimodal import gpu_tensor_hash
52
55
  from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache
53
56
  from sglang.srt.mem_cache.chunk_cache import ChunkCache
54
57
  from sglang.srt.mem_cache.memory_pool import ReqToTokenPool, TokenToKVPoolAllocator
@@ -77,16 +80,19 @@ global_server_args_dict = {
77
80
  "enable_dp_attention": ServerArgs.enable_dp_attention,
78
81
  "enable_dp_lm_head": ServerArgs.enable_dp_lm_head,
79
82
  "enable_ep_moe": ServerArgs.enable_ep_moe,
83
+ "deepep_config": ServerArgs.deepep_config,
80
84
  "enable_nan_detection": ServerArgs.enable_nan_detection,
81
85
  "flashinfer_mla_disable_ragged": ServerArgs.flashinfer_mla_disable_ragged,
82
86
  "max_micro_batch_size": ServerArgs.max_micro_batch_size,
83
87
  "moe_dense_tp_size": ServerArgs.moe_dense_tp_size,
88
+ "ep_dispatch_algorithm": ServerArgs.ep_dispatch_algorithm,
84
89
  "n_share_experts_fusion": ServerArgs.n_share_experts_fusion,
85
90
  "sampling_backend": ServerArgs.sampling_backend,
86
91
  "speculative_accept_threshold_acc": ServerArgs.speculative_accept_threshold_acc,
87
92
  "speculative_accept_threshold_single": ServerArgs.speculative_accept_threshold_single,
88
93
  "torchao_config": ServerArgs.torchao_config,
89
94
  "triton_attention_reduce_in_fp32": ServerArgs.triton_attention_reduce_in_fp32,
95
+ "ep_num_redundant_experts": ServerArgs.ep_num_redundant_experts,
90
96
  }
91
97
 
92
98
  logger = logging.getLogger(__name__)
@@ -177,10 +183,10 @@ class MultimodalDataItem:
177
183
  image_offsets: Optional[list] = None
178
184
 
179
185
  # the real data, pixel_values or audio_features
180
- # data: Union[List[torch.Tensor], List[np.array]]
181
- pixel_values: Union[torch.Tensor, np.array] = None
182
- image_grid_thws: Union[torch.Tensor, np.array] = None
183
- video_grid_thws: Union[torch.Tensor, np.array] = None
186
+ # data: Union[List[torch.Tensor], List[np.ndarray]]
187
+ pixel_values: Union[torch.Tensor, np.ndarray] = None
188
+ image_grid_thws: Union[torch.Tensor, np.ndarray] = None
189
+ video_grid_thws: Union[torch.Tensor, np.ndarray] = None
184
190
 
185
191
  image_emb_mask: Optional[torch.Tensor] = None
186
192
  image_spatial_crop: Optional[torch.Tensor] = None
@@ -189,8 +195,11 @@ class MultimodalDataItem:
189
195
  # [num_images, (n, w, h)]
190
196
  tgt_size: Tuple[int, int] = None
191
197
 
192
- audio_features: Union[torch.Tensor, np.array] = None
198
+ audio_features: Union[torch.Tensor, np.ndarray] = None
193
199
  audio_feature_lens: Optional[List[torch.Tensor]] = None
200
+ audio_offsets: Optional[List[Tuple[int, int]]] = None
201
+
202
+ precomputed_features: Optional[Union[torch.Tensor, np.ndarray]] = None
194
203
 
195
204
  @staticmethod
196
205
  def is_empty_list(l):
@@ -219,7 +228,8 @@ class MultimodalDataItem:
219
228
  for x in tensor_list
220
229
  ]
221
230
  tensor = torch.concat(tensor_list)
222
-
231
+ if tensor.is_cuda:
232
+ return gpu_tensor_hash(tensor)
223
233
  tensor = tensor.detach().contiguous()
224
234
 
225
235
  if tensor.dtype == torch.bfloat16:
@@ -249,7 +259,9 @@ class MultimodalDataItem:
249
259
  return tensor_hash([f])
250
260
  return data_hash(f)
251
261
 
252
- if self.is_audio():
262
+ if self.precomputed_features is not None:
263
+ self.hash = hash_feature(self.precomputed_features)
264
+ elif self.is_audio():
253
265
  self.hash = hash_feature(self.audio_features)
254
266
  else:
255
267
  self.hash = hash_feature(self.pixel_values)
@@ -258,19 +270,24 @@ class MultimodalDataItem:
258
270
  self.pad_value = self.hash % (1 << 30)
259
271
 
260
272
  def is_audio(self):
261
- return (
262
- self.modality == Modality.AUDIO
263
- ) and not MultimodalDataItem.is_empty_list(self.audio_features)
273
+ return (self.modality == Modality.AUDIO) and (
274
+ self.precomputed_features is not None
275
+ or not MultimodalDataItem.is_empty_list(self.audio_features)
276
+ )
264
277
 
265
278
  def is_image(self):
266
279
  return (
267
280
  self.modality == Modality.IMAGE or self.modality == Modality.MULTI_IMAGES
268
- ) and not MultimodalDataItem.is_empty_list(self.pixel_values)
281
+ ) and (
282
+ self.precomputed_features is not None
283
+ or not MultimodalDataItem.is_empty_list(self.pixel_values)
284
+ )
269
285
 
270
286
  def is_video(self):
271
- return (
272
- self.modality == Modality.VIDEO
273
- ) and not MultimodalDataItem.is_empty_list(self.pixel_values)
287
+ return (self.modality == Modality.VIDEO) and (
288
+ self.precomputed_features is not None
289
+ or not MultimodalDataItem.is_empty_list(self.pixel_values)
290
+ )
274
291
 
275
292
  def is_valid(self) -> bool:
276
293
  return self.is_image() or self.is_video() or self.is_audio()
@@ -279,6 +296,16 @@ class MultimodalDataItem:
279
296
  ...
280
297
  # TODO
281
298
 
299
+ @staticmethod
300
+ def from_dict(obj: dict):
301
+ kwargs = dict(obj)
302
+ modality = kwargs.pop("modality")
303
+ if isinstance(modality, str):
304
+ modality = Modality[modality]
305
+ ret = MultimodalDataItem(modality=modality, **kwargs)
306
+ ret.validate()
307
+ return ret
308
+
282
309
 
283
310
  @dataclasses.dataclass
284
311
  class MultimodalInputs:
@@ -304,8 +331,9 @@ class MultimodalInputs:
304
331
  video_token_id: Optional[int] = None
305
332
 
306
333
  # audio
307
- audio_start_id: Optional[torch.Tensor] = None
308
- audio_end_id: Optional[torch.Tensor] = None
334
+ audio_token_id: Optional[int] = None
335
+ audio_start_id: Optional[int] = None
336
+ audio_end_id: Optional[int] = None
309
337
 
310
338
  @staticmethod
311
339
  def from_dict(obj: dict):
@@ -329,6 +357,7 @@ class MultimodalInputs:
329
357
  "slice_end_id",
330
358
  "audio_start_id",
331
359
  "audio_end_id",
360
+ "audio_token_id",
332
361
  ]
333
362
  for arg in optional_args:
334
363
  if arg in obj:
@@ -578,9 +607,6 @@ class Req:
578
607
  self.tmp_end_idx: int = -1
579
608
  self.metadata_buffer_index: int = -1
580
609
 
581
- # The first output_id transferred from prefill instance.
582
- self.transferred_output_id: Optional[int] = None
583
-
584
610
  @property
585
611
  def seqlen(self):
586
612
  return len(self.origin_input_ids) + len(self.output_ids)
@@ -1069,7 +1095,9 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
1069
1095
  else:
1070
1096
  self.encoder_out_cache_loc = torch.cat(encoder_out_cache_loc)
1071
1097
 
1072
- assert len(self.out_cache_loc) == self.extend_num_tokens
1098
+ assert (
1099
+ len(self.out_cache_loc) == self.extend_num_tokens
1100
+ ), f"Expected {len(self.out_cache_loc)}, got {self.extend_num_tokens}"
1073
1101
 
1074
1102
  def prepare_for_extend(self):
1075
1103
  self.forward_mode = ForwardMode.EXTEND
@@ -22,11 +22,7 @@ from typing import Dict, List, Optional, Set, Union
22
22
 
23
23
  import torch
24
24
 
25
- from sglang.srt.managers.schedule_batch import (
26
- Req,
27
- ScheduleBatch,
28
- global_server_args_dict,
29
- )
25
+ from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
30
26
  from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache
31
27
  from sglang.srt.mem_cache.memory_pool import TokenToKVPoolAllocator
32
28
  from sglang.srt.mem_cache.radix_cache import RadixCache, TreeNode
@@ -468,6 +464,9 @@ class PrefillAdder:
468
464
  return AddReqResult.OTHER
469
465
 
470
466
  with self._lock_node(req.last_node):
467
+ if total_tokens > self.rem_total_tokens:
468
+ return AddReqResult.NO_TOKEN
469
+
471
470
  if (
472
471
  enable_hierarchical_cache
473
472
  and req.last_node_global is not None