sglang 0.4.8.post1__py3-none-any.whl → 0.4.9.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (158) hide show
  1. sglang/bench_one_batch_server.py +17 -2
  2. sglang/bench_serving.py +170 -24
  3. sglang/srt/configs/internvl.py +4 -2
  4. sglang/srt/configs/janus_pro.py +1 -1
  5. sglang/srt/configs/model_config.py +60 -1
  6. sglang/srt/configs/update_config.py +119 -0
  7. sglang/srt/conversation.py +69 -1
  8. sglang/srt/disaggregation/decode.py +21 -5
  9. sglang/srt/disaggregation/mooncake/conn.py +35 -4
  10. sglang/srt/disaggregation/nixl/conn.py +6 -6
  11. sglang/srt/disaggregation/prefill.py +2 -2
  12. sglang/srt/disaggregation/utils.py +1 -1
  13. sglang/srt/distributed/parallel_state.py +44 -17
  14. sglang/srt/entrypoints/EngineBase.py +8 -0
  15. sglang/srt/entrypoints/engine.py +40 -6
  16. sglang/srt/entrypoints/http_server.py +111 -24
  17. sglang/srt/entrypoints/http_server_engine.py +1 -1
  18. sglang/srt/entrypoints/openai/protocol.py +4 -2
  19. sglang/srt/eplb/__init__.py +0 -0
  20. sglang/srt/{managers → eplb}/eplb_algorithms/__init__.py +1 -1
  21. sglang/srt/{managers → eplb}/eplb_manager.py +2 -4
  22. sglang/srt/{eplb_simulator → eplb/eplb_simulator}/reader.py +1 -1
  23. sglang/srt/{managers → eplb}/expert_distribution.py +1 -5
  24. sglang/srt/{managers → eplb}/expert_location.py +1 -1
  25. sglang/srt/{managers → eplb}/expert_location_dispatch.py +1 -1
  26. sglang/srt/{model_executor → eplb}/expert_location_updater.py +17 -1
  27. sglang/srt/hf_transformers_utils.py +2 -1
  28. sglang/srt/layers/activation.py +2 -2
  29. sglang/srt/layers/amx_utils.py +86 -0
  30. sglang/srt/layers/attention/ascend_backend.py +219 -0
  31. sglang/srt/layers/attention/flashattention_backend.py +32 -9
  32. sglang/srt/layers/attention/tbo_backend.py +37 -9
  33. sglang/srt/layers/communicator.py +20 -2
  34. sglang/srt/layers/dp_attention.py +9 -3
  35. sglang/srt/layers/elementwise.py +76 -12
  36. sglang/srt/layers/flashinfer_comm_fusion.py +202 -0
  37. sglang/srt/layers/layernorm.py +26 -0
  38. sglang/srt/layers/linear.py +84 -14
  39. sglang/srt/layers/logits_processor.py +4 -4
  40. sglang/srt/layers/moe/cutlass_w4a8_moe.py +215 -0
  41. sglang/srt/layers/moe/ep_moe/kernels.py +81 -8
  42. sglang/srt/layers/moe/ep_moe/layer.py +176 -15
  43. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +23 -17
  44. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +3 -2
  45. sglang/srt/layers/moe/fused_moe_triton/layer.py +211 -74
  46. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +176 -0
  47. sglang/srt/layers/moe/router.py +60 -22
  48. sglang/srt/layers/moe/topk.py +10 -28
  49. sglang/srt/layers/parameter.py +67 -7
  50. sglang/srt/layers/quantization/__init__.py +2 -0
  51. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +1 -1
  52. sglang/srt/layers/quantization/fp8.py +72 -7
  53. sglang/srt/layers/quantization/fp8_kernel.py +1 -1
  54. sglang/srt/layers/quantization/fp8_utils.py +1 -2
  55. sglang/srt/layers/quantization/gptq.py +5 -1
  56. sglang/srt/layers/quantization/modelopt_quant.py +244 -1
  57. sglang/srt/layers/quantization/moe_wna16.py +1 -1
  58. sglang/srt/layers/quantization/quant_utils.py +166 -0
  59. sglang/srt/layers/quantization/w4afp8.py +264 -0
  60. sglang/srt/layers/quantization/w8a8_int8.py +52 -1
  61. sglang/srt/layers/rotary_embedding.py +2 -2
  62. sglang/srt/layers/vocab_parallel_embedding.py +20 -10
  63. sglang/srt/lora/lora.py +4 -5
  64. sglang/srt/lora/lora_manager.py +73 -20
  65. sglang/srt/lora/triton_ops/gate_up_lora_b.py +30 -19
  66. sglang/srt/lora/triton_ops/qkv_lora_b.py +30 -19
  67. sglang/srt/lora/triton_ops/sgemm_lora_a.py +27 -11
  68. sglang/srt/lora/triton_ops/sgemm_lora_b.py +27 -15
  69. sglang/srt/managers/cache_controller.py +41 -195
  70. sglang/srt/managers/configure_logging.py +1 -1
  71. sglang/srt/managers/io_struct.py +58 -14
  72. sglang/srt/managers/mm_utils.py +77 -61
  73. sglang/srt/managers/multimodal_processor.py +2 -6
  74. sglang/srt/managers/multimodal_processors/qwen_audio.py +94 -0
  75. sglang/srt/managers/schedule_batch.py +78 -85
  76. sglang/srt/managers/scheduler.py +130 -64
  77. sglang/srt/managers/scheduler_output_processor_mixin.py +8 -2
  78. sglang/srt/managers/session_controller.py +12 -3
  79. sglang/srt/managers/tokenizer_manager.py +314 -103
  80. sglang/srt/managers/tp_worker.py +13 -1
  81. sglang/srt/managers/tp_worker_overlap_thread.py +8 -0
  82. sglang/srt/mem_cache/allocator.py +290 -0
  83. sglang/srt/mem_cache/chunk_cache.py +34 -2
  84. sglang/srt/mem_cache/hiradix_cache.py +2 -0
  85. sglang/srt/mem_cache/memory_pool.py +402 -66
  86. sglang/srt/mem_cache/memory_pool_host.py +6 -109
  87. sglang/srt/mem_cache/multimodal_cache.py +3 -0
  88. sglang/srt/mem_cache/radix_cache.py +8 -4
  89. sglang/srt/model_executor/cuda_graph_runner.py +2 -1
  90. sglang/srt/model_executor/forward_batch_info.py +17 -4
  91. sglang/srt/model_executor/model_runner.py +297 -56
  92. sglang/srt/model_loader/loader.py +41 -0
  93. sglang/srt/model_loader/weight_utils.py +72 -4
  94. sglang/srt/models/deepseek_nextn.py +1 -3
  95. sglang/srt/models/deepseek_v2.py +195 -45
  96. sglang/srt/models/deepseek_vl2.py +3 -5
  97. sglang/srt/models/gemma3_causal.py +1 -2
  98. sglang/srt/models/gemma3n_causal.py +4 -3
  99. sglang/srt/models/gemma3n_mm.py +4 -20
  100. sglang/srt/models/hunyuan.py +1 -1
  101. sglang/srt/models/kimi_vl.py +1 -2
  102. sglang/srt/models/llama.py +10 -4
  103. sglang/srt/models/llama4.py +32 -45
  104. sglang/srt/models/llama_eagle3.py +61 -11
  105. sglang/srt/models/llava.py +5 -5
  106. sglang/srt/models/minicpmo.py +2 -2
  107. sglang/srt/models/mistral.py +1 -1
  108. sglang/srt/models/mllama4.py +402 -89
  109. sglang/srt/models/phi4mm.py +1 -3
  110. sglang/srt/models/pixtral.py +3 -7
  111. sglang/srt/models/qwen2.py +31 -3
  112. sglang/srt/models/qwen2_5_vl.py +1 -3
  113. sglang/srt/models/qwen2_audio.py +200 -0
  114. sglang/srt/models/qwen2_moe.py +32 -6
  115. sglang/srt/models/qwen2_vl.py +1 -4
  116. sglang/srt/models/qwen3.py +94 -25
  117. sglang/srt/models/qwen3_moe.py +68 -21
  118. sglang/srt/models/vila.py +3 -8
  119. sglang/srt/{mm_utils.py → multimodal/mm_utils.py} +2 -2
  120. sglang/srt/{managers/multimodal_processors → multimodal/processors}/base_processor.py +140 -158
  121. sglang/srt/{managers/multimodal_processors → multimodal/processors}/clip.py +2 -13
  122. sglang/srt/{managers/multimodal_processors → multimodal/processors}/deepseek_vl_v2.py +4 -11
  123. sglang/srt/{managers/multimodal_processors → multimodal/processors}/gemma3.py +3 -10
  124. sglang/srt/{managers/multimodal_processors → multimodal/processors}/gemma3n.py +5 -20
  125. sglang/srt/{managers/multimodal_processors → multimodal/processors}/internvl.py +3 -10
  126. sglang/srt/{managers/multimodal_processors → multimodal/processors}/janus_pro.py +3 -9
  127. sglang/srt/{managers/multimodal_processors → multimodal/processors}/kimi_vl.py +6 -13
  128. sglang/srt/{managers/multimodal_processors → multimodal/processors}/llava.py +2 -10
  129. sglang/srt/{managers/multimodal_processors → multimodal/processors}/minicpm.py +5 -12
  130. sglang/srt/{managers/multimodal_processors → multimodal/processors}/mlama.py +2 -14
  131. sglang/srt/{managers/multimodal_processors → multimodal/processors}/mllama4.py +65 -66
  132. sglang/srt/{managers/multimodal_processors → multimodal/processors}/phi4mm.py +4 -14
  133. sglang/srt/{managers/multimodal_processors → multimodal/processors}/pixtral.py +3 -9
  134. sglang/srt/{managers/multimodal_processors → multimodal/processors}/qwen_vl.py +8 -14
  135. sglang/srt/{managers/multimodal_processors → multimodal/processors}/vila.py +13 -31
  136. sglang/srt/operations_strategy.py +6 -2
  137. sglang/srt/reasoning_parser.py +26 -0
  138. sglang/srt/sampling/sampling_batch_info.py +39 -1
  139. sglang/srt/server_args.py +84 -22
  140. sglang/srt/speculative/build_eagle_tree.py +57 -18
  141. sglang/srt/speculative/eagle_worker.py +6 -4
  142. sglang/srt/two_batch_overlap.py +203 -27
  143. sglang/srt/utils.py +343 -163
  144. sglang/srt/warmup.py +12 -3
  145. sglang/test/runners.py +10 -1
  146. sglang/test/test_cutlass_w4a8_moe.py +281 -0
  147. sglang/test/test_utils.py +15 -3
  148. sglang/utils.py +5 -5
  149. sglang/version.py +1 -1
  150. {sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/METADATA +12 -8
  151. {sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/RECORD +157 -146
  152. sglang/math_utils.py +0 -8
  153. /sglang/srt/{managers → eplb}/eplb_algorithms/deepseek.py +0 -0
  154. /sglang/srt/{managers → eplb}/eplb_algorithms/deepseek_vec.py +0 -0
  155. /sglang/srt/{eplb_simulator → eplb/eplb_simulator}/__init__.py +0 -0
  156. {sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/WHEEL +0 -0
  157. {sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/licenses/LICENSE +0 -0
  158. {sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/top_level.txt +0 -0
sglang/srt/warmup.py CHANGED
@@ -4,6 +4,7 @@ from typing import List
4
4
  import numpy as np
5
5
  import tqdm
6
6
 
7
+ from sglang.srt.disaggregation.utils import FAKE_BOOTSTRAP_HOST
7
8
  from sglang.srt.managers.io_struct import GenerateReqInput
8
9
  from sglang.srt.managers.tokenizer_manager import TokenizerManager
9
10
 
@@ -20,17 +21,21 @@ def warmup(name: str) -> callable:
20
21
  return decorator
21
22
 
22
23
 
23
- async def execute_warmups(warmup_names: List[str], tokenizer_manager: TokenizerManager):
24
+ async def execute_warmups(
25
+ disaggregation_mode: str,
26
+ warmup_names: List[str],
27
+ tokenizer_manager: TokenizerManager,
28
+ ):
24
29
  for warmup_name in warmup_names:
25
30
  if warmup_name not in _warmup_registry:
26
31
  logger.warning(f"Could not find custom warmup {warmup_name}")
27
32
  continue
28
33
  logger.info(f"Running warmup {warmup_name}")
29
- await _warmup_registry[warmup_name](tokenizer_manager)
34
+ await _warmup_registry[warmup_name](disaggregation_mode, tokenizer_manager)
30
35
 
31
36
 
32
37
  @warmup("voice_chat")
33
- async def voice_chat(tokenizer_manager: TokenizerManager):
38
+ async def voice_chat(disaggregation_mode: str, tokenizer_manager: TokenizerManager):
34
39
  # this warms up the fused_moe triton kernels and caches them
35
40
  # if we don't do this we break real time inference for voice chat
36
41
  for i in tqdm.trange(1, 512):
@@ -44,4 +49,8 @@ async def voice_chat(tokenizer_manager: TokenizerManager):
44
49
  "min_p": 0.0,
45
50
  },
46
51
  )
52
+ if disaggregation_mode != "null":
53
+ generate_req_input.bootstrap_room = 0
54
+ generate_req_input.bootstrap_host = FAKE_BOOTSTRAP_HOST
55
+
47
56
  await tokenizer_manager.generate_request(generate_req_input, None).__anext__()
sglang/test/runners.py CHANGED
@@ -503,6 +503,8 @@ class SRTRunner:
503
503
  disable_overlap_schedule: bool = False,
504
504
  disable_custom_all_reduce: bool = False,
505
505
  torchao_config: Optional[str] = None,
506
+ cuda_graph_max_bs: int = 4,
507
+ sleep_on_idle=False,
506
508
  ):
507
509
  self.model_type = model_type
508
510
  self.is_generation = model_type == "generation"
@@ -538,8 +540,9 @@ class SRTRunner:
538
540
  tokenizer_path=tokenizer_path,
539
541
  enable_ep_moe=enable_ep_moe,
540
542
  disable_overlap_schedule=disable_overlap_schedule,
541
- cuda_graph_max_bs=4,
543
+ cuda_graph_max_bs=cuda_graph_max_bs,
542
544
  disable_custom_all_reduce=disable_custom_all_reduce,
545
+ sleep_on_idle=sleep_on_idle,
543
546
  **spec_kwargs,
544
547
  )
545
548
 
@@ -550,6 +553,12 @@ class SRTRunner:
550
553
  else:
551
554
  self.tokenizer = None
552
555
 
556
+ def load_lora_adapter(self, lora_name: str, lora_path: str):
557
+ return self.engine.load_lora_adapter(lora_name, lora_path)
558
+
559
+ def unload_lora_adapter(self, lora_name: str):
560
+ return self.engine.unload_lora_adapter(lora_name)
561
+
553
562
  def forward(
554
563
  self,
555
564
  prompts: Union[
@@ -0,0 +1,281 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ from typing import Optional
4
+
5
+ import pytest
6
+ import torch
7
+
8
+ from sglang.srt.layers.moe.cutlass_w4a8_moe import cutlass_w4a8_moe
9
+ from sglang.srt.layers.moe.topk import select_experts
10
+
11
+
12
+ def pack_int4_values_to_int8(int4_values_interleaved: torch.Tensor) -> torch.Tensor:
13
+ if int4_values_interleaved.shape[-1] % 2 != 0:
14
+ raise ValueError(
15
+ "the last dim size of int4_values_interleaved tensor must be even."
16
+ )
17
+
18
+ input_tensor_int8 = int4_values_interleaved.to(torch.int8)
19
+
20
+ low_nibbles = input_tensor_int8[..., 0::2]
21
+ high_nibbles = input_tensor_int8[..., 1::2]
22
+
23
+ packed_tensor = (high_nibbles << 4) | (low_nibbles & 0x0F)
24
+
25
+ return packed_tensor.to(torch.int8)
26
+
27
+
28
+ def pack_interleave(num_experts, ref_weight, ref_scale):
29
+ n, k = ref_weight.shape[1], ref_weight.shape[2]
30
+
31
+ weight = pack_int4_values_to_int8(ref_weight.cpu()).cuda()
32
+ w_q = weight.view((num_experts, n, k // 2)).view(torch.int8)
33
+ w_q = w_q.contiguous()
34
+
35
+ scale_interleaved = ref_scale.reshape(
36
+ ref_scale.shape[0], ref_scale.shape[1], (ref_scale.shape[2] // 4), 4
37
+ ) # [E, N, K/4, 4]
38
+ scale_interleaved = scale_interleaved.permute(0, 2, 1, 3) # [E, K/4, N, 4]
39
+ scale_interleaved = scale_interleaved.reshape(
40
+ ref_scale.shape[0], ref_scale.shape[2] // 4, ref_scale.shape[1] * 4
41
+ ) # [E, K/4, N*4]
42
+ w_scale = scale_interleaved.contiguous()
43
+
44
+ return w_q, w_scale
45
+
46
+
47
+ @pytest.mark.parametrize("M", [1, 2, 4, 8, 16])
48
+ @pytest.mark.parametrize("N", [2048])
49
+ @pytest.mark.parametrize("K", [7168])
50
+ @pytest.mark.parametrize("E", [256])
51
+ @pytest.mark.parametrize("ep_size", [8])
52
+ @pytest.mark.parametrize("topk", [8])
53
+ @pytest.mark.parametrize("group_size", [128])
54
+ @pytest.mark.parametrize("dtype", [torch.bfloat16])
55
+ def test_cutlass_w4a8_moe(M, N, K, E, ep_size, topk, group_size, dtype):
56
+ local_e = E // ep_size
57
+
58
+ debug = False
59
+ if debug:
60
+ a = torch.ones((M, K), dtype=dtype, device="cuda") * 0.001
61
+ ref_weight_1 = torch.ones((local_e, N * 2, K), dtype=torch.int8, device="cuda")
62
+ ref_weight_2 = torch.ones((local_e, K, N), dtype=torch.int8, device="cuda")
63
+ a1_scale = torch.ones(1, dtype=torch.float32, device="cuda")
64
+ a2_scale = torch.ones(1, dtype=torch.float32, device="cuda")
65
+ scale_1 = torch.ones(
66
+ (local_e, N * 2, K // group_size), dtype=dtype, device="cuda"
67
+ )
68
+ scale_2 = torch.ones((local_e, K, N // group_size), dtype=dtype, device="cuda")
69
+ else:
70
+ a = torch.randn(M, K, dtype=dtype, device="cuda")
71
+ ref_weight_1 = torch.randint(
72
+ -8, 8, (local_e, N * 2, K), dtype=torch.int8, device="cuda"
73
+ )
74
+ ref_weight_2 = torch.randint(
75
+ -8, 8, (local_e, K, N), dtype=torch.int8, device="cuda"
76
+ )
77
+ affine_coeff = 0.005
78
+ a1_scale = torch.randn(1, dtype=torch.float32, device="cuda")
79
+ a2_scale = torch.randn(1, dtype=torch.float32, device="cuda")
80
+ scale_1 = (
81
+ torch.randn(local_e, N * 2, K // group_size, dtype=dtype, device="cuda")
82
+ * affine_coeff
83
+ )
84
+ scale_2 = (
85
+ torch.randn(local_e, K, N // group_size, dtype=dtype, device="cuda")
86
+ * affine_coeff
87
+ )
88
+
89
+ w1_q, w1_scale = pack_interleave(local_e, ref_weight_1, scale_1)
90
+ w2_q, w2_scale = pack_interleave(local_e, ref_weight_2, scale_2)
91
+
92
+ device = "cuda"
93
+ a_strides1 = torch.full((local_e, 3), K, device=device, dtype=torch.int64)
94
+ c_strides1 = torch.full((local_e, 3), 2 * N, device=device, dtype=torch.int64)
95
+ a_strides2 = torch.full((local_e, 3), N, device=device, dtype=torch.int64)
96
+ c_strides2 = torch.full((local_e, 3), K, device=device, dtype=torch.int64)
97
+ b_strides1 = a_strides1
98
+ s_strides13 = c_strides1
99
+ b_strides2 = a_strides2
100
+ s_strides2 = c_strides2
101
+
102
+ score = torch.randn((M, E), dtype=dtype, device=device)
103
+ topk_weights, topk_ids = select_experts(
104
+ hidden_states=a,
105
+ router_logits=score,
106
+ top_k=topk,
107
+ use_grouped_topk=False,
108
+ renormalize=False,
109
+ )
110
+ expert_map = torch.arange(E, dtype=torch.int32, device=device)
111
+ expert_map[local_e:] = E
112
+
113
+ output = cutlass_moe(
114
+ a,
115
+ w1_q,
116
+ w2_q,
117
+ w1_scale,
118
+ w2_scale,
119
+ topk_weights,
120
+ topk_ids,
121
+ a_strides1,
122
+ b_strides1,
123
+ c_strides1,
124
+ a_strides2,
125
+ b_strides2,
126
+ c_strides2,
127
+ s_strides13,
128
+ s_strides2,
129
+ 0,
130
+ local_e - 1,
131
+ E,
132
+ a1_scale,
133
+ a2_scale,
134
+ expert_map,
135
+ )
136
+
137
+ ref_output = ref(
138
+ a,
139
+ local_e,
140
+ topk_weights,
141
+ topk_ids,
142
+ ref_weight_1,
143
+ ref_weight_2,
144
+ scale_1,
145
+ scale_2,
146
+ has_pre_quant=True,
147
+ has_alpha=True,
148
+ pre_quant_scale_1=a1_scale,
149
+ pre_quant_scale_2=a2_scale,
150
+ alpha_1=a1_scale,
151
+ alpha_2=a2_scale,
152
+ )
153
+
154
+ # compare
155
+ torch.cuda.synchronize()
156
+
157
+ # compare final output
158
+ torch.testing.assert_close(output, ref_output, rtol=1e-2, atol=0.1)
159
+ print("SUCCESS: Final output tensors are close.")
160
+
161
+
162
+ def cutlass_moe(
163
+ a: torch.Tensor,
164
+ w1_q: torch.Tensor,
165
+ w2_q: torch.Tensor,
166
+ w1_scale: torch.Tensor,
167
+ w2_scale: torch.Tensor,
168
+ topk_weights: torch.Tensor,
169
+ topk_ids_: torch.Tensor,
170
+ a_strides1: torch.Tensor,
171
+ b_strides1: torch.Tensor,
172
+ c_strides1: torch.Tensor,
173
+ a_strides2: torch.Tensor,
174
+ b_strides2: torch.Tensor,
175
+ c_strides2: torch.Tensor,
176
+ s_strides13: torch.Tensor,
177
+ s_strides2: torch.Tensor,
178
+ start_expert_id: int,
179
+ end_expert_id: int,
180
+ E: int,
181
+ a1_scale: Optional[torch.Tensor] = None,
182
+ a2_scale: Optional[torch.Tensor] = None,
183
+ expert_map: Optional[torch.Tensor] = None,
184
+ apply_router_weight_on_input: bool = False,
185
+ ):
186
+ local_topk_ids = topk_ids_
187
+ local_topk_ids = torch.where(expert_map[topk_ids_] != E, expert_map[topk_ids_], E)
188
+ device = a.device
189
+
190
+ local_num_experts = end_expert_id - start_expert_id + 1
191
+ expert_offsets = torch.empty(
192
+ (local_num_experts + 1), dtype=torch.int32, device=device
193
+ )
194
+ problem_sizes1 = torch.empty(
195
+ (local_num_experts, 3), dtype=torch.int32, device=device
196
+ )
197
+ problem_sizes2 = torch.empty(
198
+ (local_num_experts, 3), dtype=torch.int32, device=device
199
+ )
200
+ return cutlass_w4a8_moe(
201
+ start_expert_id,
202
+ end_expert_id,
203
+ E,
204
+ a,
205
+ w1_q,
206
+ w2_q,
207
+ w1_scale,
208
+ w2_scale,
209
+ topk_weights,
210
+ topk_ids_,
211
+ local_topk_ids,
212
+ a_strides1,
213
+ b_strides1,
214
+ c_strides1,
215
+ a_strides2,
216
+ b_strides2,
217
+ c_strides2,
218
+ s_strides13,
219
+ s_strides2,
220
+ expert_offsets,
221
+ problem_sizes1,
222
+ problem_sizes2,
223
+ a1_scale,
224
+ a2_scale,
225
+ apply_router_weight_on_input,
226
+ )
227
+
228
+
229
+ def ref(
230
+ x: torch.Tensor,
231
+ num_experts: int,
232
+ topk_weights: torch.Tensor,
233
+ topk_ids: torch.Tensor,
234
+ ref_weight_1: torch.Tensor,
235
+ ref_weight_2: torch.Tensor,
236
+ ref_weight_scale_1: torch.Tensor,
237
+ ref_weight_scale_2: torch.Tensor,
238
+ has_pre_quant: bool = False,
239
+ has_alpha: bool = False,
240
+ pre_quant_scale_1: Optional[torch.Tensor] = None,
241
+ pre_quant_scale_2: Optional[torch.Tensor] = None,
242
+ alpha_1: Optional[torch.Tensor] = None,
243
+ alpha_2: Optional[torch.Tensor] = None,
244
+ ):
245
+ results = torch.zeros_like(x)
246
+ dtype = x.dtype
247
+ for e_idx in range(num_experts):
248
+ mask = topk_ids == e_idx
249
+ activated_tokens = mask.sum(1).bool()
250
+ act = x[activated_tokens, :]
251
+ if act.shape[0] == 0:
252
+ continue
253
+ final_scale = (topk_weights * mask).sum(1)[activated_tokens].unsqueeze(1)
254
+
255
+ act = (
256
+ torch.clamp((act / pre_quant_scale_1.float()), -448.0, 448.0)
257
+ .to(torch.float8_e4m3fn)
258
+ .to(dtype)
259
+ )
260
+ w3_w1 = ref_weight_1[e_idx]
261
+ ref_w_scale_repeat = (
262
+ ref_weight_scale_1[e_idx].repeat_interleave(128, dim=1).to(float)
263
+ )
264
+ w3_w1 = (w3_w1.to(float) * ref_w_scale_repeat).to(dtype)
265
+ fc1 = ((torch.matmul(act, w3_w1.T)) * alpha_1).to(torch.float16)
266
+
267
+ gate, fc1 = fc1.chunk(2, dim=-1)
268
+ fc1 = fc1 * torch.nn.functional.silu(gate)
269
+ act = (fc1 / pre_quant_scale_2.float()).to(torch.float8_e4m3fn)
270
+ act = act.to(dtype)
271
+
272
+ w2 = ref_weight_2[e_idx]
273
+ ref_w_scale_repeat = (
274
+ ref_weight_scale_2[e_idx].repeat_interleave(128, dim=1).to(float)
275
+ )
276
+ w2 = (w2.to(float) * ref_w_scale_repeat).to(dtype)
277
+ fc2 = (torch.matmul(act, w2.T) * alpha_2).to(torch.float16)
278
+
279
+ results[activated_tokens, :] += (fc2 * final_scale).to(results.dtype)
280
+
281
+ return results
sglang/test/test_utils.py CHANGED
@@ -5,6 +5,7 @@ import copy
5
5
  import logging
6
6
  import os
7
7
  import random
8
+ import re
8
9
  import subprocess
9
10
  import threading
10
11
  import time
@@ -840,12 +841,23 @@ def run_bench_one_batch(model, other_args):
840
841
  print(f"Output: {output}", flush=True)
841
842
  print(f"Error: {error}", flush=True)
842
843
 
843
- lastline = output.split("\n")[-3]
844
- output_throughput = float(lastline.split(" ")[-2])
844
+ # Return prefill_latency, decode_throughput, decode_latency
845
+ prefill_line = output.split("\n")[-9]
846
+ decode_line = output.split("\n")[-3]
847
+ pattern = (
848
+ r"latency: (?P<latency>\d+\.\d+).*?throughput:\s*(?P<throughput>\d+\.\d+)"
849
+ )
850
+ match = re.search(pattern, prefill_line)
851
+ if match:
852
+ prefill_latency = float(match.group("latency"))
853
+ match = re.search(pattern, decode_line)
854
+ if match:
855
+ decode_latency = float(match.group("latency"))
856
+ decode_throughput = float(match.group("throughput"))
845
857
  finally:
846
858
  kill_process_tree(process.pid)
847
859
 
848
- return output_throughput
860
+ return prefill_latency, decode_throughput, decode_latency
849
861
 
850
862
 
851
863
  def run_bench_offline_throughput(model, other_args):
sglang/utils.py CHANGED
@@ -1,6 +1,5 @@
1
1
  """Common utilities"""
2
2
 
3
- import base64
4
3
  import importlib
5
4
  import json
6
5
  import logging
@@ -20,6 +19,7 @@ from json import dumps
20
19
  from typing import Any, Callable, List, Optional, Tuple, Type, Union
21
20
 
22
21
  import numpy as np
22
+ import pybase64
23
23
  import requests
24
24
  from IPython.display import HTML, display
25
25
  from pydantic import BaseModel
@@ -148,15 +148,15 @@ def encode_image_base64(image_path: Union[str, bytes]):
148
148
  if isinstance(image_path, str):
149
149
  with open(image_path, "rb") as image_file:
150
150
  data = image_file.read()
151
- return base64.b64encode(data).decode("utf-8")
151
+ return pybase64.b64encode(data).decode("utf-8")
152
152
  elif isinstance(image_path, bytes):
153
- return base64.b64encode(image_path).decode("utf-8")
153
+ return pybase64.b64encode(image_path).decode("utf-8")
154
154
  else:
155
155
  # image_path is PIL.WebPImagePlugin.WebPImageFile
156
156
  image = image_path
157
157
  buffered = BytesIO()
158
158
  image.save(buffered, format="PNG")
159
- return base64.b64encode(buffered.getvalue()).decode("utf-8")
159
+ return pybase64.b64encode(buffered.getvalue()).decode("utf-8")
160
160
 
161
161
 
162
162
  def encode_frame(frame):
@@ -223,7 +223,7 @@ def encode_video_base64(video_path: str, num_frames: int = 16):
223
223
  video_bytes = b"".join(encoded_frames)
224
224
 
225
225
  # Encode the concatenated bytes to base64
226
- video_base64 = "video:" + base64.b64encode(video_bytes).decode("utf-8")
226
+ video_base64 = "video:" + pybase64.b64encode(video_bytes).decode("utf-8")
227
227
 
228
228
  return video_base64
229
229
 
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.4.8.post1"
1
+ __version__ = "0.4.9.post1"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sglang
3
- Version: 0.4.8.post1
3
+ Version: 0.4.9.post1
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -219,6 +219,7 @@ Requires-Dist: IPython
219
219
  Requires-Dist: setproctitle
220
220
  Provides-Extra: runtime-common
221
221
  Requires-Dist: blobfile==3.0.0; extra == "runtime-common"
222
+ Requires-Dist: build; extra == "runtime-common"
222
223
  Requires-Dist: compressed-tensors; extra == "runtime-common"
223
224
  Requires-Dist: datasets; extra == "runtime-common"
224
225
  Requires-Dist: fastapi; extra == "runtime-common"
@@ -238,24 +239,26 @@ Requires-Dist: prometheus-client>=0.20.0; extra == "runtime-common"
238
239
  Requires-Dist: psutil; extra == "runtime-common"
239
240
  Requires-Dist: pydantic; extra == "runtime-common"
240
241
  Requires-Dist: pynvml; extra == "runtime-common"
242
+ Requires-Dist: pybase64; extra == "runtime-common"
241
243
  Requires-Dist: python-multipart; extra == "runtime-common"
242
244
  Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
243
245
  Requires-Dist: soundfile==0.13.1; extra == "runtime-common"
244
246
  Requires-Dist: scipy; extra == "runtime-common"
245
247
  Requires-Dist: torchao==0.9.0; extra == "runtime-common"
246
- Requires-Dist: transformers==4.52.3; extra == "runtime-common"
248
+ Requires-Dist: transformers==4.53.0; extra == "runtime-common"
249
+ Requires-Dist: timm==1.0.16; extra == "runtime-common"
247
250
  Requires-Dist: uvicorn; extra == "runtime-common"
248
251
  Requires-Dist: uvloop; extra == "runtime-common"
249
- Requires-Dist: xgrammar==0.1.19; extra == "runtime-common"
252
+ Requires-Dist: xgrammar==0.1.20; extra == "runtime-common"
250
253
  Provides-Extra: srt
251
254
  Requires-Dist: sglang[runtime_common]; extra == "srt"
252
- Requires-Dist: sgl-kernel==0.1.9; extra == "srt"
255
+ Requires-Dist: sgl-kernel==0.2.4; extra == "srt"
253
256
  Requires-Dist: torch==2.7.1; extra == "srt"
254
257
  Requires-Dist: torchaudio==2.7.1; extra == "srt"
255
258
  Requires-Dist: torchvision==0.22.1; extra == "srt"
256
259
  Requires-Dist: cuda-python; extra == "srt"
257
260
  Requires-Dist: einops; extra == "srt"
258
- Requires-Dist: flashinfer_python==0.2.6.post1; extra == "srt"
261
+ Requires-Dist: flashinfer_python==0.2.7.post1; extra == "srt"
259
262
  Provides-Extra: blackwell
260
263
  Requires-Dist: sglang[runtime_common]; extra == "blackwell"
261
264
  Requires-Dist: sgl-kernel; extra == "blackwell"
@@ -264,7 +267,7 @@ Requires-Dist: torchaudio==2.7.1; extra == "blackwell"
264
267
  Requires-Dist: torchvision==0.22.1; extra == "blackwell"
265
268
  Requires-Dist: cuda-python; extra == "blackwell"
266
269
  Requires-Dist: einops; extra == "blackwell"
267
- Requires-Dist: flashinfer_python==0.2.6.post1; extra == "blackwell"
270
+ Requires-Dist: flashinfer_python==0.2.7.post1; extra == "blackwell"
268
271
  Provides-Extra: srt-hip
269
272
  Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
270
273
  Requires-Dist: torch; extra == "srt-hip"
@@ -295,7 +298,6 @@ Requires-Dist: jsonlines; extra == "test"
295
298
  Requires-Dist: matplotlib; extra == "test"
296
299
  Requires-Dist: pandas; extra == "test"
297
300
  Requires-Dist: peft; extra == "test"
298
- Requires-Dist: timm; extra == "test"
299
301
  Requires-Dist: sentence_transformers; extra == "test"
300
302
  Provides-Extra: all
301
303
  Requires-Dist: sglang[srt]; extra == "all"
@@ -373,6 +375,8 @@ Dynamic: license-file
373
375
  | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
374
376
 
375
377
  ## News
378
+ - [2025/06] 🔥 SGLang, the high-performance serving infrastructure powering trillions of tokens daily, has been awarded the third batch of the Open Source AI Grant by a16z ([a16z blog](https://a16z.com/advancing-open-source-ai-through-benchmarks-and-bold-experimentation/)).
379
+ - [2025/06] 🔥 Deploying DeepSeek on GB200 NVL72 with PD and Large Scale EP (Part I): 2.7x Higher Decoding Throughput ([blog](https://lmsys.org/blog/2025-06-16-gb200-part-1/)).
376
380
  - [2025/05] 🔥 Deploying DeepSeek with PD Disaggregation and Large-scale Expert Parallelism on 96 H100 GPUs ([blog](https://lmsys.org/blog/2025-05-05-large-scale-ep/)).
377
381
  - [2025/03] Supercharge DeepSeek-R1 Inference on AMD Instinct MI300X ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1-Part2/README.html))
378
382
  - [2025/03] SGLang Joins PyTorch Ecosystem: Efficient LLM Serving Engine ([PyTorch blog](https://pytorch.org/blog/sglang-joins-pytorch/))
@@ -416,7 +420,7 @@ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
416
420
  [Development Roadmap (2025 H1)](https://github.com/sgl-project/sglang/issues/4042)
417
421
 
418
422
  ## Adoption and Sponsorship
419
- SGLang has been deployed at large scale, generating trillions of tokens in production every day. It is trusted and adopted by a broad range of leading enterprises and institutions, including xAI, NVIDIA, AMD, Google Cloud, Oracle Cloud, LinkedIn, Cursor, Voltage Park, Atlas Cloud, DataCrunch, Baseten, Nebius, Novita, InnoMatrix, RunPod, Stanford, UC Berkeley, UCLA, ETCHED, Jam & Tea Studios, Hyperbolic, as well as major technology organizations across North America and Asia. As an open-source LLM inference engine, SGLang has become the de facto standard in the industry, with production deployments running on over 100,000 GPUs worldwide.
423
+ SGLang has been deployed at large scale, generating trillions of tokens in production each day. It is trusted and adopted by a wide range of leading enterprises and institutions, including xAI, AMD, NVIDIA, Intel, LinkedIn, Cursor, Oracle Cloud, Google Cloud, Microsoft Azure, AWS, Atlas Cloud, Voltage Park, Nebius, DataCrunch, Novita, InnoMatrix, MIT, UCLA, the University of Washington, Stanford, UC Berkeley, Tsinghua University, Jam & Tea Studios, Baseten, and other major technology organizations across North America and Asia. As an open-source LLM inference engine, SGLang has become the de facto industry standard, with deployments running on over 1,000,000 GPUs worldwide.
420
424
 
421
425
  <img src="https://raw.githubusercontent.com/sgl-project/sgl-learning-materials/refs/heads/main/slides/adoption.png" alt="logo" width="800" margin="10px"></img>
422
426