sglang 0.5.0rc0__py3-none-any.whl → 0.5.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (170) hide show
  1. sglang/__init__.py +8 -3
  2. sglang/bench_one_batch.py +6 -1
  3. sglang/lang/chat_template.py +18 -0
  4. sglang/srt/bench_utils.py +137 -0
  5. sglang/srt/configs/model_config.py +8 -7
  6. sglang/srt/disaggregation/decode.py +8 -4
  7. sglang/srt/disaggregation/mooncake/conn.py +43 -25
  8. sglang/srt/disaggregation/mooncake/transfer_engine.py +29 -0
  9. sglang/srt/distributed/parallel_state.py +4 -2
  10. sglang/srt/entrypoints/context.py +3 -20
  11. sglang/srt/entrypoints/engine.py +13 -8
  12. sglang/srt/entrypoints/harmony_utils.py +2 -0
  13. sglang/srt/entrypoints/http_server.py +68 -5
  14. sglang/srt/entrypoints/openai/protocol.py +2 -9
  15. sglang/srt/entrypoints/openai/serving_chat.py +60 -265
  16. sglang/srt/entrypoints/openai/serving_completions.py +1 -0
  17. sglang/srt/entrypoints/openai/tool_server.py +4 -3
  18. sglang/srt/function_call/ebnf_composer.py +1 -0
  19. sglang/srt/function_call/function_call_parser.py +2 -0
  20. sglang/srt/function_call/glm4_moe_detector.py +1 -1
  21. sglang/srt/function_call/gpt_oss_detector.py +331 -0
  22. sglang/srt/function_call/kimik2_detector.py +3 -3
  23. sglang/srt/function_call/qwen3_coder_detector.py +219 -9
  24. sglang/srt/jinja_template_utils.py +6 -0
  25. sglang/srt/layers/attention/aiter_backend.py +370 -107
  26. sglang/srt/layers/attention/ascend_backend.py +3 -0
  27. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  28. sglang/srt/layers/attention/flashattention_backend.py +18 -0
  29. sglang/srt/layers/attention/flashinfer_backend.py +55 -13
  30. sglang/srt/layers/attention/flashinfer_mla_backend.py +1 -0
  31. sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
  32. sglang/srt/layers/attention/triton_backend.py +24 -27
  33. sglang/srt/layers/attention/trtllm_mha_backend.py +8 -6
  34. sglang/srt/layers/attention/trtllm_mla_backend.py +129 -25
  35. sglang/srt/layers/attention/vision.py +9 -1
  36. sglang/srt/layers/attention/wave_backend.py +627 -0
  37. sglang/srt/layers/attention/wave_ops/decode_attention.py +186 -0
  38. sglang/srt/layers/attention/wave_ops/extend_attention.py +149 -0
  39. sglang/srt/layers/attention/wave_ops/prefill_attention.py +79 -0
  40. sglang/srt/layers/communicator.py +11 -13
  41. sglang/srt/layers/dp_attention.py +118 -27
  42. sglang/srt/layers/flashinfer_comm_fusion.py +4 -4
  43. sglang/srt/layers/linear.py +1 -0
  44. sglang/srt/layers/logits_processor.py +12 -18
  45. sglang/srt/layers/moe/cutlass_moe.py +11 -16
  46. sglang/srt/layers/moe/cutlass_w4a8_moe.py +4 -5
  47. sglang/srt/layers/moe/ep_moe/kernels.py +43 -0
  48. sglang/srt/layers/moe/ep_moe/layer.py +60 -2
  49. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=129,N=352,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  50. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=161,N=192,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  51. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_0/E=16,N=1024,device_name=NVIDIA_B200.json +146 -0
  52. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  53. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
  54. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  55. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  56. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  57. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  58. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  59. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  60. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  61. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  62. sglang/srt/layers/moe/fused_moe_triton/layer.py +7 -9
  63. sglang/srt/layers/moe/token_dispatcher/deepep.py +61 -24
  64. sglang/srt/layers/moe/topk.py +4 -1
  65. sglang/srt/layers/multimodal.py +156 -40
  66. sglang/srt/layers/quantization/__init__.py +10 -35
  67. sglang/srt/layers/quantization/awq.py +15 -16
  68. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +0 -1
  69. sglang/srt/layers/quantization/fp8_kernel.py +277 -0
  70. sglang/srt/layers/quantization/fp8_utils.py +22 -10
  71. sglang/srt/layers/quantization/gptq.py +12 -17
  72. sglang/srt/layers/quantization/marlin_utils.py +15 -5
  73. sglang/srt/layers/quantization/modelopt_quant.py +58 -41
  74. sglang/srt/layers/quantization/mxfp4.py +20 -3
  75. sglang/srt/layers/quantization/utils.py +52 -2
  76. sglang/srt/layers/quantization/w4afp8.py +20 -11
  77. sglang/srt/layers/quantization/w8a8_int8.py +48 -34
  78. sglang/srt/layers/rotary_embedding.py +281 -2
  79. sglang/srt/layers/sampler.py +5 -2
  80. sglang/srt/lora/backend/base_backend.py +3 -23
  81. sglang/srt/lora/layers.py +66 -116
  82. sglang/srt/lora/lora.py +17 -62
  83. sglang/srt/lora/lora_manager.py +12 -48
  84. sglang/srt/lora/lora_registry.py +20 -9
  85. sglang/srt/lora/mem_pool.py +20 -63
  86. sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
  87. sglang/srt/lora/utils.py +25 -58
  88. sglang/srt/managers/cache_controller.py +24 -29
  89. sglang/srt/managers/detokenizer_manager.py +1 -1
  90. sglang/srt/managers/io_struct.py +20 -6
  91. sglang/srt/managers/mm_utils.py +1 -2
  92. sglang/srt/managers/multimodal_processor.py +1 -1
  93. sglang/srt/managers/schedule_batch.py +43 -49
  94. sglang/srt/managers/schedule_policy.py +6 -6
  95. sglang/srt/managers/scheduler.py +18 -11
  96. sglang/srt/managers/scheduler_profiler_mixin.py +28 -8
  97. sglang/srt/managers/tokenizer_manager.py +53 -44
  98. sglang/srt/mem_cache/allocator.py +39 -214
  99. sglang/srt/mem_cache/allocator_ascend.py +158 -0
  100. sglang/srt/mem_cache/chunk_cache.py +1 -1
  101. sglang/srt/mem_cache/hicache_storage.py +1 -1
  102. sglang/srt/mem_cache/hiradix_cache.py +34 -24
  103. sglang/srt/mem_cache/lora_radix_cache.py +421 -0
  104. sglang/srt/mem_cache/memory_pool_host.py +33 -35
  105. sglang/srt/mem_cache/radix_cache.py +2 -5
  106. sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +443 -0
  107. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +139 -67
  108. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +6 -9
  109. sglang/srt/model_executor/cuda_graph_runner.py +29 -23
  110. sglang/srt/model_executor/forward_batch_info.py +33 -14
  111. sglang/srt/model_executor/model_runner.py +179 -81
  112. sglang/srt/model_loader/loader.py +18 -6
  113. sglang/srt/models/deepseek_nextn.py +2 -1
  114. sglang/srt/models/deepseek_v2.py +79 -38
  115. sglang/srt/models/gemma2.py +0 -34
  116. sglang/srt/models/gemma3n_mm.py +8 -9
  117. sglang/srt/models/glm4.py +6 -0
  118. sglang/srt/models/glm4_moe.py +11 -11
  119. sglang/srt/models/glm4_moe_nextn.py +2 -1
  120. sglang/srt/models/glm4v.py +589 -0
  121. sglang/srt/models/glm4v_moe.py +400 -0
  122. sglang/srt/models/gpt_oss.py +142 -20
  123. sglang/srt/models/granite.py +0 -25
  124. sglang/srt/models/llama.py +10 -27
  125. sglang/srt/models/llama4.py +19 -6
  126. sglang/srt/models/qwen2.py +2 -2
  127. sglang/srt/models/qwen2_5_vl.py +7 -3
  128. sglang/srt/models/qwen2_audio.py +10 -9
  129. sglang/srt/models/qwen2_moe.py +20 -5
  130. sglang/srt/models/qwen3.py +0 -24
  131. sglang/srt/models/qwen3_classification.py +78 -0
  132. sglang/srt/models/qwen3_moe.py +18 -5
  133. sglang/srt/models/registry.py +1 -1
  134. sglang/srt/models/step3_vl.py +6 -2
  135. sglang/srt/models/torch_native_llama.py +0 -24
  136. sglang/srt/multimodal/processors/base_processor.py +23 -13
  137. sglang/srt/multimodal/processors/glm4v.py +132 -0
  138. sglang/srt/multimodal/processors/qwen_audio.py +4 -2
  139. sglang/srt/operations.py +17 -2
  140. sglang/srt/reasoning_parser.py +316 -0
  141. sglang/srt/sampling/sampling_batch_info.py +7 -4
  142. sglang/srt/server_args.py +142 -140
  143. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -21
  144. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +7 -21
  145. sglang/srt/speculative/eagle_worker.py +16 -0
  146. sglang/srt/two_batch_overlap.py +16 -12
  147. sglang/srt/utils.py +3 -3
  148. sglang/srt/weight_sync/tensor_bucket.py +106 -0
  149. sglang/test/attention/test_trtllm_mla_backend.py +186 -36
  150. sglang/test/doc_patch.py +59 -0
  151. sglang/test/few_shot_gsm8k.py +1 -1
  152. sglang/test/few_shot_gsm8k_engine.py +1 -1
  153. sglang/test/run_eval.py +4 -1
  154. sglang/test/simple_eval_common.py +6 -0
  155. sglang/test/simple_eval_gpqa.py +2 -0
  156. sglang/test/test_fp4_moe.py +118 -36
  157. sglang/test/test_marlin_moe.py +1 -1
  158. sglang/test/test_marlin_utils.py +1 -1
  159. sglang/utils.py +1 -1
  160. sglang/version.py +1 -1
  161. {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/METADATA +27 -31
  162. {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/RECORD +166 -142
  163. sglang/lang/backend/__init__.py +0 -0
  164. sglang/srt/function_call/harmony_tool_parser.py +0 -130
  165. sglang/srt/layers/quantization/scalar_type.py +0 -352
  166. sglang/srt/lora/backend/flashinfer_backend.py +0 -131
  167. /sglang/{api.py → lang/api.py} +0 -0
  168. {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/WHEEL +0 -0
  169. {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/licenses/LICENSE +0 -0
  170. {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,9 @@
1
1
  # SPDX-License-Identifier: Apache-2.0
2
+ from typing import Callable
3
+
2
4
  import pytest
3
5
  import torch
6
+ from flashinfer.fused_moe import cutlass_fused_moe as flashinfer_cutlass_fused_moe
4
7
  from sgl_kernel import scaled_fp4_quant
5
8
 
6
9
  from sglang.srt.layers.activation import SiluAndMul
@@ -111,15 +114,16 @@ def torch_moe(a, w1, w2, score, topk, expert_map):
111
114
  ).sum(dim=1)
112
115
 
113
116
 
114
- @pytest.mark.parametrize("m,n,k", MNK_FACTORS)
115
- @pytest.mark.parametrize("e", [40, 64, 256])
116
- @pytest.mark.parametrize("topk", [1, 6, 8])
117
- @pytest.mark.parametrize("dtype", [torch.half, torch.bfloat16])
118
- @torch.inference_mode()
119
- def test_cutlass_fp4_moe_no_graph(
120
- m: int, n: int, k: int, e: int, topk: int, dtype: torch.dtype
117
+ def check_moe(
118
+ m: int,
119
+ n: int,
120
+ k: int,
121
+ e: int,
122
+ topk: int,
123
+ dtype: torch.dtype,
124
+ moe_impl: Callable,
125
+ flip_w13: bool,
121
126
  ):
122
-
123
127
  torch.manual_seed(7)
124
128
  a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
125
129
  w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
@@ -167,38 +171,18 @@ def test_cutlass_fp4_moe_no_graph(
167
171
 
168
172
  a1_gs = torch.ones((e,), device="cuda", dtype=torch.float32)
169
173
  a2_gs = torch.ones((e,), device="cuda", dtype=torch.float32)
170
- # strides for the cutlass moe_fp4 kernel
171
- ab_strides_13 = torch.full(
172
- (e,), w1_q.shape[2] * 2, dtype=torch.int64, device=w1_q.device
173
- )
174
- c_strides_13 = torch.full(
175
- (e,), w1_q.shape[1], dtype=torch.int64, device=w1_q.device
176
- )
177
- ab_strides_2 = torch.full(
178
- (e,), w2_q.shape[2] * 2, dtype=torch.int64, device=w2_q.device
179
- )
180
- c_strides_2 = torch.full((e,), w2_q.shape[1], dtype=torch.int64, device=w2_q.device)
181
- params = CutlassMoEParams(
182
- CutlassMoEType.BlockscaledFP4,
183
- device=a.device,
184
- num_experts=e,
185
- intermediate_size_per_partition=n, # n
186
- hidden_size=k,
187
- ) # k
188
- cutlass_output = cutlass_moe_fp4(
174
+ test_output = moe_impl(
189
175
  a=a,
190
- a1_gscale=a1_gs,
191
- w1_fp4=w1_q,
176
+ topk_weights=topk_weights,
177
+ topk_ids=topk_ids,
178
+ w1_q=w1_q,
179
+ w2_q=w2_q,
180
+ a1_gs=a1_gs,
192
181
  w1_blockscale=w1_blockscale,
193
182
  w1_alphas=(1 / w1_gs),
194
- a2_gscale=a2_gs,
195
- w2_fp4=w2_q,
183
+ a2_gs=a2_gs,
196
184
  w2_blockscale=w2_blockscale,
197
185
  w2_alphas=(1 / w2_gs),
198
- topk_weights=topk_weights,
199
- topk_ids=topk_ids,
200
- params=params,
201
- apply_router_weight_on_input=False,
202
186
  )
203
187
 
204
188
  # Reference check:
@@ -237,10 +221,108 @@ def test_cutlass_fp4_moe_no_graph(
237
221
  block_size=quant_blocksize,
238
222
  )
239
223
 
224
+ if flip_w13:
225
+ dim = -2
226
+ size = w1_d.size(dim)
227
+ assert size % 2 == 0, f"Expected even size in dim {dim}, got {size}"
228
+ half = size // 2
229
+ # Reorder weight
230
+ w1, w3 = w1_d.split(half, dim=dim)
231
+ w1_d = torch.cat([w3, w1], dim=dim).contiguous()
232
+
240
233
  torch_output = torch_moe(a_in_dtype, w1_d, w2_d, score, topk, None)
241
234
 
242
- torch.testing.assert_close(torch_output, cutlass_output, atol=1e-1, rtol=1e-1)
235
+ torch.testing.assert_close(torch_output, test_output, atol=1e-1, rtol=1e-1)
236
+
237
+
238
+ @pytest.mark.parametrize("m,n,k", MNK_FACTORS)
239
+ @pytest.mark.parametrize("e", [40, 64, 256])
240
+ @pytest.mark.parametrize("topk", [1, 6, 8])
241
+ @pytest.mark.parametrize("dtype", [torch.half, torch.bfloat16])
242
+ @torch.inference_mode()
243
+ def test_cutlass_fp4_moe_no_graph(
244
+ m: int, n: int, k: int, e: int, topk: int, dtype: torch.dtype
245
+ ):
246
+ def cutlass_moe_impl(
247
+ a,
248
+ topk_weights,
249
+ topk_ids,
250
+ w1_q,
251
+ w2_q,
252
+ a1_gs,
253
+ w1_blockscale,
254
+ w1_alphas,
255
+ a2_gs,
256
+ w2_blockscale,
257
+ w2_alphas,
258
+ ):
259
+ params = CutlassMoEParams(
260
+ CutlassMoEType.BlockscaledFP4,
261
+ device=a.device,
262
+ num_experts=e,
263
+ intermediate_size_per_partition=n, # n
264
+ hidden_size=k,
265
+ ) # k
266
+ return cutlass_moe_fp4(
267
+ a=a,
268
+ a1_gscale=a1_gs,
269
+ w1_fp4=w1_q,
270
+ w1_blockscale=w1_blockscale,
271
+ w1_alphas=w1_alphas,
272
+ a2_gscale=a2_gs,
273
+ w2_fp4=w2_q,
274
+ w2_blockscale=w2_blockscale,
275
+ w2_alphas=w2_alphas,
276
+ topk_weights=topk_weights,
277
+ topk_ids=topk_ids,
278
+ params=params,
279
+ apply_router_weight_on_input=False,
280
+ )
281
+
282
+ check_moe(m, n, k, e, topk, dtype, cutlass_moe_impl, flip_w13=False)
283
+
284
+
285
+ @pytest.mark.parametrize("m,n,k", MNK_FACTORS)
286
+ @pytest.mark.parametrize("e", [40, 64, 256])
287
+ @pytest.mark.parametrize("topk", [1, 6, 8])
288
+ @pytest.mark.parametrize("dtype", [torch.half, torch.bfloat16])
289
+ @torch.inference_mode()
290
+ def test_flashinfer_fp4_moe_no_graph(
291
+ m: int, n: int, k: int, e: int, topk: int, dtype: torch.dtype
292
+ ):
293
+ def flashinfer_moe_impl(
294
+ a,
295
+ topk_weights,
296
+ topk_ids,
297
+ w1_q,
298
+ w2_q,
299
+ a1_gs,
300
+ w1_blockscale,
301
+ w1_alphas,
302
+ a2_gs,
303
+ w2_blockscale,
304
+ w2_alphas,
305
+ ):
306
+ return flashinfer_cutlass_fused_moe(
307
+ a,
308
+ topk_ids.to(torch.int),
309
+ topk_weights,
310
+ w1_q.view(torch.long),
311
+ w2_q.view(torch.long),
312
+ a.dtype,
313
+ quant_scales=[
314
+ a1_gs,
315
+ w1_blockscale.view(torch.int32),
316
+ w1_alphas,
317
+ a2_gs,
318
+ w2_blockscale.view(torch.int32),
319
+ w2_alphas,
320
+ ],
321
+ )[0]
322
+
323
+ check_moe(m, n, k, e, topk, dtype, flashinfer_moe_impl, flip_w13=True)
243
324
 
244
325
 
245
326
  if __name__ == "__main__":
246
327
  test_cutlass_fp4_moe_no_graph(224, 1024, 1024, 256, 8, torch.half)
328
+ test_flashinfer_fp4_moe_no_graph(224, 1024, 1024, 256, 8, torch.half)
@@ -4,9 +4,9 @@ from typing import Optional
4
4
  import pytest
5
5
  import torch
6
6
  from sgl_kernel import fused_marlin_moe
7
+ from sgl_kernel.scalar_type import ScalarType, scalar_types
7
8
 
8
9
  from sglang.srt.layers.activation import SiluAndMul
9
- from sglang.srt.layers.quantization.scalar_type import ScalarType, scalar_types
10
10
  from sglang.test.test_marlin_utils import awq_marlin_quantize, marlin_quantize
11
11
 
12
12
 
@@ -10,13 +10,13 @@ from typing import Optional
10
10
 
11
11
  import numpy as np
12
12
  import torch
13
+ from sgl_kernel.scalar_type import ScalarType
13
14
 
14
15
  from sglang.srt.layers.quantization.marlin_utils import (
15
16
  GPTQ_MARLIN_TILE,
16
17
  marlin_permute_scales,
17
18
  marlin_zero_points,
18
19
  )
19
- from sglang.srt.layers.quantization.scalar_type import ScalarType
20
20
  from sglang.srt.layers.quantization.utils import (
21
21
  get_pack_factor,
22
22
  gptq_quantize_weights,
sglang/utils.py CHANGED
@@ -458,7 +458,7 @@ def wait_for_server(base_url: str, timeout: int = None) -> None:
458
458
  NOTE: Typically, the server runs in a separate terminal.
459
459
  In this notebook, we run the server and notebook code together, so their outputs are combined.
460
460
  To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.
461
- We are running those notebooks in a CI parallel environment, so the throughput is not representative of the actual performance.
461
+ We are running those notebooks in a CI environment, so the throughput is not representative of the actual performance.
462
462
  """
463
463
  )
464
464
  break
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.5.0rc0"
1
+ __version__ = "0.5.0rc2"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sglang
3
- Version: 0.5.0rc0
3
+ Version: 0.5.0rc2
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -208,7 +208,7 @@ Project-URL: Homepage, https://github.com/sgl-project/sglang
208
208
  Project-URL: Bug Tracker, https://github.com/sgl-project/sglang/issues
209
209
  Classifier: Programming Language :: Python :: 3
210
210
  Classifier: License :: OSI Approved :: Apache Software License
211
- Requires-Python: >=3.9
211
+ Requires-Python: >=3.10
212
212
  Description-Content-Type: text/markdown
213
213
  License-File: LICENSE
214
214
  Requires-Dist: aiohttp
@@ -222,6 +222,7 @@ Requires-Dist: blobfile==3.0.0; extra == "runtime-common"
222
222
  Requires-Dist: build; extra == "runtime-common"
223
223
  Requires-Dist: compressed-tensors; extra == "runtime-common"
224
224
  Requires-Dist: datasets; extra == "runtime-common"
225
+ Requires-Dist: einops; extra == "runtime-common"
225
226
  Requires-Dist: fastapi; extra == "runtime-common"
226
227
  Requires-Dist: hf_transfer; extra == "runtime-common"
227
228
  Requires-Dist: huggingface_hub; extra == "runtime-common"
@@ -230,6 +231,7 @@ Requires-Dist: llguidance<0.8.0,>=0.7.11; extra == "runtime-common"
230
231
  Requires-Dist: modelscope; extra == "runtime-common"
231
232
  Requires-Dist: msgspec; extra == "runtime-common"
232
233
  Requires-Dist: ninja; extra == "runtime-common"
234
+ Requires-Dist: openai==1.99.1; extra == "runtime-common"
233
235
  Requires-Dist: openai-harmony==0.0.3; extra == "runtime-common"
234
236
  Requires-Dist: orjson; extra == "runtime-common"
235
237
  Requires-Dist: outlines==0.1.11; extra == "runtime-common"
@@ -246,21 +248,21 @@ Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
246
248
  Requires-Dist: sentencepiece; extra == "runtime-common"
247
249
  Requires-Dist: soundfile==0.13.1; extra == "runtime-common"
248
250
  Requires-Dist: scipy; extra == "runtime-common"
249
- Requires-Dist: torchao==0.9.0; extra == "runtime-common"
250
- Requires-Dist: transformers==4.55.0; extra == "runtime-common"
251
251
  Requires-Dist: timm==1.0.16; extra == "runtime-common"
252
+ Requires-Dist: tiktoken; extra == "runtime-common"
253
+ Requires-Dist: torchao==0.9.0; extra == "runtime-common"
254
+ Requires-Dist: transformers==4.55.2; extra == "runtime-common"
252
255
  Requires-Dist: uvicorn; extra == "runtime-common"
253
256
  Requires-Dist: uvloop; extra == "runtime-common"
254
257
  Requires-Dist: xgrammar==0.1.22; extra == "runtime-common"
255
258
  Provides-Extra: srt
256
259
  Requires-Dist: sglang[runtime_common]; extra == "srt"
257
- Requires-Dist: sgl-kernel==0.3.2; extra == "srt"
260
+ Requires-Dist: sgl-kernel==0.3.5; extra == "srt"
258
261
  Requires-Dist: torch==2.8.0; extra == "srt"
259
262
  Requires-Dist: torchaudio==2.8.0; extra == "srt"
260
263
  Requires-Dist: torchvision; extra == "srt"
261
264
  Requires-Dist: cuda-python; extra == "srt"
262
- Requires-Dist: einops; extra == "srt"
263
- Requires-Dist: flashinfer_python==0.2.10; extra == "srt"
265
+ Requires-Dist: flashinfer_python==0.2.11.post3; extra == "srt"
264
266
  Provides-Extra: blackwell
265
267
  Requires-Dist: sglang[runtime_common]; extra == "blackwell"
266
268
  Requires-Dist: sgl-kernel; extra == "blackwell"
@@ -268,21 +270,19 @@ Requires-Dist: torch==2.8.0; extra == "blackwell"
268
270
  Requires-Dist: torchaudio==2.8.0; extra == "blackwell"
269
271
  Requires-Dist: torchvision; extra == "blackwell"
270
272
  Requires-Dist: cuda-python; extra == "blackwell"
271
- Requires-Dist: einops; extra == "blackwell"
272
- Requires-Dist: flashinfer_python==0.2.10; extra == "blackwell"
273
- Requires-Dist: tiktoken; extra == "blackwell"
274
- Requires-Dist: openai==1.99.1; extra == "blackwell"
273
+ Requires-Dist: flashinfer_python==0.2.11.post3; extra == "blackwell"
275
274
  Provides-Extra: srt-hip
276
275
  Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
277
276
  Requires-Dist: torch; extra == "srt-hip"
278
277
  Requires-Dist: petit_kernel==0.0.2; extra == "srt-hip"
278
+ Requires-Dist: wave-lang==1.0.1; extra == "srt-hip"
279
+ Provides-Extra: srt-cpu
280
+ Requires-Dist: sglang[runtime_common]; extra == "srt-cpu"
281
+ Requires-Dist: einops; extra == "srt-cpu"
279
282
  Provides-Extra: srt-xpu
280
283
  Requires-Dist: sglang[runtime_common]; extra == "srt-xpu"
281
284
  Provides-Extra: srt-hpu
282
285
  Requires-Dist: sglang[runtime_common]; extra == "srt-hpu"
283
- Provides-Extra: srt-cpu
284
- Requires-Dist: sglang[runtime_common]; extra == "srt-cpu"
285
- Requires-Dist: einops; extra == "srt-cpu"
286
286
  Provides-Extra: srt-npu
287
287
  Requires-Dist: sglang[runtime_common]; extra == "srt-npu"
288
288
  Provides-Extra: openai
@@ -293,11 +293,12 @@ Requires-Dist: anthropic>=0.20.0; extra == "anthropic"
293
293
  Provides-Extra: litellm
294
294
  Requires-Dist: litellm>=1.0.0; extra == "litellm"
295
295
  Provides-Extra: torch-memory-saver
296
- Requires-Dist: torch_memory_saver>=0.0.8; extra == "torch-memory-saver"
296
+ Requires-Dist: torch_memory_saver==0.0.8; extra == "torch-memory-saver"
297
297
  Provides-Extra: decord
298
298
  Requires-Dist: decord; extra == "decord"
299
299
  Provides-Extra: test
300
300
  Requires-Dist: accelerate; extra == "test"
301
+ Requires-Dist: expecttest; extra == "test"
301
302
  Requires-Dist: jsonlines; extra == "test"
302
303
  Requires-Dist: matplotlib; extra == "test"
303
304
  Requires-Dist: pandas; extra == "test"
@@ -308,38 +309,32 @@ Provides-Extra: all
308
309
  Requires-Dist: sglang[srt]; extra == "all"
309
310
  Requires-Dist: sglang[openai]; extra == "all"
310
311
  Requires-Dist: sglang[anthropic]; extra == "all"
311
- Requires-Dist: sglang[litellm]; extra == "all"
312
312
  Requires-Dist: sglang[torch_memory_saver]; extra == "all"
313
313
  Requires-Dist: sglang[decord]; extra == "all"
314
314
  Provides-Extra: all-hip
315
315
  Requires-Dist: sglang[srt_hip]; extra == "all-hip"
316
316
  Requires-Dist: sglang[openai]; extra == "all-hip"
317
317
  Requires-Dist: sglang[anthropic]; extra == "all-hip"
318
- Requires-Dist: sglang[litellm]; extra == "all-hip"
319
318
  Requires-Dist: sglang[decord]; extra == "all-hip"
320
319
  Provides-Extra: all-xpu
321
320
  Requires-Dist: sglang[srt_xpu]; extra == "all-xpu"
322
321
  Requires-Dist: sglang[openai]; extra == "all-xpu"
323
322
  Requires-Dist: sglang[anthropic]; extra == "all-xpu"
324
- Requires-Dist: sglang[litellm]; extra == "all-xpu"
325
323
  Requires-Dist: sglang[decord]; extra == "all-xpu"
326
324
  Provides-Extra: all-hpu
327
325
  Requires-Dist: sglang[srt_hpu]; extra == "all-hpu"
328
326
  Requires-Dist: sglang[openai]; extra == "all-hpu"
329
327
  Requires-Dist: sglang[anthropic]; extra == "all-hpu"
330
- Requires-Dist: sglang[litellm]; extra == "all-hpu"
331
328
  Requires-Dist: sglang[decord]; extra == "all-hpu"
332
329
  Provides-Extra: all-cpu
333
330
  Requires-Dist: sglang[srt_cpu]; extra == "all-cpu"
334
331
  Requires-Dist: sglang[openai]; extra == "all-cpu"
335
332
  Requires-Dist: sglang[anthropic]; extra == "all-cpu"
336
- Requires-Dist: sglang[litellm]; extra == "all-cpu"
337
333
  Requires-Dist: sglang[decord]; extra == "all-cpu"
338
334
  Provides-Extra: all-npu
339
335
  Requires-Dist: sglang[srt_npu]; extra == "all-npu"
340
336
  Requires-Dist: sglang[openai]; extra == "all-npu"
341
337
  Requires-Dist: sglang[anthropic]; extra == "all-npu"
342
- Requires-Dist: sglang[litellm]; extra == "all-npu"
343
338
  Requires-Dist: sglang[decord]; extra == "all-npu"
344
339
  Provides-Extra: dev
345
340
  Requires-Dist: sglang[all]; extra == "dev"
@@ -376,17 +371,17 @@ Dynamic: license-file
376
371
  | [**Documentation**](https://docs.sglang.ai/)
377
372
  | [**Join Slack**](https://slack.sglang.ai/)
378
373
  | [**Join Bi-Weekly Development Meeting**](https://meeting.sglang.ai/)
379
- | [**Roadmap**](https://github.com/sgl-project/sglang/issues/4042)
374
+ | [**Roadmap**](https://github.com/sgl-project/sglang/issues/7736)
380
375
  | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
381
376
 
382
377
  ## News
378
+ - [2025/08] 🔥 SGLang provides day-0 support for OpenAI gpt-oss model ([instructions](https://github.com/sgl-project/sglang/issues/8833))
383
379
  - [2025/06] 🔥 SGLang, the high-performance serving infrastructure powering trillions of tokens daily, has been awarded the third batch of the Open Source AI Grant by a16z ([a16z blog](https://a16z.com/advancing-open-source-ai-through-benchmarks-and-bold-experimentation/)).
384
380
  - [2025/06] 🔥 Deploying DeepSeek on GB200 NVL72 with PD and Large Scale EP (Part I): 2.7x Higher Decoding Throughput ([blog](https://lmsys.org/blog/2025-06-16-gb200-part-1/)).
385
381
  - [2025/05] 🔥 Deploying DeepSeek with PD Disaggregation and Large-scale Expert Parallelism on 96 H100 GPUs ([blog](https://lmsys.org/blog/2025-05-05-large-scale-ep/)).
386
382
  - [2025/03] Supercharge DeepSeek-R1 Inference on AMD Instinct MI300X ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1-Part2/README.html))
387
383
  - [2025/03] SGLang Joins PyTorch Ecosystem: Efficient LLM Serving Engine ([PyTorch blog](https://pytorch.org/blog/sglang-joins-pytorch/))
388
384
  - [2024/12] v0.4 Release: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
389
- - [2024/07] v0.2 Release: Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
390
385
 
391
386
  <details>
392
387
  <summary>More</summary>
@@ -395,6 +390,7 @@ Dynamic: license-file
395
390
  - [2025/01] SGLang provides day one support for DeepSeek V3/R1 models on NVIDIA and AMD GPUs with DeepSeek-specific optimizations. ([instructions](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3), [AMD blog](https://www.amd.com/en/developer/resources/technical-articles/amd-instinct-gpus-power-deepseek-v3-revolutionizing-ai-development-with-sglang.html), [10+ other companies](https://x.com/lmsysorg/status/1887262321636221412))
396
391
  - [2024/10] The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
397
392
  - [2024/09] v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
393
+ - [2024/07] v0.2 Release: Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
398
394
  - [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
399
395
  - [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
400
396
  - [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
@@ -406,17 +402,17 @@ SGLang is a fast serving framework for large language models and vision language
406
402
  It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
407
403
  The core features include:
408
404
 
409
- - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, prefill-decode disaggregation, speculative decoding, continuous batching, paged attention, tensor parallelism, pipeline parallelism, expert parallelism, structured outputs, chunked prefill, quantization (FP8/INT4/AWQ/GPTQ), and multi-lora batching.
405
+ - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, prefill-decode disaggregation, speculative decoding, continuous batching, paged attention, tensor/pipeline/expert/data parallelism, structured outputs, chunked prefill, quantization (FP4/FP8/INT4/AWQ/GPTQ), and multi-lora batching.
410
406
  - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
411
- - **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, Qwen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
412
- - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
407
+ - **Extensive Model Support**: Supports a wide range of generative models (Llama, Qwen, DeepSeek, Kimi, GPT, Gemma, Mistral, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
408
+ - **Active Community**: SGLang is open-source and backed by an active community with wide industry adoption.
413
409
 
414
410
  ## Getting Started
415
- - [Install SGLang](https://docs.sglang.ai/start/install.html)
416
- - [Quick Start](https://docs.sglang.ai/backend/send_request.html)
417
- - [Backend Tutorial](https://docs.sglang.ai/backend/openai_api_completions.html)
418
- - [Frontend Tutorial](https://docs.sglang.ai/frontend/frontend.html)
419
- - [Contribution Guide](https://docs.sglang.ai/references/contribution_guide.html)
411
+ - [Install SGLang](https://docs.sglang.ai/get_started/install.html)
412
+ - [Quick Start](https://docs.sglang.ai/basic_usage/send_request.html)
413
+ - [Backend Tutorial](https://docs.sglang.ai/basic_usage/openai_api_completions.html)
414
+ - [Frontend Tutorial](https://docs.sglang.ai/references/frontend/frontend_tutorial.html)
415
+ - [Contribution Guide](https://docs.sglang.ai/developer_guide/contribution_guide.html)
420
416
 
421
417
  ## Benchmark and Performance
422
418
  Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/), [v0.4 blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/), [Large-scale expert parallelism](https://lmsys.org/blog/2025-05-05-large-scale-ep/).