sglang 0.5.0rc1__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (203) hide show
  1. sglang/bench_one_batch.py +0 -7
  2. sglang/bench_one_batch_server.py +7 -2
  3. sglang/bench_serving.py +3 -3
  4. sglang/eval/llama3_eval.py +0 -1
  5. sglang/srt/configs/model_config.py +25 -9
  6. sglang/srt/configs/update_config.py +40 -5
  7. sglang/srt/constrained/xgrammar_backend.py +23 -11
  8. sglang/srt/conversation.py +2 -15
  9. sglang/srt/disaggregation/ascend/conn.py +1 -3
  10. sglang/srt/disaggregation/base/conn.py +1 -0
  11. sglang/srt/disaggregation/decode.py +1 -2
  12. sglang/srt/disaggregation/launch_lb.py +7 -1
  13. sglang/srt/disaggregation/mini_lb.py +11 -5
  14. sglang/srt/disaggregation/mooncake/conn.py +141 -47
  15. sglang/srt/disaggregation/prefill.py +261 -5
  16. sglang/srt/disaggregation/utils.py +2 -1
  17. sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -1
  18. sglang/srt/distributed/device_communicators/pynccl.py +68 -18
  19. sglang/srt/distributed/device_communicators/pynccl_wrapper.py +52 -0
  20. sglang/srt/distributed/naive_distributed.py +112 -0
  21. sglang/srt/distributed/parallel_state.py +90 -4
  22. sglang/srt/entrypoints/context.py +20 -1
  23. sglang/srt/entrypoints/engine.py +29 -4
  24. sglang/srt/entrypoints/http_server.py +76 -0
  25. sglang/srt/entrypoints/openai/protocol.py +4 -2
  26. sglang/srt/entrypoints/openai/serving_chat.py +23 -6
  27. sglang/srt/entrypoints/openai/serving_completions.py +10 -1
  28. sglang/srt/entrypoints/openai/serving_responses.py +2 -2
  29. sglang/srt/eplb/expert_distribution.py +2 -3
  30. sglang/srt/function_call/deepseekv3_detector.py +1 -1
  31. sglang/srt/hf_transformers_utils.py +24 -0
  32. sglang/srt/host_shared_memory.py +83 -0
  33. sglang/srt/layers/attention/ascend_backend.py +132 -22
  34. sglang/srt/layers/attention/flashattention_backend.py +24 -17
  35. sglang/srt/layers/attention/flashinfer_backend.py +14 -3
  36. sglang/srt/layers/attention/flashinfer_mla_backend.py +227 -76
  37. sglang/srt/layers/attention/triton_backend.py +109 -73
  38. sglang/srt/layers/attention/triton_ops/decode_attention.py +33 -2
  39. sglang/srt/layers/attention/triton_ops/extend_attention.py +32 -2
  40. sglang/srt/layers/attention/trtllm_mha_backend.py +398 -36
  41. sglang/srt/layers/attention/trtllm_mla_backend.py +49 -19
  42. sglang/srt/layers/attention/utils.py +94 -15
  43. sglang/srt/layers/attention/vision.py +40 -13
  44. sglang/srt/layers/attention/vision_utils.py +65 -0
  45. sglang/srt/layers/communicator.py +58 -10
  46. sglang/srt/layers/dp_attention.py +137 -27
  47. sglang/srt/layers/elementwise.py +94 -0
  48. sglang/srt/layers/flashinfer_comm_fusion.py +29 -1
  49. sglang/srt/layers/layernorm.py +8 -1
  50. sglang/srt/layers/linear.py +24 -0
  51. sglang/srt/layers/logits_processor.py +16 -18
  52. sglang/srt/layers/moe/__init__.py +31 -0
  53. sglang/srt/layers/moe/ep_moe/layer.py +37 -33
  54. sglang/srt/layers/moe/fused_moe_native.py +14 -25
  55. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=129,N=352,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  56. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=161,N=192,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  57. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_0/E=16,N=1024,device_name=NVIDIA_B200.json +146 -0
  58. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  59. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
  60. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  61. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  62. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  63. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  64. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  65. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  66. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  67. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json +146 -0
  68. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
  69. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=161,N=384,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json +146 -0
  70. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +69 -76
  71. sglang/srt/layers/moe/fused_moe_triton/layer.py +66 -123
  72. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +20 -18
  73. sglang/srt/layers/moe/moe_runner/__init__.py +3 -0
  74. sglang/srt/layers/moe/moe_runner/base.py +13 -0
  75. sglang/srt/layers/moe/rocm_moe_utils.py +141 -0
  76. sglang/srt/layers/moe/router.py +15 -9
  77. sglang/srt/layers/moe/token_dispatcher/__init__.py +6 -0
  78. sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +55 -14
  79. sglang/srt/layers/moe/token_dispatcher/deepep.py +11 -21
  80. sglang/srt/layers/moe/token_dispatcher/standard.py +1 -1
  81. sglang/srt/layers/moe/topk.py +167 -83
  82. sglang/srt/layers/moe/utils.py +159 -18
  83. sglang/srt/layers/multimodal.py +156 -40
  84. sglang/srt/layers/quantization/__init__.py +18 -46
  85. sglang/srt/layers/quantization/awq.py +22 -23
  86. sglang/srt/layers/quantization/base_config.py +2 -6
  87. sglang/srt/layers/quantization/blockwise_int8.py +4 -12
  88. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +72 -29
  89. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -1
  90. sglang/srt/layers/quantization/fp8.py +127 -119
  91. sglang/srt/layers/quantization/fp8_kernel.py +195 -24
  92. sglang/srt/layers/quantization/fp8_utils.py +34 -9
  93. sglang/srt/layers/quantization/fpgemm_fp8.py +203 -0
  94. sglang/srt/layers/quantization/gptq.py +17 -21
  95. sglang/srt/layers/quantization/marlin_utils.py +26 -8
  96. sglang/srt/layers/quantization/marlin_utils_fp8.py +352 -0
  97. sglang/srt/layers/quantization/modelopt_quant.py +217 -98
  98. sglang/srt/layers/quantization/moe_wna16.py +10 -15
  99. sglang/srt/layers/quantization/mxfp4.py +222 -39
  100. sglang/srt/layers/quantization/quark/quark.py +390 -0
  101. sglang/srt/layers/quantization/quark/quark_moe.py +197 -0
  102. sglang/srt/layers/quantization/unquant.py +34 -70
  103. sglang/srt/layers/quantization/utils.py +77 -2
  104. sglang/srt/layers/quantization/w4afp8.py +7 -8
  105. sglang/srt/layers/quantization/w8a8_fp8.py +5 -13
  106. sglang/srt/layers/quantization/w8a8_int8.py +5 -13
  107. sglang/srt/layers/radix_attention.py +6 -0
  108. sglang/srt/layers/rotary_embedding.py +1 -0
  109. sglang/srt/layers/sampler.py +5 -2
  110. sglang/srt/lora/layers.py +6 -2
  111. sglang/srt/lora/lora_manager.py +21 -22
  112. sglang/srt/lora/lora_registry.py +3 -3
  113. sglang/srt/lora/mem_pool.py +26 -24
  114. sglang/srt/lora/utils.py +10 -12
  115. sglang/srt/managers/cache_controller.py +80 -19
  116. sglang/srt/managers/detokenizer_manager.py +10 -2
  117. sglang/srt/managers/io_struct.py +23 -0
  118. sglang/srt/managers/mm_utils.py +1 -1
  119. sglang/srt/managers/schedule_batch.py +22 -48
  120. sglang/srt/managers/scheduler.py +28 -20
  121. sglang/srt/managers/session_controller.py +1 -1
  122. sglang/srt/managers/template_manager.py +7 -5
  123. sglang/srt/managers/tokenizer_manager.py +88 -39
  124. sglang/srt/managers/tp_worker.py +1 -0
  125. sglang/srt/managers/utils.py +59 -1
  126. sglang/srt/mem_cache/allocator.py +10 -157
  127. sglang/srt/mem_cache/allocator_ascend.py +147 -0
  128. sglang/srt/mem_cache/chunk_cache.py +1 -1
  129. sglang/srt/mem_cache/hicache_storage.py +14 -4
  130. sglang/srt/mem_cache/memory_pool.py +3 -3
  131. sglang/srt/mem_cache/memory_pool_host.py +35 -2
  132. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +56 -12
  133. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +8 -4
  134. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +153 -59
  135. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +19 -53
  136. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +46 -7
  137. sglang/srt/model_executor/cuda_graph_runner.py +33 -33
  138. sglang/srt/model_executor/forward_batch_info.py +11 -10
  139. sglang/srt/model_executor/model_runner.py +93 -78
  140. sglang/srt/model_executor/npu_graph_runner.py +94 -0
  141. sglang/srt/model_loader/loader.py +24 -6
  142. sglang/srt/models/dbrx.py +12 -6
  143. sglang/srt/models/deepseek.py +2 -1
  144. sglang/srt/models/deepseek_nextn.py +5 -2
  145. sglang/srt/models/deepseek_v2.py +226 -223
  146. sglang/srt/models/ernie4.py +2 -2
  147. sglang/srt/models/glm4_moe.py +27 -65
  148. sglang/srt/models/glm4_moe_nextn.py +2 -1
  149. sglang/srt/models/glm4v.py +52 -1
  150. sglang/srt/models/glm4v_moe.py +8 -11
  151. sglang/srt/models/gpt_oss.py +41 -76
  152. sglang/srt/models/granitemoe.py +0 -1
  153. sglang/srt/models/grok.py +376 -48
  154. sglang/srt/models/interns1.py +12 -47
  155. sglang/srt/models/internvl.py +6 -51
  156. sglang/srt/models/llama.py +10 -2
  157. sglang/srt/models/llama4.py +18 -7
  158. sglang/srt/models/minicpm3.py +0 -1
  159. sglang/srt/models/mixtral.py +0 -2
  160. sglang/srt/models/nemotron_nas.py +435 -0
  161. sglang/srt/models/olmoe.py +0 -1
  162. sglang/srt/models/phi4mm.py +3 -21
  163. sglang/srt/models/qwen2.py +2 -2
  164. sglang/srt/models/qwen2_5_vl.py +2 -0
  165. sglang/srt/models/qwen2_moe.py +23 -23
  166. sglang/srt/models/qwen3.py +2 -2
  167. sglang/srt/models/qwen3_classification.py +84 -0
  168. sglang/srt/models/qwen3_moe.py +27 -43
  169. sglang/srt/models/step3_vl.py +8 -3
  170. sglang/srt/models/xverse_moe.py +11 -5
  171. sglang/srt/multimodal/processors/base_processor.py +3 -3
  172. sglang/srt/multimodal/processors/internvl.py +7 -2
  173. sglang/srt/multimodal/processors/llava.py +11 -7
  174. sglang/srt/offloader.py +433 -0
  175. sglang/srt/operations.py +22 -2
  176. sglang/srt/reasoning_parser.py +4 -3
  177. sglang/srt/sampling/sampling_batch_info.py +7 -4
  178. sglang/srt/server_args.py +264 -105
  179. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -21
  180. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +7 -21
  181. sglang/srt/speculative/eagle_utils.py +36 -13
  182. sglang/srt/speculative/eagle_worker.py +56 -3
  183. sglang/srt/tokenizer/tiktoken_tokenizer.py +161 -0
  184. sglang/srt/two_batch_overlap.py +20 -19
  185. sglang/srt/utils.py +68 -70
  186. sglang/test/runners.py +8 -5
  187. sglang/test/test_block_fp8.py +5 -6
  188. sglang/test/test_block_fp8_ep.py +13 -19
  189. sglang/test/test_cutlass_moe.py +4 -6
  190. sglang/test/test_cutlass_w4a8_moe.py +4 -3
  191. sglang/test/test_fp4_moe.py +4 -3
  192. sglang/test/test_marlin_moe.py +1 -1
  193. sglang/test/test_marlin_utils.py +1 -1
  194. sglang/test/test_utils.py +7 -0
  195. sglang/utils.py +0 -1
  196. sglang/version.py +1 -1
  197. {sglang-0.5.0rc1.dist-info → sglang-0.5.1.dist-info}/METADATA +11 -11
  198. {sglang-0.5.0rc1.dist-info → sglang-0.5.1.dist-info}/RECORD +201 -171
  199. sglang/srt/layers/quantization/fp4.py +0 -557
  200. sglang/srt/layers/quantization/scalar_type.py +0 -352
  201. {sglang-0.5.0rc1.dist-info → sglang-0.5.1.dist-info}/WHEEL +0 -0
  202. {sglang-0.5.0rc1.dist-info → sglang-0.5.1.dist-info}/licenses/LICENSE +0 -0
  203. {sglang-0.5.0rc1.dist-info → sglang-0.5.1.dist-info}/top_level.txt +0 -0
sglang/srt/utils.py CHANGED
@@ -438,70 +438,6 @@ def is_pin_memory_available() -> bool:
438
438
  return torch.cuda.is_available()
439
439
 
440
440
 
441
- _CPU_OFFLOAD_BYTES = 0
442
- _CPU_OFFLOAD_MAX_BYTES = 0
443
-
444
-
445
- def set_cpu_offload_max_bytes(max_bytes: int) -> None:
446
- global _CPU_OFFLOAD_MAX_BYTES, _CPU_OFFLOAD_BYTES
447
- _CPU_OFFLOAD_BYTES = 0
448
- _CPU_OFFLOAD_MAX_BYTES = max_bytes
449
-
450
-
451
- def maybe_offload_to_cpu(module: torch.nn.Module) -> torch.nn.Module:
452
- device = next(module.parameters()).device
453
-
454
- if device == torch.device("cpu"):
455
- return module
456
-
457
- global _CPU_OFFLOAD_MAX_BYTES, _CPU_OFFLOAD_BYTES
458
- if _CPU_OFFLOAD_BYTES >= _CPU_OFFLOAD_MAX_BYTES:
459
- return module
460
-
461
- pin_memory = is_pin_memory_available()
462
- # offload parameters to CPU
463
- # use pin_memory if possible, which helps cudagraph capture speed
464
- offloaded_parameters = False
465
- for p in module.parameters():
466
- if _CPU_OFFLOAD_BYTES >= _CPU_OFFLOAD_MAX_BYTES:
467
- # we use per-parameter offloading
468
- # one module might have some parameters offloaded and some not
469
- break
470
-
471
- # `torch.empty_like` does not support `pin_memory` argument
472
- cpu_data = torch.empty_strided(
473
- size=p.data.size(),
474
- stride=p.data.stride(),
475
- dtype=p.data.dtype,
476
- layout=p.data.layout,
477
- device="cpu",
478
- pin_memory=pin_memory,
479
- )
480
- cpu_data.copy_(p.data)
481
- p.data = cpu_data
482
- _CPU_OFFLOAD_BYTES += p.data.numel() * p.data.element_size()
483
- offloaded_parameters = True
484
-
485
- if offloaded_parameters:
486
- original_forward = module.forward
487
-
488
- def forward(*args, **kwargs):
489
- module.forward = original_forward
490
- device_state = {
491
- # here we blindly call `to(device)`
492
- # if the parameter is already on the device, it will be a no-op
493
- k: v.to(device, non_blocking=True)
494
- for k, v in module.state_dict().items()
495
- }
496
- output = functional_call(module, device_state, args=args, kwargs=kwargs)
497
- module.forward = forward
498
- return output
499
-
500
- module.forward = forward
501
-
502
- return module
503
-
504
-
505
441
  class LayerFn(Protocol):
506
442
 
507
443
  def __call__(self, layer_id: int, prefix: str) -> torch.nn.Module: ...
@@ -514,11 +450,13 @@ def make_layers(
514
450
  pp_size: Optional[int] = None,
515
451
  prefix: str = "",
516
452
  return_tuple: bool = False,
453
+ offloader_kwargs: Dict[str, Any] = {},
517
454
  ) -> Tuple[int, int, torch.nn.ModuleList]:
518
455
  """Make a list of layers with the given layer function"""
519
456
  # circula imports
520
457
  from sglang.srt.distributed import get_pp_indices
521
458
  from sglang.srt.layers.utils import PPMissingLayer
459
+ from sglang.srt.offloader import get_offloader
522
460
 
523
461
  assert not pp_size or num_hidden_layers >= pp_size
524
462
  start_layer, end_layer = (
@@ -532,10 +470,13 @@ def make_layers(
532
470
  )
533
471
  modules = torch.nn.ModuleList(
534
472
  [PPMissingLayer(return_tuple=return_tuple) for _ in range(start_layer)]
535
- + [
536
- maybe_offload_to_cpu(layer_fn(idx=idx, prefix=add_prefix(idx, prefix)))
537
- for idx in range(start_layer, end_layer)
538
- ]
473
+ + get_offloader().wrap_modules(
474
+ (
475
+ layer_fn(idx=idx, prefix=add_prefix(idx, prefix))
476
+ for idx in range(start_layer, end_layer)
477
+ ),
478
+ **offloader_kwargs,
479
+ )
539
480
  + [
540
481
  PPMissingLayer(return_tuple=return_tuple)
541
482
  for _ in range(end_layer, num_hidden_layers)
@@ -2343,6 +2284,7 @@ def is_fa3_default_architecture(hf_config):
2343
2284
  "Qwen3ForCausalLM",
2344
2285
  "Qwen3MoeForCausalLM",
2345
2286
  "Glm4MoeForCausalLM",
2287
+ "Glm4vMoeForConditionalGeneration",
2346
2288
  "Step3VLForConditionalGeneration",
2347
2289
  }
2348
2290
  return architectures[0] in default_archs
@@ -2413,7 +2355,7 @@ def require_mlp_tp_gather(server_args):
2413
2355
  return True
2414
2356
  elif not server_args.enable_dp_lm_head:
2415
2357
  return True
2416
- elif server_args.moe_a2a_backend is None:
2358
+ elif server_args.moe_a2a_backend == "none":
2417
2359
  return True
2418
2360
  else:
2419
2361
  return (
@@ -2429,7 +2371,7 @@ def require_attn_tp_gather(server_args):
2429
2371
  Check if the input of attention is scattered.
2430
2372
  """
2431
2373
  assert server_args.moe_dense_tp_size in [1, None]
2432
- if server_args.moe_a2a_backend is not None or server_args.moe_dense_tp_size == 1:
2374
+ if server_args.moe_a2a_backend != "none" or server_args.moe_dense_tp_size == 1:
2433
2375
  if server_args.enable_dp_attention:
2434
2376
  return server_args.dp_size < server_args.tp_size
2435
2377
  else:
@@ -2599,6 +2541,50 @@ def dynamic_import(func_path: str):
2599
2541
  return func
2600
2542
 
2601
2543
 
2544
+ def gc_object_counts():
2545
+ import gc
2546
+
2547
+ g0 = len(gc.get_objects(0))
2548
+ g1 = len(gc.get_objects(1))
2549
+ g2 = len(gc.get_objects(2))
2550
+ return g0, g1, g2
2551
+
2552
+
2553
+ def configure_gc_warning(warn_threshold_secs):
2554
+ import gc
2555
+
2556
+ gc_start_time = {}
2557
+
2558
+ def gc_callback(phase, info):
2559
+ gen = info.get("generation", "?")
2560
+ if phase == "start":
2561
+ gc_start_time[gen] = time.time()
2562
+ elif phase == "stop":
2563
+ duration = time.time() - gc_start_time.get(gen, time.time())
2564
+ if duration > warn_threshold_secs:
2565
+ g0, g1, g2 = gc_object_counts()
2566
+ logger.warn(
2567
+ f"LONG GARBAGE COLLECTION DETECTED | Generation {gen} | Duration: {duration:.4f}s | # Objects: gen0={g0}, gen1={g1}, gen2={g2} | "
2568
+ f"This may cause latency jitter. Consider calling the freeze_gc API after sending a few warmup requests."
2569
+ )
2570
+
2571
+ gc.callbacks.append(gc_callback)
2572
+
2573
+
2574
+ def freeze_gc(context: str):
2575
+ import gc
2576
+
2577
+ g0_before, g1_before, g2_before = gc_object_counts()
2578
+ gc.freeze()
2579
+ g0_after, g1_after, g2_after = gc_object_counts()
2580
+ logger.info(
2581
+ f"Freezing GC in {context} process. "
2582
+ f"gen0: {g0_before}->{g0_after}, "
2583
+ f"gen1: {g1_before}->{g1_after}, "
2584
+ f"gen2: {g2_before}->{g2_after}"
2585
+ )
2586
+
2587
+
2602
2588
  def configure_gc_logger():
2603
2589
  logger.info("Enable GC Logger")
2604
2590
 
@@ -2872,6 +2858,8 @@ SUPPORTED_LORA_TARGET_MODULES = [
2872
2858
  "gate_proj",
2873
2859
  "up_proj",
2874
2860
  "down_proj",
2861
+ "qkv_proj",
2862
+ "gate_up_proj",
2875
2863
  ]
2876
2864
 
2877
2865
  LORA_TARGET_ALL_MODULES = "all"
@@ -2966,3 +2954,13 @@ class ConcurrentCounter:
2966
2954
  @lru_cache(maxsize=1)
2967
2955
  def is_triton_kernels_available() -> bool:
2968
2956
  return importlib.util.find_spec("triton_kernels") is not None
2957
+
2958
+
2959
+ def check_cuda_result(raw_output):
2960
+ import cuda.bindings.runtime as cuda_rt
2961
+
2962
+ err, *results = raw_output
2963
+ if err != cuda_rt.cudaError_t.cudaSuccess:
2964
+ raise Exception(f"CUDA error: {err}")
2965
+
2966
+ return results
sglang/test/runners.py CHANGED
@@ -231,11 +231,14 @@ class HFRunner:
231
231
 
232
232
  # Load the model and tokenizer
233
233
  if self.model_type == "generation":
234
- config = AutoConfig.from_pretrained(model_path)
235
- if model_archs := getattr(config, "architectures"):
236
- model_cls = getattr(transformers, model_archs[0])
237
- else:
234
+ config = AutoConfig.from_pretrained(
235
+ model_path, trust_remote_code=self.trust_remote_code
236
+ )
237
+ if self.trust_remote_code:
238
238
  model_cls = AutoModelForCausalLM
239
+ else:
240
+ model_arch = getattr(config, "architectures")[0]
241
+ model_cls = getattr(transformers, model_arch)
239
242
  self.base_model = model_cls.from_pretrained(
240
243
  model_path,
241
244
  torch_dtype=torch_dtype,
@@ -488,7 +491,7 @@ class SRTRunner:
488
491
  tp_size: int = 1,
489
492
  model_impl: str = "auto",
490
493
  port: int = DEFAULT_PORT_FOR_SRT_TEST_RUNNER,
491
- lora_paths: List[str] = None,
494
+ lora_paths: Optional[Union[List[str], List[dict[str, str]]]] = None,
492
495
  max_loras_per_batch: int = 4,
493
496
  attention_backend: Optional[str] = None,
494
497
  prefill_attention_backend: Optional[str] = None,
@@ -6,7 +6,7 @@ import torch
6
6
 
7
7
  from sglang.srt.layers.activation import SiluAndMul
8
8
  from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_moe
9
- from sglang.srt.layers.moe.topk import select_experts
9
+ from sglang.srt.layers.moe.topk import TopKConfig, select_experts
10
10
  from sglang.srt.layers.quantization.fp8_kernel import (
11
11
  per_tensor_quant_mla_fp8,
12
12
  per_token_group_quant_fp8,
@@ -498,11 +498,13 @@ class TestW8A8BlockFP8FusedMoE(CustomTestCase):
498
498
  score = torch.randn((M, E), dtype=dtype)
499
499
 
500
500
  with torch.inference_mode():
501
+ ref_out = torch_w8a8_block_fp8_moe(
502
+ a, w1, w2, w1_s, w2_s, score, topk, block_size
503
+ )
501
504
  topk_output = select_experts(
502
505
  hidden_states=a,
503
506
  router_logits=score,
504
- top_k=topk,
505
- renormalize=False,
507
+ topk_config=TopKConfig(top_k=topk, renormalize=False),
506
508
  )
507
509
  out = fused_moe(
508
510
  a,
@@ -514,9 +516,6 @@ class TestW8A8BlockFP8FusedMoE(CustomTestCase):
514
516
  w2_scale=w2_s,
515
517
  block_shape=block_size,
516
518
  )
517
- ref_out = torch_w8a8_block_fp8_moe(
518
- a, w1, w2, w1_s, w2_s, score, topk, block_size
519
- )
520
519
 
521
520
  self.assertTrue(
522
521
  torch.mean(torch.abs(out.to(torch.float32) - ref_out.to(torch.float32)))
@@ -12,7 +12,7 @@ from sglang.srt.layers.moe.ep_moe.kernels import (
12
12
  run_moe_ep_preproess,
13
13
  silu_and_mul_triton_kernel,
14
14
  )
15
- from sglang.srt.layers.moe.topk import select_experts
15
+ from sglang.srt.layers.moe.topk import TopKConfig, select_experts
16
16
  from sglang.test.test_utils import CustomTestCase
17
17
 
18
18
 
@@ -22,35 +22,26 @@ def ep_moe(
22
22
  w1: torch.Tensor,
23
23
  w2: torch.Tensor,
24
24
  router_logits: torch.Tensor,
25
- top_k: int,
26
- renormalize: bool,
25
+ topk_config: TopKConfig,
27
26
  # ep config
28
27
  num_experts: int = 256,
29
28
  fp8_dtype: torch.types = torch.float8_e4m3fn,
30
29
  num_experts_per_partition: int = 128,
31
30
  start_expert_id: int = 0,
32
31
  end_expert_id: int = 127,
33
- use_grouped_topk: bool = False,
34
- num_expert_group: Optional[int] = None,
35
- topk_group: Optional[int] = None,
36
- custom_routing_function: Optional[Callable] = None,
37
32
  use_fp8_w8a8: bool = False,
38
33
  w1_scale_inv: Optional[torch.Tensor] = None,
39
34
  w2_scale_inv: Optional[torch.Tensor] = None,
40
35
  block_shape: Optional[List[int]] = None,
41
36
  ):
42
37
  use_blockwise_fp8 = block_shape is not None
43
- topk_weights, topk_ids, _ = select_experts(
38
+ top_k = topk_config.top_k
39
+ topk_output = select_experts(
44
40
  hidden_states=hidden_states,
45
41
  router_logits=router_logits,
46
- top_k=top_k,
47
- use_grouped_topk=use_grouped_topk,
48
- renormalize=renormalize,
49
- topk_group=topk_group,
50
- num_expert_group=num_expert_group,
51
- # correction_bias=correction_bias, #skip this in test
52
- custom_routing_function=custom_routing_function,
42
+ topk_config=topk_config,
53
43
  )
44
+ topk_weights, topk_ids, _ = topk_output
54
45
 
55
46
  reorder_topk_ids, src2dst, seg_indptr = run_moe_ep_preproess(topk_ids, num_experts)
56
47
 
@@ -294,14 +285,18 @@ class TestW8A8BlockFP8EPMoE(CustomTestCase):
294
285
  start_id = cur_rank * num_experts_per_partition
295
286
  end_id = start_id + num_experts_per_partition - 1
296
287
 
288
+ topk_config = TopKConfig(
289
+ top_k=topk,
290
+ renormalize=False,
291
+ )
292
+
297
293
  with torch.inference_mode():
298
294
  out = ep_moe(
299
295
  hidden_states=a,
300
296
  w1=w1,
301
297
  w2=w2,
302
298
  router_logits=score,
303
- top_k=topk,
304
- renormalize=False,
299
+ topk_config=topk_config,
305
300
  use_fp8_w8a8=True,
306
301
  w1_scale_inv=w1_s,
307
302
  w2_scale_inv=w2_s,
@@ -316,8 +311,7 @@ class TestW8A8BlockFP8EPMoE(CustomTestCase):
316
311
  w1=w1_ref,
317
312
  w2=w2_ref,
318
313
  router_logits=score,
319
- top_k=topk,
320
- renormalize=False,
314
+ topk_config=topk_config,
321
315
  use_fp8_w8a8=False,
322
316
  w1_scale_inv=None,
323
317
  w2_scale_inv=None,
@@ -153,9 +153,8 @@ def run_test(tp_size, batch_size, model_config, check=False):
153
153
  x,
154
154
  w1,
155
155
  w2,
156
- topk_weights,
157
- topk_ids,
158
- inplace=False, # Use False for benchmarking to avoid side effects if run multiple times
156
+ (topk_weights, topk_ids, "dummy"),
157
+ inplace=False,
159
158
  activation="silu", # Assuming SiLU activation common in MoEs
160
159
  use_fp8_w8a8=True,
161
160
  w1_scale=w1_scale,
@@ -221,8 +220,7 @@ def run_test(tp_size, batch_size, model_config, check=False):
221
220
  x,
222
221
  w1, # Original shape
223
222
  w2, # Original shape
224
- topk_weights,
225
- topk_ids,
223
+ (topk_weights, topk_ids, "dummy"),
226
224
  inplace=False, # Important: Use False to get output tensor
227
225
  activation="silu",
228
226
  use_fp8_w8a8=True,
@@ -266,7 +264,7 @@ if __name__ == "__main__":
266
264
  "--batch-sizes",
267
265
  type=int,
268
266
  nargs="+",
269
- default=[1, 4, 8, 16, 32, 64, 128, 256, 512], # Adjusted default
267
+ default=[1, 4, 8, 16, 32, 64, 128, 256, 512, 1024], # Adjusted default
270
268
  help="List of batch sizes to test",
271
269
  )
272
270
  parser.add_argument("--check", action="store_true", help="Enable check mode")
@@ -6,7 +6,7 @@ import pytest
6
6
  import torch
7
7
 
8
8
  from sglang.srt.layers.moe.cutlass_w4a8_moe import cutlass_w4a8_moe
9
- from sglang.srt.layers.moe.topk import select_experts
9
+ from sglang.srt.layers.moe.topk import TopKConfig, select_experts
10
10
 
11
11
 
12
12
  def pack_int4_values_to_int8(int4_values_interleaved: torch.Tensor) -> torch.Tensor:
@@ -100,11 +100,12 @@ def test_cutlass_w4a8_moe(M, N, K, E, ep_size, topk, group_size, dtype):
100
100
  s_strides2 = c_strides2
101
101
 
102
102
  score = torch.randn((M, E), dtype=dtype, device=device)
103
- topk_weights, topk_ids, _ = select_experts(
103
+ topk_output = select_experts(
104
104
  hidden_states=a,
105
105
  router_logits=score,
106
- top_k=topk,
106
+ topk_config=TopKConfig(top_k=topk, renormalize=False),
107
107
  )
108
+ topk_weights, topk_ids, _ = topk_output
108
109
  expert_map = torch.arange(E, dtype=torch.int32, device=device)
109
110
  expert_map[local_e:] = E
110
111
 
@@ -9,7 +9,7 @@ from sgl_kernel import scaled_fp4_quant
9
9
  from sglang.srt.layers.activation import SiluAndMul
10
10
  from sglang.srt.layers.moe.cutlass_moe import cutlass_moe_fp4
11
11
  from sglang.srt.layers.moe.cutlass_moe_params import CutlassMoEParams, CutlassMoEType
12
- from sglang.srt.layers.moe.topk import select_experts
12
+ from sglang.srt.layers.moe.topk import TopKConfig, select_experts
13
13
 
14
14
  if torch.cuda.get_device_capability() < (10, 0):
15
15
  pytest.skip(
@@ -163,11 +163,12 @@ def check_moe(
163
163
 
164
164
  score = torch.randn((m, e), device="cuda", dtype=dtype)
165
165
 
166
- topk_weights, topk_ids, _ = select_experts(
166
+ topk_output = select_experts(
167
167
  hidden_states=a,
168
168
  router_logits=score,
169
- top_k=topk,
169
+ topk_config=TopKConfig(top_k=topk, renormalize=False),
170
170
  )
171
+ topk_weights, topk_ids, _ = topk_output
171
172
 
172
173
  a1_gs = torch.ones((e,), device="cuda", dtype=torch.float32)
173
174
  a2_gs = torch.ones((e,), device="cuda", dtype=torch.float32)
@@ -4,9 +4,9 @@ from typing import Optional
4
4
  import pytest
5
5
  import torch
6
6
  from sgl_kernel import fused_marlin_moe
7
+ from sgl_kernel.scalar_type import ScalarType, scalar_types
7
8
 
8
9
  from sglang.srt.layers.activation import SiluAndMul
9
- from sglang.srt.layers.quantization.scalar_type import ScalarType, scalar_types
10
10
  from sglang.test.test_marlin_utils import awq_marlin_quantize, marlin_quantize
11
11
 
12
12
 
@@ -10,13 +10,13 @@ from typing import Optional
10
10
 
11
11
  import numpy as np
12
12
  import torch
13
+ from sgl_kernel.scalar_type import ScalarType
13
14
 
14
15
  from sglang.srt.layers.quantization.marlin_utils import (
15
16
  GPTQ_MARLIN_TILE,
16
17
  marlin_permute_scales,
17
18
  marlin_zero_points,
18
19
  )
19
- from sglang.srt.layers.quantization.scalar_type import ScalarType
20
20
  from sglang.srt.layers.quantization.utils import (
21
21
  get_pack_factor,
22
22
  gptq_quantize_weights,
sglang/test/test_utils.py CHANGED
@@ -61,6 +61,12 @@ DEFAULT_MODEL_NAME_FOR_DYNAMIC_QUANT_ACCURACY_TEST_FP8 = (
61
61
  DEFAULT_MODEL_NAME_FOR_MODELOPT_QUANT_ACCURACY_TEST_FP8 = (
62
62
  "nvidia/Llama-3.1-8B-Instruct-FP8"
63
63
  )
64
+ DEFAULT_MODEL_NAME_FOR_TEST_QWEN_FP8 = "Qwen/Qwen3-1.7B-FP8"
65
+ DEFAULT_MODEL_NAME_FOR_TEST_FP8_WITH_MOE = "gaunernst/DeepSeek-V2-Lite-Chat-FP8"
66
+
67
+ # W8A8 models
68
+ DEFAULT_MODEL_NAME_FOR_TEST_W8A8 = "RedHatAI/Llama-3.2-3B-quantized.w8a8"
69
+ DEFAULT_MODEL_NAME_FOR_TEST_W8A8_WITH_MOE = "nytopop/Qwen3-30B-A3B.w8a8"
64
70
 
65
71
  # EAGLE
66
72
  DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST = "meta-llama/Llama-2-7b-chat-hf"
@@ -78,6 +84,7 @@ DEFAULT_AWQ_MOE_MODEL_NAME_FOR_TEST = (
78
84
  "hugging-quants/Mixtral-8x7B-Instruct-v0.1-AWQ-INT4"
79
85
  )
80
86
  DEFAULT_ENABLE_THINKING_MODEL_NAME_FOR_TEST = "Qwen/Qwen3-30B-A3B"
87
+ DEFAULT_DEEPSEEK_W4AFP8_MODEL_FOR_TEST = "Barrrrry/DeepSeek-R1-W4AFP8"
81
88
 
82
89
  # Nightly tests
83
90
  DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
sglang/utils.py CHANGED
@@ -5,7 +5,6 @@ import json
5
5
  import logging
6
6
  import os
7
7
  import random
8
- import signal
9
8
  import socket
10
9
  import subprocess
11
10
  import sys
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.5.0rc1"
1
+ __version__ = "0.5.1"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sglang
3
- Version: 0.5.0rc1
3
+ Version: 0.5.1
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -232,7 +232,7 @@ Requires-Dist: modelscope; extra == "runtime-common"
232
232
  Requires-Dist: msgspec; extra == "runtime-common"
233
233
  Requires-Dist: ninja; extra == "runtime-common"
234
234
  Requires-Dist: openai==1.99.1; extra == "runtime-common"
235
- Requires-Dist: openai-harmony==0.0.3; extra == "runtime-common"
235
+ Requires-Dist: openai-harmony==0.0.4; extra == "runtime-common"
236
236
  Requires-Dist: orjson; extra == "runtime-common"
237
237
  Requires-Dist: outlines==0.1.11; extra == "runtime-common"
238
238
  Requires-Dist: packaging; extra == "runtime-common"
@@ -240,9 +240,9 @@ Requires-Dist: partial_json_parser; extra == "runtime-common"
240
240
  Requires-Dist: pillow; extra == "runtime-common"
241
241
  Requires-Dist: prometheus-client>=0.20.0; extra == "runtime-common"
242
242
  Requires-Dist: psutil; extra == "runtime-common"
243
+ Requires-Dist: pybase64; extra == "runtime-common"
243
244
  Requires-Dist: pydantic; extra == "runtime-common"
244
245
  Requires-Dist: pynvml; extra == "runtime-common"
245
- Requires-Dist: pybase64; extra == "runtime-common"
246
246
  Requires-Dist: python-multipart; extra == "runtime-common"
247
247
  Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
248
248
  Requires-Dist: sentencepiece; extra == "runtime-common"
@@ -251,18 +251,18 @@ Requires-Dist: scipy; extra == "runtime-common"
251
251
  Requires-Dist: timm==1.0.16; extra == "runtime-common"
252
252
  Requires-Dist: tiktoken; extra == "runtime-common"
253
253
  Requires-Dist: torchao==0.9.0; extra == "runtime-common"
254
- Requires-Dist: transformers==4.55.0; extra == "runtime-common"
254
+ Requires-Dist: transformers==4.55.2; extra == "runtime-common"
255
255
  Requires-Dist: uvicorn; extra == "runtime-common"
256
256
  Requires-Dist: uvloop; extra == "runtime-common"
257
- Requires-Dist: xgrammar==0.1.22; extra == "runtime-common"
257
+ Requires-Dist: xgrammar==0.1.23; extra == "runtime-common"
258
258
  Provides-Extra: srt
259
259
  Requires-Dist: sglang[runtime_common]; extra == "srt"
260
- Requires-Dist: sgl-kernel==0.3.4.post1; extra == "srt"
260
+ Requires-Dist: sgl-kernel==0.3.5; extra == "srt"
261
261
  Requires-Dist: torch==2.8.0; extra == "srt"
262
262
  Requires-Dist: torchaudio==2.8.0; extra == "srt"
263
263
  Requires-Dist: torchvision; extra == "srt"
264
264
  Requires-Dist: cuda-python; extra == "srt"
265
- Requires-Dist: flashinfer_python==0.2.11.post1; extra == "srt"
265
+ Requires-Dist: flashinfer_python==0.2.11.post3; extra == "srt"
266
266
  Provides-Extra: blackwell
267
267
  Requires-Dist: sglang[runtime_common]; extra == "blackwell"
268
268
  Requires-Dist: sgl-kernel; extra == "blackwell"
@@ -270,7 +270,7 @@ Requires-Dist: torch==2.8.0; extra == "blackwell"
270
270
  Requires-Dist: torchaudio==2.8.0; extra == "blackwell"
271
271
  Requires-Dist: torchvision; extra == "blackwell"
272
272
  Requires-Dist: cuda-python; extra == "blackwell"
273
- Requires-Dist: flashinfer_python==0.2.11.post1; extra == "blackwell"
273
+ Requires-Dist: flashinfer_python==0.2.11.post3; extra == "blackwell"
274
274
  Provides-Extra: srt-hip
275
275
  Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
276
276
  Requires-Dist: torch; extra == "srt-hip"
@@ -278,13 +278,12 @@ Requires-Dist: petit_kernel==0.0.2; extra == "srt-hip"
278
278
  Requires-Dist: wave-lang==1.0.1; extra == "srt-hip"
279
279
  Provides-Extra: srt-cpu
280
280
  Requires-Dist: sglang[runtime_common]; extra == "srt-cpu"
281
- Requires-Dist: einops; extra == "srt-cpu"
281
+ Provides-Extra: srt-npu
282
+ Requires-Dist: sglang[runtime_common]; extra == "srt-npu"
282
283
  Provides-Extra: srt-xpu
283
284
  Requires-Dist: sglang[runtime_common]; extra == "srt-xpu"
284
285
  Provides-Extra: srt-hpu
285
286
  Requires-Dist: sglang[runtime_common]; extra == "srt-hpu"
286
- Provides-Extra: srt-npu
287
- Requires-Dist: sglang[runtime_common]; extra == "srt-npu"
288
287
  Provides-Extra: openai
289
288
  Requires-Dist: openai==1.99.1; extra == "openai"
290
289
  Requires-Dist: tiktoken; extra == "openai"
@@ -375,6 +374,7 @@ Dynamic: license-file
375
374
  | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
376
375
 
377
376
  ## News
377
+ - [2025/08] 🔔 SGLang x AMD SF Meetup on 8/22: Hands-on GPU workshop, tech talks by AMD/xAI/SGLang, and networking. [Register here](https://lu.ma/gbfhjvuo).
378
378
  - [2025/08] 🔥 SGLang provides day-0 support for OpenAI gpt-oss model ([instructions](https://github.com/sgl-project/sglang/issues/8833))
379
379
  - [2025/06] 🔥 SGLang, the high-performance serving infrastructure powering trillions of tokens daily, has been awarded the third batch of the Open Source AI Grant by a16z ([a16z blog](https://a16z.com/advancing-open-source-ai-through-benchmarks-and-bold-experimentation/)).
380
380
  - [2025/06] 🔥 Deploying DeepSeek on GB200 NVL72 with PD and Large Scale EP (Part I): 2.7x Higher Decoding Throughput ([blog](https://lmsys.org/blog/2025-06-16-gb200-part-1/)).