sglang 0.5.0rc1__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +0 -7
- sglang/bench_one_batch_server.py +7 -2
- sglang/bench_serving.py +3 -3
- sglang/eval/llama3_eval.py +0 -1
- sglang/srt/configs/model_config.py +25 -9
- sglang/srt/configs/update_config.py +40 -5
- sglang/srt/constrained/xgrammar_backend.py +23 -11
- sglang/srt/conversation.py +2 -15
- sglang/srt/disaggregation/ascend/conn.py +1 -3
- sglang/srt/disaggregation/base/conn.py +1 -0
- sglang/srt/disaggregation/decode.py +1 -2
- sglang/srt/disaggregation/launch_lb.py +7 -1
- sglang/srt/disaggregation/mini_lb.py +11 -5
- sglang/srt/disaggregation/mooncake/conn.py +141 -47
- sglang/srt/disaggregation/prefill.py +261 -5
- sglang/srt/disaggregation/utils.py +2 -1
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -1
- sglang/srt/distributed/device_communicators/pynccl.py +68 -18
- sglang/srt/distributed/device_communicators/pynccl_wrapper.py +52 -0
- sglang/srt/distributed/naive_distributed.py +112 -0
- sglang/srt/distributed/parallel_state.py +90 -4
- sglang/srt/entrypoints/context.py +20 -1
- sglang/srt/entrypoints/engine.py +29 -4
- sglang/srt/entrypoints/http_server.py +76 -0
- sglang/srt/entrypoints/openai/protocol.py +4 -2
- sglang/srt/entrypoints/openai/serving_chat.py +23 -6
- sglang/srt/entrypoints/openai/serving_completions.py +10 -1
- sglang/srt/entrypoints/openai/serving_responses.py +2 -2
- sglang/srt/eplb/expert_distribution.py +2 -3
- sglang/srt/function_call/deepseekv3_detector.py +1 -1
- sglang/srt/hf_transformers_utils.py +24 -0
- sglang/srt/host_shared_memory.py +83 -0
- sglang/srt/layers/attention/ascend_backend.py +132 -22
- sglang/srt/layers/attention/flashattention_backend.py +24 -17
- sglang/srt/layers/attention/flashinfer_backend.py +14 -3
- sglang/srt/layers/attention/flashinfer_mla_backend.py +227 -76
- sglang/srt/layers/attention/triton_backend.py +109 -73
- sglang/srt/layers/attention/triton_ops/decode_attention.py +33 -2
- sglang/srt/layers/attention/triton_ops/extend_attention.py +32 -2
- sglang/srt/layers/attention/trtllm_mha_backend.py +398 -36
- sglang/srt/layers/attention/trtllm_mla_backend.py +49 -19
- sglang/srt/layers/attention/utils.py +94 -15
- sglang/srt/layers/attention/vision.py +40 -13
- sglang/srt/layers/attention/vision_utils.py +65 -0
- sglang/srt/layers/communicator.py +58 -10
- sglang/srt/layers/dp_attention.py +137 -27
- sglang/srt/layers/elementwise.py +94 -0
- sglang/srt/layers/flashinfer_comm_fusion.py +29 -1
- sglang/srt/layers/layernorm.py +8 -1
- sglang/srt/layers/linear.py +24 -0
- sglang/srt/layers/logits_processor.py +16 -18
- sglang/srt/layers/moe/__init__.py +31 -0
- sglang/srt/layers/moe/ep_moe/layer.py +37 -33
- sglang/srt/layers/moe/fused_moe_native.py +14 -25
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=129,N=352,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=161,N=192,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_0/E=16,N=1024,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=161,N=384,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +69 -76
- sglang/srt/layers/moe/fused_moe_triton/layer.py +66 -123
- sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +20 -18
- sglang/srt/layers/moe/moe_runner/__init__.py +3 -0
- sglang/srt/layers/moe/moe_runner/base.py +13 -0
- sglang/srt/layers/moe/rocm_moe_utils.py +141 -0
- sglang/srt/layers/moe/router.py +15 -9
- sglang/srt/layers/moe/token_dispatcher/__init__.py +6 -0
- sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +55 -14
- sglang/srt/layers/moe/token_dispatcher/deepep.py +11 -21
- sglang/srt/layers/moe/token_dispatcher/standard.py +1 -1
- sglang/srt/layers/moe/topk.py +167 -83
- sglang/srt/layers/moe/utils.py +159 -18
- sglang/srt/layers/multimodal.py +156 -40
- sglang/srt/layers/quantization/__init__.py +18 -46
- sglang/srt/layers/quantization/awq.py +22 -23
- sglang/srt/layers/quantization/base_config.py +2 -6
- sglang/srt/layers/quantization/blockwise_int8.py +4 -12
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +72 -29
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -1
- sglang/srt/layers/quantization/fp8.py +127 -119
- sglang/srt/layers/quantization/fp8_kernel.py +195 -24
- sglang/srt/layers/quantization/fp8_utils.py +34 -9
- sglang/srt/layers/quantization/fpgemm_fp8.py +203 -0
- sglang/srt/layers/quantization/gptq.py +17 -21
- sglang/srt/layers/quantization/marlin_utils.py +26 -8
- sglang/srt/layers/quantization/marlin_utils_fp8.py +352 -0
- sglang/srt/layers/quantization/modelopt_quant.py +217 -98
- sglang/srt/layers/quantization/moe_wna16.py +10 -15
- sglang/srt/layers/quantization/mxfp4.py +222 -39
- sglang/srt/layers/quantization/quark/quark.py +390 -0
- sglang/srt/layers/quantization/quark/quark_moe.py +197 -0
- sglang/srt/layers/quantization/unquant.py +34 -70
- sglang/srt/layers/quantization/utils.py +77 -2
- sglang/srt/layers/quantization/w4afp8.py +7 -8
- sglang/srt/layers/quantization/w8a8_fp8.py +5 -13
- sglang/srt/layers/quantization/w8a8_int8.py +5 -13
- sglang/srt/layers/radix_attention.py +6 -0
- sglang/srt/layers/rotary_embedding.py +1 -0
- sglang/srt/layers/sampler.py +5 -2
- sglang/srt/lora/layers.py +6 -2
- sglang/srt/lora/lora_manager.py +21 -22
- sglang/srt/lora/lora_registry.py +3 -3
- sglang/srt/lora/mem_pool.py +26 -24
- sglang/srt/lora/utils.py +10 -12
- sglang/srt/managers/cache_controller.py +80 -19
- sglang/srt/managers/detokenizer_manager.py +10 -2
- sglang/srt/managers/io_struct.py +23 -0
- sglang/srt/managers/mm_utils.py +1 -1
- sglang/srt/managers/schedule_batch.py +22 -48
- sglang/srt/managers/scheduler.py +28 -20
- sglang/srt/managers/session_controller.py +1 -1
- sglang/srt/managers/template_manager.py +7 -5
- sglang/srt/managers/tokenizer_manager.py +88 -39
- sglang/srt/managers/tp_worker.py +1 -0
- sglang/srt/managers/utils.py +59 -1
- sglang/srt/mem_cache/allocator.py +10 -157
- sglang/srt/mem_cache/allocator_ascend.py +147 -0
- sglang/srt/mem_cache/chunk_cache.py +1 -1
- sglang/srt/mem_cache/hicache_storage.py +14 -4
- sglang/srt/mem_cache/memory_pool.py +3 -3
- sglang/srt/mem_cache/memory_pool_host.py +35 -2
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +56 -12
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +8 -4
- sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +153 -59
- sglang/srt/mem_cache/storage/nixl/nixl_utils.py +19 -53
- sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +46 -7
- sglang/srt/model_executor/cuda_graph_runner.py +33 -33
- sglang/srt/model_executor/forward_batch_info.py +11 -10
- sglang/srt/model_executor/model_runner.py +93 -78
- sglang/srt/model_executor/npu_graph_runner.py +94 -0
- sglang/srt/model_loader/loader.py +24 -6
- sglang/srt/models/dbrx.py +12 -6
- sglang/srt/models/deepseek.py +2 -1
- sglang/srt/models/deepseek_nextn.py +5 -2
- sglang/srt/models/deepseek_v2.py +226 -223
- sglang/srt/models/ernie4.py +2 -2
- sglang/srt/models/glm4_moe.py +27 -65
- sglang/srt/models/glm4_moe_nextn.py +2 -1
- sglang/srt/models/glm4v.py +52 -1
- sglang/srt/models/glm4v_moe.py +8 -11
- sglang/srt/models/gpt_oss.py +41 -76
- sglang/srt/models/granitemoe.py +0 -1
- sglang/srt/models/grok.py +376 -48
- sglang/srt/models/interns1.py +12 -47
- sglang/srt/models/internvl.py +6 -51
- sglang/srt/models/llama.py +10 -2
- sglang/srt/models/llama4.py +18 -7
- sglang/srt/models/minicpm3.py +0 -1
- sglang/srt/models/mixtral.py +0 -2
- sglang/srt/models/nemotron_nas.py +435 -0
- sglang/srt/models/olmoe.py +0 -1
- sglang/srt/models/phi4mm.py +3 -21
- sglang/srt/models/qwen2.py +2 -2
- sglang/srt/models/qwen2_5_vl.py +2 -0
- sglang/srt/models/qwen2_moe.py +23 -23
- sglang/srt/models/qwen3.py +2 -2
- sglang/srt/models/qwen3_classification.py +84 -0
- sglang/srt/models/qwen3_moe.py +27 -43
- sglang/srt/models/step3_vl.py +8 -3
- sglang/srt/models/xverse_moe.py +11 -5
- sglang/srt/multimodal/processors/base_processor.py +3 -3
- sglang/srt/multimodal/processors/internvl.py +7 -2
- sglang/srt/multimodal/processors/llava.py +11 -7
- sglang/srt/offloader.py +433 -0
- sglang/srt/operations.py +22 -2
- sglang/srt/reasoning_parser.py +4 -3
- sglang/srt/sampling/sampling_batch_info.py +7 -4
- sglang/srt/server_args.py +264 -105
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -21
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +7 -21
- sglang/srt/speculative/eagle_utils.py +36 -13
- sglang/srt/speculative/eagle_worker.py +56 -3
- sglang/srt/tokenizer/tiktoken_tokenizer.py +161 -0
- sglang/srt/two_batch_overlap.py +20 -19
- sglang/srt/utils.py +68 -70
- sglang/test/runners.py +8 -5
- sglang/test/test_block_fp8.py +5 -6
- sglang/test/test_block_fp8_ep.py +13 -19
- sglang/test/test_cutlass_moe.py +4 -6
- sglang/test/test_cutlass_w4a8_moe.py +4 -3
- sglang/test/test_fp4_moe.py +4 -3
- sglang/test/test_marlin_moe.py +1 -1
- sglang/test/test_marlin_utils.py +1 -1
- sglang/test/test_utils.py +7 -0
- sglang/utils.py +0 -1
- sglang/version.py +1 -1
- {sglang-0.5.0rc1.dist-info → sglang-0.5.1.dist-info}/METADATA +11 -11
- {sglang-0.5.0rc1.dist-info → sglang-0.5.1.dist-info}/RECORD +201 -171
- sglang/srt/layers/quantization/fp4.py +0 -557
- sglang/srt/layers/quantization/scalar_type.py +0 -352
- {sglang-0.5.0rc1.dist-info → sglang-0.5.1.dist-info}/WHEEL +0 -0
- {sglang-0.5.0rc1.dist-info → sglang-0.5.1.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.0rc1.dist-info → sglang-0.5.1.dist-info}/top_level.txt +0 -0
sglang/srt/utils.py
CHANGED
@@ -438,70 +438,6 @@ def is_pin_memory_available() -> bool:
|
|
438
438
|
return torch.cuda.is_available()
|
439
439
|
|
440
440
|
|
441
|
-
_CPU_OFFLOAD_BYTES = 0
|
442
|
-
_CPU_OFFLOAD_MAX_BYTES = 0
|
443
|
-
|
444
|
-
|
445
|
-
def set_cpu_offload_max_bytes(max_bytes: int) -> None:
|
446
|
-
global _CPU_OFFLOAD_MAX_BYTES, _CPU_OFFLOAD_BYTES
|
447
|
-
_CPU_OFFLOAD_BYTES = 0
|
448
|
-
_CPU_OFFLOAD_MAX_BYTES = max_bytes
|
449
|
-
|
450
|
-
|
451
|
-
def maybe_offload_to_cpu(module: torch.nn.Module) -> torch.nn.Module:
|
452
|
-
device = next(module.parameters()).device
|
453
|
-
|
454
|
-
if device == torch.device("cpu"):
|
455
|
-
return module
|
456
|
-
|
457
|
-
global _CPU_OFFLOAD_MAX_BYTES, _CPU_OFFLOAD_BYTES
|
458
|
-
if _CPU_OFFLOAD_BYTES >= _CPU_OFFLOAD_MAX_BYTES:
|
459
|
-
return module
|
460
|
-
|
461
|
-
pin_memory = is_pin_memory_available()
|
462
|
-
# offload parameters to CPU
|
463
|
-
# use pin_memory if possible, which helps cudagraph capture speed
|
464
|
-
offloaded_parameters = False
|
465
|
-
for p in module.parameters():
|
466
|
-
if _CPU_OFFLOAD_BYTES >= _CPU_OFFLOAD_MAX_BYTES:
|
467
|
-
# we use per-parameter offloading
|
468
|
-
# one module might have some parameters offloaded and some not
|
469
|
-
break
|
470
|
-
|
471
|
-
# `torch.empty_like` does not support `pin_memory` argument
|
472
|
-
cpu_data = torch.empty_strided(
|
473
|
-
size=p.data.size(),
|
474
|
-
stride=p.data.stride(),
|
475
|
-
dtype=p.data.dtype,
|
476
|
-
layout=p.data.layout,
|
477
|
-
device="cpu",
|
478
|
-
pin_memory=pin_memory,
|
479
|
-
)
|
480
|
-
cpu_data.copy_(p.data)
|
481
|
-
p.data = cpu_data
|
482
|
-
_CPU_OFFLOAD_BYTES += p.data.numel() * p.data.element_size()
|
483
|
-
offloaded_parameters = True
|
484
|
-
|
485
|
-
if offloaded_parameters:
|
486
|
-
original_forward = module.forward
|
487
|
-
|
488
|
-
def forward(*args, **kwargs):
|
489
|
-
module.forward = original_forward
|
490
|
-
device_state = {
|
491
|
-
# here we blindly call `to(device)`
|
492
|
-
# if the parameter is already on the device, it will be a no-op
|
493
|
-
k: v.to(device, non_blocking=True)
|
494
|
-
for k, v in module.state_dict().items()
|
495
|
-
}
|
496
|
-
output = functional_call(module, device_state, args=args, kwargs=kwargs)
|
497
|
-
module.forward = forward
|
498
|
-
return output
|
499
|
-
|
500
|
-
module.forward = forward
|
501
|
-
|
502
|
-
return module
|
503
|
-
|
504
|
-
|
505
441
|
class LayerFn(Protocol):
|
506
442
|
|
507
443
|
def __call__(self, layer_id: int, prefix: str) -> torch.nn.Module: ...
|
@@ -514,11 +450,13 @@ def make_layers(
|
|
514
450
|
pp_size: Optional[int] = None,
|
515
451
|
prefix: str = "",
|
516
452
|
return_tuple: bool = False,
|
453
|
+
offloader_kwargs: Dict[str, Any] = {},
|
517
454
|
) -> Tuple[int, int, torch.nn.ModuleList]:
|
518
455
|
"""Make a list of layers with the given layer function"""
|
519
456
|
# circula imports
|
520
457
|
from sglang.srt.distributed import get_pp_indices
|
521
458
|
from sglang.srt.layers.utils import PPMissingLayer
|
459
|
+
from sglang.srt.offloader import get_offloader
|
522
460
|
|
523
461
|
assert not pp_size or num_hidden_layers >= pp_size
|
524
462
|
start_layer, end_layer = (
|
@@ -532,10 +470,13 @@ def make_layers(
|
|
532
470
|
)
|
533
471
|
modules = torch.nn.ModuleList(
|
534
472
|
[PPMissingLayer(return_tuple=return_tuple) for _ in range(start_layer)]
|
535
|
-
+
|
536
|
-
|
537
|
-
|
538
|
-
|
473
|
+
+ get_offloader().wrap_modules(
|
474
|
+
(
|
475
|
+
layer_fn(idx=idx, prefix=add_prefix(idx, prefix))
|
476
|
+
for idx in range(start_layer, end_layer)
|
477
|
+
),
|
478
|
+
**offloader_kwargs,
|
479
|
+
)
|
539
480
|
+ [
|
540
481
|
PPMissingLayer(return_tuple=return_tuple)
|
541
482
|
for _ in range(end_layer, num_hidden_layers)
|
@@ -2343,6 +2284,7 @@ def is_fa3_default_architecture(hf_config):
|
|
2343
2284
|
"Qwen3ForCausalLM",
|
2344
2285
|
"Qwen3MoeForCausalLM",
|
2345
2286
|
"Glm4MoeForCausalLM",
|
2287
|
+
"Glm4vMoeForConditionalGeneration",
|
2346
2288
|
"Step3VLForConditionalGeneration",
|
2347
2289
|
}
|
2348
2290
|
return architectures[0] in default_archs
|
@@ -2413,7 +2355,7 @@ def require_mlp_tp_gather(server_args):
|
|
2413
2355
|
return True
|
2414
2356
|
elif not server_args.enable_dp_lm_head:
|
2415
2357
|
return True
|
2416
|
-
elif server_args.moe_a2a_backend
|
2358
|
+
elif server_args.moe_a2a_backend == "none":
|
2417
2359
|
return True
|
2418
2360
|
else:
|
2419
2361
|
return (
|
@@ -2429,7 +2371,7 @@ def require_attn_tp_gather(server_args):
|
|
2429
2371
|
Check if the input of attention is scattered.
|
2430
2372
|
"""
|
2431
2373
|
assert server_args.moe_dense_tp_size in [1, None]
|
2432
|
-
if server_args.moe_a2a_backend
|
2374
|
+
if server_args.moe_a2a_backend != "none" or server_args.moe_dense_tp_size == 1:
|
2433
2375
|
if server_args.enable_dp_attention:
|
2434
2376
|
return server_args.dp_size < server_args.tp_size
|
2435
2377
|
else:
|
@@ -2599,6 +2541,50 @@ def dynamic_import(func_path: str):
|
|
2599
2541
|
return func
|
2600
2542
|
|
2601
2543
|
|
2544
|
+
def gc_object_counts():
|
2545
|
+
import gc
|
2546
|
+
|
2547
|
+
g0 = len(gc.get_objects(0))
|
2548
|
+
g1 = len(gc.get_objects(1))
|
2549
|
+
g2 = len(gc.get_objects(2))
|
2550
|
+
return g0, g1, g2
|
2551
|
+
|
2552
|
+
|
2553
|
+
def configure_gc_warning(warn_threshold_secs):
|
2554
|
+
import gc
|
2555
|
+
|
2556
|
+
gc_start_time = {}
|
2557
|
+
|
2558
|
+
def gc_callback(phase, info):
|
2559
|
+
gen = info.get("generation", "?")
|
2560
|
+
if phase == "start":
|
2561
|
+
gc_start_time[gen] = time.time()
|
2562
|
+
elif phase == "stop":
|
2563
|
+
duration = time.time() - gc_start_time.get(gen, time.time())
|
2564
|
+
if duration > warn_threshold_secs:
|
2565
|
+
g0, g1, g2 = gc_object_counts()
|
2566
|
+
logger.warn(
|
2567
|
+
f"LONG GARBAGE COLLECTION DETECTED | Generation {gen} | Duration: {duration:.4f}s | # Objects: gen0={g0}, gen1={g1}, gen2={g2} | "
|
2568
|
+
f"This may cause latency jitter. Consider calling the freeze_gc API after sending a few warmup requests."
|
2569
|
+
)
|
2570
|
+
|
2571
|
+
gc.callbacks.append(gc_callback)
|
2572
|
+
|
2573
|
+
|
2574
|
+
def freeze_gc(context: str):
|
2575
|
+
import gc
|
2576
|
+
|
2577
|
+
g0_before, g1_before, g2_before = gc_object_counts()
|
2578
|
+
gc.freeze()
|
2579
|
+
g0_after, g1_after, g2_after = gc_object_counts()
|
2580
|
+
logger.info(
|
2581
|
+
f"Freezing GC in {context} process. "
|
2582
|
+
f"gen0: {g0_before}->{g0_after}, "
|
2583
|
+
f"gen1: {g1_before}->{g1_after}, "
|
2584
|
+
f"gen2: {g2_before}->{g2_after}"
|
2585
|
+
)
|
2586
|
+
|
2587
|
+
|
2602
2588
|
def configure_gc_logger():
|
2603
2589
|
logger.info("Enable GC Logger")
|
2604
2590
|
|
@@ -2872,6 +2858,8 @@ SUPPORTED_LORA_TARGET_MODULES = [
|
|
2872
2858
|
"gate_proj",
|
2873
2859
|
"up_proj",
|
2874
2860
|
"down_proj",
|
2861
|
+
"qkv_proj",
|
2862
|
+
"gate_up_proj",
|
2875
2863
|
]
|
2876
2864
|
|
2877
2865
|
LORA_TARGET_ALL_MODULES = "all"
|
@@ -2966,3 +2954,13 @@ class ConcurrentCounter:
|
|
2966
2954
|
@lru_cache(maxsize=1)
|
2967
2955
|
def is_triton_kernels_available() -> bool:
|
2968
2956
|
return importlib.util.find_spec("triton_kernels") is not None
|
2957
|
+
|
2958
|
+
|
2959
|
+
def check_cuda_result(raw_output):
|
2960
|
+
import cuda.bindings.runtime as cuda_rt
|
2961
|
+
|
2962
|
+
err, *results = raw_output
|
2963
|
+
if err != cuda_rt.cudaError_t.cudaSuccess:
|
2964
|
+
raise Exception(f"CUDA error: {err}")
|
2965
|
+
|
2966
|
+
return results
|
sglang/test/runners.py
CHANGED
@@ -231,11 +231,14 @@ class HFRunner:
|
|
231
231
|
|
232
232
|
# Load the model and tokenizer
|
233
233
|
if self.model_type == "generation":
|
234
|
-
config = AutoConfig.from_pretrained(
|
235
|
-
|
236
|
-
|
237
|
-
|
234
|
+
config = AutoConfig.from_pretrained(
|
235
|
+
model_path, trust_remote_code=self.trust_remote_code
|
236
|
+
)
|
237
|
+
if self.trust_remote_code:
|
238
238
|
model_cls = AutoModelForCausalLM
|
239
|
+
else:
|
240
|
+
model_arch = getattr(config, "architectures")[0]
|
241
|
+
model_cls = getattr(transformers, model_arch)
|
239
242
|
self.base_model = model_cls.from_pretrained(
|
240
243
|
model_path,
|
241
244
|
torch_dtype=torch_dtype,
|
@@ -488,7 +491,7 @@ class SRTRunner:
|
|
488
491
|
tp_size: int = 1,
|
489
492
|
model_impl: str = "auto",
|
490
493
|
port: int = DEFAULT_PORT_FOR_SRT_TEST_RUNNER,
|
491
|
-
lora_paths: List[str] = None,
|
494
|
+
lora_paths: Optional[Union[List[str], List[dict[str, str]]]] = None,
|
492
495
|
max_loras_per_batch: int = 4,
|
493
496
|
attention_backend: Optional[str] = None,
|
494
497
|
prefill_attention_backend: Optional[str] = None,
|
sglang/test/test_block_fp8.py
CHANGED
@@ -6,7 +6,7 @@ import torch
|
|
6
6
|
|
7
7
|
from sglang.srt.layers.activation import SiluAndMul
|
8
8
|
from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_moe
|
9
|
-
from sglang.srt.layers.moe.topk import select_experts
|
9
|
+
from sglang.srt.layers.moe.topk import TopKConfig, select_experts
|
10
10
|
from sglang.srt.layers.quantization.fp8_kernel import (
|
11
11
|
per_tensor_quant_mla_fp8,
|
12
12
|
per_token_group_quant_fp8,
|
@@ -498,11 +498,13 @@ class TestW8A8BlockFP8FusedMoE(CustomTestCase):
|
|
498
498
|
score = torch.randn((M, E), dtype=dtype)
|
499
499
|
|
500
500
|
with torch.inference_mode():
|
501
|
+
ref_out = torch_w8a8_block_fp8_moe(
|
502
|
+
a, w1, w2, w1_s, w2_s, score, topk, block_size
|
503
|
+
)
|
501
504
|
topk_output = select_experts(
|
502
505
|
hidden_states=a,
|
503
506
|
router_logits=score,
|
504
|
-
top_k=topk,
|
505
|
-
renormalize=False,
|
507
|
+
topk_config=TopKConfig(top_k=topk, renormalize=False),
|
506
508
|
)
|
507
509
|
out = fused_moe(
|
508
510
|
a,
|
@@ -514,9 +516,6 @@ class TestW8A8BlockFP8FusedMoE(CustomTestCase):
|
|
514
516
|
w2_scale=w2_s,
|
515
517
|
block_shape=block_size,
|
516
518
|
)
|
517
|
-
ref_out = torch_w8a8_block_fp8_moe(
|
518
|
-
a, w1, w2, w1_s, w2_s, score, topk, block_size
|
519
|
-
)
|
520
519
|
|
521
520
|
self.assertTrue(
|
522
521
|
torch.mean(torch.abs(out.to(torch.float32) - ref_out.to(torch.float32)))
|
sglang/test/test_block_fp8_ep.py
CHANGED
@@ -12,7 +12,7 @@ from sglang.srt.layers.moe.ep_moe.kernels import (
|
|
12
12
|
run_moe_ep_preproess,
|
13
13
|
silu_and_mul_triton_kernel,
|
14
14
|
)
|
15
|
-
from sglang.srt.layers.moe.topk import select_experts
|
15
|
+
from sglang.srt.layers.moe.topk import TopKConfig, select_experts
|
16
16
|
from sglang.test.test_utils import CustomTestCase
|
17
17
|
|
18
18
|
|
@@ -22,35 +22,26 @@ def ep_moe(
|
|
22
22
|
w1: torch.Tensor,
|
23
23
|
w2: torch.Tensor,
|
24
24
|
router_logits: torch.Tensor,
|
25
|
-
|
26
|
-
renormalize: bool,
|
25
|
+
topk_config: TopKConfig,
|
27
26
|
# ep config
|
28
27
|
num_experts: int = 256,
|
29
28
|
fp8_dtype: torch.types = torch.float8_e4m3fn,
|
30
29
|
num_experts_per_partition: int = 128,
|
31
30
|
start_expert_id: int = 0,
|
32
31
|
end_expert_id: int = 127,
|
33
|
-
use_grouped_topk: bool = False,
|
34
|
-
num_expert_group: Optional[int] = None,
|
35
|
-
topk_group: Optional[int] = None,
|
36
|
-
custom_routing_function: Optional[Callable] = None,
|
37
32
|
use_fp8_w8a8: bool = False,
|
38
33
|
w1_scale_inv: Optional[torch.Tensor] = None,
|
39
34
|
w2_scale_inv: Optional[torch.Tensor] = None,
|
40
35
|
block_shape: Optional[List[int]] = None,
|
41
36
|
):
|
42
37
|
use_blockwise_fp8 = block_shape is not None
|
43
|
-
|
38
|
+
top_k = topk_config.top_k
|
39
|
+
topk_output = select_experts(
|
44
40
|
hidden_states=hidden_states,
|
45
41
|
router_logits=router_logits,
|
46
|
-
|
47
|
-
use_grouped_topk=use_grouped_topk,
|
48
|
-
renormalize=renormalize,
|
49
|
-
topk_group=topk_group,
|
50
|
-
num_expert_group=num_expert_group,
|
51
|
-
# correction_bias=correction_bias, #skip this in test
|
52
|
-
custom_routing_function=custom_routing_function,
|
42
|
+
topk_config=topk_config,
|
53
43
|
)
|
44
|
+
topk_weights, topk_ids, _ = topk_output
|
54
45
|
|
55
46
|
reorder_topk_ids, src2dst, seg_indptr = run_moe_ep_preproess(topk_ids, num_experts)
|
56
47
|
|
@@ -294,14 +285,18 @@ class TestW8A8BlockFP8EPMoE(CustomTestCase):
|
|
294
285
|
start_id = cur_rank * num_experts_per_partition
|
295
286
|
end_id = start_id + num_experts_per_partition - 1
|
296
287
|
|
288
|
+
topk_config = TopKConfig(
|
289
|
+
top_k=topk,
|
290
|
+
renormalize=False,
|
291
|
+
)
|
292
|
+
|
297
293
|
with torch.inference_mode():
|
298
294
|
out = ep_moe(
|
299
295
|
hidden_states=a,
|
300
296
|
w1=w1,
|
301
297
|
w2=w2,
|
302
298
|
router_logits=score,
|
303
|
-
|
304
|
-
renormalize=False,
|
299
|
+
topk_config=topk_config,
|
305
300
|
use_fp8_w8a8=True,
|
306
301
|
w1_scale_inv=w1_s,
|
307
302
|
w2_scale_inv=w2_s,
|
@@ -316,8 +311,7 @@ class TestW8A8BlockFP8EPMoE(CustomTestCase):
|
|
316
311
|
w1=w1_ref,
|
317
312
|
w2=w2_ref,
|
318
313
|
router_logits=score,
|
319
|
-
|
320
|
-
renormalize=False,
|
314
|
+
topk_config=topk_config,
|
321
315
|
use_fp8_w8a8=False,
|
322
316
|
w1_scale_inv=None,
|
323
317
|
w2_scale_inv=None,
|
sglang/test/test_cutlass_moe.py
CHANGED
@@ -153,9 +153,8 @@ def run_test(tp_size, batch_size, model_config, check=False):
|
|
153
153
|
x,
|
154
154
|
w1,
|
155
155
|
w2,
|
156
|
-
topk_weights,
|
157
|
-
|
158
|
-
inplace=False, # Use False for benchmarking to avoid side effects if run multiple times
|
156
|
+
(topk_weights, topk_ids, "dummy"),
|
157
|
+
inplace=False,
|
159
158
|
activation="silu", # Assuming SiLU activation common in MoEs
|
160
159
|
use_fp8_w8a8=True,
|
161
160
|
w1_scale=w1_scale,
|
@@ -221,8 +220,7 @@ def run_test(tp_size, batch_size, model_config, check=False):
|
|
221
220
|
x,
|
222
221
|
w1, # Original shape
|
223
222
|
w2, # Original shape
|
224
|
-
topk_weights,
|
225
|
-
topk_ids,
|
223
|
+
(topk_weights, topk_ids, "dummy"),
|
226
224
|
inplace=False, # Important: Use False to get output tensor
|
227
225
|
activation="silu",
|
228
226
|
use_fp8_w8a8=True,
|
@@ -266,7 +264,7 @@ if __name__ == "__main__":
|
|
266
264
|
"--batch-sizes",
|
267
265
|
type=int,
|
268
266
|
nargs="+",
|
269
|
-
default=[1, 4, 8, 16, 32, 64, 128, 256, 512], # Adjusted default
|
267
|
+
default=[1, 4, 8, 16, 32, 64, 128, 256, 512, 1024], # Adjusted default
|
270
268
|
help="List of batch sizes to test",
|
271
269
|
)
|
272
270
|
parser.add_argument("--check", action="store_true", help="Enable check mode")
|
@@ -6,7 +6,7 @@ import pytest
|
|
6
6
|
import torch
|
7
7
|
|
8
8
|
from sglang.srt.layers.moe.cutlass_w4a8_moe import cutlass_w4a8_moe
|
9
|
-
from sglang.srt.layers.moe.topk import select_experts
|
9
|
+
from sglang.srt.layers.moe.topk import TopKConfig, select_experts
|
10
10
|
|
11
11
|
|
12
12
|
def pack_int4_values_to_int8(int4_values_interleaved: torch.Tensor) -> torch.Tensor:
|
@@ -100,11 +100,12 @@ def test_cutlass_w4a8_moe(M, N, K, E, ep_size, topk, group_size, dtype):
|
|
100
100
|
s_strides2 = c_strides2
|
101
101
|
|
102
102
|
score = torch.randn((M, E), dtype=dtype, device=device)
|
103
|
-
|
103
|
+
topk_output = select_experts(
|
104
104
|
hidden_states=a,
|
105
105
|
router_logits=score,
|
106
|
-
top_k=topk,
|
106
|
+
topk_config=TopKConfig(top_k=topk, renormalize=False),
|
107
107
|
)
|
108
|
+
topk_weights, topk_ids, _ = topk_output
|
108
109
|
expert_map = torch.arange(E, dtype=torch.int32, device=device)
|
109
110
|
expert_map[local_e:] = E
|
110
111
|
|
sglang/test/test_fp4_moe.py
CHANGED
@@ -9,7 +9,7 @@ from sgl_kernel import scaled_fp4_quant
|
|
9
9
|
from sglang.srt.layers.activation import SiluAndMul
|
10
10
|
from sglang.srt.layers.moe.cutlass_moe import cutlass_moe_fp4
|
11
11
|
from sglang.srt.layers.moe.cutlass_moe_params import CutlassMoEParams, CutlassMoEType
|
12
|
-
from sglang.srt.layers.moe.topk import select_experts
|
12
|
+
from sglang.srt.layers.moe.topk import TopKConfig, select_experts
|
13
13
|
|
14
14
|
if torch.cuda.get_device_capability() < (10, 0):
|
15
15
|
pytest.skip(
|
@@ -163,11 +163,12 @@ def check_moe(
|
|
163
163
|
|
164
164
|
score = torch.randn((m, e), device="cuda", dtype=dtype)
|
165
165
|
|
166
|
-
|
166
|
+
topk_output = select_experts(
|
167
167
|
hidden_states=a,
|
168
168
|
router_logits=score,
|
169
|
-
top_k=topk,
|
169
|
+
topk_config=TopKConfig(top_k=topk, renormalize=False),
|
170
170
|
)
|
171
|
+
topk_weights, topk_ids, _ = topk_output
|
171
172
|
|
172
173
|
a1_gs = torch.ones((e,), device="cuda", dtype=torch.float32)
|
173
174
|
a2_gs = torch.ones((e,), device="cuda", dtype=torch.float32)
|
sglang/test/test_marlin_moe.py
CHANGED
@@ -4,9 +4,9 @@ from typing import Optional
|
|
4
4
|
import pytest
|
5
5
|
import torch
|
6
6
|
from sgl_kernel import fused_marlin_moe
|
7
|
+
from sgl_kernel.scalar_type import ScalarType, scalar_types
|
7
8
|
|
8
9
|
from sglang.srt.layers.activation import SiluAndMul
|
9
|
-
from sglang.srt.layers.quantization.scalar_type import ScalarType, scalar_types
|
10
10
|
from sglang.test.test_marlin_utils import awq_marlin_quantize, marlin_quantize
|
11
11
|
|
12
12
|
|
sglang/test/test_marlin_utils.py
CHANGED
@@ -10,13 +10,13 @@ from typing import Optional
|
|
10
10
|
|
11
11
|
import numpy as np
|
12
12
|
import torch
|
13
|
+
from sgl_kernel.scalar_type import ScalarType
|
13
14
|
|
14
15
|
from sglang.srt.layers.quantization.marlin_utils import (
|
15
16
|
GPTQ_MARLIN_TILE,
|
16
17
|
marlin_permute_scales,
|
17
18
|
marlin_zero_points,
|
18
19
|
)
|
19
|
-
from sglang.srt.layers.quantization.scalar_type import ScalarType
|
20
20
|
from sglang.srt.layers.quantization.utils import (
|
21
21
|
get_pack_factor,
|
22
22
|
gptq_quantize_weights,
|
sglang/test/test_utils.py
CHANGED
@@ -61,6 +61,12 @@ DEFAULT_MODEL_NAME_FOR_DYNAMIC_QUANT_ACCURACY_TEST_FP8 = (
|
|
61
61
|
DEFAULT_MODEL_NAME_FOR_MODELOPT_QUANT_ACCURACY_TEST_FP8 = (
|
62
62
|
"nvidia/Llama-3.1-8B-Instruct-FP8"
|
63
63
|
)
|
64
|
+
DEFAULT_MODEL_NAME_FOR_TEST_QWEN_FP8 = "Qwen/Qwen3-1.7B-FP8"
|
65
|
+
DEFAULT_MODEL_NAME_FOR_TEST_FP8_WITH_MOE = "gaunernst/DeepSeek-V2-Lite-Chat-FP8"
|
66
|
+
|
67
|
+
# W8A8 models
|
68
|
+
DEFAULT_MODEL_NAME_FOR_TEST_W8A8 = "RedHatAI/Llama-3.2-3B-quantized.w8a8"
|
69
|
+
DEFAULT_MODEL_NAME_FOR_TEST_W8A8_WITH_MOE = "nytopop/Qwen3-30B-A3B.w8a8"
|
64
70
|
|
65
71
|
# EAGLE
|
66
72
|
DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST = "meta-llama/Llama-2-7b-chat-hf"
|
@@ -78,6 +84,7 @@ DEFAULT_AWQ_MOE_MODEL_NAME_FOR_TEST = (
|
|
78
84
|
"hugging-quants/Mixtral-8x7B-Instruct-v0.1-AWQ-INT4"
|
79
85
|
)
|
80
86
|
DEFAULT_ENABLE_THINKING_MODEL_NAME_FOR_TEST = "Qwen/Qwen3-30B-A3B"
|
87
|
+
DEFAULT_DEEPSEEK_W4AFP8_MODEL_FOR_TEST = "Barrrrry/DeepSeek-R1-W4AFP8"
|
81
88
|
|
82
89
|
# Nightly tests
|
83
90
|
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
|
sglang/utils.py
CHANGED
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.5.
|
1
|
+
__version__ = "0.5.1"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.5.
|
3
|
+
Version: 0.5.1
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -232,7 +232,7 @@ Requires-Dist: modelscope; extra == "runtime-common"
|
|
232
232
|
Requires-Dist: msgspec; extra == "runtime-common"
|
233
233
|
Requires-Dist: ninja; extra == "runtime-common"
|
234
234
|
Requires-Dist: openai==1.99.1; extra == "runtime-common"
|
235
|
-
Requires-Dist: openai-harmony==0.0.
|
235
|
+
Requires-Dist: openai-harmony==0.0.4; extra == "runtime-common"
|
236
236
|
Requires-Dist: orjson; extra == "runtime-common"
|
237
237
|
Requires-Dist: outlines==0.1.11; extra == "runtime-common"
|
238
238
|
Requires-Dist: packaging; extra == "runtime-common"
|
@@ -240,9 +240,9 @@ Requires-Dist: partial_json_parser; extra == "runtime-common"
|
|
240
240
|
Requires-Dist: pillow; extra == "runtime-common"
|
241
241
|
Requires-Dist: prometheus-client>=0.20.0; extra == "runtime-common"
|
242
242
|
Requires-Dist: psutil; extra == "runtime-common"
|
243
|
+
Requires-Dist: pybase64; extra == "runtime-common"
|
243
244
|
Requires-Dist: pydantic; extra == "runtime-common"
|
244
245
|
Requires-Dist: pynvml; extra == "runtime-common"
|
245
|
-
Requires-Dist: pybase64; extra == "runtime-common"
|
246
246
|
Requires-Dist: python-multipart; extra == "runtime-common"
|
247
247
|
Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
|
248
248
|
Requires-Dist: sentencepiece; extra == "runtime-common"
|
@@ -251,18 +251,18 @@ Requires-Dist: scipy; extra == "runtime-common"
|
|
251
251
|
Requires-Dist: timm==1.0.16; extra == "runtime-common"
|
252
252
|
Requires-Dist: tiktoken; extra == "runtime-common"
|
253
253
|
Requires-Dist: torchao==0.9.0; extra == "runtime-common"
|
254
|
-
Requires-Dist: transformers==4.55.
|
254
|
+
Requires-Dist: transformers==4.55.2; extra == "runtime-common"
|
255
255
|
Requires-Dist: uvicorn; extra == "runtime-common"
|
256
256
|
Requires-Dist: uvloop; extra == "runtime-common"
|
257
|
-
Requires-Dist: xgrammar==0.1.
|
257
|
+
Requires-Dist: xgrammar==0.1.23; extra == "runtime-common"
|
258
258
|
Provides-Extra: srt
|
259
259
|
Requires-Dist: sglang[runtime_common]; extra == "srt"
|
260
|
-
Requires-Dist: sgl-kernel==0.3.
|
260
|
+
Requires-Dist: sgl-kernel==0.3.5; extra == "srt"
|
261
261
|
Requires-Dist: torch==2.8.0; extra == "srt"
|
262
262
|
Requires-Dist: torchaudio==2.8.0; extra == "srt"
|
263
263
|
Requires-Dist: torchvision; extra == "srt"
|
264
264
|
Requires-Dist: cuda-python; extra == "srt"
|
265
|
-
Requires-Dist: flashinfer_python==0.2.11.
|
265
|
+
Requires-Dist: flashinfer_python==0.2.11.post3; extra == "srt"
|
266
266
|
Provides-Extra: blackwell
|
267
267
|
Requires-Dist: sglang[runtime_common]; extra == "blackwell"
|
268
268
|
Requires-Dist: sgl-kernel; extra == "blackwell"
|
@@ -270,7 +270,7 @@ Requires-Dist: torch==2.8.0; extra == "blackwell"
|
|
270
270
|
Requires-Dist: torchaudio==2.8.0; extra == "blackwell"
|
271
271
|
Requires-Dist: torchvision; extra == "blackwell"
|
272
272
|
Requires-Dist: cuda-python; extra == "blackwell"
|
273
|
-
Requires-Dist: flashinfer_python==0.2.11.
|
273
|
+
Requires-Dist: flashinfer_python==0.2.11.post3; extra == "blackwell"
|
274
274
|
Provides-Extra: srt-hip
|
275
275
|
Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
|
276
276
|
Requires-Dist: torch; extra == "srt-hip"
|
@@ -278,13 +278,12 @@ Requires-Dist: petit_kernel==0.0.2; extra == "srt-hip"
|
|
278
278
|
Requires-Dist: wave-lang==1.0.1; extra == "srt-hip"
|
279
279
|
Provides-Extra: srt-cpu
|
280
280
|
Requires-Dist: sglang[runtime_common]; extra == "srt-cpu"
|
281
|
-
|
281
|
+
Provides-Extra: srt-npu
|
282
|
+
Requires-Dist: sglang[runtime_common]; extra == "srt-npu"
|
282
283
|
Provides-Extra: srt-xpu
|
283
284
|
Requires-Dist: sglang[runtime_common]; extra == "srt-xpu"
|
284
285
|
Provides-Extra: srt-hpu
|
285
286
|
Requires-Dist: sglang[runtime_common]; extra == "srt-hpu"
|
286
|
-
Provides-Extra: srt-npu
|
287
|
-
Requires-Dist: sglang[runtime_common]; extra == "srt-npu"
|
288
287
|
Provides-Extra: openai
|
289
288
|
Requires-Dist: openai==1.99.1; extra == "openai"
|
290
289
|
Requires-Dist: tiktoken; extra == "openai"
|
@@ -375,6 +374,7 @@ Dynamic: license-file
|
|
375
374
|
| [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
|
376
375
|
|
377
376
|
## News
|
377
|
+
- [2025/08] 🔔 SGLang x AMD SF Meetup on 8/22: Hands-on GPU workshop, tech talks by AMD/xAI/SGLang, and networking. [Register here](https://lu.ma/gbfhjvuo).
|
378
378
|
- [2025/08] 🔥 SGLang provides day-0 support for OpenAI gpt-oss model ([instructions](https://github.com/sgl-project/sglang/issues/8833))
|
379
379
|
- [2025/06] 🔥 SGLang, the high-performance serving infrastructure powering trillions of tokens daily, has been awarded the third batch of the Open Source AI Grant by a16z ([a16z blog](https://a16z.com/advancing-open-source-ai-through-benchmarks-and-bold-experimentation/)).
|
380
380
|
- [2025/06] 🔥 Deploying DeepSeek on GB200 NVL72 with PD and Large Scale EP (Part I): 2.7x Higher Decoding Throughput ([blog](https://lmsys.org/blog/2025-06-16-gb200-part-1/)).
|