PyPI - sglang - Versions diffs - 0.5.2rc2__py3-none-any.whl → 0.5.3rc2__py3-none-any.whl - Mend

sglang 0.5.2rc2py3-none-any.whl → 0.5.3rc2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (377) hide show

sglang/bench_one_batch.py +7 -9
sglang/bench_one_batch_server.py +330 -31
sglang/bench_serving.py +267 -32
sglang/global_config.py +2 -2
sglang/lang/backend/runtime_endpoint.py +1 -1
sglang/launch_server.py +14 -0
sglang/profiler.py +2 -2
sglang/srt/batch_invariant_ops/__init__.py +27 -0
sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
sglang/srt/configs/__init__.py +8 -0
sglang/srt/configs/device_config.py +3 -1
sglang/srt/configs/dots_ocr.py +64 -0
sglang/srt/configs/dots_vlm.py +139 -0
sglang/srt/configs/falcon_h1.py +360 -0
sglang/srt/configs/load_config.py +9 -0
sglang/srt/configs/model_config.py +181 -82
sglang/srt/configs/qwen3_next.py +326 -0
sglang/srt/configs/qwen3_vl.py +586 -0
sglang/srt/connector/__init__.py +8 -1
sglang/srt/connector/remote_instance.py +82 -0
sglang/srt/constrained/base_grammar_backend.py +49 -12
sglang/srt/constrained/llguidance_backend.py +0 -1
sglang/srt/constrained/outlines_backend.py +0 -1
sglang/srt/constrained/outlines_jump_forward.py +1 -1
sglang/srt/constrained/xgrammar_backend.py +30 -9
sglang/srt/custom_op.py +11 -1
sglang/srt/debug_utils/dump_comparator.py +81 -44
sglang/srt/debug_utils/dump_loader.py +97 -0
sglang/srt/debug_utils/dumper.py +21 -6
sglang/srt/debug_utils/text_comparator.py +73 -11
sglang/srt/disaggregation/ascend/conn.py +2 -2
sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
sglang/srt/disaggregation/base/conn.py +1 -1
sglang/srt/disaggregation/common/conn.py +279 -108
sglang/srt/disaggregation/decode.py +71 -19
sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
sglang/srt/disaggregation/fake/conn.py +1 -1
sglang/srt/disaggregation/mini_lb.py +6 -445
sglang/srt/disaggregation/mooncake/conn.py +55 -537
sglang/srt/disaggregation/nixl/conn.py +326 -53
sglang/srt/disaggregation/prefill.py +36 -17
sglang/srt/disaggregation/utils.py +40 -54
sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
sglang/srt/distributed/parallel_state.py +156 -80
sglang/srt/entrypoints/engine.py +59 -18
sglang/srt/entrypoints/grpc_request_manager.py +855 -0
sglang/srt/entrypoints/grpc_server.py +810 -0
sglang/srt/entrypoints/http_server.py +130 -59
sglang/srt/entrypoints/openai/protocol.py +112 -4
sglang/srt/entrypoints/openai/serving_base.py +65 -3
sglang/srt/entrypoints/openai/serving_chat.py +204 -55
sglang/srt/entrypoints/openai/serving_completions.py +14 -3
sglang/srt/entrypoints/openai/serving_embedding.py +9 -3
sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
sglang/srt/entrypoints/openai/serving_responses.py +48 -3
sglang/srt/entrypoints/openai/serving_score.py +1 -0
sglang/srt/environ.py +285 -0
sglang/srt/eplb/eplb_manager.py +2 -2
sglang/srt/eplb/expert_distribution.py +26 -13
sglang/srt/eplb/expert_location.py +38 -8
sglang/srt/eplb/expert_location_updater.py +1 -1
sglang/srt/function_call/base_format_detector.py +3 -6
sglang/srt/function_call/ebnf_composer.py +11 -9
sglang/srt/function_call/function_call_parser.py +9 -2
sglang/srt/function_call/glm4_moe_detector.py +4 -4
sglang/srt/function_call/gpt_oss_detector.py +23 -0
sglang/srt/function_call/json_array_parser.py +63 -0
sglang/srt/function_call/kimik2_detector.py +17 -4
sglang/srt/function_call/qwen3_coder_detector.py +1 -1
sglang/srt/function_call/utils.py +96 -5
sglang/srt/grpc/__init__.py +1 -0
sglang/srt/grpc/compile_proto.py +245 -0
sglang/srt/grpc/sglang_scheduler_pb2.py +111 -0
sglang/srt/grpc/sglang_scheduler_pb2.pyi +434 -0
sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +239 -0
sglang/srt/layers/activation.py +143 -9
sglang/srt/layers/attention/aiter_backend.py +14 -15
sglang/srt/layers/attention/ascend_backend.py +115 -9
sglang/srt/layers/attention/attention_registry.py +206 -0
sglang/srt/layers/attention/base_attn_backend.py +12 -3
sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
sglang/srt/layers/attention/fla/chunk.py +242 -0
sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
sglang/srt/layers/attention/fla/chunk_o.py +178 -0
sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
sglang/srt/layers/attention/fla/cumsum.py +300 -0
sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
sglang/srt/layers/attention/fla/index.py +37 -0
sglang/srt/layers/attention/fla/l2norm.py +150 -0
sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
sglang/srt/layers/attention/fla/op.py +66 -0
sglang/srt/layers/attention/fla/solve_tril.py +465 -0
sglang/srt/layers/attention/fla/utils.py +331 -0
sglang/srt/layers/attention/fla/wy_fast.py +158 -0
sglang/srt/layers/attention/flashattention_backend.py +41 -8
sglang/srt/layers/attention/flashinfer_backend.py +118 -198
sglang/srt/layers/attention/flashinfer_mla_backend.py +27 -27
sglang/srt/layers/attention/flashmla_backend.py +7 -5
sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
sglang/srt/layers/attention/intel_amx_backend.py +3 -0
sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +969 -0
sglang/srt/layers/attention/mamba/mamba.py +629 -0
sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
sglang/srt/layers/attention/nsa/transform_index.py +144 -0
sglang/srt/layers/attention/nsa/utils.py +24 -0
sglang/srt/layers/attention/nsa_backend.py +887 -0
sglang/srt/layers/attention/tbo_backend.py +6 -6
sglang/srt/layers/attention/torch_flex_backend.py +325 -0
sglang/srt/layers/attention/torch_native_backend.py +12 -6
sglang/srt/layers/attention/triton_backend.py +57 -7
sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
sglang/srt/layers/attention/vision.py +58 -0
sglang/srt/layers/attention/wave_backend.py +4 -4
sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
sglang/srt/layers/communicator.py +8 -0
sglang/srt/layers/dp_attention.py +41 -2
sglang/srt/layers/elementwise.py +3 -1
sglang/srt/layers/layernorm.py +34 -15
sglang/srt/layers/linear.py +55 -7
sglang/srt/layers/logits_processor.py +44 -12
sglang/srt/layers/moe/__init__.py +2 -1
sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
sglang/srt/layers/moe/ep_moe/kernels.py +2 -2
sglang/srt/layers/moe/ep_moe/layer.py +256 -63
sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
sglang/srt/layers/moe/fused_moe_native.py +5 -3
sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
sglang/srt/layers/moe/fused_moe_triton/layer.py +71 -70
sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
sglang/srt/layers/moe/moe_runner/base.py +274 -1
sglang/srt/layers/moe/moe_runner/runner.py +80 -0
sglang/srt/layers/moe/moe_runner/triton.py +448 -0
sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
sglang/srt/layers/moe/topk.py +30 -9
sglang/srt/layers/moe/utils.py +22 -6
sglang/srt/layers/parameter.py +23 -6
sglang/srt/layers/quantization/awq.py +19 -7
sglang/srt/layers/quantization/base_config.py +11 -6
sglang/srt/layers/quantization/blockwise_int8.py +38 -27
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
sglang/srt/layers/quantization/fp8.py +78 -49
sglang/srt/layers/quantization/fp8_utils.py +51 -32
sglang/srt/layers/quantization/gptq.py +25 -17
sglang/srt/layers/quantization/modelopt_quant.py +190 -55
sglang/srt/layers/quantization/moe_wna16.py +21 -18
sglang/srt/layers/quantization/mxfp4.py +74 -42
sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
sglang/srt/layers/quantization/unquant.py +135 -47
sglang/srt/layers/quantization/w4afp8.py +26 -17
sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
sglang/srt/layers/quantization/w8a8_int8.py +91 -41
sglang/srt/layers/rotary_embedding.py +78 -31
sglang/srt/layers/sampler.py +213 -21
sglang/srt/layers/utils.py +23 -0
sglang/srt/lora/backend/base_backend.py +50 -8
sglang/srt/lora/backend/chunked_backend.py +348 -0
sglang/srt/lora/backend/triton_backend.py +99 -5
sglang/srt/lora/layers.py +32 -0
sglang/srt/lora/lora.py +8 -3
sglang/srt/lora/lora_manager.py +52 -118
sglang/srt/lora/mem_pool.py +25 -11
sglang/srt/lora/triton_ops/__init__.py +4 -0
sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
sglang/srt/lora/utils.py +22 -11
sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
sglang/srt/managers/cache_controller.py +199 -301
sglang/srt/managers/data_parallel_controller.py +115 -80
sglang/srt/managers/detokenizer_manager.py +19 -15
sglang/srt/managers/disagg_service.py +46 -0
sglang/srt/managers/io_struct.py +340 -109
sglang/srt/managers/mm_utils.py +44 -6
sglang/srt/managers/multi_tokenizer_mixin.py +357 -407
sglang/srt/managers/multimodal_processor.py +1 -2
sglang/srt/managers/overlap_utils.py +53 -0
sglang/srt/managers/schedule_batch.py +240 -138
sglang/srt/managers/schedule_policy.py +144 -17
sglang/srt/managers/scheduler.py +502 -209
sglang/srt/managers/scheduler_input_blocker.py +1 -1
sglang/srt/managers/scheduler_metrics_mixin.py +99 -126
sglang/srt/managers/scheduler_output_processor_mixin.py +75 -22
sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
sglang/srt/managers/tokenizer_manager.py +320 -632
sglang/srt/managers/tp_worker.py +81 -22
sglang/srt/managers/tp_worker_overlap_thread.py +71 -56
sglang/srt/managers/utils.py +1 -45
sglang/srt/mem_cache/allocator.py +14 -20
sglang/srt/mem_cache/allocator_ascend.py +41 -27
sglang/srt/mem_cache/base_prefix_cache.py +1 -1
sglang/srt/mem_cache/chunk_cache.py +8 -1
sglang/srt/mem_cache/evict_policy.py +23 -0
sglang/srt/mem_cache/hicache_storage.py +43 -24
sglang/srt/mem_cache/hiradix_cache.py +222 -75
sglang/srt/mem_cache/memory_pool.py +535 -58
sglang/srt/mem_cache/memory_pool_host.py +239 -228
sglang/srt/mem_cache/radix_cache.py +222 -73
sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
sglang/srt/mem_cache/storage/__init__.py +10 -0
sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
sglang/srt/mem_cache/storage/backend_factory.py +223 -0
sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +259 -62
sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +166 -17
sglang/srt/mem_cache/swa_radix_cache.py +25 -36
sglang/srt/metrics/collector.py +511 -132
sglang/srt/metrics/func_timer.py +2 -7
sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
sglang/srt/metrics/utils.py +8 -1
sglang/srt/model_executor/cpu_graph_runner.py +640 -0
sglang/srt/model_executor/cuda_graph_runner.py +52 -37
sglang/srt/model_executor/forward_batch_info.py +82 -40
sglang/srt/model_executor/model_runner.py +432 -157
sglang/srt/model_executor/npu_graph_runner.py +12 -5
sglang/srt/model_loader/__init__.py +9 -3
sglang/srt/model_loader/loader.py +133 -5
sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
sglang/srt/model_loader/weight_utils.py +158 -3
sglang/srt/models/apertus.py +686 -0
sglang/srt/models/bailing_moe.py +820 -217
sglang/srt/models/bailing_moe_nextn.py +168 -0
sglang/srt/models/deepseek_nextn.py +6 -1
sglang/srt/models/deepseek_v2.py +607 -130
sglang/srt/models/dots_ocr.py +173 -0
sglang/srt/models/dots_vlm.py +174 -0
sglang/srt/models/dots_vlm_vit.py +337 -0
sglang/srt/models/ernie4.py +1 -1
sglang/srt/models/falcon_h1.py +576 -0
sglang/srt/models/gemma3_causal.py +0 -2
sglang/srt/models/gemma3_mm.py +1 -1
sglang/srt/models/gemma3n_mm.py +2 -2
sglang/srt/models/glm4_moe.py +4 -4
sglang/srt/models/glm4_moe_nextn.py +2 -2
sglang/srt/models/glm4v.py +5 -3
sglang/srt/models/glm4v_moe.py +4 -1
sglang/srt/models/gpt_oss.py +8 -31
sglang/srt/models/kimi_vl_moonvit.py +2 -2
sglang/srt/models/llama.py +4 -0
sglang/srt/models/llama4.py +9 -0
sglang/srt/models/llama_eagle3.py +13 -0
sglang/srt/models/longcat_flash.py +3 -3
sglang/srt/models/longcat_flash_nextn.py +1 -1
sglang/srt/models/mllama4.py +40 -4
sglang/srt/models/opt.py +637 -0
sglang/srt/models/qwen2_5_vl.py +29 -5
sglang/srt/models/qwen2_audio.py +1 -1
sglang/srt/models/qwen2_moe.py +120 -13
sglang/srt/models/qwen2_vl.py +1 -1
sglang/srt/models/qwen3.py +18 -3
sglang/srt/models/qwen3_moe.py +32 -4
sglang/srt/models/qwen3_next.py +1069 -0
sglang/srt/models/qwen3_next_mtp.py +112 -0
sglang/srt/models/qwen3_vl.py +787 -0
sglang/srt/models/qwen3_vl_moe.py +471 -0
sglang/srt/models/registry.py +15 -3
sglang/srt/models/sarashina2_vision.py +269 -0
sglang/srt/models/solar.py +505 -0
sglang/srt/models/starcoder2.py +357 -0
sglang/srt/models/step3_vl.py +1 -1
sglang/srt/models/torch_native_llama.py +9 -2
sglang/srt/models/utils.py +51 -0
sglang/srt/multimodal/processors/base_processor.py +15 -7
sglang/srt/multimodal/processors/dots_vlm.py +98 -0
sglang/srt/multimodal/processors/glm4v.py +9 -9
sglang/srt/multimodal/processors/internvl.py +153 -129
sglang/srt/multimodal/processors/qwen_vl.py +23 -6
sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
sglang/srt/offloader.py +27 -3
sglang/srt/parser/jinja_template_utils.py +6 -0
sglang/srt/sampling/sampling_batch_info.py +38 -17
sglang/srt/sampling/sampling_params.py +7 -0
sglang/srt/server_args.py +966 -267
sglang/srt/server_args_config_parser.py +146 -0
sglang/srt/single_batch_overlap.py +151 -0
sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
sglang/srt/speculative/cpp_ngram/param.h +125 -0
sglang/srt/speculative/cpp_ngram/queue.h +71 -0
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
sglang/srt/speculative/eagle_worker.py +99 -28
sglang/srt/speculative/ngram_utils.py +428 -0
sglang/srt/speculative/ngram_worker.py +245 -0
sglang/srt/speculative/spec_info.py +52 -0
sglang/srt/speculative/spec_utils.py +606 -0
sglang/srt/speculative/standalone_worker.py +109 -0
sglang/srt/torch_memory_saver_adapter.py +5 -7
sglang/srt/tracing/trace.py +578 -0
sglang/srt/two_batch_overlap.py +8 -5
sglang/srt/utils/__init__.py +2 -0
sglang/srt/{utils.py → utils/common.py} +433 -77
sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +53 -5
sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
sglang/srt/utils/rpd_utils.py +452 -0
sglang/srt/utils/slow_rank_detector.py +71 -0
sglang/srt/warmup.py +8 -4
sglang/srt/weight_sync/utils.py +2 -2
sglang/test/attention/test_trtllm_mla_backend.py +169 -5
sglang/test/get_logits_ut.py +57 -0
sglang/test/run_eval.py +79 -11
sglang/test/runners.py +5 -1
sglang/test/simple_eval_common.py +5 -2
sglang/test/simple_eval_mmmu_vlm.py +441 -0
sglang/test/test_block_fp8.py +2 -2
sglang/test/test_cutlass_moe.py +24 -6
sglang/test/test_deterministic.py +297 -0
sglang/test/test_disaggregation_utils.py +77 -0
sglang/test/test_fp4_moe.py +370 -1
sglang/test/test_programs.py +1 -1
sglang/test/test_utils.py +383 -5
sglang/utils.py +21 -1
sglang/version.py +1 -1
{sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/METADATA +69 -124
{sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/RECORD +375 -245
sglang/srt/disaggregation/launch_lb.py +0 -118
sglang/srt/mem_cache/lora_radix_cache.py +0 -421
/sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
{sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/WHEEL +0 -0
{sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/top_level.txt +0 -0

sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py CHANGED Viewed

@@ -11,6 +11,7 @@ from sglang.srt.layers.quantization.deep_gemm_wrapper.configurer import (
     ENABLE_JIT_DEEPGEMM,
 )
 from sglang.srt.server_args import ServerArgs
+from sglang.srt.utils import get_bool_env_var
 logger = logging.getLogger(__name__)
@@ -18,6 +19,8 @@ if ENABLE_JIT_DEEPGEMM:
     import deep_gemm
     from deep_gemm.utils.layout import get_mn_major_tma_aligned_tensor
+_SANITY_CHECK = get_bool_env_var("SGLANG_DEEPGEMM_SANITY_CHECK")
 # TODO maybe rename these functions
 def grouped_gemm_nt_f8f8bf16_masked(
@@ -31,6 +34,9 @@ def grouped_gemm_nt_f8f8bf16_masked(
     _, n, _ = rhs[0].shape
     kernel_type = compile_utils.DeepGemmKernelType.GROUPED_GEMM_NT_F8F8BF16_MASKED
+    _sanity_check_input(lhs)
+    _sanity_check_input(rhs)
     with compile_utils.deep_gemm_execution_hook(
         expected_m, n, k, num_groups, kernel_type
     ):
@@ -53,6 +59,9 @@ def grouped_gemm_nt_f8f8bf16_contig(
     num_groups, n, _ = rhs[0].shape
     kernel_type = compile_utils.DeepGemmKernelType.GROUPED_GEMM_NT_F8F8BF16_CONTIG
+    _sanity_check_input(lhs)
+    _sanity_check_input(rhs)
     with compile_utils.deep_gemm_execution_hook(m, n, k, num_groups, kernel_type):
         deep_gemm.m_grouped_fp8_gemm_nt_contiguous(lhs, rhs, out, m_indices)
@@ -67,6 +76,9 @@ def gemm_nt_f8f8bf16(
     num_groups = 1
     kernel_type = compile_utils.DeepGemmKernelType.GEMM_NT_F8F8BF16
+    _sanity_check_input(lhs)
+    _sanity_check_input(rhs)
     with compile_utils.deep_gemm_execution_hook(m, n, k, num_groups, kernel_type):
         deep_gemm.fp8_gemm_nt(
             lhs,
@@ -90,3 +102,18 @@ def configure_deep_gemm_num_sms(num_sms):
             yield
         finally:
             deep_gemm.set_num_sms(original_num_sms)
+def _sanity_check_input(x_fp8: Tuple[torch.Tensor, torch.Tensor]):
+    if not _SANITY_CHECK:
+        return
+    x, x_scale = x_fp8
+    if x_scale.dtype == torch.int:
+        return
+    from sglang.srt.layers.quantization.fp8_utils import ceil_to_ue8m0
+    x_scale_ceil = ceil_to_ue8m0(x_scale)
+    assert torch.all(x_scale == x_scale_ceil), f"{x_scale=} {x_scale_ceil=}"

sglang/srt/layers/quantization/fp8.py CHANGED Viewed

@@ -30,6 +30,9 @@ except ImportError:
 from sglang.srt.distributed import get_tensor_model_parallel_world_size
 from sglang.srt.layers.amx_utils import _amx_process_weight_after_loading
+from sglang.srt.layers.moe import MoeRunner, MoeRunnerBackend, MoeRunnerConfig
+from sglang.srt.layers.moe.moe_runner.triton import TritonMoeQuantInfo
+from sglang.srt.layers.moe.token_dispatcher.base import DispatchOutputChecker
 from sglang.srt.layers.parameter import (
     BlockQuantScaleParameter,
     ModelWeightParameter,
@@ -81,7 +84,11 @@ from sglang.srt.utils import (
 )
 if TYPE_CHECKING:
-    from sglang.srt.layers.moe.moe_runner import MoeRunnerConfig
+    from sglang.srt.layers.moe.token_dispatcher import (
+        CombineInput,
+        DispatchOutput,
+        StandardDispatchOutput,
+    )
     from sglang.srt.layers.moe.topk import TopKOutput
     from sglang.srt.layers.quantization.w4afp8 import W4AFp8Config
@@ -345,11 +352,14 @@ class Fp8LinearMethod(LinearMethodBase):
                     _is_cpu_amx_available
                 ), "Fp8LinearMethod on CPU requires that CPU has AMX support"
                 _amx_process_weight_after_loading(layer, ["weight"])
+                layer.weight_scale_inv = torch.nn.Parameter(
+                    layer.weight_scale_inv.data, requires_grad=False
+                )
                 return
             else:
                 weight, weight_scale = layer.weight.data, layer.weight_scale_inv.data
-            layer.weight = Parameter(weight, requires_grad=False)
-            layer.weight_scale_inv = Parameter(weight_scale, requires_grad=False)
+            layer.weight.data = weight.data
+            layer.weight_scale_inv.data = weight_scale.data
         else:
             layer.weight = Parameter(layer.weight.data, requires_grad=False)
@@ -527,7 +537,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
         layer: Module,
         num_experts: int,
         hidden_size: int,
-        intermediate_size: int,
+        intermediate_size_per_partition: int,
         params_dtype: torch.dtype,
         **extra_weight_attrs,
     ):
@@ -543,18 +553,18 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             )
             # NOTE(HandH1998): To ensure proper alignment of the block-wise quantization scales, the output_size of the weights for both the gate and up layers must be divisible by block_n.
             # Required by column parallel or enabling merged weights
-            if intermediate_size % block_n != 0:
+            if intermediate_size_per_partition % block_n != 0:
                 raise ValueError(
                     f"The output_size of gate's and up's weight = "
-                    f"{intermediate_size} is not divisible by "
+                    f"{intermediate_size_per_partition} is not divisible by "
                     f"weight quantization block_n = {block_n}."
                 )
             if tp_size > 1:
                 # Required by row parallel
-                if intermediate_size % block_k != 0:
+                if intermediate_size_per_partition % block_k != 0:
                     raise ValueError(
                         f"The input_size of down's weight = "
-                        f"{intermediate_size} is not divisible by "
+                        f"{intermediate_size_per_partition} is not divisible by "
                         f"weight quantization block_k = {block_k}."
                     )
@@ -564,7 +574,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             w13_weight = torch.nn.Parameter(
                 torch.empty(
                     num_experts,
-                    2 * intermediate_size,
+                    2 * intermediate_size_per_partition,
                     hidden_size // 8,
                     dtype=params_dtype,
                 ),
@@ -572,20 +582,29 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             )
             w2_weight = torch.nn.Parameter(
                 torch.empty(
-                    num_experts, hidden_size, intermediate_size // 8, dtype=params_dtype
+                    num_experts,
+                    hidden_size,
+                    intermediate_size_per_partition // 8,
+                    dtype=params_dtype,
                 ),
                 requires_grad=False,
             )
         else:
             w13_weight = torch.nn.Parameter(
                 torch.empty(
-                    num_experts, 2 * intermediate_size, hidden_size, dtype=params_dtype
+                    num_experts,
+                    2 * intermediate_size_per_partition,
+                    hidden_size,
+                    dtype=params_dtype,
                 ),
                 requires_grad=False,
             )
             w2_weight = torch.nn.Parameter(
                 torch.empty(
-                    num_experts, hidden_size, intermediate_size, dtype=params_dtype
+                    num_experts,
+                    hidden_size,
+                    intermediate_size_per_partition,
+                    dtype=params_dtype,
                 ),
                 requires_grad=False,
             )
@@ -601,7 +620,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             w13_weight_scale = torch.nn.Parameter(
                 torch.ones(
                     num_experts,
-                    2 * ((intermediate_size + block_n - 1) // block_n),
+                    2 * ((intermediate_size_per_partition + block_n - 1) // block_n),
                     (hidden_size + block_k - 1) // block_k,
                     dtype=torch.float32,
                 ),
@@ -611,7 +630,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                 torch.ones(
                     num_experts,
                     (hidden_size + block_n - 1) // block_n,
-                    (intermediate_size + block_k - 1) // block_k,
+                    (intermediate_size_per_partition + block_k - 1) // block_k,
                     dtype=torch.float32,
                 ),
                 requires_grad=False,
@@ -619,11 +638,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             layer.register_parameter("w13_weight_scale_inv", w13_weight_scale)
             layer.register_parameter("w2_weight_scale_inv", w2_weight_scale)
             assert self.quant_config.activation_scheme == "dynamic"
-            if (
-                get_bool_env_var("SGLANG_CUTLASS_MOE")
-                and self.cutlass_fp8_supported
-                and (is_sm100_supported() or is_sm90_supported())
-            ):
+            if self.use_cutlass_fused_experts_fp8:
                 self.ab_strides1 = torch.full(
                     (num_experts,),
                     hidden_size,
@@ -632,13 +647,13 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                 )
                 self.c_strides1 = torch.full(
                     (num_experts,),
-                    2 * intermediate_size,
+                    2 * intermediate_size_per_partition,
                     device=w13_weight.device,
                     dtype=torch.int64,
                 )
                 self.ab_strides2 = torch.full(
                     (num_experts,),
-                    intermediate_size,
+                    intermediate_size_per_partition,
                     device=w2_weight.device,
                     dtype=torch.int64,
                 )
@@ -691,7 +706,11 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             if _is_hip:  # _use_aiter: TODO: add check back after triton kernel
                 # ROCm - using column scaling, duplicate scaling numbers in case per tensor scaling
                 w13_weight_scale1 = torch.nn.Parameter(
-                    torch.ones(num_experts, 2 * intermediate_size, dtype=torch.float32),
+                    torch.ones(
+                        num_experts,
+                        2 * intermediate_size_per_partition,
+                        dtype=torch.float32,
+                    ),
                     requires_grad=False,
                 )
                 w2_weight_scale1 = torch.nn.Parameter(
@@ -984,14 +1003,23 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             )
             torch.cuda.empty_cache()
+    def create_moe_runner(
+        self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
+    ):
+        self.moe_runner_config = moe_runner_config
+        self.runner = MoeRunner(MoeRunnerBackend.TRITON, moe_runner_config)
     def apply(
         self,
         layer: torch.nn.Module,
-        x: torch.Tensor,
-        topk_output: TopKOutput,
-        moe_runner_config: MoeRunnerConfig,
-    ) -> torch.Tensor:
-        from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_experts
+        dispatch_output: DispatchOutput,
+    ) -> CombineInput:
+        from sglang.srt.layers.moe.token_dispatcher import StandardCombineInput
+        x = dispatch_output.hidden_states
+        topk_output = dispatch_output.topk_output
+        moe_runner_config = self.moe_runner_config
         if use_intel_amx_backend(layer):
             from sglang.srt.layers.moe.topk import apply_topk_weights_cpu
@@ -1001,7 +1029,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                 moe_runner_config.apply_router_weight_on_input, topk_weights, x
             )
-            return torch.ops.sgl_kernel.fused_experts_cpu(
+            output = torch.ops.sgl_kernel.fused_experts_cpu(
                 x,
                 layer.w13_weight,
                 layer.w2_weight,
@@ -1017,6 +1045,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                 None,  # a2_scale
                 True,  # is_vnni
             )
+            return StandardCombineInput(hidden_states=output)
         if _is_hip:
             ret = self.maybe_apply_hip_fused_experts(
@@ -1027,7 +1056,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                 moe_runner_config.no_combine,
             )
             if ret is not None:
-                return ret
+                return StandardCombineInput(hidden_states=ret)
         if self.use_cutlass_fused_experts_fp8:
             from sglang.srt.layers.moe.cutlass_moe import cutlass_fused_experts_fp8
@@ -1056,17 +1085,13 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                 self.problem_sizes2,
                 use_fp8_blockscale=True,
             )
-            # Scale by routed_scaling_factor is fused into select_experts.
-            return output
-        # Expert fusion with FP8 quantization
-        return fused_experts(
-            x,
-            layer.w13_weight,
-            layer.w2_weight,
-            topk_output=topk_output,
-            moe_runner_config=moe_runner_config,
+            return StandardCombineInput(hidden_states=output)
+        quant_info = TritonMoeQuantInfo(
+            w13_weight=layer.w13_weight,
+            w2_weight=layer.w2_weight,
             use_fp8_w8a8=True,
-            w1_scale=(
+            w13_scale=(
                 layer.w13_weight_scale_inv
                 if self.block_quant
                 else layer.w13_weight_scale
@@ -1074,20 +1099,22 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             w2_scale=(
                 layer.w2_weight_scale_inv if self.block_quant else layer.w2_weight_scale
             ),
-            a1_scale=layer.w13_input_scale,
+            a13_scale=layer.w13_input_scale,
             a2_scale=layer.w2_input_scale,
             block_shape=self.quant_config.weight_block_size,
         )
+        return self.runner.run(dispatch_output, quant_info)
     def apply_with_router_logits(
         self,
         layer: torch.nn.Module,
-        x: torch.Tensor,
-        topk_output: TopKOutput,
-        moe_runner_config: MoeRunnerConfig,
+        dispatch_output: StandardDispatchOutput,
     ) -> torch.Tensor:
-        activation = moe_runner_config.activation
-        routed_scaling_factor = moe_runner_config.routed_scaling_factor
+        x = dispatch_output.hidden_states
+        topk_output = dispatch_output.topk_output
+        activation = self.moe_runner_config.activation
+        routed_scaling_factor = self.moe_runner_config.routed_scaling_factor
         from flashinfer.fused_moe import trtllm_fp8_block_scale_moe
@@ -1108,10 +1135,12 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             and topk_config.topk_group is not None
         ), "Current trtllm_fp8_block_scale_moe kernel does not support these two arguments as None"
-        if topk_config.correction_bias is None:
-            correction_bias = topk_config.correction_bias.to(x.dtype)
-        else:
-            correction_bias = None
+        correction_bias = (
+            None
+            if topk_config.correction_bias is None
+            else topk_config.correction_bias.to(x.dtype)
+        )
         return trtllm_fp8_block_scale_moe(
             routing_logits=router_logits.to(torch.float32),
             routing_bias=correction_bias,

sglang/srt/layers/quantization/fp8_utils.py CHANGED Viewed

@@ -2,6 +2,7 @@ from typing import Callable, List, Optional, Tuple
 import torch
+from sglang.srt import offloader
 from sglang.srt.layers.quantization import deep_gemm_wrapper
 from sglang.srt.layers.quantization.fp8_kernel import sglang_per_token_group_quant_fp8
 from sglang.srt.layers.quantization.mxfp4_tensor import MXFP4QuantizeUtil
@@ -45,7 +46,7 @@ _use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
 if _use_aiter:
     import aiter
-    from aiter import gemm_a8w8_blockscale, get_hip_quant
+    from aiter import gemm_a8w8_blockscale, gemm_a8w8_bpreshuffle, get_hip_quant
     aiter_per1x128_quant = get_hip_quant(aiter.QuantType.per_1x128)
@@ -248,11 +249,6 @@ def deepgemm_w8a8_block_fp8_linear_with_fallback(
         scale_ue8m0=deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0,
     )
-    # NOTE(alcanderian): Useless when scale is packed to int32
-    # if get_bool_env_var("SGLANG_W8A8_DEEPGEMM_SANITY_CHECK_UE8M0"):
-    #     _check_ue8m0("x_scale", x_scale)
-    #     _check_ue8m0("weight_scale", ws)
     output = w8a8_block_fp8_matmul_deepgemm(
         q_input, weight, x_scale, weight_scale, block_size, output_dtype=output_dtype
     )
@@ -261,11 +257,6 @@ def deepgemm_w8a8_block_fp8_linear_with_fallback(
     return output.to(dtype=output_dtype).view(*output_shape)
-def _check_ue8m0(name, x):
-    x_ceil = ceil_to_ue8m0(x)
-    assert torch.all(x == x_ceil), f"{name=} {x=} {x_ceil=}"
 def aiter_w8a8_block_fp8_linear(
     input: torch.Tensor,
     weight: torch.Tensor,
@@ -427,10 +418,14 @@ def block_quant_dequant(
 def requant_weight_ue8m0_inplace(weight, weight_scale_inv, weight_block_size):
     assert isinstance(weight, torch.nn.Parameter)
     assert isinstance(weight_scale_inv, torch.nn.Parameter)
-    weight.data, weight_scale_inv.data = _requant_weight_ue8m0(
-        weight, weight_scale_inv, weight_block_size
+    new_weight, new_weight_scale_inv = _requant_weight_ue8m0(
+        weight.to(weight_scale_inv.device), weight_scale_inv, weight_block_size
     )
+    offloader.update_param(weight, new_weight)
+    weight_scale_inv.data = new_weight_scale_inv
 def _requant_weight_ue8m0(
     weight: torch.Tensor,
@@ -652,25 +647,49 @@ def apply_fp8_linear(
                 use_per_token_if_dynamic
                 and not per_tensor_weights
                 and not per_tensor_activations
-                and USE_ROWWISE_TORCH_SCALED_MM
+                and (USE_ROWWISE_TORCH_SCALED_MM or _use_aiter)
             ):
-                # For now validated on ROCm platform
-                # fp8 rowwise scaling in torch._scaled_mm is introduced in
-                # https://github.com/pytorch/pytorch/pull/144432 using hipBLASLt
-                # and ROCm 6.3, which only exists in torch 2.7 and above.
-                # For CUDA platform please validate if the
-                # torch._scaled_mm support rowwise scaled GEMM
-                # Fused GEMM_DQ Rowwise GEMM
-                output = torch._scaled_mm(
-                    qinput,
-                    weight,
-                    out_dtype=input.dtype,
-                    scale_a=x_scale,
-                    scale_b=weight_scale.t(),
-                    bias=bias,
-                )
-                return _process_scaled_mm_output(output, input_2d.shape, output_shape)
+                # into this sector means use dynamic per-token-per-channel quant
+                # per-token scale quant for input matrix, every row(one token) have one scale factor
+                # per-channel scale quant for weight matrix, every col(one channel) have one scale factor
+                if _use_aiter:
+                    # gemm_a8w8_bpreshuffle(XQ, WQ, x_scale, w_scale, dtype)
+                    # XQ -> input tensor, shape = (m, k)
+                    # WQ -> weight tensor, shape = (n, k), with preshuffe get better perf
+                    # x_scale -> input scale tensor, shape = (m, 1)
+                    # w_scale -> weight scale tensor, shape = (n ,1)
+                    # dtype -> output dtype
+                    output = gemm_a8w8_bpreshuffle(
+                        XQ=qinput,
+                        WQ=weight,
+                        x_scale=x_scale,
+                        w_scale=weight_scale,
+                        dtype=input.dtype,
+                    )
+                    if bias is not None:
+                        output += bias
+                    return _process_scaled_mm_output(
+                        output, input_2d.shape, [*input.shape[:-1], weight.shape[0]]
+                    )
+                else:
+                    # For now validated on ROCm platform
+                    # fp8 rowwise scaling in torch._scaled_mm is introduced in
+                    # https://github.com/pytorch/pytorch/pull/144432 using hipBLASLt
+                    # and ROCm 6.3, which only exists in torch 2.7 and above.
+                    # For CUDA platform please validate if the
+                    # torch._scaled_mm support rowwise scaled GEMM
+                    # Fused GEMM_DQ Rowwise GEMM
+                    output = torch._scaled_mm(
+                        qinput,
+                        weight,
+                        out_dtype=input.dtype,
+                        scale_a=x_scale,
+                        scale_b=weight_scale.t(),
+                        bias=bias,
+                    )
+                    return _process_scaled_mm_output(
+                        output, input_2d.shape, output_shape
+                    )
             else:
                 # Fallback for channelwise case, where we use unfused DQ
                 # due to limitations with scaled_mm
@@ -713,7 +732,7 @@ def apply_fp8_linear(
                 # final solution should be: 1. add support to per-tensor activation scaling.
                 # 2. solve the torch.compile error from weight_scale.numel() == 1 and x_scale.numel() > 1 (below line#308)
                 if _is_hip and weight_scale.numel() == 1:
-                    qinput, x_scale = ops.scaled_fp8_quant(
+                    qinput, x_scale = scaled_fp8_quant(
                         input_2d,
                         input_scale,
                         use_per_token_if_dynamic=use_per_token_if_dynamic,

sglang/srt/layers/quantization/gptq.py CHANGED Viewed

@@ -45,7 +45,10 @@ from sglang.srt.layers.quantization.utils import (
 if TYPE_CHECKING:
     from sglang.srt.layers.moe.moe_runner import MoeRunnerConfig
-    from sglang.srt.layers.moe.topk import TopKOutput
+    from sglang.srt.layers.moe.token_dispatcher import (
+        StandardDispatchOutput,
+        CombineInput,
+    )
 from sglang.srt.utils import is_cuda
@@ -838,19 +841,14 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
         from sglang.srt.layers.linear import set_weight_attrs
         from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
-        intermediate_size = extra_weight_attrs.pop("intermediate_size")
-        self.is_k_full = (not self.quant_config.desc_act) or (
-            intermediate_size_per_partition == intermediate_size
-        )
+        self.is_k_full = (not self.quant_config.desc_act) or layer.moe_tp_size == 1
         if self.quant_config.group_size != -1:
             scales_size13 = hidden_size // self.quant_config.group_size
-            w2_scales_size = (
-                intermediate_size
-                if self.quant_config.desc_act
-                else intermediate_size_per_partition
-            )
+            if self.quant_config.desc_act:
+                w2_scales_size = intermediate_size_per_partition
+            else:
+                w2_scales_size = intermediate_size_per_partition * layer.moe_tp_size
             scales_size2 = w2_scales_size // self.quant_config.group_size
             strategy = FusedMoeWeightScaleSupported.GROUP.value
         else:
@@ -1052,17 +1050,26 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
         )
         replace_parameter(layer, "w2_scales", marlin_w2_scales)
+    def create_moe_runner(
+        self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
+    ):
+        self.moe_runner_config = moe_runner_config
     def apply(
         self,
         layer: torch.nn.Module,
-        x: torch.Tensor,
-        topk_output: TopKOutput,
-        moe_runner_config: MoeRunnerConfig,
-    ) -> torch.Tensor:
+        dispatch_output: StandardDispatchOutput,
+    ) -> CombineInput:
+        from sglang.srt.layers.moe.token_dispatcher import StandardCombineInput
+        x = dispatch_output.hidden_states
+        topk_output = dispatch_output.topk_output
         # Delay the import to avoid circular dependency
         assert (
-            moe_runner_config.activation == "silu"
+            self.moe_runner_config.activation == "silu"
         ), "Only SiLU activation is supported."
         # The input must currently be float16
@@ -1071,7 +1078,7 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
         topk_weights, topk_ids, router_logits = topk_output
-        return fused_marlin_moe(
+        output = fused_marlin_moe(
             x,
             layer.w13_qweight,
             layer.w2_qweight,
@@ -1087,3 +1094,4 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
             num_bits=self.quant_config.weight_bits,
             is_k_full=self.is_k_full,
         ).to(orig_dtype)
+        return StandardCombineInput(hidden_states=output)

sglang 0.5.2rc2__py3-none-any.whl → 0.5.3rc2__py3-none-any.whl

sglang 0.5.2rc2py3-none-any.whl → 0.5.3rc2py3-none-any.whl