PyPI - sglang - Versions diffs - 0.5.3rc2__py3-none-any.whl → 0.5.4.post1__py3-none-any.whl - Mend

sglang 0.5.3rc2py3-none-any.whl → 0.5.4.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (419) hide show

sglang/bench_one_batch.py +47 -28
sglang/bench_one_batch_server.py +41 -25
sglang/bench_serving.py +378 -160
sglang/check_env.py +1 -1
sglang/compile_deep_gemm.py +6 -2
sglang/global_config.py +1 -25
sglang/lang/api.py +6 -0
sglang/lang/interpreter.py +1 -0
sglang/lang/ir.py +13 -0
sglang/launch_server.py +10 -15
sglang/profiler.py +18 -1
sglang/srt/_custom_ops.py +1 -1
sglang/srt/batch_invariant_ops/batch_invariant_ops.py +105 -10
sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
sglang/srt/compilation/backend.py +437 -0
sglang/srt/compilation/compilation_config.py +20 -0
sglang/srt/compilation/compilation_counter.py +47 -0
sglang/srt/compilation/compile.py +210 -0
sglang/srt/compilation/compiler_interface.py +503 -0
sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
sglang/srt/compilation/fix_functionalization.py +134 -0
sglang/srt/compilation/fx_utils.py +83 -0
sglang/srt/compilation/inductor_pass.py +140 -0
sglang/srt/compilation/pass_manager.py +66 -0
sglang/srt/compilation/piecewise_context_manager.py +40 -0
sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
sglang/srt/configs/__init__.py +4 -0
sglang/srt/configs/deepseek_ocr.py +262 -0
sglang/srt/configs/deepseekvl2.py +194 -96
sglang/srt/configs/dots_vlm.py +2 -7
sglang/srt/configs/falcon_h1.py +13 -64
sglang/srt/configs/load_config.py +25 -2
sglang/srt/configs/mamba_utils.py +117 -0
sglang/srt/configs/model_config.py +136 -25
sglang/srt/configs/modelopt_config.py +30 -0
sglang/srt/configs/nemotron_h.py +286 -0
sglang/srt/configs/olmo3.py +105 -0
sglang/srt/configs/points_v15_chat.py +29 -0
sglang/srt/configs/qwen3_next.py +11 -47
sglang/srt/configs/qwen3_omni.py +613 -0
sglang/srt/configs/qwen3_vl.py +0 -10
sglang/srt/connector/remote_instance.py +1 -1
sglang/srt/constrained/base_grammar_backend.py +5 -1
sglang/srt/constrained/llguidance_backend.py +5 -0
sglang/srt/constrained/outlines_backend.py +1 -1
sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
sglang/srt/constrained/utils.py +12 -0
sglang/srt/constrained/xgrammar_backend.py +20 -11
sglang/srt/disaggregation/ascend/transfer_engine.py +1 -1
sglang/srt/disaggregation/base/conn.py +17 -4
sglang/srt/disaggregation/common/conn.py +4 -2
sglang/srt/disaggregation/decode.py +123 -31
sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
sglang/srt/disaggregation/fake/conn.py +11 -3
sglang/srt/disaggregation/mooncake/conn.py +157 -19
sglang/srt/disaggregation/nixl/conn.py +69 -24
sglang/srt/disaggregation/prefill.py +96 -270
sglang/srt/distributed/device_communicators/all_reduce_utils.py +4 -4
sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
sglang/srt/distributed/device_communicators/pynccl.py +24 -12
sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
sglang/srt/distributed/device_communicators/symm_mem.py +1 -1
sglang/srt/distributed/naive_distributed.py +5 -4
sglang/srt/distributed/parallel_state.py +63 -19
sglang/srt/elastic_ep/elastic_ep.py +74 -0
sglang/srt/entrypoints/context.py +3 -2
sglang/srt/entrypoints/engine.py +83 -80
sglang/srt/entrypoints/grpc_server.py +430 -234
sglang/srt/entrypoints/harmony_utils.py +2 -2
sglang/srt/entrypoints/http_server.py +195 -102
sglang/srt/entrypoints/http_server_engine.py +1 -7
sglang/srt/entrypoints/openai/protocol.py +225 -37
sglang/srt/entrypoints/openai/serving_base.py +49 -2
sglang/srt/entrypoints/openai/serving_chat.py +29 -74
sglang/srt/entrypoints/openai/serving_classify.py +204 -0
sglang/srt/entrypoints/openai/serving_completions.py +15 -1
sglang/srt/entrypoints/openai/serving_responses.py +5 -2
sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
sglang/srt/environ.py +58 -6
sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
sglang/srt/eplb/expert_distribution.py +33 -4
sglang/srt/eplb/expert_location_dispatch.py +2 -2
sglang/srt/eplb/expert_location_updater.py +2 -2
sglang/srt/function_call/base_format_detector.py +17 -18
sglang/srt/function_call/function_call_parser.py +20 -14
sglang/srt/function_call/glm4_moe_detector.py +1 -5
sglang/srt/function_call/gpt_oss_detector.py +1 -1
sglang/srt/function_call/json_array_parser.py +0 -2
sglang/srt/function_call/minimax_m2.py +367 -0
sglang/srt/function_call/utils.py +2 -2
sglang/srt/grpc/compile_proto.py +3 -3
sglang/srt/{entrypoints → grpc}/grpc_request_manager.py +112 -52
sglang/srt/grpc/health_servicer.py +189 -0
sglang/srt/grpc/scheduler_launcher.py +181 -0
sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
sglang/srt/grpc/sglang_scheduler_pb2.pyi +66 -10
sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +89 -1
sglang/srt/layers/activation.py +10 -1
sglang/srt/layers/attention/aiter_backend.py +3 -3
sglang/srt/layers/attention/ascend_backend.py +17 -1
sglang/srt/layers/attention/attention_registry.py +43 -23
sglang/srt/layers/attention/base_attn_backend.py +20 -1
sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
sglang/srt/layers/attention/fla/chunk.py +0 -1
sglang/srt/layers/attention/fla/chunk_o.py +1 -1
sglang/srt/layers/attention/fla/index.py +0 -2
sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
sglang/srt/layers/attention/fla/utils.py +0 -3
sglang/srt/layers/attention/fla/wy_fast.py +0 -2
sglang/srt/layers/attention/flashattention_backend.py +24 -10
sglang/srt/layers/attention/flashinfer_backend.py +258 -22
sglang/srt/layers/attention/flashinfer_mla_backend.py +38 -28
sglang/srt/layers/attention/flashmla_backend.py +2 -2
sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -62
sglang/srt/layers/attention/intel_amx_backend.py +1 -1
sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -5
sglang/srt/layers/attention/mamba/mamba.py +189 -241
sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
sglang/srt/layers/attention/mamba/ops/ssd_combined.py +0 -1
sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
sglang/srt/layers/attention/npu_ops/mla_preprocess.py +1 -1
sglang/srt/layers/attention/nsa/nsa_indexer.py +40 -83
sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
sglang/srt/layers/attention/nsa/utils.py +0 -1
sglang/srt/layers/attention/nsa_backend.py +404 -90
sglang/srt/layers/attention/triton_backend.py +208 -34
sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
sglang/srt/layers/attention/trtllm_mha_backend.py +2 -2
sglang/srt/layers/attention/trtllm_mla_backend.py +362 -43
sglang/srt/layers/attention/utils.py +89 -7
sglang/srt/layers/attention/vision.py +3 -3
sglang/srt/layers/attention/xpu_backend.py +1028 -0
sglang/srt/layers/communicator.py +12 -7
sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +5 -9
sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/configurer.py +4 -3
sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
sglang/srt/layers/dp_attention.py +17 -0
sglang/srt/layers/layernorm.py +64 -19
sglang/srt/layers/linear.py +9 -1
sglang/srt/layers/logits_processor.py +152 -17
sglang/srt/layers/modelopt_utils.py +11 -0
sglang/srt/layers/moe/cutlass_moe.py +0 -2
sglang/srt/layers/moe/cutlass_w4a8_moe.py +351 -21
sglang/srt/layers/moe/ep_moe/kernels.py +229 -457
sglang/srt/layers/moe/ep_moe/layer.py +154 -625
sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +1 -1
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +11 -3
sglang/srt/layers/moe/fused_moe_triton/layer.py +79 -73
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +25 -46
sglang/srt/layers/moe/moe_runner/deep_gemm.py +569 -0
sglang/srt/layers/moe/moe_runner/runner.py +6 -0
sglang/srt/layers/moe/moe_runner/triton.py +3 -1
sglang/srt/layers/moe/moe_runner/triton_kernels.py +194 -0
sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
sglang/srt/layers/moe/router.py +51 -15
sglang/srt/layers/moe/token_dispatcher/__init__.py +14 -4
sglang/srt/layers/moe/token_dispatcher/base.py +12 -6
sglang/srt/layers/moe/token_dispatcher/deepep.py +127 -110
sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
sglang/srt/layers/moe/topk.py +7 -6
sglang/srt/layers/moe/utils.py +20 -5
sglang/srt/layers/quantization/__init__.py +5 -58
sglang/srt/layers/quantization/awq.py +183 -9
sglang/srt/layers/quantization/awq_triton.py +29 -0
sglang/srt/layers/quantization/base_config.py +27 -1
sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +20 -49
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +3 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
sglang/srt/layers/quantization/fp8.py +152 -81
sglang/srt/layers/quantization/fp8_kernel.py +55 -10
sglang/srt/layers/quantization/fp8_utils.py +42 -14
sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
sglang/srt/layers/quantization/gguf.py +566 -0
sglang/srt/layers/quantization/gptq.py +0 -1
sglang/srt/layers/quantization/int8_kernel.py +18 -2
sglang/srt/layers/quantization/marlin_utils.py +12 -0
sglang/srt/layers/quantization/modelopt_quant.py +125 -100
sglang/srt/layers/quantization/mxfp4.py +35 -68
sglang/srt/layers/quantization/petit.py +1 -1
sglang/srt/layers/quantization/quark/quark.py +3 -1
sglang/srt/layers/quantization/quark/quark_moe.py +3 -3
sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
sglang/srt/layers/quantization/unquant.py +23 -48
sglang/srt/layers/quantization/utils.py +0 -1
sglang/srt/layers/quantization/w4afp8.py +87 -20
sglang/srt/layers/quantization/w8a8_int8.py +30 -24
sglang/srt/layers/radix_attention.py +62 -9
sglang/srt/layers/rotary_embedding.py +686 -17
sglang/srt/layers/sampler.py +47 -16
sglang/srt/layers/sparse_pooler.py +98 -0
sglang/srt/layers/utils.py +0 -1
sglang/srt/layers/vocab_parallel_embedding.py +4 -1
sglang/srt/lora/backend/triton_backend.py +0 -1
sglang/srt/lora/eviction_policy.py +139 -0
sglang/srt/lora/lora_manager.py +24 -9
sglang/srt/lora/lora_registry.py +1 -1
sglang/srt/lora/mem_pool.py +40 -16
sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +1 -1
sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +4 -2
sglang/srt/managers/cache_controller.py +48 -17
sglang/srt/managers/data_parallel_controller.py +146 -42
sglang/srt/managers/detokenizer_manager.py +40 -13
sglang/srt/managers/io_struct.py +69 -16
sglang/srt/managers/mm_utils.py +20 -18
sglang/srt/managers/multi_tokenizer_mixin.py +83 -82
sglang/srt/managers/overlap_utils.py +96 -19
sglang/srt/managers/schedule_batch.py +241 -511
sglang/srt/managers/schedule_policy.py +15 -2
sglang/srt/managers/scheduler.py +420 -514
sglang/srt/managers/scheduler_metrics_mixin.py +73 -18
sglang/srt/managers/scheduler_output_processor_mixin.py +317 -111
sglang/srt/managers/scheduler_pp_mixin.py +341 -0
sglang/srt/managers/scheduler_profiler_mixin.py +60 -14
sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
sglang/srt/managers/scheduler_update_weights_mixin.py +33 -14
sglang/srt/managers/tokenizer_communicator_mixin.py +71 -55
sglang/srt/managers/tokenizer_manager.py +375 -95
sglang/srt/managers/tp_worker.py +212 -161
sglang/srt/managers/utils.py +78 -2
sglang/srt/mem_cache/allocator.py +7 -2
sglang/srt/mem_cache/allocator_ascend.py +2 -2
sglang/srt/mem_cache/base_prefix_cache.py +2 -2
sglang/srt/mem_cache/chunk_cache.py +13 -2
sglang/srt/mem_cache/common.py +480 -0
sglang/srt/mem_cache/evict_policy.py +16 -1
sglang/srt/mem_cache/hicache_storage.py +11 -2
sglang/srt/mem_cache/hiradix_cache.py +16 -3
sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
sglang/srt/mem_cache/memory_pool.py +517 -219
sglang/srt/mem_cache/memory_pool_host.py +0 -1
sglang/srt/mem_cache/multimodal_cache.py +0 -1
sglang/srt/mem_cache/radix_cache.py +53 -19
sglang/srt/mem_cache/radix_cache_cpp.py +19 -14
sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +8 -2
sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +1 -13
sglang/srt/mem_cache/storage/backend_factory.py +2 -2
sglang/srt/mem_cache/storage/eic/eic_storage.py +5 -6
sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +3 -2
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +9 -3
sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +5 -3
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +101 -17
sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
sglang/srt/mem_cache/swa_radix_cache.py +92 -26
sglang/srt/metrics/collector.py +31 -0
sglang/srt/metrics/func_timer.py +1 -1
sglang/srt/model_executor/cuda_graph_runner.py +43 -5
sglang/srt/model_executor/forward_batch_info.py +71 -25
sglang/srt/model_executor/model_runner.py +362 -270
sglang/srt/model_executor/npu_graph_runner.py +2 -3
sglang/srt/model_executor/piecewise_cuda_graph_runner.py +549 -0
sglang/srt/model_loader/__init__.py +1 -1
sglang/srt/model_loader/loader.py +424 -27
sglang/srt/model_loader/utils.py +0 -1
sglang/srt/model_loader/weight_utils.py +47 -28
sglang/srt/models/apertus.py +2 -3
sglang/srt/models/arcee.py +2 -2
sglang/srt/models/bailing_moe.py +13 -52
sglang/srt/models/bailing_moe_nextn.py +3 -4
sglang/srt/models/bert.py +1 -1
sglang/srt/models/deepseek_nextn.py +19 -3
sglang/srt/models/deepseek_ocr.py +1516 -0
sglang/srt/models/deepseek_v2.py +418 -140
sglang/srt/models/dots_ocr.py +0 -2
sglang/srt/models/dots_vlm.py +0 -1
sglang/srt/models/dots_vlm_vit.py +1 -1
sglang/srt/models/falcon_h1.py +13 -19
sglang/srt/models/gemma3_mm.py +16 -0
sglang/srt/models/gemma3n_mm.py +1 -2
sglang/srt/models/glm4_moe.py +327 -382
sglang/srt/models/glm4_moe_nextn.py +6 -16
sglang/srt/models/glm4v.py +2 -1
sglang/srt/models/glm4v_moe.py +32 -199
sglang/srt/models/gpt_oss.py +5 -5
sglang/srt/models/grok.py +10 -23
sglang/srt/models/hunyuan.py +2 -7
sglang/srt/models/interns1.py +0 -1
sglang/srt/models/kimi_vl.py +1 -7
sglang/srt/models/kimi_vl_moonvit.py +3 -1
sglang/srt/models/llama.py +2 -2
sglang/srt/models/llama_eagle3.py +1 -1
sglang/srt/models/longcat_flash.py +5 -22
sglang/srt/models/longcat_flash_nextn.py +3 -14
sglang/srt/models/mimo.py +2 -13
sglang/srt/models/mimo_mtp.py +1 -2
sglang/srt/models/minicpmo.py +7 -5
sglang/srt/models/minimax_m2.py +922 -0
sglang/srt/models/mixtral.py +1 -4
sglang/srt/models/mllama.py +1 -1
sglang/srt/models/mllama4.py +13 -3
sglang/srt/models/nemotron_h.py +511 -0
sglang/srt/models/nvila.py +355 -0
sglang/srt/models/nvila_lite.py +184 -0
sglang/srt/models/olmo2.py +31 -4
sglang/srt/models/opt.py +5 -5
sglang/srt/models/phi.py +1 -1
sglang/srt/models/phi4mm.py +1 -1
sglang/srt/models/phimoe.py +0 -1
sglang/srt/models/pixtral.py +0 -3
sglang/srt/models/points_v15_chat.py +186 -0
sglang/srt/models/qwen.py +0 -1
sglang/srt/models/qwen2.py +22 -1
sglang/srt/models/qwen2_5_vl.py +3 -3
sglang/srt/models/qwen2_audio.py +2 -15
sglang/srt/models/qwen2_moe.py +15 -12
sglang/srt/models/qwen2_vl.py +5 -2
sglang/srt/models/qwen3.py +34 -4
sglang/srt/models/qwen3_moe.py +19 -37
sglang/srt/models/qwen3_next.py +7 -12
sglang/srt/models/qwen3_next_mtp.py +3 -4
sglang/srt/models/qwen3_omni_moe.py +661 -0
sglang/srt/models/qwen3_vl.py +37 -33
sglang/srt/models/qwen3_vl_moe.py +57 -185
sglang/srt/models/roberta.py +55 -3
sglang/srt/models/sarashina2_vision.py +0 -1
sglang/srt/models/step3_vl.py +3 -5
sglang/srt/models/utils.py +11 -1
sglang/srt/multimodal/processors/base_processor.py +7 -2
sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
sglang/srt/multimodal/processors/dots_vlm.py +0 -1
sglang/srt/multimodal/processors/glm4v.py +2 -6
sglang/srt/multimodal/processors/internvl.py +0 -2
sglang/srt/multimodal/processors/janus_pro.py +0 -1
sglang/srt/multimodal/processors/mllama4.py +0 -8
sglang/srt/multimodal/processors/{vila.py → nvila.py} +32 -24
sglang/srt/multimodal/processors/phi4mm.py +0 -1
sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
sglang/srt/multimodal/processors/qwen_vl.py +75 -16
sglang/srt/multimodal/processors/step3_vl.py +1 -1
sglang/srt/parser/conversation.py +41 -0
sglang/srt/parser/reasoning_parser.py +28 -2
sglang/srt/sampling/custom_logit_processor.py +77 -2
sglang/srt/sampling/sampling_batch_info.py +17 -22
sglang/srt/sampling/sampling_params.py +70 -2
sglang/srt/server_args.py +846 -163
sglang/srt/server_args_config_parser.py +1 -1
sglang/srt/single_batch_overlap.py +36 -31
sglang/srt/speculative/base_spec_worker.py +34 -0
sglang/srt/speculative/draft_utils.py +226 -0
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +24 -7
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +23 -2
sglang/srt/speculative/eagle_info.py +57 -18
sglang/srt/speculative/eagle_info_v2.py +458 -0
sglang/srt/speculative/eagle_utils.py +138 -0
sglang/srt/speculative/eagle_worker.py +83 -280
sglang/srt/speculative/eagle_worker_v2.py +702 -0
sglang/srt/speculative/{ngram_utils.py → ngram_info.py} +14 -9
sglang/srt/speculative/ngram_worker.py +12 -11
sglang/srt/speculative/spec_info.py +2 -0
sglang/srt/speculative/spec_utils.py +38 -3
sglang/srt/speculative/standalone_worker.py +4 -14
sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
sglang/srt/two_batch_overlap.py +28 -14
sglang/srt/utils/__init__.py +1 -1
sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
sglang/srt/utils/common.py +272 -82
sglang/srt/utils/hf_transformers_utils.py +44 -17
sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
sglang/srt/{offloader.py → utils/offloader.py} +4 -4
sglang/srt/utils/profile_merger.py +199 -0
sglang/test/attention/test_flashattn_backend.py +1 -1
sglang/test/attention/test_flashattn_mla_backend.py +0 -1
sglang/test/attention/test_prefix_chunk_info.py +0 -2
sglang/test/attention/test_trtllm_mla_backend.py +221 -53
sglang/test/few_shot_gsm8k_engine.py +2 -4
sglang/test/kit_matched_stop.py +157 -0
sglang/test/longbench_v2/__init__.py +1 -0
sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
sglang/test/run_eval.py +41 -0
sglang/test/runners.py +2 -0
sglang/test/send_one.py +42 -7
sglang/test/simple_eval_common.py +3 -0
sglang/test/simple_eval_gpqa.py +0 -1
sglang/test/simple_eval_humaneval.py +0 -3
sglang/test/simple_eval_longbench_v2.py +344 -0
sglang/test/test_block_fp8.py +1 -2
sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
sglang/test/test_cutlass_moe.py +1 -2
sglang/test/test_cutlass_w4a8_moe.py +10 -20
sglang/test/test_deterministic.py +463 -107
sglang/test/test_deterministic_utils.py +74 -0
sglang/test/test_disaggregation_utils.py +81 -0
sglang/test/test_marlin_moe.py +0 -1
sglang/test/test_utils.py +85 -20
sglang/version.py +1 -1
{sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/METADATA +48 -35
{sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/RECORD +414 -350
sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
sglang/srt/models/vila.py +0 -306
sglang/srt/speculative/build_eagle_tree.py +0 -427
sglang/test/test_block_fp8_ep.py +0 -358
/sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
/sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
/sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +0 -0
{sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/WHEEL +0 -0
{sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/top_level.txt +0 -0

sglang/srt/layers/moe/ep_moe/layer.py CHANGED Viewed

@@ -1,59 +1,35 @@
 from __future__ import annotations
 import logging
-from contextlib import nullcontext
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Any, Dict, Optional, Union
 import torch
-import triton
-import triton.language as tl
-from sglang.srt.distributed.parallel_state import get_moe_expert_parallel_world_size
+from sglang.srt import single_batch_overlap
+from sglang.srt.layers import deep_gemm_wrapper
 from sglang.srt.layers.moe import (
     get_deepep_mode,
     get_moe_a2a_backend,
     get_moe_runner_backend,
     should_use_flashinfer_trtllm_moe,
 )
-from sglang.srt.layers.moe.ep_moe.kernels import (
-    ep_gather,
-    ep_scatter,
-    moe_ep_deepgemm_preprocess,
-    post_reorder_triton_kernel,
-    silu_and_mul_masked_post_quant_fwd,
-    tma_align_input_scale,
-)
 from sglang.srt.layers.moe.fused_moe_triton.layer import FlashInferFusedMoE, FusedMoE
+from sglang.srt.layers.moe.token_dispatcher.deepep import (
+    DeepEPLLCombineInput,
+    DeepEPNormalCombineInput,
+)
 from sglang.srt.layers.moe.topk import TopKOutput
-from sglang.srt.layers.quantization import deep_gemm_wrapper
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.quantization.fp8 import Fp8Config
-from sglang.srt.layers.quantization.fp8_kernel import (
-    is_fp8_fnuz,
-    sglang_per_token_group_quant_fp8,
-)
-from sglang.srt.layers.quantization.modelopt_quant import (
-    CUTEDSL_MOE_NVFP4_DISPATCH,
-    ModelOptNvFp4FusedMoEMethod,
-)
-from sglang.srt.managers.schedule_batch import global_server_args_dict
-from sglang.srt.model_executor.forward_batch_info import ForwardBatch
-from sglang.srt.offloader import get_offloader
+from sglang.srt.layers.quantization.fp8_kernel import is_fp8_fnuz
+from sglang.srt.layers.quantization.w4afp8 import W4AFp8Config, W4AFp8MoEMethod
 from sglang.srt.single_batch_overlap import DownGemmOverlapArgs
-from sglang.srt.utils import (
-    ceil_div,
-    dispose_tensor,
-    get_bool_env_var,
-    get_int_env_var,
-    is_cuda,
-    is_hip,
-    is_npu,
-)
+from sglang.srt.utils import get_bool_env_var, is_hip, is_npu
 if TYPE_CHECKING:
     from sglang.srt.layers.moe.token_dispatcher import (
-        DeepEPLLOutput,
-        DeepEPNormalOutput,
+        DeepEPLLDispatchOutput,
+        DeepEPNormalDispatchOutput,
         DispatchOutput,
     )
@@ -63,7 +39,7 @@ _is_fp8_fnuz = is_fp8_fnuz()
 _use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
 if not (_is_npu or _is_hip):
-    from sgl_kernel import silu_and_mul
+    pass
 if _use_aiter:
     from aiter import ActivationType, QuantType
@@ -72,29 +48,14 @@ if _use_aiter:
 logger = logging.getLogger(__name__)
-# TODO(kaixih@nvidia): ideally we should merge this logic into
-# `fill_gateup_input_triton_kernel` to directly generate e8m0 scale.
-@torch.compile
-def _cast_to_e8m0_with_rounding_up(x: torch.Tensor) -> torch.Tensor:
-    temp = x.to(torch.float32).view(torch.int32)
-    exp = torch.bitwise_right_shift(temp, 23)
-    mant = torch.bitwise_and(temp, 0x7FFFFF)
-    is_ru = torch.logical_and(
-        torch.logical_and((mant > 0), (exp != 0xFE)),
-        ~torch.logical_and((exp == 0), (mant <= 0x400000)),
-    )
-    exp = torch.where(is_ru, exp + 1, exp)
-    new_x = exp.to(torch.uint8).view(torch.int)
-    return new_x.transpose(1, 2).contiguous().transpose(1, 2)
-class EPMoE(FusedMoE):
+class DeepEPMoE(FusedMoE):
     """
-    MoE Expert Parallel Impl
+    MoE Expert Parallel Impl based on DeepEP (https://github.com/deepseek-ai/DeepEP/tree/main)
+    Mooncake EP shares the same class, as they expose the same interface.
     """
+    _has_printed = False
     def __init__(
         self,
         num_experts: int,
@@ -108,291 +69,50 @@ class EPMoE(FusedMoE):
         prefix: str = "",
         activation: str = "silu",
         routed_scaling_factor: Optional[float] = None,
-        gemm1_alpha: Optional[float] = None,
-        gemm1_clamp_limit: Optional[float] = None,
-        with_bias: bool = False,
     ):
         super().__init__(
             num_experts=num_experts,
+            top_k=top_k,
             hidden_size=hidden_size,
             intermediate_size=intermediate_size,
-            num_fused_shared_experts=num_fused_shared_experts,
             layer_id=layer_id,
-            top_k=top_k,
+            num_fused_shared_experts=num_fused_shared_experts,
             params_dtype=params_dtype,
             quant_config=quant_config,
             prefix=prefix,
             activation=activation,
-            # apply_router_weight_on_input=apply_router_weight_on_input,
             routed_scaling_factor=routed_scaling_factor,
-            gemm1_alpha=gemm1_alpha,
-            gemm1_clamp_limit=gemm1_clamp_limit,
-            with_bias=with_bias,
         )
-        self.intermediate_size = intermediate_size
+        if _use_aiter or _is_npu:
+            self.deprecate_flag = False
+        elif deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM and isinstance(
+            quant_config, Fp8Config
+        ):
+            self.deprecate_flag = True
+        else:
+            self.deprecate_flag = False
+        if self.deprecate_flag:
+            return
         if isinstance(quant_config, Fp8Config):
             self.use_block_quant = getattr(self.quant_method, "block_quant", False)
-            self.block_shape = (
-                self.quant_method.quant_config.weight_block_size
-                if self.use_block_quant
-                else None
-            )
             self.use_fp8_w8a8 = True
             self.fp8_dtype = torch.float8_e4m3fn
-            self.activation_scheme = quant_config.activation_scheme
-        else:
+            self.use_w4afp8 = False
+        elif isinstance(quant_config, W4AFp8Config):
+            self.use_w4afp8 = True
             self.use_fp8_w8a8 = False
             self.use_block_quant = False
-            self.block_shape = None
-            self.activation_scheme = None
-    def forward(self, hidden_states: torch.Tensor, topk_output: TopKOutput):
-        if deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM and self.use_fp8_w8a8:
-            return self.forward_deepgemm(hidden_states, topk_output)
         else:
-            return super().forward(hidden_states, topk_output)
-    def forward_deepgemm(
-        self,
-        hidden_states: torch.Tensor,
-        topk_output: TopKOutput,
-    ):
-        self.w13_weight_fp8 = (
-            self.w13_weight,
-            (
-                self.w13_weight_scale_inv
-                if self.use_block_quant
-                else self.w13_weight_scale
-            ),
-        )
-        self.w2_weight_fp8 = (
-            self.w2_weight,
-            self.w2_weight_scale_inv if self.use_block_quant else self.w2_weight_scale,
-        )
-        assert self.quant_method is not None
-        assert self.moe_runner_config.activation == "silu"
-        hidden_states_shape = hidden_states.shape
-        hidden_states_dtype = hidden_states.dtype
-        hidden_states_device = hidden_states.device
-        topk_weights, topk_ids, _ = topk_output
-        if not self.use_block_quant:
-            # Convert per-tensor quant to per-block quant by repeating scales for forward_deepgemm
-            scale_block_size = 128
-            w13_weight_scale_n = 2 * (
-                (self.intermediate_size + scale_block_size - 1) // scale_block_size
-            )
-            w13_weight_scale_k = (
-                hidden_states_shape[-1] + scale_block_size - 1
-            ) // scale_block_size
-            w13_weight_scale = (
-                self.w13_weight_scale.unsqueeze(1)
-                .repeat_interleave(w13_weight_scale_n, dim=1)
-                .unsqueeze(2)
-                .repeat_interleave(w13_weight_scale_k, dim=2)
-            )
-            self.w13_weight_fp8 = (
-                self.w13_weight,
-                w13_weight_scale,
-            )
-            w2_weight_scale_n = (
-                hidden_states_shape[-1] + scale_block_size - 1
-            ) // scale_block_size
-            w2_weight_scale_k = (
-                self.intermediate_size + scale_block_size - 1
-            ) // scale_block_size
-            w2_weight_scale = (
-                self.w2_weight_scale.unsqueeze(1)
-                .repeat_interleave(w2_weight_scale_n, dim=1)
-                .unsqueeze(2)
-                .repeat_interleave(w2_weight_scale_k, dim=2)
-            )
-            self.w2_weight_fp8 = (
-                self.w2_weight,
-                w2_weight_scale,
-            )
-        # PreReorder
-        m_max, masked_m, expected_m, src2dst, gateup_input, gateup_input_scale = (
-            moe_ep_deepgemm_preprocess(
-                topk_ids,
-                self.num_experts,
-                hidden_states,
-                self.top_k,
-                self.start_expert_id,
-                self.end_expert_id,
-                self.block_shape,
-            )
-        )
-        dispose_tensor(hidden_states)
-        if deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0:
-            b, s_mn, s_k = gateup_input_scale.shape
-            assert (
-                s_mn % 4 == 0 and s_k % 4 == 0
-            ), f"scales must be aligned to 4, but got ({b}, {s_mn}, {s_k})"
-        # GroupGemm-0
-        gateup_input_fp8 = (
-            gateup_input,
-            (
-                _cast_to_e8m0_with_rounding_up(gateup_input_scale)
-                if deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0
-                else deep_gemm_wrapper.get_mn_major_tma_aligned_tensor(
-                    gateup_input_scale
-                )
-            ),
-        )
-        num_groups, m, k = gateup_input_fp8[0].size()
-        n = self.w13_weight.size(1)
-        gateup_output = torch.empty(
-            (num_groups, m, n), device=hidden_states_device, dtype=torch.bfloat16
-        )
-        deep_gemm_wrapper.grouped_gemm_nt_f8f8bf16_masked(
-            gateup_input_fp8,
-            self.w13_weight_fp8,
-            gateup_output,
-            masked_m,
-            expected_m,
-        )
-        del gateup_input
-        del gateup_input_fp8
-        # Act
-        down_input = torch.empty(
-            (
-                gateup_output.shape[0],
-                gateup_output.shape[1],
-                gateup_output.shape[2] // 2,
-            ),
-            device=hidden_states_device,
-            dtype=self.fp8_dtype,
-        )
-        scale_block_size = 128
-        down_input_scale = torch.empty(
-            (
-                gateup_output.shape[0],
-                gateup_output.shape[1],
-                gateup_output.shape[2] // 2 // scale_block_size,
-            ),
-            device=hidden_states_device,
-            dtype=torch.float32,
-        )
-        silu_and_mul_masked_post_quant_fwd(
-            gateup_output,
-            down_input,
-            down_input_scale,
-            scale_block_size,
-            masked_m,
-            scale_ue8m0=deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0,
-        )
-        del gateup_output
-        # GroupGemm-1
-        n = self.w2_weight.size(1)
-        down_input_fp8 = (
-            down_input,
-            (
-                down_input_scale
-                if deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0
-                else deep_gemm_wrapper.get_mn_major_tma_aligned_tensor(down_input_scale)
-            ),
-        )
-        down_output = torch.empty(
-            (num_groups, m, n), device=hidden_states_device, dtype=torch.bfloat16
-        )
-        deep_gemm_wrapper.grouped_gemm_nt_f8f8bf16_masked(
-            down_input_fp8,
-            self.w2_weight_fp8,
-            down_output,
-            masked_m,
-            expected_m,
-        )
-        del down_input
-        del down_input_fp8
-        # PostReorder
-        output = torch.empty(
-            hidden_states_shape, dtype=hidden_states_dtype, device=hidden_states_device
-        )
-        post_reorder_triton_kernel[(hidden_states_shape[0],)](
-            down_output,
-            output,
-            src2dst,
-            topk_ids,
-            topk_weights,
-            self.start_expert_id,
-            self.end_expert_id,
-            self.top_k,
-            hidden_states_shape[1],
-            m_max * self.start_expert_id,
-            BLOCK_SIZE=512,
-        )
-        if self.moe_runner_config.routed_scaling_factor is not None:
-            output *= self.moe_runner_config.routed_scaling_factor
-        return output
-class DeepEPMoE(EPMoE):
-    """
-    MoE Expert Parallel Impl based on DeepEP (https://github.com/deepseek-ai/DeepEP/tree/main)
-    """
-    _has_printed = False
+            self.use_w4afp8 = False
+            self.use_fp8_w8a8 = False
+            self.use_block_quant = False
+            self.use_w4afp8 = False
-    def __init__(
-        self,
-        num_experts: int,
-        top_k: int,
-        hidden_size: int,
-        intermediate_size: int,
-        layer_id: int,
-        num_fused_shared_experts: int = 0,
-        params_dtype: Optional[torch.dtype] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = "",
-        activation: str = "silu",
-        routed_scaling_factor: Optional[float] = None,
-    ):
-        super().__init__(
-            num_experts=num_experts,
-            top_k=top_k,
-            hidden_size=hidden_size,
-            intermediate_size=intermediate_size,
-            layer_id=layer_id,
-            num_fused_shared_experts=num_fused_shared_experts,
-            params_dtype=params_dtype,
-            quant_config=quant_config,
-            prefix=prefix,
-            activation=activation,
-            routed_scaling_factor=routed_scaling_factor,
-        )
         self.deepep_mode = get_deepep_mode()
-        # TODO: move to the beginning of the file
-        from sglang.srt.distributed.parallel_state import get_tp_group
-        from sglang.srt.two_batch_overlap import MaybeTboDeepEPDispatcher
-        self.deepep_dispatcher = MaybeTboDeepEPDispatcher(
-            group=get_tp_group().device_group,
-            router_topk=self.top_k,
-            permute_fusion=True,
-            num_experts=self.num_experts,
-            num_local_experts=self.num_local_experts,
-            hidden_size=hidden_size,
-            params_dtype=params_dtype,
-            deepep_mode=self.deepep_mode,
-            async_finish=True,  # TODO
-            return_recv_hook=True,
-        )
         if self.deepep_mode.enable_low_latency() and not _is_npu:
             # NPU supports low_latency deepep without deepgemm
             assert (
@@ -416,7 +136,7 @@ class DeepEPMoE(EPMoE):
                 self.w13_weight,
                 (
                     self.w13_weight_scale_inv
-                    if self.use_block_quant
+                    if self.use_block_quant or self.use_w4afp8
                     else self.w13_weight_scale
                 ),
             )
@@ -424,7 +144,7 @@ class DeepEPMoE(EPMoE):
                 self.w2_weight,
                 (
                     self.w2_weight_scale_inv
-                    if self.use_block_quant
+                    if self.use_block_quant or self.use_w4afp8
                     else self.w2_weight_scale
                 ),
             )
@@ -432,95 +152,113 @@ class DeepEPMoE(EPMoE):
     def forward(
         self,
         hidden_states: torch.Tensor,
-        topk_idx: torch.Tensor,
-        topk_weights: torch.Tensor,
-        forward_batch: ForwardBatch,
+        topk_output: TopKOutput,
+        forward_shared_experts=None,
+        alt_stream=None,
+        disable_sbo=False,
     ):
-        dispatch_output = self.dispatch(
-            hidden_states, topk_idx, topk_weights, forward_batch
-        )
-        hidden_states = self.moe_impl(dispatch_output)
-        hidden_states = self.combine(
-            hidden_states,
-            dispatch_output.topk_idx,
-            dispatch_output.topk_weights,
-            forward_batch,
+        if self.deprecate_flag:
+            assert forward_shared_experts is None
+            assert alt_stream is None
+            return super().forward(
+                hidden_states,
+                topk_output,
+            )
+        # We have to call SBO inside MoE to be compatible with hooks used in offloading
+        return single_batch_overlap.execute_sbo(
+            hidden_states=hidden_states,
+            topk_output=topk_output,
+            # SBO args
+            experts=self,
+            forward_shared_experts=forward_shared_experts,
+            alt_stream=alt_stream,
+            disable_sbo=disable_sbo,
         )
-        return hidden_states
     def dispatch(
         self,
         hidden_states: torch.Tensor,
-        topk_idx: torch.Tensor,
-        topk_weights: torch.Tensor,
-        forward_batch: ForwardBatch,
+        topk_output: TopKOutput,
     ):
-        return self.deepep_dispatcher.dispatch(
+        return self.dispatcher.dispatch(
             hidden_states=hidden_states,
-            topk_idx=topk_idx,
-            topk_weights=topk_weights,
-            forward_batch=forward_batch,
-            input_global_scale=(
-                self.w13_input_scale_quant
-                if isinstance(self.quant_method, ModelOptNvFp4FusedMoEMethod)
-                and self.quant_method.enable_flashinfer_cutedsl_moe
-                and CUTEDSL_MOE_NVFP4_DISPATCH
-                else None
-            ),
+            topk_output=topk_output,
         )
-    def moe_impl(
+    def run_moe_core(
         self,
         dispatch_output: DispatchOutput,
         down_gemm_overlap_args: Optional[DownGemmOverlapArgs] = None,
     ):
+        if self.deprecate_flag:
+            assert down_gemm_overlap_args is None
+            return super().run_moe_core(
+                dispatch_output,
+            )
         from sglang.srt.layers.moe.token_dispatcher import DispatchOutputChecker
         if _use_aiter:
             assert DispatchOutputChecker.format_is_deepep(dispatch_output)
             # in forward_aiter, we skip token permutation and unpermutation, which have been fused inside aiter kernel
-            return self.forward_aiter(dispatch_output)
-        if _is_npu:
+            output = self.forward_aiter(dispatch_output)
+        elif _is_npu:
             assert DispatchOutputChecker.format_is_deepep(dispatch_output)
-            return self.forward_npu(dispatch_output)
-        if DispatchOutputChecker.format_is_deepep_normal(dispatch_output):
-            assert deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM and self.use_fp8_w8a8
-            return self.forward_deepgemm_contiguous(dispatch_output)
+            output = self.forward_npu(dispatch_output)
+        elif DispatchOutputChecker.format_is_deepep_normal(dispatch_output):
+            if self.use_w4afp8:
+                output = self.forward_cutlass_w4afp8(dispatch_output)
+            else:
+                assert False, "forward_deepgemm_contiguous is deprecated"
         elif DispatchOutputChecker.format_is_deepep_ll(dispatch_output):
-            if get_moe_runner_backend().is_flashinfer_cutedsl():
-                return self.forward_flashinfer_cutedsl(
+            if (
+                get_moe_runner_backend().is_flashinfer_cutedsl()
+                and self.quant_config.get_name() == "modelopt_fp4"
+            ):
+                output = self.forward_flashinfer_cutedsl(
                     dispatch_output, down_gemm_overlap_args=down_gemm_overlap_args
                 )
-            assert deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM and self.use_fp8_w8a8
-            return self.forward_deepgemm_masked(dispatch_output)
-        else:
-            raise ValueError(
-                f"Dispatch output format {dispatch_output.format} is not supported"
-            )
+            elif self.use_w4afp8:
+                output = self.forward_cutlass_w4afp8_masked(dispatch_output)
+            else:
+                assert False, "forward_deepgemm_masked is deprecated"
+        combine_input_wrapper = (
+            DeepEPNormalCombineInput
+            if DispatchOutputChecker.format_is_deepep_normal(dispatch_output)
+            else DeepEPLLCombineInput
+        )
+        return combine_input_wrapper(
+            hidden_states=output,
+            topk_ids=dispatch_output.topk_ids,
+            topk_weights=dispatch_output.topk_weights,
+            overlap_args=down_gemm_overlap_args,
+        )
     def combine(
         self,
         hidden_states: torch.Tensor,
-        topk_idx: torch.Tensor,
+        topk_ids: torch.Tensor,
         topk_weights: torch.Tensor,
-        forward_batch: ForwardBatch,
         overlap_args: Optional[Dict[str, Any]] = None,
     ):
-        return self.deepep_dispatcher.combine(
+        return self.dispatcher.combine(
             hidden_states=hidden_states,
-            topk_idx=topk_idx,
+            topk_ids=topk_ids,
             topk_weights=topk_weights,
-            forward_batch=forward_batch,
             overlap_args=overlap_args,
         )
     def forward_aiter(
         self,
-        dispatch_output: Union[DeepEPNormalOutput, DeepEPLLOutput],
+        dispatch_output: Union[DeepEPNormalDispatchOutput, DeepEPLLDispatchOutput],
     ):
-        hidden_states, topk_idx, topk_weights = (
+        hidden_states, topk_ids, topk_weights = (
             dispatch_output.hidden_states,
-            dispatch_output.topk_idx,
+            dispatch_output.topk_ids,
             dispatch_output.topk_weights,
         )
         if hidden_states.shape[0] == 0:
@@ -528,15 +266,15 @@ class DeepEPMoE(EPMoE):
         # in original deepep, idx == -1 meaning invalid and will not be processed.
         # aiter does not accept -1, we use a expert mask to make these idx invalid
         # (idx == num_local_experts) meaning not used in aiter fused_moe
-        topk_idx_copy = topk_idx.to(torch.int32)
-        topk_idx_copy[topk_idx_copy == -1] = self.num_local_experts
+        topk_ids_copy = topk_ids.to(torch.int32)
+        topk_ids_copy[topk_ids_copy == -1] = self.num_local_experts
         return fused_moe(
             hidden_states,
             self.w13_weight,
             self.w2_weight,
             topk_weights,
-            topk_idx_copy,
+            topk_ids_copy,
             w1_scale=self.w13_weight_scale_inv,
             w2_scale=self.w2_weight_scale_inv,
             quant_type=QuantType.per_128x128,
@@ -548,251 +286,52 @@ class DeepEPMoE(EPMoE):
             expert_mask=self.expert_mask,
         )
-    def forward_deepgemm_contiguous(
-        self,
-        dispatch_output: DeepEPNormalOutput,
-    ):
-        hidden_states_fp8, topk_idx, topk_weights, num_recv_tokens_per_expert = (
-            dispatch_output
-        )
-        hidden_states_fp8, hidden_states_scale = hidden_states_fp8
-        assert self.quant_method is not None
-        assert self.moe_runner_config.activation == "silu"
-        if num_recv_tokens_per_expert is None:
-            return hidden_states_fp8.bfloat16()
-        all_tokens = sum(num_recv_tokens_per_expert)
-        if all_tokens <= 0:
-            return hidden_states_fp8.bfloat16()
-        M, K = hidden_states_fp8.size()
-        N = self.w13_weight.size(1)
-        scale_block_size = 128
-        # TODO also unify other branches (e.g. `EPMoE.forward_deepgemm` sets the field on forward pass)
-        w13_weight_fp8 = (
-            self.w13_weight,
-            (
-                self.w13_weight_scale_inv
-                if self.use_block_quant
-                else self.w13_weight_scale
-            ),
-        )
-        w2_weight_fp8 = (
-            self.w2_weight,
-            (
-                self.w2_weight_scale_inv
-                if self.use_block_quant
-                else self.w2_weight_scale
-            ),
-        )
-        hidden_states_fp8_shape = hidden_states_fp8.shape
-        hidden_states_fp8_device = hidden_states_fp8.device
-        hidden_states_fp8_dtype = hidden_states_fp8.dtype
-        input_tensor = [
-            torch.empty(
-                (all_tokens, K),
-                device=hidden_states_fp8.device,
-                dtype=hidden_states_fp8.dtype,
-            ),
-            (
-                # TODO check whether need `zeros`
-                torch.zeros(
-                    (ceil_div(K // 128, 4), all_tokens),
-                    device=hidden_states_fp8.device,
-                    dtype=torch.int,
-                ).transpose(0, 1)
-                if deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0
-                else torch.empty(
-                    (all_tokens, K // 128),
-                    device=hidden_states_fp8.device,
-                    dtype=torch.float32,
-                )
-            ),
-        ]
-        m_indices = torch.empty(
-            all_tokens, device=hidden_states_fp8.device, dtype=torch.int32
-        )
-        output_index = torch.empty_like(topk_idx)
-        if get_offloader().forbid_copy_engine_usage:
-            num_recv_tokens_per_expert_gpu = copy_list_to_gpu_no_ce(
-                num_recv_tokens_per_expert
-            )
-        else:
-            num_recv_tokens_per_expert_gpu = torch.tensor(
-                num_recv_tokens_per_expert,
-                dtype=torch.int32,
-                pin_memory=True,
-                device="cpu",
-            ).cuda(non_blocking=True)
-        expert_start_loc = torch.empty_like(num_recv_tokens_per_expert_gpu)
-        ep_scatter(
-            hidden_states_fp8,
-            hidden_states_scale,
-            topk_idx,
-            num_recv_tokens_per_expert_gpu,
-            expert_start_loc,
-            input_tensor[0],
-            input_tensor[1],
-            m_indices,
-            output_index,
-            scale_ue8m0=deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0,
-        )
-        dispose_tensor(hidden_states_fp8)
-        gateup_output = torch.empty(
-            (all_tokens, N),
-            device=hidden_states_fp8_device,
-            dtype=torch.bfloat16,
-        )
-        if not deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0:
-            input_tensor[1] = tma_align_input_scale(input_tensor[1])
-        deep_gemm_wrapper.grouped_gemm_nt_f8f8bf16_contig(
-            input_tensor, w13_weight_fp8, gateup_output, m_indices
-        )
-        del input_tensor
-        down_input = torch.empty(
-            (
-                all_tokens,
-                N // 2,
-            ),
-            device=gateup_output.device,
-            dtype=torch.bfloat16,
-        )
-        silu_and_mul(gateup_output.view(-1, N), down_input)
-        del gateup_output
-        down_output = torch.empty(
-            (all_tokens, K),
-            device=hidden_states_fp8_device,
-            dtype=torch.bfloat16,
-        )
-        down_input_fp8, down_input_scale = sglang_per_token_group_quant_fp8(
-            down_input,
-            scale_block_size,
-            column_major_scales=deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0,
-            scale_tma_aligned=deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0,
-            scale_ue8m0=deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0,
-        )
-        del down_input
-        if not deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0:
-            down_input_scale = tma_align_input_scale(down_input_scale)
-        deep_gemm_wrapper.grouped_gemm_nt_f8f8bf16_contig(
-            (down_input_fp8, down_input_scale),
-            w2_weight_fp8,
-            down_output,
-            m_indices,
-        )
-        del down_input_fp8, down_input_scale
-        gather_out = torch.empty(
-            hidden_states_fp8_shape,
-            device=hidden_states_fp8_device,
-            dtype=torch.bfloat16,
-        )
-        ep_gather(down_output, topk_idx, topk_weights, output_index, gather_out)
-        return gather_out
     def forward_flashinfer_cutedsl(
         self,
-        dispatch_output: DeepEPLLOutput,
+        dispatch_output: DeepEPLLDispatchOutput,
         down_gemm_overlap_args: Optional[DownGemmOverlapArgs],
     ):
-        hidden_states, _, _, masked_m, _ = dispatch_output
+        hidden_states, hidden_states_scale, _, _, masked_m, _ = dispatch_output
         assert self.quant_method is not None
         assert self.moe_runner_config.activation == "silu"
         output = self.quant_method.apply_without_routing_weights(
             layer=self,
-            x=hidden_states,
+            x=(hidden_states, hidden_states_scale),
             masked_m=masked_m,
             moe_runner_config=self.moe_runner_config,
             down_gemm_overlap_args=down_gemm_overlap_args,
         )
         return output
-    def forward_deepgemm_masked(
+    def forward_cutlass_w4afp8(
         self,
-        dispatch_output: DeepEPLLOutput,
+        dispatch_output: DeepEPNormalDispatchOutput,
     ):
-        hidden_states_fp8, _, _, masked_m, expected_m = dispatch_output
-        assert self.quant_method is not None
         assert self.moe_runner_config.activation == "silu"
-        # GroupGemm-0
-        num_groups, m, k = hidden_states_fp8[0].size()
-        n = self.w13_weight.size(1)
-        expected_m = min(expected_m, m)
-        gateup_output = torch.empty(
-            (num_groups, m, n), device=hidden_states_fp8[0].device, dtype=torch.bfloat16
-        )
-        deep_gemm_wrapper.grouped_gemm_nt_f8f8bf16_masked(
-            hidden_states_fp8,
-            self.w13_weight_fp8,
-            gateup_output,
-            masked_m,
-            expected_m,
-        )
-        dispose_tensor(hidden_states_fp8[0])
-        # Act
-        down_input = torch.empty(
-            (
-                gateup_output.shape[0],
-                gateup_output.shape[1],
-                gateup_output.shape[2] // 2,
-            ),
-            device=gateup_output.device,
-            dtype=self.fp8_dtype,
-        )
-        scale_block_size = 128
-        down_input_scale = torch.empty(
-            (
-                gateup_output.shape[0],
-                gateup_output.shape[1],
-                gateup_output.shape[2] // 2 // scale_block_size,
-            ),
-            device=gateup_output.device,
-            dtype=torch.float32,
-        )
-        silu_and_mul_masked_post_quant_fwd(
-            gateup_output,
-            down_input,
-            down_input_scale,
-            scale_block_size,
-            masked_m,
-            scale_ue8m0=deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0,
+        assert isinstance(self.quant_method, W4AFp8MoEMethod)
+        return self.quant_method.apply_deepep_normal(
+            layer=self,
+            dispatch_output=dispatch_output,
         )
-        del gateup_output
-        # GroupGemm-1
-        n = self.w2_weight.size(1)
-        down_input_fp8 = (
-            down_input,
-            (
-                down_input_scale
-                if deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0
-                else deep_gemm_wrapper.get_mn_major_tma_aligned_tensor(down_input_scale)
-            ),
-        )
-        down_output = torch.empty(
-            (num_groups, m, n), device=down_input.device, dtype=torch.bfloat16
-        )
-        deep_gemm_wrapper.grouped_gemm_nt_f8f8bf16_masked(
-            down_input_fp8,
-            self.w2_weight_fp8,
-            down_output,
-            masked_m,
-            expected_m,
+    def forward_cutlass_w4afp8_masked(
+        self,
+        dispatch_output: DeepEPLLDispatchOutput,
+    ):
+        assert self.moe_runner_config.activation == "silu"
+        assert isinstance(self.quant_method, W4AFp8MoEMethod)
+        assert get_bool_env_var(
+            "SGLANG_DEEPEP_BF16_DISPATCH"
+        ), "W4AFP8 does not support FP8 dispatch; please set SGLANG_DEEPEP_BF16_DISPATCH=1."
+        return self.quant_method.apply_deepep_ll(
+            layer=self,
+            dispatch_output=dispatch_output,
         )
-        return down_output
     def forward_npu(
         self,
-        dispatch_output: Union[DeepEPNormalOutput, DeepEPLLOutput],
+        dispatch_output: Union[DeepEPNormalDispatchOutput, DeepEPLLDispatchOutput],
     ):
         assert self.quant_method is not None
         assert self.moe_runner_config.activation == "silu"
@@ -805,14 +344,12 @@ class DeepEPMoE(EPMoE):
         output_dtype = torch.bfloat16
         group_list_type = 1
-        def _forward_normal(dispatch_output: DeepEPNormalOutput):
+        def _forward_normal(dispatch_output: DeepEPNormalDispatchOutput):
             if TYPE_CHECKING:
-                assert isinstance(dispatch_output, DeepEPNormalOutput)
-            hidden_states, _, _, num_recv_tokens_per_expert = dispatch_output
-            if isinstance(hidden_states, tuple):
-                per_token_scale = hidden_states[1]
-                hidden_states = hidden_states[0]
+                assert isinstance(dispatch_output, DeepEPNormalDispatchOutput)
+            hidden_states, hidden_states_scale, _, _, num_recv_tokens_per_expert = (
+                dispatch_output
+            )
             group_list = torch.tensor(num_recv_tokens_per_expert, dtype=torch.int64).to(
                 hidden_states.device
@@ -822,7 +359,7 @@ class DeepEPMoE(EPMoE):
                 hidden_states = torch_npu.npu_grouped_matmul(
                     x=[hidden_states],
                     weight=[self.w13_weight.permute(0, 2, 1)],
-                    # per_token_scale=[per_token_scale],
+                    # per_token_scale=[hidden_states_scale],
                     split_item=2,
                     group_list_type=group_list_type,
                     group_type=0,
@@ -842,7 +379,7 @@ class DeepEPMoE(EPMoE):
                 )[0]
             else:
                 if not get_bool_env_var("DEEP_NORMAL_MODE_USE_INT8_QUANT"):
-                    hidden_states, per_token_scale = torch_npu.npu_dynamic_quant(
+                    hidden_states, hidden_states_scale = torch_npu.npu_dynamic_quant(
                         hidden_states
                     )
                 # gmm1: gate_up_proj
@@ -850,7 +387,7 @@ class DeepEPMoE(EPMoE):
                     x=[hidden_states],
                     weight=[self.w13_weight],
                     scale=[self.w13_weight_scale.to(output_dtype)],
-                    per_token_scale=[per_token_scale],
+                    per_token_scale=[hidden_states_scale],
                     split_item=2,
                     group_list_type=group_list_type,
                     group_type=0,
@@ -879,14 +416,17 @@ class DeepEPMoE(EPMoE):
             return hidden_states
-        def _forward_ll(dispatch_output: DeepEPLLOutput):
+        def _forward_ll(dispatch_output: DeepEPLLDispatchOutput):
             if TYPE_CHECKING:
-                assert isinstance(dispatch_output, DeepEPLLOutput)
-            hidden_states, topk_idx, topk_weights, group_list, _ = dispatch_output
-            if isinstance(hidden_states, tuple):
-                per_token_scale = hidden_states[1]
-                hidden_states = hidden_states[0]
+                assert isinstance(dispatch_output, DeepEPLLDispatchOutput)
+            (
+                hidden_states,
+                hidden_states_scale,
+                topk_ids,
+                topk_weights,
+                group_list,
+                _,
+            ) = dispatch_output
             group_list = group_list.to(torch.int64)
@@ -895,7 +435,7 @@ class DeepEPMoE(EPMoE):
                 hidden_states = torch_npu.npu_grouped_matmul(
                     x=[hidden_states],
                     weight=[self.w13_weight.permute(0, 2, 1)],
-                    # per_token_scale=[per_token_scale],
+                    # per_token_scale=[hidden_states_scale],
                     split_item=2,
                     group_list_type=group_list_type,
                     group_type=0,
@@ -929,7 +469,7 @@ class DeepEPMoE(EPMoE):
                 hidden_states, swiglu_out_scale = torch_npu.npu_dequant_swiglu_quant(
                     x=hidden_states,
                     weight_scale=self.w13_weight_scale.to(torch.float32),
-                    activation_scale=per_token_scale,
+                    activation_scale=hidden_states_scale,
                     bias=None,
                     quant_scale=None,
                     quant_offset=None,
@@ -962,7 +502,7 @@ class DeepEPMoE(EPMoE):
 def get_moe_impl_class(quant_config: Optional[QuantizationConfig]):
-    if get_moe_a2a_backend().is_deepep():
+    if get_moe_a2a_backend().is_deepep() or get_moe_a2a_backend().is_mooncake():
         return DeepEPMoE
     # NEW: Direct FP4 detection (bypasses EP requirements)
@@ -988,15 +528,4 @@ def get_moe_impl_class(quant_config: Optional[QuantizationConfig]):
         return FlashInferFusedMoE
     if get_moe_runner_backend().is_flashinfer_cutlass():
         return FusedMoE
-    if get_moe_expert_parallel_world_size() > 1:
-        return EPMoE
     return FusedMoE
-def copy_list_to_gpu_no_ce(arr: List[int]):
-    from sgl_kernel.elementwise import copy_to_gpu_no_ce
-    tensor_cpu = torch.tensor(arr, dtype=torch.int32, device="cpu")
-    tensor_gpu = torch.empty_like(tensor_cpu, device="cuda")
-    copy_to_gpu_no_ce(tensor_cpu, tensor_gpu)
-    return tensor_gpu

sglang 0.5.3rc2__py3-none-any.whl → 0.5.4.post1__py3-none-any.whl

sglang 0.5.3rc2py3-none-any.whl → 0.5.4.post1py3-none-any.whl