PyPI - sglang - Versions diffs - 0.5.1.post2__py3-none-any.whl → 0.5.2__py3-none-any.whl - Mend

sglang 0.5.1.post2py3-none-any.whl → 0.5.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (256) hide show

sglang/bench_one_batch.py +3 -0
sglang/bench_one_batch_server.py +89 -54
sglang/bench_serving.py +437 -40
sglang/lang/interpreter.py +1 -1
sglang/profiler.py +0 -1
sglang/srt/configs/__init__.py +4 -0
sglang/srt/configs/internvl.py +6 -0
sglang/srt/configs/longcat_flash.py +104 -0
sglang/srt/configs/model_config.py +37 -7
sglang/srt/configs/qwen3_next.py +326 -0
sglang/srt/connector/__init__.py +1 -1
sglang/srt/connector/base_connector.py +1 -2
sglang/srt/connector/redis.py +2 -2
sglang/srt/connector/serde/__init__.py +1 -1
sglang/srt/connector/serde/safe_serde.py +4 -3
sglang/srt/custom_op.py +11 -1
sglang/srt/debug_utils/dump_comparator.py +81 -44
sglang/srt/debug_utils/dump_loader.py +97 -0
sglang/srt/debug_utils/dumper.py +11 -3
sglang/srt/debug_utils/text_comparator.py +73 -11
sglang/srt/disaggregation/ascend/conn.py +75 -0
sglang/srt/disaggregation/base/conn.py +1 -1
sglang/srt/disaggregation/common/conn.py +15 -12
sglang/srt/disaggregation/decode.py +6 -4
sglang/srt/disaggregation/fake/conn.py +1 -1
sglang/srt/disaggregation/mini_lb.py +6 -420
sglang/srt/disaggregation/mooncake/conn.py +18 -10
sglang/srt/disaggregation/nixl/conn.py +180 -16
sglang/srt/disaggregation/prefill.py +6 -4
sglang/srt/disaggregation/utils.py +5 -50
sglang/srt/distributed/parallel_state.py +94 -58
sglang/srt/entrypoints/engine.py +34 -14
sglang/srt/entrypoints/http_server.py +172 -47
sglang/srt/entrypoints/openai/protocol.py +90 -27
sglang/srt/entrypoints/openai/serving_base.py +6 -2
sglang/srt/entrypoints/openai/serving_chat.py +82 -26
sglang/srt/entrypoints/openai/serving_completions.py +25 -4
sglang/srt/entrypoints/openai/serving_embedding.py +8 -4
sglang/srt/entrypoints/openai/serving_responses.py +7 -4
sglang/srt/eplb/eplb_manager.py +28 -4
sglang/srt/eplb/expert_distribution.py +55 -15
sglang/srt/eplb/expert_location.py +8 -3
sglang/srt/eplb/expert_location_updater.py +1 -1
sglang/srt/function_call/deepseekv31_detector.py +222 -0
sglang/srt/function_call/ebnf_composer.py +11 -9
sglang/srt/function_call/function_call_parser.py +2 -0
sglang/srt/function_call/glm4_moe_detector.py +1 -1
sglang/srt/function_call/gpt_oss_detector.py +144 -256
sglang/srt/function_call/qwen3_coder_detector.py +1 -1
sglang/srt/hf_transformers_utils.py +28 -7
sglang/srt/layers/activation.py +44 -9
sglang/srt/layers/attention/aiter_backend.py +93 -68
sglang/srt/layers/attention/ascend_backend.py +381 -136
sglang/srt/layers/attention/fla/chunk.py +242 -0
sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
sglang/srt/layers/attention/fla/chunk_o.py +178 -0
sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
sglang/srt/layers/attention/fla/cumsum.py +300 -0
sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
sglang/srt/layers/attention/fla/index.py +37 -0
sglang/srt/layers/attention/fla/l2norm.py +150 -0
sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
sglang/srt/layers/attention/fla/op.py +66 -0
sglang/srt/layers/attention/fla/solve_tril.py +465 -0
sglang/srt/layers/attention/fla/utils.py +331 -0
sglang/srt/layers/attention/fla/wy_fast.py +158 -0
sglang/srt/layers/attention/flashattention_backend.py +241 -7
sglang/srt/layers/attention/flashinfer_backend.py +11 -6
sglang/srt/layers/attention/flashinfer_mla_backend.py +21 -14
sglang/srt/layers/attention/hybrid_attn_backend.py +47 -8
sglang/srt/layers/attention/hybrid_linear_attn_backend.py +584 -0
sglang/srt/layers/attention/intel_amx_backend.py +3 -0
sglang/srt/layers/attention/mamba/causal_conv1d.py +128 -0
sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +1052 -0
sglang/srt/layers/attention/mamba/mamba.py +64 -0
sglang/srt/layers/attention/torch_native_backend.py +12 -6
sglang/srt/layers/attention/trtllm_mla_backend.py +126 -36
sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
sglang/srt/layers/communicator.py +45 -8
sglang/srt/layers/layernorm.py +54 -12
sglang/srt/layers/logits_processor.py +10 -3
sglang/srt/layers/moe/__init__.py +2 -1
sglang/srt/layers/moe/cutlass_moe.py +0 -8
sglang/srt/layers/moe/cutlass_w4a8_moe.py +4 -12
sglang/srt/layers/moe/ep_moe/kernels.py +74 -0
sglang/srt/layers/moe/ep_moe/layer.py +111 -56
sglang/srt/layers/moe/fused_moe_native.py +5 -3
sglang/srt/layers/moe/fused_moe_triton/__init__.py +5 -3
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/{E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json } +29 -29
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=64,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +9 -1049
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +212 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +799 -0
sglang/srt/layers/moe/fused_moe_triton/layer.py +56 -45
sglang/srt/layers/moe/fused_moe_triton/moe_align_block_size.py +87 -0
sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
sglang/srt/layers/moe/moe_runner/base.py +274 -1
sglang/srt/layers/moe/moe_runner/runner.py +80 -0
sglang/srt/layers/moe/moe_runner/triton.py +448 -0
sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
sglang/srt/layers/moe/token_dispatcher/deepep.py +41 -38
sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
sglang/srt/layers/moe/topk.py +43 -12
sglang/srt/layers/moe/utils.py +6 -5
sglang/srt/layers/quantization/awq.py +19 -7
sglang/srt/layers/quantization/base_config.py +11 -6
sglang/srt/layers/quantization/blockwise_int8.py +38 -27
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +141 -235
sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +5 -10
sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +31 -22
sglang/srt/layers/quantization/fp8.py +78 -48
sglang/srt/layers/quantization/fp8_kernel.py +2 -2
sglang/srt/layers/quantization/fp8_utils.py +45 -31
sglang/srt/layers/quantization/gptq.py +25 -17
sglang/srt/layers/quantization/modelopt_quant.py +107 -40
sglang/srt/layers/quantization/moe_wna16.py +21 -18
sglang/srt/layers/quantization/mxfp4.py +93 -68
sglang/srt/layers/quantization/mxfp4_tensor.py +3 -1
sglang/srt/layers/quantization/quark/quark_moe.py +32 -27
sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +49 -30
sglang/srt/layers/quantization/quark/utils.py +97 -0
sglang/srt/layers/quantization/rocm_mxfp4_utils.py +13 -0
sglang/srt/layers/quantization/unquant.py +135 -47
sglang/srt/layers/quantization/utils.py +13 -0
sglang/srt/layers/quantization/w4afp8.py +60 -42
sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
sglang/srt/layers/quantization/w8a8_int8.py +83 -41
sglang/srt/layers/rocm_linear_utils.py +44 -0
sglang/srt/layers/rotary_embedding.py +28 -19
sglang/srt/layers/sampler.py +29 -5
sglang/srt/layers/utils.py +0 -14
sglang/srt/lora/backend/base_backend.py +50 -8
sglang/srt/lora/backend/triton_backend.py +90 -2
sglang/srt/lora/layers.py +32 -0
sglang/srt/lora/lora.py +4 -1
sglang/srt/lora/lora_manager.py +35 -112
sglang/srt/lora/mem_pool.py +24 -10
sglang/srt/lora/utils.py +18 -9
sglang/srt/managers/cache_controller.py +396 -365
sglang/srt/managers/data_parallel_controller.py +30 -15
sglang/srt/managers/detokenizer_manager.py +18 -2
sglang/srt/managers/disagg_service.py +46 -0
sglang/srt/managers/io_struct.py +190 -11
sglang/srt/managers/mm_utils.py +6 -1
sglang/srt/managers/multi_tokenizer_mixin.py +579 -0
sglang/srt/managers/schedule_batch.py +27 -44
sglang/srt/managers/schedule_policy.py +4 -3
sglang/srt/managers/scheduler.py +148 -122
sglang/srt/managers/scheduler_metrics_mixin.py +114 -8
sglang/srt/managers/scheduler_output_processor_mixin.py +29 -19
sglang/srt/managers/scheduler_profiler_mixin.py +1 -1
sglang/srt/managers/scheduler_update_weights_mixin.py +8 -1
sglang/srt/managers/template_manager.py +3 -3
sglang/srt/managers/tokenizer_communicator_mixin.py +491 -0
sglang/srt/managers/tokenizer_manager.py +77 -480
sglang/srt/managers/tp_worker.py +16 -4
sglang/srt/managers/tp_worker_overlap_thread.py +8 -10
sglang/srt/mem_cache/allocator.py +1 -1
sglang/srt/mem_cache/chunk_cache.py +1 -1
sglang/srt/mem_cache/hicache_storage.py +53 -40
sglang/srt/mem_cache/hiradix_cache.py +196 -104
sglang/srt/mem_cache/lora_radix_cache.py +1 -1
sglang/srt/mem_cache/memory_pool.py +395 -53
sglang/srt/mem_cache/memory_pool_host.py +27 -19
sglang/srt/mem_cache/radix_cache.py +6 -6
sglang/srt/mem_cache/radix_cache_cpp.py +1 -1
sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +61 -34
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +152 -23
sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +280 -0
sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +154 -95
sglang/srt/mem_cache/storage/mooncake_store/test_mooncake_store.py +161 -0
sglang/srt/mem_cache/swa_radix_cache.py +1 -3
sglang/srt/metrics/collector.py +484 -63
sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
sglang/srt/metrics/utils.py +48 -0
sglang/srt/model_executor/cpu_graph_runner.py +640 -0
sglang/srt/model_executor/cuda_graph_runner.py +13 -5
sglang/srt/model_executor/forward_batch_info.py +72 -18
sglang/srt/model_executor/model_runner.py +190 -32
sglang/srt/model_loader/__init__.py +9 -3
sglang/srt/model_loader/loader.py +33 -28
sglang/srt/model_loader/utils.py +12 -0
sglang/srt/model_loader/weight_utils.py +2 -1
sglang/srt/models/deepseek_v2.py +323 -53
sglang/srt/models/gemma3n_mm.py +1 -1
sglang/srt/models/glm4_moe.py +10 -1
sglang/srt/models/glm4v.py +4 -2
sglang/srt/models/gpt_oss.py +7 -19
sglang/srt/models/internvl.py +28 -0
sglang/srt/models/llama4.py +9 -0
sglang/srt/models/llama_eagle3.py +17 -0
sglang/srt/models/longcat_flash.py +1026 -0
sglang/srt/models/longcat_flash_nextn.py +699 -0
sglang/srt/models/minicpmv.py +165 -3
sglang/srt/models/mllama4.py +25 -0
sglang/srt/models/opt.py +637 -0
sglang/srt/models/qwen2.py +33 -3
sglang/srt/models/qwen2_5_vl.py +91 -42
sglang/srt/models/qwen2_moe.py +79 -14
sglang/srt/models/qwen3.py +8 -2
sglang/srt/models/qwen3_moe.py +39 -8
sglang/srt/models/qwen3_next.py +1039 -0
sglang/srt/models/qwen3_next_mtp.py +109 -0
sglang/srt/models/torch_native_llama.py +1 -1
sglang/srt/models/transformers.py +1 -1
sglang/srt/multimodal/processors/base_processor.py +4 -2
sglang/srt/multimodal/processors/glm4v.py +9 -9
sglang/srt/multimodal/processors/internvl.py +141 -129
sglang/srt/{conversation.py → parser/conversation.py} +38 -5
sglang/srt/parser/harmony_parser.py +588 -0
sglang/srt/parser/reasoning_parser.py +309 -0
sglang/srt/sampling/penaltylib/orchestrator.py +14 -2
sglang/srt/sampling/sampling_batch_info.py +18 -15
sglang/srt/server_args.py +307 -80
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -0
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +10 -1
sglang/srt/speculative/eagle_worker.py +216 -120
sglang/srt/speculative/spec_info.py +5 -0
sglang/srt/speculative/standalone_worker.py +109 -0
sglang/srt/tokenizer/tiktoken_tokenizer.py +6 -1
sglang/srt/utils.py +96 -7
sglang/srt/weight_sync/utils.py +1 -1
sglang/test/attention/test_trtllm_mla_backend.py +181 -8
sglang/test/few_shot_gsm8k.py +1 -0
sglang/test/runners.py +4 -0
sglang/test/test_cutlass_moe.py +24 -6
sglang/test/test_cutlass_w4a8_moe.py +24 -9
sglang/test/test_disaggregation_utils.py +66 -0
sglang/test/test_utils.py +25 -1
sglang/utils.py +5 -0
sglang/version.py +1 -1
{sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/METADATA +13 -10
{sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/RECORD +253 -201
sglang/srt/disaggregation/launch_lb.py +0 -131
sglang/srt/mem_cache/storage/mooncake_store/unit_test.py +0 -40
sglang/srt/reasoning_parser.py +0 -553
/sglang/srt/{model_parallel.py → layers/model_parallel.py} +0 -0
/sglang/srt/{code_completion_parser.py → parser/code_completion_parser.py} +0 -0
/sglang/srt/{jinja_template_utils.py → parser/jinja_template_utils.py} +0 -0
{sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/WHEEL +0 -0
{sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/top_level.txt +0 -0

sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py CHANGED Viewed

@@ -11,6 +11,8 @@ import torch
 from compressed_tensors import CompressionFormat
 from compressed_tensors.quantization import QuantizationStrategy
+from sglang.srt.layers.moe import MoeRunner, MoeRunnerBackend, MoeRunnerConfig
+from sglang.srt.layers.moe.moe_runner.triton import TritonMoeQuantInfo
 from sglang.srt.layers.quantization.base_config import FusedMoEMethodBase
 from sglang.srt.layers.quantization.fp8_kernel import is_fp8_fnuz, scaled_fp8_quant
 from sglang.srt.layers.quantization.fp8_utils import normalize_e4m3fn_to_e4m3fnuz
@@ -30,8 +32,10 @@ from sglang.srt.utils import (
 if TYPE_CHECKING:
     from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
-    from sglang.srt.layers.moe.moe_runner import MoeRunnerConfig
-    from sglang.srt.layers.moe.topk import TopKOutput
+    from sglang.srt.layers.moe.token_dispatcher import (
+        CombineInput,
+        StandardDispatchOutput,
+    )
     from sglang.srt.layers.quantization.compressed_tensors.compressed_tensors import (
         CompressedTensorsConfig,
     )
@@ -293,14 +297,24 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
                 )
                 torch.cuda.empty_cache()
+    def create_moe_runner(
+        self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
+    ):
+        self.moe_runner_config = moe_runner_config
+        self.runner = MoeRunner(MoeRunnerBackend.TRITON, moe_runner_config)
     def apply(
         self,
         layer: torch.nn.Module,
-        x: torch.Tensor,
-        topk_output: TopKOutput,
-        moe_runner_config: MoeRunnerConfig,
-    ) -> torch.Tensor:
-        from sglang.srt.layers.moe.fused_moe_triton import fused_experts
+        dispatch_output: StandardDispatchOutput,
+    ) -> CombineInput:
+        from sglang.srt.layers.moe.token_dispatcher import StandardCombineInput
+        x = dispatch_output.hidden_states
+        topk_output = dispatch_output.topk_output
+        moe_runner_config = self.moe_runner_config
         if (
             _use_aiter
@@ -308,7 +322,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
             and moe_runner_config.apply_router_weight_on_input
         ):
             topk_weights, topk_ids, _ = topk_output
-            return rocm_fused_experts_tkw1(
+            output = rocm_fused_experts_tkw1(
                 hidden_states=x,
                 w1=layer.w13_weight,
                 w2=layer.w2_weight,
@@ -324,21 +338,20 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
                 a1_scale=layer.w13_input_scale,
                 a2_scale=layer.w2_input_scale,
             )
+            return StandardCombineInput(hidden_states=output)
         else:
-            return fused_experts(
-                x,
-                layer.w13_weight,
-                layer.w2_weight,
-                topk_output=topk_output,
-                moe_runner_config=moe_runner_config,
+            quant_info = TritonMoeQuantInfo(
+                w13_weight=layer.w13_weight,
+                w2_weight=layer.w2_weight,
                 use_fp8_w8a8=True,
                 per_channel_quant=self.weight_quant.strategy
                 == QuantizationStrategy.CHANNEL,
-                w1_scale=layer.w13_weight_scale,
+                w13_scale=layer.w13_weight_scale,
                 w2_scale=layer.w2_weight_scale,
-                a1_scale=layer.w13_input_scale,
+                a13_scale=layer.w13_input_scale,
                 a2_scale=layer.w2_input_scale,
             )
+            return self.runner.run(dispatch_output, quant_info)
 class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
@@ -380,8 +393,6 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
             params_dtype == torch.float16
         ), "float16 is required for MoE compressed models. Set dtype=torch.float16"  # noqa: E501
-        intermediate_size_full = extra_weight_attrs.pop("intermediate_size_full")
         # Will transpose the loaded weight along the
         # intermediate and hidden dim sizes. Will
         # shard for TP along the transposed dims
@@ -415,13 +426,13 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
         # In the case where we have actorder/g_idx,
         # we do not partition the w2 scales
         load_full_w2 = self.actorder and self.group_size != -1
-        w2_scales_size = (
-            intermediate_size_full if load_full_w2 else intermediate_size_per_partition
-        )
-        self.is_k_full = (not self.actorder) or (
-            intermediate_size_per_partition == intermediate_size_full
-        )
+        if load_full_w2:
+            w2_scales_size = intermediate_size_per_partition * layer.moe_tp_size
+        else:
+            w2_scales_size = intermediate_size_per_partition
+        self.is_k_full = (not self.actorder) or layer.moe_tp_size == 1
         if self.strategy == "channel":
             num_groups_w2 = num_groups_w13 = 1
@@ -640,21 +651,29 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
         )
         replace_tensor("w2_weight_scale", marlin_w2_scales)
+    def create_moe_runner(
+        self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
+    ):
+        self.moe_runner_config = moe_runner_config
     def apply(
         self,
         layer: torch.nn.Module,
-        x: torch.Tensor,
-        topk_output: TopKOutput,
-        moe_runner_config: MoeRunnerConfig,
-    ) -> torch.Tensor:
+        dispatch_output: StandardDispatchOutput,
+    ) -> CombineInput:
+        from sglang.srt.layers.moe.token_dispatcher import StandardCombineInput
         assert (
-            moe_runner_config.activation == "silu"
+            self.moe_runner_config.activation == "silu"
         ), "Only SiLU activation is supported."
+        x = dispatch_output.hidden_states
+        topk_output = dispatch_output.topk_output
         topk_weights, topk_ids, router_logits = topk_output
-        return torch.ops.vllm.fused_marlin_moe(
+        output = torch.ops.vllm.fused_marlin_moe(
             x,
             layer.w13_weight_packed,
             layer.w2_weight_packed,
@@ -670,3 +689,4 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
             num_bits=self.num_bits,
             is_k_full=self.is_k_full,
         )
+        return StandardCombineInput(hidden_states=output)

sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py CHANGED Viewed

@@ -21,9 +21,15 @@ from sglang.srt.layers.quantization.fp8_utils import (
     normalize_e4m3fn_to_e4m3fnuz,
 )
 from sglang.srt.layers.quantization.utils import requantize_with_max_scale
+from sglang.srt.utils import get_bool_env_var, is_hip
 __all__ = ["CompressedTensorsW8A8Fp8"]
+_is_hip = is_hip()
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
+if _use_aiter:
+    from aiter.ops.shuffle import shuffle_weight
 class CompressedTensorsW8A8Fp8(CompressedTensorsScheme):
@@ -76,7 +82,13 @@ class CompressedTensorsW8A8Fp8(CompressedTensorsScheme):
             else:
                 weight_scale = layer.weight_scale.data
-            layer.weight = Parameter(weight.t(), requires_grad=False)
+            if _use_aiter:
+                layer.weight = Parameter(
+                    shuffle_weight(weight, (16, 16)), requires_grad=False
+                )
+            else:
+                layer.weight = Parameter(weight.t(), requires_grad=False)
             # required by torch.compile to be torch.nn.Parameter
             layer.weight_scale = Parameter(weight_scale, requires_grad=False)

sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py CHANGED Viewed

@@ -1,26 +1,22 @@
 import logging
 import os
 from contextlib import contextmanager
-from dataclasses import dataclass
 from enum import IntEnum, auto
-from typing import Callable, Dict, List, Optional, Tuple
+from typing import Dict, List, Tuple
-from tqdm.contrib.concurrent import thread_map
+import torch
+from tqdm import tqdm
 from sglang.srt.layers.quantization.deep_gemm_wrapper.configurer import (
-    DEEPGEMM_BLACKWELL,
     ENABLE_JIT_DEEPGEMM,
 )
 from sglang.srt.server_args import ServerArgs
-from sglang.srt.utils import get_bool_env_var, get_int_env_var
+from sglang.srt.utils import ceil_div, get_bool_env_var, get_int_env_var
 logger = logging.getLogger(__name__)
-if ENABLE_JIT_DEEPGEMM and not DEEPGEMM_BLACKWELL:
-    from deep_gemm import get_num_sms
-    from deep_gemm.jit import build
-    from deep_gemm.jit_kernels.gemm import get_best_configs
-    from deep_gemm.jit_kernels.runtime import FP8GemmRuntime, GemmType
+if ENABLE_JIT_DEEPGEMM:
+    import deep_gemm
 _BUILTIN_M_LIST = list(range(1, 1024 * 16 + 1))
@@ -40,19 +36,7 @@ os.environ["DG_JIT_CACHE_DIR"] = os.getenv(
 # Refer to https://github.com/deepseek-ai/DeepGEMM/commit/d75b218b7b8f4a5dd5406ac87905039ead3ae42f
 # NVRTC may have performance loss with some cases.
 # And NVCC JIT speed is also 9x faster in the ref commit
-_USE_NVRTC_DEFAULT = "0"
-if ENABLE_JIT_DEEPGEMM:
-    try:
-        from deep_gemm.jit.compiler import get_nvcc_compiler
-        get_nvcc_compiler()
-    except:
-        logger.warning(
-            "NVCC Compiler not found, use NVRTC for DeepGEMM JIT "
-            "and may have performance loss with some cases."
-        )
-        _USE_NVRTC_DEFAULT = "1"
-os.environ["DG_JIT_USE_NVRTC"] = os.getenv("SGL_DG_USE_NVRTC", _USE_NVRTC_DEFAULT)
+os.environ["DG_JIT_USE_NVRTC"] = os.getenv("SGL_DG_USE_NVRTC", "0")
 def update_deep_gemm_config(gpu_id: int, server_args: ServerArgs):
@@ -75,7 +59,7 @@ def update_deep_gemm_config(gpu_id: int, server_args: ServerArgs):
     # Default each rank will try compile all Ms to
     # load all symbols at the launch stages.
     # Avoid loading symbols at the serving stages.
-    _DO_COMPILE_ALL = _IS_FIRST_RANK_ON_NODE or not _IN_PRECOMPILE_STAGE
+    _DO_COMPILE_ALL = _IS_FIRST_RANK_ON_NODE
 class DeepGemmKernelType(IntEnum):
@@ -84,185 +68,15 @@ class DeepGemmKernelType(IntEnum):
     GEMM_NT_F8F8BF16 = auto()
-@dataclass
-class DeepGemmKernelHelper:
-    name: str
-    compile_func: Callable[
-        [
-            int,
-            int,
-            int,
-            Tuple[int, int, int, int, Tuple[int, bool], Tuple[int, int, int]],
-        ],
-        None,
-    ]
-    configure_func: Callable[
-        [int, int, int, int, int],
-        Tuple[int, int, int, int, Tuple[int, bool], Tuple[int, int, int]],
-    ]
 _INITIALIZATION_DICT: Dict[Tuple[DeepGemmKernelType, int, int, int], bool] = dict()
-# TODO improve naming
-def _compile_warning_1():
-    if not _IN_PRECOMPILE_STAGE and _IS_FIRST_RANK_ON_NODE:
-        logger.warning(
-            "Entering DeepGEMM JIT Pre-Compile session. "
-            "It may takes a long time (typically 10-20 mins) "
-            "if you have not run `sglang.compile_deep_gemm`. "
-            "It is recommended to run `sglang.compile_deep_gemm` with same args as `sglang.launch_server`"
-            " for pre-compilation to reduce the overhead if you have not run it before. "
-            "For example: "
-            "`python3 -m sglang.compile_deep_gemm --model deepseek-ai/DeepSeek-V3 --tp 8 --trust-remote-code`"
-        )
-# TODO improve naming
-def _compile_warning_2():
-    logger.warning(
-        "Entering DeepGEMM JIT Single Kernel Compile session. "
-        "And it will makes inference throughput becomes flaky. "
-        "Please run `sglang.compile_deep_gemm` with same args as `sglang.launch_server`"
-        " for pre-compilation to solve this issue. "
-        "For example: "
-        "`python3 -m sglang.compile_deep_gemm --model deepseek-ai/DeepSeek-V3 --tp 8 --trust-remote-code`"
-    )
-def _compile_grouped_gemm_nt_f8f8bf16_masked_one(
-    n: int,
-    k: int,
-    num_groups: int,
-    config: Tuple[int, int, int, int, Tuple[int, bool], Tuple[int, int, int]],
-) -> None:
-    num_sms, block_m, block_n, num_stages, tma_multicast_config, smem_config = config
-    block_k = 128
-    num_tma_threads = 128
-    num_math_threads_per_group = 128
-    kwargs = {
-        "GEMM_TYPE": GemmType.GroupedMasked,
-        "NUM_TMA_THREADS": num_tma_threads,
-        "NUM_MATH_THREADS_PER_GROUP": num_math_threads_per_group,
-        "N": n,
-        "K": k,
-        "NUM_GROUPS": num_groups,
-        "BLOCK_M": block_m,
-        "BLOCK_N": block_n,
-        "BLOCK_K": block_k,
-        "SWIZZLE_D_MODE": smem_config[1],
-        "BLOCK_N_PADDING": smem_config[2],
-        "NUM_STAGES": num_stages,
-        "NUM_TMA_MULTICAST": tma_multicast_config[0],
-        "IS_TMA_MULTICAST_ON_A": tma_multicast_config[1],
-        "NUM_SMS": num_sms,
-        "SMEM_SIZE": smem_config[0],
-    }
-    code = FP8GemmRuntime.generate(kwargs)
-    _ = build("m_grouped_gemm_fp8_fp8_bf16_nt", code, FP8GemmRuntime, kwargs)
-def _compile_grouped_gemm_nt_f8f8bf16_contig_one(
-    n: int,
-    k: int,
-    num_groups: int,
-    config: Tuple[int, int, int, int, Tuple[int, bool], Tuple[int, int, int]],
-) -> None:
-    num_sms, block_m, block_n, num_stages, tma_multicast_config, smem_config = config
-    block_k = 128
-    num_tma_threads = 128
-    num_math_threads_per_group = 128
-    kwargs = {
-        "GEMM_TYPE": GemmType.GroupedContiguous,
-        "NUM_TMA_THREADS": num_tma_threads,
-        "NUM_MATH_THREADS_PER_GROUP": num_math_threads_per_group,
-        "N": n,
-        "K": k,
-        "NUM_GROUPS": 1,
-        "BLOCK_M": block_m,
-        "BLOCK_N": block_n,
-        "BLOCK_K": block_k,
-        "SWIZZLE_D_MODE": smem_config[1],
-        "BLOCK_N_PADDING": smem_config[2],
-        "NUM_STAGES": num_stages,
-        "NUM_TMA_MULTICAST": tma_multicast_config[0],
-        "IS_TMA_MULTICAST_ON_A": tma_multicast_config[1],
-        "NUM_SMS": num_sms,
-        "SMEM_SIZE": smem_config[0],
-    }
-    code = FP8GemmRuntime.generate(kwargs)
-    _ = build("m_grouped_gemm_fp8_fp8_bf16_nt", code, FP8GemmRuntime, kwargs)
-def _compile_gemm_nt_f8f8bf16_one(
-    n: int,
-    k: int,
-    _: int,  # _ is a dummy parameter to align with other interfaces
-    config: Tuple[int, int, int, int, Tuple[int, bool], Tuple[int, int, int]],
-) -> None:
-    num_sms, block_m, block_n, num_stages, tma_multicast_config, smem_config = config
-    block_k = 128
-    num_tma_threads = 128
-    num_math_threads_per_group = 128
-    kwargs = {
-        "GEMM_TYPE": GemmType.Normal,
-        "NUM_TMA_THREADS": num_tma_threads,
-        "NUM_MATH_THREADS_PER_GROUP": num_math_threads_per_group,
-        "N": n,
-        "K": k,
-        "NUM_GROUPS": 1,
-        "BLOCK_M": block_m,
-        "BLOCK_N": block_n,
-        "BLOCK_K": block_k,
-        "SWIZZLE_D_MODE": smem_config[1],
-        "BLOCK_N_PADDING": smem_config[2],
-        "NUM_STAGES": num_stages,
-        "NUM_TMA_MULTICAST": tma_multicast_config[0],
-        "IS_TMA_MULTICAST_ON_A": tma_multicast_config[1],
-        "NUM_SMS": num_sms,
-        "SMEM_SIZE": smem_config[0],
-    }
-    code = FP8GemmRuntime.generate(kwargs)
-    _ = build("gemm_fp8_fp8_bf16_nt", code, FP8GemmRuntime, kwargs)
-# TODO further refactor warmup-related
-_KERNEL_HELPER_DICT: Dict[DeepGemmKernelType, DeepGemmKernelHelper] = {
-    DeepGemmKernelType.GROUPED_GEMM_NT_F8F8BF16_MASKED: DeepGemmKernelHelper(
-        name="m_grouped_gemm_fp8_fp8_bf16_nt_masked",
-        compile_func=_compile_grouped_gemm_nt_f8f8bf16_masked_one,
-        configure_func=lambda m, n, k, num_groups, num_sms: get_best_configs(
-            m, n, k, num_groups, num_sms, is_grouped_masked=True
-        ),
-    ),
-    DeepGemmKernelType.GROUPED_GEMM_NT_F8F8BF16_CONTIG: DeepGemmKernelHelper(
-        name="m_grouped_gemm_fp8_fp8_bf16_nt_contiguous",
-        compile_func=_compile_grouped_gemm_nt_f8f8bf16_contig_one,
-        configure_func=lambda m, n, k, _, num_sms: get_best_configs(
-            m, n, k, 1, num_sms, is_grouped_contiguous=True
-        ),
-    ),
-    DeepGemmKernelType.GEMM_NT_F8F8BF16: DeepGemmKernelHelper(
-        name="gemm_fp8_fp8_bf16_nt",
-        compile_func=_compile_gemm_nt_f8f8bf16_one,
-        configure_func=lambda m, n, k, _, num_sms: get_best_configs(
-            m, n, k, 1, num_sms
-        ),
-    ),
-}
+# TODO improve code
 def _maybe_compile_deep_gemm_one_type_all(
     kernel_type: DeepGemmKernelType,
     n: int,
     k: int,
     num_groups: int,
-    m_list: Optional[List[int]] = None,
 ) -> None:
     global _INITIALIZATION_DICT
     global _BUILTIN_M_LIST
@@ -275,61 +89,153 @@ def _maybe_compile_deep_gemm_one_type_all(
     ):
         _INITIALIZATION_DICT[query_key] = True
-        kernel_helper = _KERNEL_HELPER_DICT[kernel_type]
-        _compile_warning_1()
+        # TODO maybe improve logs
+        if not _IN_PRECOMPILE_STAGE and _IS_FIRST_RANK_ON_NODE:
+            logger.warning(
+                "Entering DeepGEMM JIT Pre-Compile session. "
+                "It may take a long time (typically 10-20 mins) "
+                "if you have not run `sglang.compile_deep_gemm`. "
+                "It is recommended to run `sglang.compile_deep_gemm` with same args as `sglang.launch_server`"
+                " for pre-compilation to reduce the overhead if you have not run it before. "
+                "For example: "
+                "`python3 -m sglang.compile_deep_gemm --model deepseek-ai/DeepSeek-V3 --tp 8 --trust-remote-code`"
+            )
         logger.info(
             f"Try DeepGEMM JIT Compiling for "
-            f"<{kernel_helper.name}> N={n}, K={k}, num_groups={num_groups} with all Ms."
+            f"<{kernel_type.name}> N={n}, K={k}, num_groups={num_groups} with all Ms."
             f"{' It only takes a little time (typically 1 sec) if you have run `python3 -m sglang.compile_deep_gemm`. ' if not _IN_PRECOMPILE_STAGE else ''}"
         )
-        # NOTE(alcanderian): get_num_sms should be change when 2-batch-overlap is introduced
-        num_sms = get_num_sms()
-        collected_configs = set()
-        for m in m_list if m_list is not None else _BUILTIN_M_LIST:
-            # Put config into set to get unique configs and reduce cases to be compiled
-            collected_configs.add(
-                kernel_helper.configure_func(m, n, k, num_groups, num_sms)
-            )
-        compile_func = lambda config: kernel_helper.compile_func(
-            n, k, num_groups, config
+        _compile_deep_gemm_one_type_all(
+            kernel_type=kernel_type,
+            n=n,
+            k=k,
+            num_groups=num_groups,
+            m_list=_BUILTIN_M_LIST,
         )
-        thread_map(compile_func, collected_configs, max_workers=_COMPILE_WORKERS)
-@contextmanager
-def _log_jit_build(M: int, N: int, K: int, kernel_type: DeepGemmKernelType):
-    if _IN_PRECOMPILE_STAGE:
-        yield
-        return
+# NOTE(alcanderian): get_num_sms should be change when 2-batch-overlap is introduced
+def _compile_deep_gemm_one_type_all(
+    kernel_type: DeepGemmKernelType,
+    n: int,
+    k: int,
+    num_groups: int,
+    m_list: List[int],
+) -> None:
+    if kernel_type == DeepGemmKernelType.GROUPED_GEMM_NT_F8F8BF16_CONTIG:
+        m_alignment = deep_gemm.get_mk_alignment_for_contiguous_layout()
+        m_list = sorted(list(set(m for m in m_list if m % m_alignment == 0)))
+    executor = _BaseWarmupExecutor.create(
+        kernel_type, max_m=max(m_list), n=n, k=k, num_groups=num_groups
+    )
-    from deep_gemm.jit.runtime import RuntimeCache
+    old_compile_mode = deep_gemm.get_compile_mode()
+    deep_gemm.set_compile_mode(1)
+    # TODO can use multi thread
+    for m in tqdm(m_list, desc=f"DeepGEMM warmup"):
+        executor.execute(m=m)
+    deep_gemm.set_compile_mode(old_compile_mode)
+    # clean up input buffers
+    torch.cuda.current_stream().synchronize()
+    del executor
+    torch.cuda.empty_cache()
+class _BaseWarmupExecutor:
+    @staticmethod
+    def create(kernel_type: DeepGemmKernelType, **kwargs):
+        return {
+            DeepGemmKernelType.GEMM_NT_F8F8BF16: _NormalWarmupExecutor,
+            DeepGemmKernelType.GROUPED_GEMM_NT_F8F8BF16_CONTIG: _GroupedContWarmupExecutor,
+            DeepGemmKernelType.GROUPED_GEMM_NT_F8F8BF16_MASKED: _GroupedMaskedWarmupExecutor,
+        }[kernel_type](**kwargs)
+    def execute(self, m):
+        raise NotImplementedError
+def _empty_token_fp8(size):
+    *dims, k = size
+    return (
+        torch.empty(size, device="cuda", dtype=torch.float8_e4m3fn),
+        torch.empty(
+            (*dims, ceil_div(k, _BLOCK_SIZE)), device="cuda", dtype=torch.float32
+        ),
+    )
-    origin_func = RuntimeCache.get
-    def __patched_func(self, *args, **kwargs):
-        ret = origin_func(self, *args, **kwargs)
-        if ret is None:
-            kernel_helper = _KERNEL_HELPER_DICT[kernel_type]
-            if not DEEPGEMM_BLACKWELL:
-                _compile_warning_2()
-            logger.warning(
-                f"DeepGEMM JIT Compiling for <{kernel_helper.name}> M={M}, N={N}, K={K}. Please wait."
-            )
-        return ret
+def _empty_block_fp8(size):
+    *dims, n, k = size
+    return (
+        torch.empty(size, device="cuda", dtype=torch.float8_e4m3fn),
+        torch.empty(
+            (*dims, ceil_div(n, _BLOCK_SIZE), ceil_div(k, _BLOCK_SIZE)),
+            device="cuda",
+            dtype=torch.float32,
+        ),
+    )
-    RuntimeCache.get = __patched_func
-    yield
-    RuntimeCache.get = origin_func
+_BLOCK_SIZE = 128
+class _NormalWarmupExecutor(_BaseWarmupExecutor):
+    def __init__(self, max_m: int, n: int, k: int, num_groups: int):
+        self.lhs_q, self.lhs_s = _empty_token_fp8((max_m, k))
+        self.rhs_q, self.rhs_s = _empty_block_fp8((n, k))
+        self.out = torch.empty((max_m, n), device="cuda", dtype=torch.bfloat16)
+    def execute(self, m):
+        deep_gemm.fp8_gemm_nt(
+            (self.lhs_q[:m], self.lhs_s[:m]),
+            (self.rhs_q, self.rhs_s),
+            self.out[:m],
+        )
+class _GroupedContWarmupExecutor(_BaseWarmupExecutor):
+    def __init__(self, max_m: int, n: int, k: int, num_groups: int):
+        self.lhs_q, self.lhs_s = _empty_token_fp8((max_m, k))
+        self.rhs_q, self.rhs_s = _empty_block_fp8((num_groups, n, k))
+        self.m_indices = torch.zeros((max_m,), device="cuda", dtype=torch.int32)
+        self.out = torch.empty((max_m, n), device="cuda", dtype=torch.bfloat16)
+    def execute(self, m):
+        deep_gemm.m_grouped_fp8_gemm_nt_contiguous(
+            (self.lhs_q[:m], self.lhs_s[:m]),
+            (self.rhs_q, self.rhs_s),
+            self.out[:m],
+            m_indices=self.m_indices[:m],
+        )
+class _GroupedMaskedWarmupExecutor(_BaseWarmupExecutor):
+    def __init__(self, max_m: int, n: int, k: int, num_groups: int):
+        self.lhs_q, self.lhs_s = _empty_token_fp8((num_groups, max_m, k))
+        self.rhs_q, self.rhs_s = _empty_block_fp8((num_groups, n, k))
+        self.masked_m = torch.zeros((num_groups,), device="cuda", dtype=torch.int32)
+        self.out = torch.empty(
+            (num_groups, max_m, n), device="cuda", dtype=torch.bfloat16
+        )
+    def execute(self, m):
+        deep_gemm.fp8_m_grouped_gemm_nt_masked(
+            (self.lhs_q, self.lhs_s),
+            (self.rhs_q, self.rhs_s),
+            self.out,
+            masked_m=self.masked_m,
+            # DeepGEMM uses `expect_m` instead of input shape for `get_best_config`
+            expected_m=m,
+        )
 @contextmanager
 def deep_gemm_execution_hook(
     m: int, n: int, k: int, num_groups: int, kernel_type: DeepGemmKernelType
 ):
-    # not supported yet
-    if not DEEPGEMM_BLACKWELL:
-        _maybe_compile_deep_gemm_one_type_all(kernel_type, n, k, num_groups)
-    with _log_jit_build(m, n, k, kernel_type):
-        yield
+    _maybe_compile_deep_gemm_one_type_all(kernel_type, n, k, num_groups)
+    yield

sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py CHANGED Viewed

@@ -11,9 +11,6 @@ def _compute_enable_deep_gemm():
     sm_version = get_device_sm()
     if sm_version < 90:
         return False
-    # TODO fix deepgemm cu129 fp8 issue
-    if torch.version.cuda == "12.9":
-        return False
     try:
         import deep_gemm
@@ -24,14 +21,12 @@ def _compute_enable_deep_gemm():
     return get_bool_env_var("SGL_ENABLE_JIT_DEEPGEMM", default="true")
-ENABLE_JIT_DEEPGEMM = _compute_enable_deep_gemm()
+def _is_blackwell_arch() -> bool:
+    major, minor = torch.cuda.get_device_capability(torch.cuda.current_device())
+    return major == 10
-try:
-    from deep_gemm import fp8_gemm_nt
-    # They have not given a name to this breaking change
-    DEEPGEMM_BLACKWELL = True
-except ImportError:
-    DEEPGEMM_BLACKWELL = False
+ENABLE_JIT_DEEPGEMM = _compute_enable_deep_gemm()
+DEEPGEMM_BLACKWELL = ENABLE_JIT_DEEPGEMM and _is_blackwell_arch()
 DEEPGEMM_SCALE_UE8M0 = DEEPGEMM_BLACKWELL

sglang 0.5.1.post2__py3-none-any.whl → 0.5.2__py3-none-any.whl

sglang 0.5.1.post2py3-none-any.whl → 0.5.2py3-none-any.whl