PyPI - sglang - Versions diffs - 0.4.9.post2__py3-none-any.whl → 0.4.9.post3__py3-none-any.whl - Mend

sglang 0.4.9.post2py3-none-any.whl → 0.4.9.post3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (168) hide show

sglang/bench_one_batch.py +2 -1
sglang/eval/loogle_eval.py +7 -0
sglang/srt/configs/deepseekvl2.py +11 -2
sglang/srt/configs/internvl.py +3 -0
sglang/srt/configs/janus_pro.py +3 -0
sglang/srt/configs/model_config.py +9 -7
sglang/srt/configs/update_config.py +3 -1
sglang/srt/conversation.py +1 -0
sglang/srt/custom_op.py +5 -2
sglang/srt/disaggregation/decode.py +9 -1
sglang/srt/disaggregation/mooncake/conn.py +44 -56
sglang/srt/distributed/parallel_state.py +33 -0
sglang/srt/entrypoints/engine.py +30 -26
sglang/srt/entrypoints/openai/serving_chat.py +21 -2
sglang/srt/eplb/expert_location_dispatch.py +1 -1
sglang/srt/function_call/function_call_parser.py +2 -0
sglang/srt/function_call/qwen3_detector.py +150 -0
sglang/srt/hf_transformers_utils.py +0 -1
sglang/srt/layers/activation.py +13 -0
sglang/srt/layers/attention/flashattention_backend.py +3 -3
sglang/srt/layers/attention/flashinfer_backend.py +40 -1
sglang/srt/layers/linear.py +13 -102
sglang/srt/layers/moe/ep_moe/kernels.py +4 -2
sglang/srt/layers/moe/ep_moe/layer.py +23 -402
sglang/srt/layers/moe/fused_moe_native.py +7 -47
sglang/srt/layers/moe/fused_moe_triton/__init__.py +4 -4
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=385,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=385,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +35 -45
sglang/srt/layers/moe/fused_moe_triton/layer.py +14 -396
sglang/srt/layers/moe/topk.py +187 -12
sglang/srt/layers/quantization/__init__.py +20 -134
sglang/srt/layers/quantization/awq.py +578 -11
sglang/srt/layers/quantization/awq_triton.py +339 -0
sglang/srt/layers/quantization/base_config.py +85 -10
sglang/srt/layers/quantization/blockwise_int8.py +17 -55
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +13 -11
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +24 -73
sglang/srt/layers/quantization/fp8.py +273 -62
sglang/srt/layers/quantization/fp8_kernel.py +210 -46
sglang/srt/layers/quantization/fp8_utils.py +2 -2
sglang/srt/layers/quantization/gptq.py +501 -143
sglang/srt/layers/quantization/marlin_utils.py +790 -0
sglang/srt/layers/quantization/modelopt_quant.py +26 -108
sglang/srt/layers/quantization/moe_wna16.py +45 -49
sglang/srt/layers/quantization/petit.py +252 -0
sglang/srt/layers/quantization/petit_utils.py +104 -0
sglang/srt/layers/quantization/qoq.py +7 -6
sglang/srt/layers/quantization/scalar_type.py +352 -0
sglang/srt/layers/quantization/unquant.py +422 -0
sglang/srt/layers/quantization/utils.py +343 -3
sglang/srt/layers/quantization/w4afp8.py +8 -4
sglang/srt/layers/quantization/w8a8_fp8.py +17 -51
sglang/srt/layers/quantization/w8a8_int8.py +51 -115
sglang/srt/layers/vocab_parallel_embedding.py +1 -41
sglang/srt/lora/lora.py +0 -4
sglang/srt/lora/lora_manager.py +87 -53
sglang/srt/lora/mem_pool.py +81 -33
sglang/srt/lora/utils.py +12 -5
sglang/srt/managers/cache_controller.py +241 -0
sglang/srt/managers/io_struct.py +41 -29
sglang/srt/managers/mm_utils.py +7 -8
sglang/srt/managers/schedule_batch.py +150 -110
sglang/srt/managers/schedule_policy.py +68 -27
sglang/srt/managers/scheduler.py +243 -61
sglang/srt/managers/scheduler_output_processor_mixin.py +22 -4
sglang/srt/managers/tokenizer_manager.py +11 -3
sglang/srt/managers/tp_worker.py +14 -0
sglang/srt/managers/tp_worker_overlap_thread.py +11 -0
sglang/srt/mem_cache/allocator.py +7 -16
sglang/srt/mem_cache/base_prefix_cache.py +14 -2
sglang/srt/mem_cache/chunk_cache.py +5 -2
sglang/srt/mem_cache/hicache_storage.py +152 -0
sglang/srt/mem_cache/hiradix_cache.py +179 -4
sglang/srt/mem_cache/memory_pool.py +16 -1
sglang/srt/mem_cache/memory_pool_host.py +41 -2
sglang/srt/mem_cache/radix_cache.py +26 -0
sglang/srt/mem_cache/swa_radix_cache.py +1025 -0
sglang/srt/metrics/collector.py +9 -0
sglang/srt/model_executor/cuda_graph_runner.py +5 -6
sglang/srt/model_executor/forward_batch_info.py +14 -1
sglang/srt/model_executor/model_runner.py +109 -22
sglang/srt/model_loader/loader.py +7 -1
sglang/srt/model_loader/utils.py +4 -4
sglang/srt/models/clip.py +1 -1
sglang/srt/models/deepseek.py +9 -6
sglang/srt/models/deepseek_janus_pro.py +1 -1
sglang/srt/models/deepseek_v2.py +191 -171
sglang/srt/models/deepseek_vl2.py +5 -5
sglang/srt/models/gemma.py +48 -0
sglang/srt/models/gemma2.py +52 -0
sglang/srt/models/gemma3_causal.py +63 -0
sglang/srt/models/gemma3_mm.py +1 -1
sglang/srt/models/gemma3n_mm.py +2 -4
sglang/srt/models/granitemoe.py +385 -0
sglang/srt/models/grok.py +9 -3
sglang/srt/models/hunyuan.py +63 -16
sglang/srt/models/internvl.py +1 -1
sglang/srt/models/kimi_vl.py +1 -1
sglang/srt/models/llama.py +41 -0
sglang/srt/models/llama4.py +11 -11
sglang/srt/models/llava.py +2 -2
sglang/srt/models/llavavid.py +1 -1
sglang/srt/models/minicpm.py +0 -2
sglang/srt/models/minicpmo.py +3 -7
sglang/srt/models/minicpmv.py +1 -1
sglang/srt/models/mistral.py +1 -1
sglang/srt/models/mixtral.py +9 -2
sglang/srt/models/mllama.py +3 -5
sglang/srt/models/mllama4.py +3 -3
sglang/srt/models/olmoe.py +8 -5
sglang/srt/models/persimmon.py +330 -0
sglang/srt/models/phi.py +321 -0
sglang/srt/models/phi4mm.py +44 -4
sglang/srt/models/phi4mm_audio.py +1260 -0
sglang/srt/models/phi4mm_utils.py +1917 -0
sglang/srt/models/phimoe.py +9 -3
sglang/srt/models/qwen.py +37 -0
sglang/srt/models/qwen2.py +41 -0
sglang/srt/models/qwen2_5_vl.py +4 -4
sglang/srt/models/qwen2_audio.py +1 -1
sglang/srt/models/qwen2_moe.py +53 -5
sglang/srt/models/qwen2_vl.py +4 -4
sglang/srt/models/qwen3.py +65 -1
sglang/srt/models/qwen3_moe.py +56 -18
sglang/srt/models/vila.py +1 -1
sglang/srt/multimodal/processors/base_processor.py +91 -97
sglang/srt/multimodal/processors/clip.py +21 -19
sglang/srt/multimodal/processors/deepseek_vl_v2.py +8 -26
sglang/srt/multimodal/processors/gemma3.py +13 -17
sglang/srt/multimodal/processors/gemma3n.py +19 -23
sglang/srt/multimodal/processors/internvl.py +9 -10
sglang/srt/multimodal/processors/janus_pro.py +12 -27
sglang/srt/multimodal/processors/kimi_vl.py +12 -14
sglang/srt/multimodal/processors/llava.py +4 -2
sglang/srt/multimodal/processors/minicpm.py +35 -44
sglang/srt/multimodal/processors/mlama.py +21 -18
sglang/srt/multimodal/processors/mllama4.py +4 -5
sglang/srt/multimodal/processors/phi4mm.py +63 -39
sglang/srt/multimodal/processors/pixtral.py +14 -35
sglang/srt/multimodal/processors/qwen_audio.py +65 -0
sglang/srt/multimodal/processors/qwen_vl.py +16 -21
sglang/srt/multimodal/processors/vila.py +14 -14
sglang/srt/sampling/sampling_params.py +8 -1
sglang/srt/server_args.py +393 -230
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +9 -1
sglang/srt/two_batch_overlap.py +1 -0
sglang/srt/utils.py +27 -1
sglang/test/runners.py +14 -3
sglang/test/test_block_fp8.py +8 -3
sglang/test/test_block_fp8_ep.py +1 -1
sglang/test/test_custom_ops.py +12 -7
sglang/test/test_cutlass_w4a8_moe.py +1 -3
sglang/test/test_fp4_moe.py +1 -3
sglang/test/test_marlin_moe.py +286 -0
sglang/test/test_marlin_utils.py +171 -0
sglang/test/test_utils.py +35 -0
sglang/version.py +1 -1
{sglang-0.4.9.post2.dist-info → sglang-0.4.9.post3.dist-info}/METADATA +8 -8
{sglang-0.4.9.post2.dist-info → sglang-0.4.9.post3.dist-info}/RECORD +166 -146
sglang/srt/layers/quantization/quant_utils.py +0 -166
sglang/srt/managers/multimodal_processors/qwen_audio.py +0 -94
{sglang-0.4.9.post2.dist-info → sglang-0.4.9.post3.dist-info}/WHEEL +0 -0
{sglang-0.4.9.post2.dist-info → sglang-0.4.9.post3.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.9.post2.dist-info → sglang-0.4.9.post3.dist-info}/top_level.txt +0 -0

sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py CHANGED Viewed

@@ -84,7 +84,15 @@ class EAGLEDraftExtendCudaGraphRunner:
                 self.hidden_states = torch.zeros(
                     (
                         self.max_num_token,
-                        self.model_runner.model_config.hidden_size * 3,
+                        (
+                            self.model_runner.model_config.hf_config.target_hidden_size
+                            * 3
+                            if hasattr(
+                                self.model_runner.model_config.hf_config,
+                                "target_hidden_size",
+                            )
+                            else self.model_runner.model_config.hidden_size * 3
+                        ),
                     ),
                     dtype=self.model_runner.dtype,
                 )

sglang/srt/two_batch_overlap.py CHANGED Viewed

@@ -500,6 +500,7 @@ class TboForwardBatchPreparer:
             "capture_hidden_mode",
             "padded_static_len",
             "mrope_positions",  # only used by qwen2-vl, thus not care
+            "split_index",  # for split prefill
         ]:
             output_dict[key] = getattr(batch, key)
         if not batch.forward_mode.is_target_verify():

sglang/srt/utils.py CHANGED Viewed

@@ -691,12 +691,17 @@ def decode_video_base64(video_base64):
         )  # Return an empty array and size tuple if no frames were found
-def load_audio(audio_file: str, sr: int = 16000, mono: bool = True) -> np.ndarray:
+def load_audio(
+    audio_file: str, sr: Optional[int] = None, mono: bool = True
+) -> np.ndarray:
     # Use soundfile here, since librosa use it under the hood,
     # and librosa will not support audio loading in the future
     import soundfile as sf
     from scipy.signal import resample
+    if sr is None:
+        sr = 16000
     # Load audio data
     if isinstance(audio_file, bytes):
         audio, original_sr = sf.read(BytesIO(audio_file))
@@ -1417,6 +1422,13 @@ def get_nvgpu_memory_capacity():
         ]
         if not memory_values:
+            # Fallback to torch.cuda.mem_get_info() when failed to get memory capacity from nvidia-smi,
+            # typically in NVIDIA MIG mode.
+            if torch.cuda.is_available():
+                logger.warning(
+                    "Failed to get GPU memory capacity from nvidia-smi, falling back to torch.cuda.mem_get_info()."
+                )
+                return torch.cuda.mem_get_info()[1] // 1024 // 1024  # unit: MB
             raise ValueError("No GPU memory values found.")
         # Return the minimum memory value
@@ -2880,3 +2892,17 @@ def parse_module_path(module_path, function_name, create_dummy):
         return final_module, getattr(final_module, function_name)
     return final_module, None
+# LoRA-related constants and utilities
+SUPPORTED_LORA_TARGET_MODULES = [
+    "q_proj",
+    "k_proj",
+    "v_proj",
+    "o_proj",
+    "gate_proj",
+    "up_proj",
+    "down_proj",
+]
+LORA_TARGET_ALL_MODULES = "all"

sglang/test/runners.py CHANGED Viewed

@@ -134,10 +134,12 @@ class HFRunner:
         model_type: str = "generation",
         output_str_only: bool = False,
         trust_remote_code: bool = False,
+        patch_model_do_sample_false: bool = False,
     ):
         self.model_type = model_type
         self.output_str_only = output_str_only
         self.trust_remote_code = trust_remote_code
+        self.patch_model_do_sample_false = patch_model_do_sample_false
         self.in_queue = mp.Queue()
         self.out_queue = mp.Queue()
@@ -292,6 +294,7 @@ class HFRunner:
                             torch_dtype=torch_dtype,
                             output_str_only=self.output_str_only,
                             token_ids_logprob=token_ids_logprob,
+                            patch_model_do_sample_false=self.patch_model_do_sample_false,
                         )
                     )
                 elif self.model_type == "embedding":
@@ -380,6 +383,7 @@ class HFRunner:
         lora_paths: Optional[List[str]] = None,
         output_str_only: bool = False,
         token_ids_logprob: Optional[int] = None,
+        patch_model_do_sample_false: Optional[bool] = False,
     ) -> ModelOutput:
         output_strs = []
         top_input_logprobs = []
@@ -407,7 +411,8 @@ class HFRunner:
                 )
             else:
                 model = base_model
+            if patch_model_do_sample_false:
+                model.generation_config.do_sample = False
             outputs = model.generate(
                 input_ids=input_ids,
                 generation_config=GenerationConfig(
@@ -481,7 +486,7 @@ class SRTRunner:
         torch_dtype: torch.dtype,
         model_type: str,
         tp_size: int = 1,
-        impl: str = "auto",
+        model_impl: str = "auto",
         port: int = DEFAULT_PORT_FOR_SRT_TEST_RUNNER,
         lora_paths: List[str] = None,
         max_loras_per_batch: int = 4,
@@ -505,6 +510,9 @@ class SRTRunner:
         torchao_config: Optional[str] = None,
         cuda_graph_max_bs: int = 4,
         sleep_on_idle=False,
+        max_lora_rank: Optional[int] = None,
+        lora_target_modules: Optional[List[str]] = None,
+        enable_lora: Optional[bool] = None,
     ):
         self.model_type = model_type
         self.is_generation = model_type == "generation"
@@ -523,7 +531,7 @@ class SRTRunner:
             tp_size=tp_size,
             dtype=get_dtype_str(torch_dtype),
             port=port,
-            impl=impl,
+            model_impl=model_impl,
             torchao_config=torchao_config,
             mem_fraction_static=mem_fraction_static,
             trust_remote_code=trust_remote_code,
@@ -543,6 +551,9 @@ class SRTRunner:
             cuda_graph_max_bs=cuda_graph_max_bs,
             disable_custom_all_reduce=disable_custom_all_reduce,
             sleep_on_idle=sleep_on_idle,
+            max_lora_rank=max_lora_rank,
+            lora_target_modules=lora_target_modules,
+            enable_lora=enable_lora,
             **spec_kwargs,
         )

sglang/test/test_block_fp8.py CHANGED Viewed

@@ -6,6 +6,7 @@ import torch
 from sglang.srt.layers.activation import SiluAndMul
 from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_moe
+from sglang.srt.layers.moe.topk import select_experts
 from sglang.srt.layers.quantization.fp8_kernel import (
     per_tensor_quant_mla_fp8,
     per_token_group_quant_fp8,
@@ -497,13 +498,17 @@ class TestW8A8BlockFP8FusedMoE(CustomTestCase):
         score = torch.randn((M, E), dtype=dtype)
         with torch.inference_mode():
+            topk_output = select_experts(
+                hidden_states=a,
+                router_logits=score,
+                top_k=topk,
+                renormalize=False,
+            )
             out = fused_moe(
                 a,
                 w1,
                 w2,
-                score,
-                topk,
-                renormalize=False,
+                topk_output,
                 use_fp8_w8a8=True,
                 w1_scale=w1_s,
                 w2_scale=w2_s,

sglang/test/test_block_fp8_ep.py CHANGED Viewed

@@ -40,7 +40,7 @@ def ep_moe(
     block_shape: Optional[List[int]] = None,
 ):
     use_blockwise_fp8 = block_shape is not None
-    topk_weights, topk_ids = select_experts(
+    topk_weights, topk_ids, _ = select_experts(
         hidden_states=hidden_states,
         router_logits=router_logits,
         top_k=top_k,

sglang/test/test_custom_ops.py CHANGED Viewed

@@ -3,8 +3,13 @@
 import pytest
 import torch
-from sglang.srt.layers.quantization.fp8_kernel import scaled_fp8_quant
-from sglang.srt.utils import is_cuda
+from sglang.srt.layers.quantization.fp8_kernel import is_fp8_fnuz, scaled_fp8_quant
+from sglang.srt.utils import is_cuda, is_hip
+_is_cuda = is_cuda()
+_is_hip = is_hip()
+_is_fp8_fnuz = is_fp8_fnuz()
+fp8_dtype = torch.float8_e4m3fnuz if _is_fp8_fnuz else torch.float8_e4m3fn
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
@@ -13,10 +18,10 @@ def test_scaled_fp8_quant_per_tensor(dtype) -> None:
     def quantize_ref_per_tensor(tensor, inv_scale):
         # The reference implementation that fully aligns to
         # the kernel being tested.
-        finfo = torch.finfo(torch.float8_e4m3fn)
+        finfo = torch.finfo(fp8_dtype)
         scale = inv_scale.reciprocal()
         qweight = (tensor.to(torch.float32) * scale).clamp(min=finfo.min, max=finfo.max)
-        qweight = qweight.to(torch.float8_e4m3fn)
+        qweight = qweight.to(fp8_dtype)
         return qweight
     def dequantize_per_tensor(tensor, inv_scale, dtype):
@@ -48,19 +53,19 @@ def test_scaled_fp8_quant_per_tensor(dtype) -> None:
     )
-if is_cuda:
+if _is_cuda or _is_hip:
     @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
     def test_scaled_fp8_quant_per_token_dynamic(dtype) -> None:
         def quantize_ref_per_token(tensor, inv_scale):
             # The reference implementation that fully aligns to
             # the kernel being tested.
-            finfo = torch.finfo(torch.float8_e4m3fn)
+            finfo = torch.finfo(fp8_dtype)
             scale = inv_scale.reciprocal()
             qweight = (tensor.to(torch.float32) * scale).clamp(
                 min=finfo.min, max=finfo.max
             )
-            qweight = qweight.to(torch.float8_e4m3fn)
+            qweight = qweight.to(fp8_dtype)
             return qweight
         def dequantize_per_token(tensor, inv_scale, dtype):

sglang/test/test_cutlass_w4a8_moe.py CHANGED Viewed

@@ -100,12 +100,10 @@ def test_cutlass_w4a8_moe(M, N, K, E, ep_size, topk, group_size, dtype):
     s_strides2 = c_strides2
     score = torch.randn((M, E), dtype=dtype, device=device)
-    topk_weights, topk_ids = select_experts(
+    topk_weights, topk_ids, _ = select_experts(
         hidden_states=a,
         router_logits=score,
         top_k=topk,
-        use_grouped_topk=False,
-        renormalize=False,
     )
     expert_map = torch.arange(E, dtype=torch.int32, device=device)
     expert_map[local_e:] = E

sglang/test/test_fp4_moe.py CHANGED Viewed

@@ -159,12 +159,10 @@ def test_cutlass_fp4_moe_no_graph(
     score = torch.randn((m, e), device="cuda", dtype=dtype)
-    topk_weights, topk_ids = select_experts(
+    topk_weights, topk_ids, _ = select_experts(
         hidden_states=a,
         router_logits=score,
         top_k=topk,
-        use_grouped_topk=False,
-        renormalize=False,
     )
     a1_gs = torch.ones((e,), device="cuda", dtype=torch.float32)

sglang/test/test_marlin_moe.py ADDED Viewed

@@ -0,0 +1,286 @@
+import types
+from typing import Optional
+import pytest
+import torch
+from sgl_kernel import fused_marlin_moe
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.quantization.scalar_type import ScalarType, scalar_types
+from sglang.test.test_marlin_utils import awq_marlin_quantize, marlin_quantize
+def stack_and_dev(tensors: list[torch.Tensor]):
+    dev = tensors[0].device
+    return torch.stack(tensors, dim=0).to(dev)
+def torch_experts(
+    a: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weight: torch.Tensor,
+    topk_ids: torch.Tensor,
+    global_num_experts: int = -1,
+    expert_map: Optional[torch.Tensor] = None,
+    quant_dtype: Optional[torch.dtype] = None,
+    apply_router_weights_on_input: bool = False,
+) -> torch.Tensor:
+    assert (
+        global_num_experts == -1
+        or (global_num_experts == w1.shape[0] and expert_map is None)
+        or (expert_map is not None and global_num_experts == expert_map.shape[0])
+    )
+    M, K = a.shape
+    topk = topk_ids.shape[1]
+    print("quant_dtype", quant_dtype)
+    # exit(0)
+    if apply_router_weights_on_input:
+        assert topk == 1
+        a = a * topk_weight.to(a.dtype)
+    a = a.view(M, -1, K).repeat(1, topk, 1).reshape(-1, K)
+    out = torch.zeros(M * topk, w2.shape[1], dtype=a.dtype, device=a.device)
+    num_experts = w1.shape[0]
+    topk_ids = topk_ids.view(-1)
+    if expert_map is not None:
+        topk_ids = expert_map[topk_ids]
+    f32 = torch.float32
+    for i in range(num_experts):
+        mask = topk_ids == i
+        if mask.sum():
+            if quant_dtype is None:
+                tmp1 = a[mask] @ w1[i].transpose(0, 1)
+                tmp2 = SiluAndMul()(tmp1)
+                out[mask] = tmp2 @ w2[i].transpose(0, 1)
+    if apply_router_weights_on_input:
+        return out
+    else:
+        return (
+            (out.view(M, -1, w2.shape[1]).to(f32) * topk_weight.view(M, -1, 1))
+            .sum(dim=1)
+            .to(out.dtype)
+        )
+def torch_moe(
+    a: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    score: torch.Tensor,
+    topk: int,
+    global_num_experts: int = -1,
+    expert_map: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    score = torch.softmax(score, dim=-1, dtype=torch.float32)
+    topk_weight, topk_ids = torch.topk(score, topk)
+    return torch_experts(
+        a, w1, w2, topk_weight, topk_ids, global_num_experts, expert_map
+    )
+def marlin_moe_generate_valid_test_cases():
+    import itertools
+    m_list = [1, 123, 666]
+    n_list = [128, 1024]
+    k_list = [256, 2048]
+    e_list = [4, 12]
+    topk_list = [2, 3]
+    dtype_list = [torch.half, torch.bfloat16]
+    group_size_list = [128]
+    act_order_list = [True, False]
+    quant_type_list = [
+        scalar_types.uint4,
+        scalar_types.uint4b8,
+    ]
+    is_k_full_list = [True, False]
+    all_combinations = itertools.product(
+        m_list,
+        n_list,
+        k_list,
+        e_list,
+        topk_list,
+        dtype_list,
+        group_size_list,
+        act_order_list,
+        quant_type_list,
+        is_k_full_list,
+    )
+    def is_invalid(
+        m, n, k, e, topk, dtype, group_size, act_order, quant_type, is_k_full
+    ):
+        # Filter act_order
+        if act_order:
+            if group_size in (-1, k, n):
+                return False
+            if quant_type not in [scalar_types.uint4b8]:
+                return False
+        elif not is_k_full:
+            return False
+        return True
+    cases = []
+    for case in all_combinations:
+        if is_invalid(*case):
+            cases.append(case)
+    return cases
+@pytest.mark.flaky(reruns=2)
+@pytest.mark.parametrize(
+    ("m, n, k, e, topk, dtype, group_size," "act_order, quant_type, is_k_full"),
+    marlin_moe_generate_valid_test_cases(),
+)
+def test_fused_marlin_moe(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    dtype: torch.dtype,
+    group_size: int,
+    act_order: bool,
+    quant_type: ScalarType,
+    is_k_full: bool,
+):
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA device not available")
+    torch.manual_seed(0)
+    has_zp = quant_type in [scalar_types.uint4, scalar_types.uint8]
+    # Filter act_order
+    if act_order:
+        if group_size == -1:
+            return
+        if group_size in (k, n):
+            return
+        if has_zp:
+            return
+    else:
+        if not is_k_full:
+            return
+    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
+    w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 20
+    w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 20
+    e_map = None
+    w_ref1_l = []
+    qweight1_l = []
+    scales1_l = []
+    zeros1_l = []
+    g_idx1_l = []
+    sort_indices1_l = []
+    for i in range(w1.shape[0]):
+        if has_zp:
+            w_ref1, qweight1, scales1, zeros1 = awq_marlin_quantize(
+                w1[i].transpose(1, 0), quant_type, group_size
+            )
+            w_ref1_l.append(w_ref1.T)
+            qweight1_l.append(qweight1)
+            scales1_l.append(scales1)
+            zeros1_l.append(zeros1)
+        else:
+            test_perm = torch.randperm(k)
+            w_ref1, qweight1, scales1, g_idx1, sort_indices1, _ = marlin_quantize(
+                w1[i].transpose(1, 0), quant_type, group_size, act_order, test_perm
+            )
+            w_ref1_l.append(w_ref1.T)
+            qweight1_l.append(qweight1)
+            scales1_l.append(scales1)
+            g_idx1_l.append(g_idx1)
+            sort_indices1_l.append(sort_indices1)
+    w_ref1 = stack_and_dev(w_ref1_l)
+    qweight1 = stack_and_dev(qweight1_l).contiguous()
+    scales1 = stack_and_dev(scales1_l)
+    g_idx1 = stack_and_dev(g_idx1_l) if g_idx1_l else None
+    zeros1 = stack_and_dev(zeros1_l) if zeros1_l else None
+    sort_indices1 = stack_and_dev(sort_indices1_l) if sort_indices1_l else None
+    w_ref2_l = []
+    qweight2_l = []
+    scales2_l = []
+    zeros2_l = []
+    g_idx2_l = []
+    sort_indices2_l = []
+    for i in range(w2.shape[0]):
+        if has_zp:
+            w_ref2, qweight2, scales2, zeros2 = awq_marlin_quantize(
+                w2[i].transpose(1, 0), quant_type, group_size
+            )
+            w_ref2_l.append(w_ref2.T)
+            qweight2_l.append(qweight2)
+            scales2_l.append(scales2)
+            zeros2_l.append(zeros2)
+        else:
+            test_perm = torch.randperm(n)
+            w_ref2, qweight2, scales2, g_idx2, sort_indices2, _ = marlin_quantize(
+                w2[i].transpose(1, 0), quant_type, group_size, act_order, test_perm
+            )
+            w_ref2_l.append(w_ref2.T)
+            qweight2_l.append(qweight2)
+            scales2_l.append(scales2)
+            g_idx2_l.append(g_idx2)
+            sort_indices2_l.append(sort_indices2)
+    w_ref2 = stack_and_dev(w_ref2_l)
+    qweight2 = stack_and_dev(qweight2_l).contiguous()
+    scales2 = stack_and_dev(scales2_l)
+    g_idx2 = stack_and_dev(g_idx2_l) if g_idx2_l else None
+    zeros2 = stack_and_dev(zeros2_l) if zeros2_l else None
+    sort_indices2 = stack_and_dev(sort_indices2_l) if sort_indices2_l else None
+    score = torch.randn((m, e), device="cuda", dtype=dtype)
+    from sglang.srt.layers.moe.topk import fused_topk_torch_native
+    topk_weights, topk_ids = fused_topk_torch_native(a, score, topk, False)
+    torch_output = torch_moe(a, w_ref1, w_ref2, score, topk, expert_map=e_map)
+    marlin_output = fused_marlin_moe(
+        a,
+        qweight1,
+        qweight2,
+        scales1,
+        scales2,
+        score,
+        topk_weights,
+        topk_ids,
+        g_idx1=g_idx1,
+        g_idx2=g_idx2,
+        sort_indices1=sort_indices1,
+        sort_indices2=sort_indices2,
+        w1_zeros=zeros1,
+        w2_zeros=zeros2,
+        num_bits=4,
+        is_k_full=is_k_full,
+    )
+    torch.testing.assert_close(marlin_output, torch_output, atol=5e-2, rtol=0)
+if __name__ == "__main__":
+    # Run the specific test function directly
+    pytest.main([__file__])

sglang 0.4.9.post2__py3-none-any.whl → 0.4.9.post3__py3-none-any.whl

sglang 0.4.9.post2py3-none-any.whl → 0.4.9.post3py3-none-any.whl