PyPI - sglang - Versions diffs - 0.4.10.post2__py3-none-any.whl → 0.5.0rc1__py3-none-any.whl - Mend

sglang 0.4.10.post2py3-none-any.whl → 0.5.0rc1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (175) hide show

sglang/__init__.py +8 -3
sglang/bench_one_batch.py +119 -17
sglang/lang/chat_template.py +18 -0
sglang/srt/bench_utils.py +137 -0
sglang/srt/configs/model_config.py +42 -7
sglang/srt/conversation.py +9 -5
sglang/srt/disaggregation/base/conn.py +5 -2
sglang/srt/disaggregation/decode.py +14 -4
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +3 -0
sglang/srt/disaggregation/mooncake/conn.py +286 -160
sglang/srt/disaggregation/mooncake/transfer_engine.py +29 -0
sglang/srt/disaggregation/prefill.py +2 -0
sglang/srt/distributed/parallel_state.py +15 -11
sglang/srt/entrypoints/context.py +227 -0
sglang/srt/entrypoints/engine.py +15 -9
sglang/srt/entrypoints/harmony_utils.py +372 -0
sglang/srt/entrypoints/http_server.py +74 -4
sglang/srt/entrypoints/openai/protocol.py +218 -1
sglang/srt/entrypoints/openai/serving_chat.py +41 -11
sglang/srt/entrypoints/openai/serving_responses.py +1273 -0
sglang/srt/entrypoints/openai/tool_server.py +175 -0
sglang/srt/entrypoints/tool.py +87 -0
sglang/srt/eplb/expert_location.py +5 -1
sglang/srt/function_call/ebnf_composer.py +1 -0
sglang/srt/function_call/function_call_parser.py +2 -0
sglang/srt/function_call/glm4_moe_detector.py +1 -1
sglang/srt/function_call/gpt_oss_detector.py +331 -0
sglang/srt/function_call/kimik2_detector.py +3 -3
sglang/srt/function_call/qwen3_coder_detector.py +219 -9
sglang/srt/hf_transformers_utils.py +30 -3
sglang/srt/jinja_template_utils.py +14 -1
sglang/srt/layers/attention/aiter_backend.py +375 -115
sglang/srt/layers/attention/ascend_backend.py +3 -0
sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1700 -0
sglang/srt/layers/attention/flashattention_backend.py +18 -0
sglang/srt/layers/attention/flashinfer_backend.py +52 -13
sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
sglang/srt/layers/attention/triton_backend.py +85 -14
sglang/srt/layers/attention/triton_ops/decode_attention.py +17 -0
sglang/srt/layers/attention/triton_ops/extend_attention.py +143 -98
sglang/srt/layers/attention/trtllm_mha_backend.py +332 -0
sglang/srt/layers/attention/trtllm_mla_backend.py +119 -22
sglang/srt/layers/attention/vision.py +22 -6
sglang/srt/layers/attention/wave_backend.py +627 -0
sglang/srt/layers/attention/wave_ops/decode_attention.py +186 -0
sglang/srt/layers/attention/wave_ops/extend_attention.py +149 -0
sglang/srt/layers/attention/wave_ops/prefill_attention.py +79 -0
sglang/srt/layers/communicator.py +29 -14
sglang/srt/layers/dp_attention.py +12 -0
sglang/srt/layers/flashinfer_comm_fusion.py +4 -4
sglang/srt/layers/linear.py +3 -7
sglang/srt/layers/moe/cutlass_moe.py +12 -3
sglang/srt/layers/moe/cutlass_w4a8_moe.py +4 -5
sglang/srt/layers/moe/ep_moe/kernels.py +43 -0
sglang/srt/layers/moe/ep_moe/layer.py +135 -73
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +101 -12
sglang/srt/layers/moe/fused_moe_triton/layer.py +412 -33
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +188 -3
sglang/srt/layers/moe/token_dispatcher/deepep.py +61 -24
sglang/srt/layers/moe/topk.py +16 -4
sglang/srt/layers/moe/utils.py +16 -0
sglang/srt/layers/quantization/__init__.py +27 -3
sglang/srt/layers/quantization/fp4.py +557 -0
sglang/srt/layers/quantization/fp8.py +3 -6
sglang/srt/layers/quantization/fp8_kernel.py +277 -0
sglang/srt/layers/quantization/fp8_utils.py +51 -10
sglang/srt/layers/quantization/modelopt_quant.py +258 -68
sglang/srt/layers/quantization/mxfp4.py +654 -0
sglang/srt/layers/quantization/mxfp4_tensor.py +133 -0
sglang/srt/layers/quantization/quark/schemes/__init__.py +6 -0
sglang/srt/layers/quantization/quark/schemes/quark_scheme.py +55 -0
sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +118 -0
sglang/srt/layers/quantization/quark/utils.py +107 -0
sglang/srt/layers/quantization/unquant.py +60 -6
sglang/srt/layers/quantization/w4afp8.py +21 -12
sglang/srt/layers/quantization/w8a8_int8.py +48 -34
sglang/srt/layers/rotary_embedding.py +506 -3
sglang/srt/layers/utils.py +9 -0
sglang/srt/layers/vocab_parallel_embedding.py +8 -3
sglang/srt/lora/backend/base_backend.py +3 -23
sglang/srt/lora/layers.py +60 -114
sglang/srt/lora/lora.py +17 -62
sglang/srt/lora/lora_manager.py +82 -62
sglang/srt/lora/lora_registry.py +23 -11
sglang/srt/lora/mem_pool.py +63 -68
sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
sglang/srt/lora/utils.py +25 -58
sglang/srt/managers/cache_controller.py +75 -58
sglang/srt/managers/detokenizer_manager.py +1 -1
sglang/srt/managers/io_struct.py +20 -8
sglang/srt/managers/mm_utils.py +6 -13
sglang/srt/managers/multimodal_processor.py +1 -1
sglang/srt/managers/schedule_batch.py +61 -25
sglang/srt/managers/schedule_policy.py +6 -6
sglang/srt/managers/scheduler.py +41 -19
sglang/srt/managers/scheduler_output_processor_mixin.py +1 -2
sglang/srt/managers/scheduler_profiler_mixin.py +28 -8
sglang/srt/managers/scheduler_recv_skipper.py +37 -0
sglang/srt/managers/scheduler_update_weights_mixin.py +6 -0
sglang/srt/managers/template_manager.py +35 -1
sglang/srt/managers/tokenizer_manager.py +47 -30
sglang/srt/managers/tp_worker.py +3 -0
sglang/srt/managers/tp_worker_overlap_thread.py +3 -0
sglang/srt/mem_cache/allocator.py +61 -87
sglang/srt/mem_cache/hicache_storage.py +1 -1
sglang/srt/mem_cache/hiradix_cache.py +80 -22
sglang/srt/mem_cache/lora_radix_cache.py +421 -0
sglang/srt/mem_cache/memory_pool_host.py +34 -36
sglang/srt/mem_cache/multimodal_cache.py +33 -13
sglang/srt/mem_cache/radix_cache.py +2 -5
sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +2 -2
sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +443 -0
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +139 -67
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +6 -9
sglang/srt/model_executor/cuda_graph_runner.py +29 -9
sglang/srt/model_executor/forward_batch_info.py +61 -19
sglang/srt/model_executor/model_runner.py +148 -37
sglang/srt/model_loader/loader.py +18 -6
sglang/srt/model_loader/weight_utils.py +10 -0
sglang/srt/models/bailing_moe.py +425 -0
sglang/srt/models/deepseek_v2.py +137 -59
sglang/srt/models/ernie4.py +426 -0
sglang/srt/models/ernie4_eagle.py +203 -0
sglang/srt/models/gemma2.py +0 -34
sglang/srt/models/gemma3n_mm.py +38 -0
sglang/srt/models/glm4.py +6 -0
sglang/srt/models/glm4_moe.py +28 -16
sglang/srt/models/glm4v.py +589 -0
sglang/srt/models/glm4v_moe.py +400 -0
sglang/srt/models/gpt_oss.py +1251 -0
sglang/srt/models/granite.py +0 -25
sglang/srt/models/llama.py +0 -25
sglang/srt/models/llama4.py +1 -1
sglang/srt/models/qwen2.py +6 -0
sglang/srt/models/qwen2_5_vl.py +7 -3
sglang/srt/models/qwen2_audio.py +10 -9
sglang/srt/models/qwen2_moe.py +6 -0
sglang/srt/models/qwen3.py +0 -24
sglang/srt/models/qwen3_moe.py +32 -6
sglang/srt/models/registry.py +1 -1
sglang/srt/models/step3_vl.py +9 -0
sglang/srt/models/torch_native_llama.py +0 -24
sglang/srt/models/transformers.py +2 -5
sglang/srt/multimodal/processors/base_processor.py +23 -13
sglang/srt/multimodal/processors/glm4v.py +132 -0
sglang/srt/multimodal/processors/qwen_audio.py +4 -2
sglang/srt/multimodal/processors/step3_vl.py +3 -1
sglang/srt/reasoning_parser.py +332 -37
sglang/srt/server_args.py +186 -75
sglang/srt/speculative/eagle_worker.py +16 -0
sglang/srt/two_batch_overlap.py +169 -9
sglang/srt/utils.py +41 -5
sglang/srt/weight_sync/tensor_bucket.py +106 -0
sglang/test/attention/test_trtllm_mla_backend.py +186 -36
sglang/test/doc_patch.py +59 -0
sglang/test/few_shot_gsm8k.py +1 -1
sglang/test/few_shot_gsm8k_engine.py +1 -1
sglang/test/run_eval.py +4 -1
sglang/test/runners.py +2 -2
sglang/test/simple_eval_common.py +6 -0
sglang/test/simple_eval_gpqa.py +2 -0
sglang/test/test_fp4_moe.py +118 -36
sglang/test/test_utils.py +1 -1
sglang/utils.py +1 -1
sglang/version.py +1 -1
{sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/METADATA +36 -38
{sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/RECORD +174 -141
sglang/srt/lora/backend/flashinfer_backend.py +0 -131
/sglang/{api.py → lang/api.py} +0 -0
/sglang/{lang/backend → srt/layers/quantization/quark}/__init__.py +0 -0
{sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/WHEEL +0 -0
{sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/top_level.txt +0 -0

sglang/test/attention/test_trtllm_mla_backend.py CHANGED Viewed

@@ -43,6 +43,37 @@ DEFAULT_CONFIG = {
     "layer_id": 0,
 }
+ROPE_BASE = 10000
+ROPE_SCALING_CONFIG = {
+    "beta_fast": 32,
+    "beta_slow": 1,
+    "factor": 40,
+    "mscale": 1.0,
+    "mscale_all_dim": 1.0,
+    "original_max_position_embeddings": 4096,
+    "type": "yarn",
+    "rope_type": "deepseek_yarn",
+}
+def build_rotary_emb(config, device=None):
+    from sglang.srt.layers.rotary_embedding import get_rope_wrapper
+    dev = device or config["device"]
+    rope_scaling = config.get("rope_scaling", ROPE_SCALING_CONFIG)
+    rotary = get_rope_wrapper(
+        head_size=config["qk_rope_head_dim"],
+        rotary_dim=config["qk_rope_head_dim"],
+        max_position=config["context_len"],
+        base=ROPE_BASE,
+        rope_scaling=rope_scaling,
+        is_neox_style=False,
+        device=dev,
+    )
+    rotary.cos_sin_cache = rotary.cos_sin_cache.to(dev)
+    return rotary
 # Centralized test cases for different test scenarios
 TEST_CASES = {
     "basic_functionality": [
@@ -63,18 +94,36 @@ TEST_CASES = {
     ],
     "decode_output_match": [
         {
-            "name": "single",
+            "name": "single_fp16",
             "batch_size": 1,
             "max_seq_len": 64,
             "page_size": 32,
-            "description": "Single vs reference",
+            "description": "Single FP16 vs reference",
         },
         {
-            "name": "batch",
+            "name": "single_fp8",
+            "batch_size": 1,
+            "max_seq_len": 64,
+            "page_size": 64,
+            "tolerance": 1e-1,
+            "kv_cache_dtype": torch.float8_e4m3fn,
+            "description": "Single FP8 vs reference",
+        },
+        {
+            "name": "batch_fp16",
             "batch_size": 32,
             "max_seq_len": 64,
             "page_size": 32,
-            "description": "Batch vs reference",
+            "description": "Batch FP16 vs reference",
+        },
+        {
+            "name": "batch_fp8",
+            "batch_size": 32,
+            "max_seq_len": 64,
+            "page_size": 64,
+            "tolerance": 1e-1,
+            "kv_cache_dtype": torch.float8_e4m3fn,
+            "description": "Batch FP8 vs reference",
         },
     ],
     "page_size_consistency": [
@@ -293,26 +342,52 @@ class TestTRTLLMMLA(CustomTestCase):
             layer,
         )
-    def _create_qkv_tensors(self, batch_size, config):
-        """Create Q, K, V tensors for testing."""
-        head_dim = config["kv_lora_rank"] + config["qk_rope_head_dim"]
+    def _create_qkv_tensors(self, batch_size, config, dtype_override=None):
+        """Create Q, K, V random tensors for given batch size with separate MLA components.
+        Args:
+            batch_size: Batch size.
+            config: Configuration dict with model dims and device.
+            dtype_override: Optional torch dtype to override config["dtype"].
+        Returns:
+            Tuple of (q_nope, q_rope, k_nope, k_rope, v, cos_sin_cache)
+        """
         device = config["device"]
-        dtype = config["dtype"]
+        target_dtype = dtype_override or config["dtype"]
-        q = torch.randn(
-            (batch_size, config["num_attention_heads"], head_dim),
-            dtype=dtype,
+        # Create separate nope and rope components for Q
+        q_nope = torch.randn(
+            (batch_size, config["num_attention_heads"], config["kv_lora_rank"]),
+            dtype=config["dtype"],
             device=device,
         )
-        k = torch.randn(
-            (batch_size, config["num_kv_heads"], head_dim), dtype=dtype, device=device
+        q_rope = torch.randn(
+            (batch_size, config["num_attention_heads"], config["qk_rope_head_dim"]),
+            dtype=config["dtype"],
+            device=device,
+        )
+        # Create separate nope and rope components for K
+        k_nope = torch.randn(
+            (batch_size, config["num_kv_heads"], config["kv_lora_rank"]),
+            dtype=config["dtype"],
+            device=device,
+        )
+        k_rope = torch.randn(
+            (batch_size, config["num_kv_heads"], config["qk_rope_head_dim"]),
+            dtype=config["dtype"],
+            device=device,
         )
+        # V tensor (unchanged)
         v = torch.randn(
             (batch_size, config["num_kv_heads"], config["v_head_dim"]),
-            dtype=dtype,
+            dtype=config["dtype"],
             device=device,
         )
-        return q, k, v
+        return q_nope, q_rope, k_nope, k_rope, v
     def _create_forward_batch(
         self, batch_size, seq_lens, backend, model_runner, config
@@ -331,6 +406,10 @@ class TestTRTLLMMLA(CustomTestCase):
         )
         fb.req_to_token_pool = model_runner.req_to_token_pool
         fb.token_to_kv_pool = model_runner.token_to_kv_pool
+        # Add position information for RoPE
+        fb.positions = torch.arange(batch_size, device=config["device"])
         return fb
     def _populate_kv_cache(self, batch_size, seq_lens, model_runners, layer, config):
@@ -344,7 +423,7 @@ class TestTRTLLMMLA(CustomTestCase):
                 for token_idx in range(seq_len - 1):
                     # Create random K components for MLA
                     cache_k_nope = torch.randn(
-                        (1, config["qk_nope_head_dim"]),
+                        (1, config["kv_lora_rank"]),
                         dtype=config["dtype"],
                         device=config["device"],
                     )
@@ -411,12 +490,16 @@ class TestTRTLLMMLA(CustomTestCase):
                     batch_size, seq_lens, [model_runner_trtllm], layer, config
                 )
-                # Create Q, K, V tensors
+                # Create Q, K, V tensors with separate MLA components
                 torch.manual_seed(config["seed_qkv"])
-                q, k, v = self._create_qkv_tensors(batch_size, config)
+                q_nope, q_rope, k_nope, k_rope, v = self._create_qkv_tensors(
+                    batch_size, config
+                )
-                # Run forward decode
-                output = trtllm_backend.forward_decode(q, k, v, layer, fb)
+                # Run forward decode with separate MLA components
+                output = trtllm_backend.forward_decode(
+                    q_nope, k_nope, None, layer, fb, q_rope=q_rope, k_rope=k_rope
+                )
                 # Basic checks
                 expected_shape = (
@@ -439,6 +522,7 @@ class TestTRTLLMMLA(CustomTestCase):
                 config = self._merge_config(test_case)
                 batch_size = config["batch_size"]
                 max_seq_len = config["max_seq_len"]
+                use_fp8 = config["kv_cache_dtype"] == torch.float8_e4m3fn
                 # Create components
                 (
@@ -487,19 +571,66 @@ class TestTRTLLMMLA(CustomTestCase):
                 # Create Q, K, V tensors for current decode step
                 torch.manual_seed(config["seed_qkv"])
-                q, k, v = self._create_qkv_tensors(batch_size, config)
+                q_nope_ref, q_rope_ref, k_nope_ref, k_rope_ref, v_ref = (
+                    self._create_qkv_tensors(batch_size, config)
+                )
+                q_nope_trt, q_rope_trt, k_nope_trt, k_rope_trt, v_trt = (
+                    q_nope_ref.clone(),
+                    q_rope_ref.clone(),
+                    k_nope_ref.clone(),
+                    k_rope_ref.clone(),
+                    v_ref.clone(),
+                )
+                tolerance = config["tolerance"]
+                extra_args = {}
+                if use_fp8:
+                    # TRT kernel applies RoPE + FP8 quantization internally
+                    # pre-apply RoPE on the reference (FlashInfer) path here so
+                    # both paths share the same rope params/cache while keeping
+                    # the TRT path unrotated.
+                    rotary_emb = build_rotary_emb(config)
+                    q_rope_ref, k_rope_ref = rotary_emb(
+                        fb_reference.positions, q_rope_ref, k_rope_ref
+                    )
+                    extra_args = {
+                        "cos_sin_cache": rotary_emb.cos_sin_cache,
+                        "is_neox": rotary_emb.is_neox_style,
+                    }
+                    dtype = q_rope_ref.dtype
+                    q_rope_ref = q_rope_ref.to(torch.float8_e4m3fn).to(dtype)
+                    q_nope_ref = q_nope_ref.to(torch.float8_e4m3fn).to(dtype)
+                    k_rope_ref = k_rope_ref.to(torch.float8_e4m3fn).to(dtype)
+                    k_nope_ref = k_nope_ref.to(torch.float8_e4m3fn).to(dtype)
                 # Run forward decode on both backends
                 out_trtllm = trtllm_backend.forward_decode(
-                    q.clone(), k.clone(), v.clone(), layer, fb_trtllm
+                    q_nope_trt,
+                    k_nope_trt,
+                    None,
+                    layer,
+                    fb_trtllm,
+                    q_rope=q_rope_trt,
+                    k_rope=k_rope_trt,
+                    **extra_args,
                 )
+                # Reference backend should also take separate components, not concatenated
                 out_reference = reference_backend.forward_decode(
-                    q.clone(), k.clone(), v.clone(), layer, fb_reference
+                    q_nope_ref,
+                    k_nope_ref,
+                    v_ref,
+                    layer,
+                    fb_reference,
+                    q_rope=q_rope_ref,
+                    k_rope=k_rope_ref,
                 )
                 # Compare outputs
                 comparison_passed = compare_outputs(
-                    out_trtllm, out_reference, tolerance=config["tolerance"]
+                    out_trtllm, out_reference, tolerance=tolerance
                 )
                 self.assertTrue(
@@ -544,12 +675,16 @@ class TestTRTLLMMLA(CustomTestCase):
                     batch_size, seq_lens, [model_runner], layer, config
                 )
-                # Create Q, K, V tensors
+                # Create Q, K, V tensors with separate MLA components
                 torch.manual_seed(config["seed_qkv"])
-                q, k, v = self._create_qkv_tensors(batch_size, config)
+                q_nope, q_rope, k_nope, k_rope, v = self._create_qkv_tensors(
+                    batch_size, config
+                )
-                # Run forward decode
-                output = backend.forward_decode(q, k, v, layer, fb)
+                # Run forward decode with separate MLA components
+                output = backend.forward_decode(
+                    q_nope, k_nope, None, layer, fb, q_rope=q_rope, k_rope=k_rope
+                )
                 expected_shape = (
                     batch_size,
@@ -591,23 +726,38 @@ class TestTRTLLMMLA(CustomTestCase):
                 )
                 backend.init_forward_metadata(fb)
-                # Create Q, K, V tensors
+                # Create Q, K, V tensors with separate MLA components
                 torch.manual_seed(config["seed_qkv"])
-                head_dim = config["kv_lora_rank"] + config["qk_rope_head_dim"]
-                q = torch.randn(
-                    (batch_size, config["num_attention_heads"], head_dim),
+                q_nope = torch.randn(
+                    (batch_size, config["num_attention_heads"], config["kv_lora_rank"]),
                     dtype=config["dtype"],
                     device=config["device"],
                 )
-                k = torch.randn(
-                    (batch_size, config["num_kv_heads"], head_dim),
+                k_nope = torch.randn(
+                    (batch_size, config["num_kv_heads"], config["kv_lora_rank"]),
                     dtype=config["dtype"],
                     device=config["device"],
                 )
-                v = None
+                q_rope = torch.randn(
+                    (
+                        batch_size,
+                        config["num_attention_heads"],
+                        config["qk_rope_head_dim"],
+                    ),
+                    dtype=config["dtype"],
+                    device=config["device"],
+                )
+                k_rope = torch.randn(
+                    (batch_size, config["num_kv_heads"], config["qk_rope_head_dim"]),
+                    dtype=config["dtype"],
+                    device=config["device"],
+                )
+                v = None  # Test with None v
                 # Run forward decode
-                output = backend.forward_decode(q, k, v, layer, fb)
+                output = backend.forward_decode(
+                    q_nope, k_nope, v, layer, fb, q_rope=q_rope, k_rope=k_rope
+                )
                 # Shape and sanity checks
                 expected_shape = (

sglang/test/doc_patch.py ADDED Viewed

@@ -0,0 +1,59 @@
+"""
+Do some monkey patch to make the documentation compilation faster and more reliable.
+- Avoid port conflicts
+- Reduce the server launch time
+"""
+import weakref
+import nest_asyncio
+nest_asyncio.apply()
+import sglang.srt.server_args as server_args_mod
+from sglang.utils import execute_shell_command, reserve_port
+DEFAULT_MAX_RUNNING_REQUESTS = 128
+DEFAULT_MAX_TOTAL_TOKENS = 20480  # To allow multiple servers on the same machine
+_original_post_init = server_args_mod.ServerArgs.__post_init__
+def patched_post_init(self):
+    _original_post_init(self)
+    if self.max_running_requests is None:
+        self.max_running_requests = DEFAULT_MAX_RUNNING_REQUESTS
+    if self.max_total_tokens is None:
+        self.max_total_tokens = DEFAULT_MAX_TOTAL_TOKENS
+    self.cuda_graph_max_bs = 4
+server_args_mod.ServerArgs.__post_init__ = patched_post_init
+process_socket_map = weakref.WeakKeyDictionary()
+def launch_server_cmd(command: str, host: str = "0.0.0.0", port: int = None):
+    """
+    Launch the server using the given command.
+    If no port is specified, a free port is reserved.
+    """
+    if port is None:
+        port, lock_socket = reserve_port(host)
+    else:
+        lock_socket = None
+    extra_flags = (
+        f"--max-running-requests {DEFAULT_MAX_RUNNING_REQUESTS} "
+        f"--max-total-tokens {DEFAULT_MAX_TOTAL_TOKENS} "
+        f"--cuda-graph-max-bs 4"
+    )
+    full_command = f"{command} --port {port} {extra_flags}"
+    process = execute_shell_command(full_command)
+    if lock_socket is not None:
+        process_socket_map[process] = lock_socket
+    return process, port

sglang/test/few_shot_gsm8k.py CHANGED Viewed

@@ -12,7 +12,7 @@ import time
 import numpy as np
-from sglang.api import set_default_backend
+from sglang.lang.api import set_default_backend
 from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
 from sglang.utils import download_and_cache_file, dump_state_text, read_jsonl

sglang/test/few_shot_gsm8k_engine.py CHANGED Viewed

@@ -8,7 +8,7 @@ import time
 import numpy as np
 import sglang as sgl
-from sglang.api import set_default_backend
+from sglang.lang.api import set_default_backend
 from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
 from sglang.utils import download_and_cache_file, dump_state_text, read_jsonl

sglang/test/run_eval.py CHANGED Viewed

@@ -65,9 +65,10 @@ def run_eval(args):
     sampler = ChatCompletionSampler(
         model=args.model,
-        max_tokens=2048,
+        max_tokens=getattr(args, "max_tokens", 2048),
         base_url=base_url,
         temperature=getattr(args, "temperature", 0.0),
+        reasoning_effort=getattr(args, "reasoning_effort", None),
     )
     # Run eval
@@ -120,7 +121,9 @@ if __name__ == "__main__":
     parser.add_argument("--eval-name", type=str, default="mmlu")
     parser.add_argument("--num-examples", type=int)
     parser.add_argument("--num-threads", type=int, default=512)
+    parser.add_argument("--max-tokens", type=int, default=2048)
     parser.add_argument("--temperature", type=float, default=0.0)
+    parser.add_argument("--reasoning-effort", type=str)
     args = parser.parse_args()
     run_eval(args)

sglang/test/runners.py CHANGED Viewed

@@ -568,8 +568,8 @@ class SRTRunner:
         else:
             self.tokenizer = None
-    def load_lora_adapter(self, lora_name: str, lora_path: str):
-        return self.engine.load_lora_adapter(lora_name, lora_path)
+    def load_lora_adapter(self, lora_name: str, lora_path: str, pinned: bool = False):
+        return self.engine.load_lora_adapter(lora_name, lora_path, pinned)
     def unload_lora_adapter(self, lora_name: str):
         return self.engine.unload_lora_adapter(lora_name)

sglang/test/simple_eval_common.py CHANGED Viewed

@@ -91,6 +91,7 @@ class ChatCompletionSampler(SamplerBase):
         model: Optional[str] = None,
         system_message: Optional[str] = None,
         temperature: float = 0.0,
+        reasoning_effort: Optional[str] = None,
         max_tokens: int = 2048,
     ):
         self.client = OpenAI(base_url=base_url, http_client=LargerHttpxClient())
@@ -102,7 +103,11 @@ class ChatCompletionSampler(SamplerBase):
         self.system_message = system_message
         self.temperature = temperature
         self.max_tokens = max_tokens
+        self.reasoning_effort = reasoning_effort
         self.image_format = "url"
+        print(
+            f"ChatCompletionSampler initialized with {self.system_message=} {self.temperature=} {self.max_tokens=} {self.reasoning_effort=}"
+        )
     def _handle_image(
         self,
@@ -138,6 +143,7 @@ class ChatCompletionSampler(SamplerBase):
                     messages=message_list,
                     temperature=self.temperature,
                     max_tokens=self.max_tokens,
+                    reasoning_effort=self.reasoning_effort,
                 )
                 return response.choices[0].message.content
             # NOTE: BadRequestError is triggered once for MMMU, please uncomment if you are rerunning MMMU

sglang/test/simple_eval_gpqa.py CHANGED Viewed

@@ -71,6 +71,8 @@ class GPQAEval(Eval):
                 )
             ]
             response_text = sampler(prompt_messages)
+            if response_text is None:
+                response_text = ""
             match = re.search(ANSWER_PATTERN_MULTICHOICE, response_text)
             extracted_answer = match.group(1) if match else None
             score = 1.0 if extracted_answer == correct_answer else 0.0

sglang/test/test_fp4_moe.py CHANGED Viewed

@@ -1,6 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
+from typing import Callable
 import pytest
 import torch
+from flashinfer.fused_moe import cutlass_fused_moe as flashinfer_cutlass_fused_moe
 from sgl_kernel import scaled_fp4_quant
 from sglang.srt.layers.activation import SiluAndMul
@@ -111,15 +114,16 @@ def torch_moe(a, w1, w2, score, topk, expert_map):
     ).sum(dim=1)
-@pytest.mark.parametrize("m,n,k", MNK_FACTORS)
-@pytest.mark.parametrize("e", [40, 64, 256])
-@pytest.mark.parametrize("topk", [1, 6, 8])
-@pytest.mark.parametrize("dtype", [torch.half, torch.bfloat16])
-@torch.inference_mode()
-def test_cutlass_fp4_moe_no_graph(
-    m: int, n: int, k: int, e: int, topk: int, dtype: torch.dtype
+def check_moe(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    dtype: torch.dtype,
+    moe_impl: Callable,
+    flip_w13: bool,
 ):
     torch.manual_seed(7)
     a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
     w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
@@ -167,38 +171,18 @@ def test_cutlass_fp4_moe_no_graph(
     a1_gs = torch.ones((e,), device="cuda", dtype=torch.float32)
     a2_gs = torch.ones((e,), device="cuda", dtype=torch.float32)
-    # strides for the cutlass moe_fp4 kernel
-    ab_strides_13 = torch.full(
-        (e,), w1_q.shape[2] * 2, dtype=torch.int64, device=w1_q.device
-    )
-    c_strides_13 = torch.full(
-        (e,), w1_q.shape[1], dtype=torch.int64, device=w1_q.device
-    )
-    ab_strides_2 = torch.full(
-        (e,), w2_q.shape[2] * 2, dtype=torch.int64, device=w2_q.device
-    )
-    c_strides_2 = torch.full((e,), w2_q.shape[1], dtype=torch.int64, device=w2_q.device)
-    params = CutlassMoEParams(
-        CutlassMoEType.BlockscaledFP4,
-        device=a.device,
-        num_experts=e,
-        intermediate_size_per_partition=n,  # n
-        hidden_size=k,
-    )  # k
-    cutlass_output = cutlass_moe_fp4(
+    test_output = moe_impl(
         a=a,
-        a1_gscale=a1_gs,
-        w1_fp4=w1_q,
+        topk_weights=topk_weights,
+        topk_ids=topk_ids,
+        w1_q=w1_q,
+        w2_q=w2_q,
+        a1_gs=a1_gs,
         w1_blockscale=w1_blockscale,
         w1_alphas=(1 / w1_gs),
-        a2_gscale=a2_gs,
-        w2_fp4=w2_q,
+        a2_gs=a2_gs,
         w2_blockscale=w2_blockscale,
         w2_alphas=(1 / w2_gs),
-        topk_weights=topk_weights,
-        topk_ids=topk_ids,
-        params=params,
-        apply_router_weight_on_input=False,
     )
     # Reference check:
@@ -237,10 +221,108 @@ def test_cutlass_fp4_moe_no_graph(
             block_size=quant_blocksize,
         )
+    if flip_w13:
+        dim = -2
+        size = w1_d.size(dim)
+        assert size % 2 == 0, f"Expected even size in dim {dim}, got {size}"
+        half = size // 2
+        # Reorder weight
+        w1, w3 = w1_d.split(half, dim=dim)
+        w1_d = torch.cat([w3, w1], dim=dim).contiguous()
     torch_output = torch_moe(a_in_dtype, w1_d, w2_d, score, topk, None)
-    torch.testing.assert_close(torch_output, cutlass_output, atol=1e-1, rtol=1e-1)
+    torch.testing.assert_close(torch_output, test_output, atol=1e-1, rtol=1e-1)
+@pytest.mark.parametrize("m,n,k", MNK_FACTORS)
+@pytest.mark.parametrize("e", [40, 64, 256])
+@pytest.mark.parametrize("topk", [1, 6, 8])
+@pytest.mark.parametrize("dtype", [torch.half, torch.bfloat16])
+@torch.inference_mode()
+def test_cutlass_fp4_moe_no_graph(
+    m: int, n: int, k: int, e: int, topk: int, dtype: torch.dtype
+):
+    def cutlass_moe_impl(
+        a,
+        topk_weights,
+        topk_ids,
+        w1_q,
+        w2_q,
+        a1_gs,
+        w1_blockscale,
+        w1_alphas,
+        a2_gs,
+        w2_blockscale,
+        w2_alphas,
+    ):
+        params = CutlassMoEParams(
+            CutlassMoEType.BlockscaledFP4,
+            device=a.device,
+            num_experts=e,
+            intermediate_size_per_partition=n,  # n
+            hidden_size=k,
+        )  # k
+        return cutlass_moe_fp4(
+            a=a,
+            a1_gscale=a1_gs,
+            w1_fp4=w1_q,
+            w1_blockscale=w1_blockscale,
+            w1_alphas=w1_alphas,
+            a2_gscale=a2_gs,
+            w2_fp4=w2_q,
+            w2_blockscale=w2_blockscale,
+            w2_alphas=w2_alphas,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            params=params,
+            apply_router_weight_on_input=False,
+        )
+    check_moe(m, n, k, e, topk, dtype, cutlass_moe_impl, flip_w13=False)
+@pytest.mark.parametrize("m,n,k", MNK_FACTORS)
+@pytest.mark.parametrize("e", [40, 64, 256])
+@pytest.mark.parametrize("topk", [1, 6, 8])
+@pytest.mark.parametrize("dtype", [torch.half, torch.bfloat16])
+@torch.inference_mode()
+def test_flashinfer_fp4_moe_no_graph(
+    m: int, n: int, k: int, e: int, topk: int, dtype: torch.dtype
+):
+    def flashinfer_moe_impl(
+        a,
+        topk_weights,
+        topk_ids,
+        w1_q,
+        w2_q,
+        a1_gs,
+        w1_blockscale,
+        w1_alphas,
+        a2_gs,
+        w2_blockscale,
+        w2_alphas,
+    ):
+        return flashinfer_cutlass_fused_moe(
+            a,
+            topk_ids.to(torch.int),
+            topk_weights,
+            w1_q.view(torch.long),
+            w2_q.view(torch.long),
+            a.dtype,
+            quant_scales=[
+                a1_gs,
+                w1_blockscale.view(torch.int32),
+                w1_alphas,
+                a2_gs,
+                w2_blockscale.view(torch.int32),
+                w2_alphas,
+            ],
+        )[0]
+    check_moe(m, n, k, e, topk, dtype, flashinfer_moe_impl, flip_w13=True)
 if __name__ == "__main__":
     test_cutlass_fp4_moe_no_graph(224, 1024, 1024, 256, 8, torch.half)
+    test_flashinfer_fp4_moe_no_graph(224, 1024, 1024, 256, 8, torch.half)

sglang/test/test_utils.py CHANGED Viewed

@@ -83,7 +83,7 @@ DEFAULT_ENABLE_THINKING_MODEL_NAME_FOR_TEST = "Qwen/Qwen3-30B-A3B"
 DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
 DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct"
 DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8,neuralmagic/Mistral-7B-Instruct-v0.3-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8,neuralmagic/gemma-2-2b-it-FP8"
-DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
+DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8,zai-org/GLM-4.5-Air-FP8"
 DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4,hugging-quants/Mixtral-8x7B-Instruct-v0.1-AWQ-INT4"
 DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN = "Qwen/Qwen2.5-1.5B-Instruct"
 DEFAULT_SMALL_VLM_MODEL_NAME_FOR_TEST = "Qwen/Qwen2.5-VL-3B-Instruct"

sglang 0.4.10.post2__py3-none-any.whl → 0.5.0rc1__py3-none-any.whl

sglang 0.4.10.post2py3-none-any.whl → 0.5.0rc1py3-none-any.whl