PyPI - sglang - Versions diffs - 0.5.0rc0__py3-none-any.whl → 0.5.0rc2__py3-none-any.whl - Mend

sglang 0.5.0rc0py3-none-any.whl → 0.5.0rc2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (170) hide show

sglang/__init__.py +8 -3
sglang/bench_one_batch.py +6 -1
sglang/lang/chat_template.py +18 -0
sglang/srt/bench_utils.py +137 -0
sglang/srt/configs/model_config.py +8 -7
sglang/srt/disaggregation/decode.py +8 -4
sglang/srt/disaggregation/mooncake/conn.py +43 -25
sglang/srt/disaggregation/mooncake/transfer_engine.py +29 -0
sglang/srt/distributed/parallel_state.py +4 -2
sglang/srt/entrypoints/context.py +3 -20
sglang/srt/entrypoints/engine.py +13 -8
sglang/srt/entrypoints/harmony_utils.py +2 -0
sglang/srt/entrypoints/http_server.py +68 -5
sglang/srt/entrypoints/openai/protocol.py +2 -9
sglang/srt/entrypoints/openai/serving_chat.py +60 -265
sglang/srt/entrypoints/openai/serving_completions.py +1 -0
sglang/srt/entrypoints/openai/tool_server.py +4 -3
sglang/srt/function_call/ebnf_composer.py +1 -0
sglang/srt/function_call/function_call_parser.py +2 -0
sglang/srt/function_call/glm4_moe_detector.py +1 -1
sglang/srt/function_call/gpt_oss_detector.py +331 -0
sglang/srt/function_call/kimik2_detector.py +3 -3
sglang/srt/function_call/qwen3_coder_detector.py +219 -9
sglang/srt/jinja_template_utils.py +6 -0
sglang/srt/layers/attention/aiter_backend.py +370 -107
sglang/srt/layers/attention/ascend_backend.py +3 -0
sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
sglang/srt/layers/attention/flashattention_backend.py +18 -0
sglang/srt/layers/attention/flashinfer_backend.py +55 -13
sglang/srt/layers/attention/flashinfer_mla_backend.py +1 -0
sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
sglang/srt/layers/attention/triton_backend.py +24 -27
sglang/srt/layers/attention/trtllm_mha_backend.py +8 -6
sglang/srt/layers/attention/trtllm_mla_backend.py +129 -25
sglang/srt/layers/attention/vision.py +9 -1
sglang/srt/layers/attention/wave_backend.py +627 -0
sglang/srt/layers/attention/wave_ops/decode_attention.py +186 -0
sglang/srt/layers/attention/wave_ops/extend_attention.py +149 -0
sglang/srt/layers/attention/wave_ops/prefill_attention.py +79 -0
sglang/srt/layers/communicator.py +11 -13
sglang/srt/layers/dp_attention.py +118 -27
sglang/srt/layers/flashinfer_comm_fusion.py +4 -4
sglang/srt/layers/linear.py +1 -0
sglang/srt/layers/logits_processor.py +12 -18
sglang/srt/layers/moe/cutlass_moe.py +11 -16
sglang/srt/layers/moe/cutlass_w4a8_moe.py +4 -5
sglang/srt/layers/moe/ep_moe/kernels.py +43 -0
sglang/srt/layers/moe/ep_moe/layer.py +60 -2
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=129,N=352,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=161,N=192,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_0/E=16,N=1024,device_name=NVIDIA_B200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/layer.py +7 -9
sglang/srt/layers/moe/token_dispatcher/deepep.py +61 -24
sglang/srt/layers/moe/topk.py +4 -1
sglang/srt/layers/multimodal.py +156 -40
sglang/srt/layers/quantization/__init__.py +10 -35
sglang/srt/layers/quantization/awq.py +15 -16
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +0 -1
sglang/srt/layers/quantization/fp8_kernel.py +277 -0
sglang/srt/layers/quantization/fp8_utils.py +22 -10
sglang/srt/layers/quantization/gptq.py +12 -17
sglang/srt/layers/quantization/marlin_utils.py +15 -5
sglang/srt/layers/quantization/modelopt_quant.py +58 -41
sglang/srt/layers/quantization/mxfp4.py +20 -3
sglang/srt/layers/quantization/utils.py +52 -2
sglang/srt/layers/quantization/w4afp8.py +20 -11
sglang/srt/layers/quantization/w8a8_int8.py +48 -34
sglang/srt/layers/rotary_embedding.py +281 -2
sglang/srt/layers/sampler.py +5 -2
sglang/srt/lora/backend/base_backend.py +3 -23
sglang/srt/lora/layers.py +66 -116
sglang/srt/lora/lora.py +17 -62
sglang/srt/lora/lora_manager.py +12 -48
sglang/srt/lora/lora_registry.py +20 -9
sglang/srt/lora/mem_pool.py +20 -63
sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
sglang/srt/lora/utils.py +25 -58
sglang/srt/managers/cache_controller.py +24 -29
sglang/srt/managers/detokenizer_manager.py +1 -1
sglang/srt/managers/io_struct.py +20 -6
sglang/srt/managers/mm_utils.py +1 -2
sglang/srt/managers/multimodal_processor.py +1 -1
sglang/srt/managers/schedule_batch.py +43 -49
sglang/srt/managers/schedule_policy.py +6 -6
sglang/srt/managers/scheduler.py +18 -11
sglang/srt/managers/scheduler_profiler_mixin.py +28 -8
sglang/srt/managers/tokenizer_manager.py +53 -44
sglang/srt/mem_cache/allocator.py +39 -214
sglang/srt/mem_cache/allocator_ascend.py +158 -0
sglang/srt/mem_cache/chunk_cache.py +1 -1
sglang/srt/mem_cache/hicache_storage.py +1 -1
sglang/srt/mem_cache/hiradix_cache.py +34 -24
sglang/srt/mem_cache/lora_radix_cache.py +421 -0
sglang/srt/mem_cache/memory_pool_host.py +33 -35
sglang/srt/mem_cache/radix_cache.py +2 -5
sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +443 -0
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +139 -67
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +6 -9
sglang/srt/model_executor/cuda_graph_runner.py +29 -23
sglang/srt/model_executor/forward_batch_info.py +33 -14
sglang/srt/model_executor/model_runner.py +179 -81
sglang/srt/model_loader/loader.py +18 -6
sglang/srt/models/deepseek_nextn.py +2 -1
sglang/srt/models/deepseek_v2.py +79 -38
sglang/srt/models/gemma2.py +0 -34
sglang/srt/models/gemma3n_mm.py +8 -9
sglang/srt/models/glm4.py +6 -0
sglang/srt/models/glm4_moe.py +11 -11
sglang/srt/models/glm4_moe_nextn.py +2 -1
sglang/srt/models/glm4v.py +589 -0
sglang/srt/models/glm4v_moe.py +400 -0
sglang/srt/models/gpt_oss.py +142 -20
sglang/srt/models/granite.py +0 -25
sglang/srt/models/llama.py +10 -27
sglang/srt/models/llama4.py +19 -6
sglang/srt/models/qwen2.py +2 -2
sglang/srt/models/qwen2_5_vl.py +7 -3
sglang/srt/models/qwen2_audio.py +10 -9
sglang/srt/models/qwen2_moe.py +20 -5
sglang/srt/models/qwen3.py +0 -24
sglang/srt/models/qwen3_classification.py +78 -0
sglang/srt/models/qwen3_moe.py +18 -5
sglang/srt/models/registry.py +1 -1
sglang/srt/models/step3_vl.py +6 -2
sglang/srt/models/torch_native_llama.py +0 -24
sglang/srt/multimodal/processors/base_processor.py +23 -13
sglang/srt/multimodal/processors/glm4v.py +132 -0
sglang/srt/multimodal/processors/qwen_audio.py +4 -2
sglang/srt/operations.py +17 -2
sglang/srt/reasoning_parser.py +316 -0
sglang/srt/sampling/sampling_batch_info.py +7 -4
sglang/srt/server_args.py +142 -140
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -21
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +7 -21
sglang/srt/speculative/eagle_worker.py +16 -0
sglang/srt/two_batch_overlap.py +16 -12
sglang/srt/utils.py +3 -3
sglang/srt/weight_sync/tensor_bucket.py +106 -0
sglang/test/attention/test_trtllm_mla_backend.py +186 -36
sglang/test/doc_patch.py +59 -0
sglang/test/few_shot_gsm8k.py +1 -1
sglang/test/few_shot_gsm8k_engine.py +1 -1
sglang/test/run_eval.py +4 -1
sglang/test/simple_eval_common.py +6 -0
sglang/test/simple_eval_gpqa.py +2 -0
sglang/test/test_fp4_moe.py +118 -36
sglang/test/test_marlin_moe.py +1 -1
sglang/test/test_marlin_utils.py +1 -1
sglang/utils.py +1 -1
sglang/version.py +1 -1
{sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/METADATA +27 -31
{sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/RECORD +166 -142
sglang/lang/backend/__init__.py +0 -0
sglang/srt/function_call/harmony_tool_parser.py +0 -130
sglang/srt/layers/quantization/scalar_type.py +0 -352
sglang/srt/lora/backend/flashinfer_backend.py +0 -131
/sglang/{api.py → lang/api.py} +0 -0
{sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/WHEEL +0 -0
{sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/top_level.txt +0 -0

sglang/srt/two_batch_overlap.py CHANGED Viewed

@@ -26,11 +26,13 @@ from sglang.srt.model_executor.forward_batch_info import (
 from sglang.srt.operations import execute_operations, execute_overlapped_operations
 from sglang.srt.operations_strategy import OperationsStrategy
 from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
-from sglang.srt.utils import BumpAllocator, get_bool_env_var
+from sglang.srt.utils import BumpAllocator, get_bool_env_var, is_hip
 if TYPE_CHECKING:
     from sglang.srt.layers.moe.token_dispatcher import DispatchOutput
+_is_hip = is_hip()
 _tbo_debug = get_bool_env_var("SGLANG_TBO_DEBUG")
 logger = logging.getLogger(__name__)
@@ -676,16 +678,12 @@ class TboForwardBatchPreparer:
         # TODO improve, e.g. unify w/ `init_raw`
         if (
             global_server_args_dict["moe_dense_tp_size"] == 1
-            and batch.gathered_buffer is not None
+            and batch.global_dp_buffer_len is not None
         ):
             sum_len = end_token_index - start_token_index
-            gathered_buffer = torch.zeros(
-                (sum_len, batch.gathered_buffer.shape[1]),
-                dtype=batch.gathered_buffer.dtype,
-                device=batch.gathered_buffer.device,
-            )
+            global_dp_buffer_len = sum_len
         else:
-            gathered_buffer = None
+            global_dp_buffer_len = None
         output_dict.update(
             dict(
@@ -704,7 +702,7 @@ class TboForwardBatchPreparer:
                 global_num_tokens_gpu=None,
                 global_num_tokens_cpu=None,
                 dp_padding_mode=None,
-                gathered_buffer=gathered_buffer,
+                global_dp_buffer_len=global_dp_buffer_len,
                 global_num_tokens_for_logprob_gpu=None,
                 global_num_tokens_for_logprob_cpu=None,
                 sampling_info=None,
@@ -822,9 +820,15 @@ def _model_forward_tbo(
     )
     del inputs
-    with deep_gemm_wrapper.configure_deep_gemm_num_sms(
-        operations_strategy.deep_gemm_num_sms
-    ):
+    context = (
+        empty_context()
+        if _is_hip
+        else deep_gemm_wrapper.configure_deep_gemm_num_sms(
+            operations_strategy.deep_gemm_num_sms
+        )
+    )
+    with context:
         outputs_arr = execute_overlapped_operations(
             inputs_arr=inputs_arr,
             operations_arr=[operations_strategy.operations] * 2,

sglang/srt/utils.py CHANGED Viewed

@@ -815,7 +815,7 @@ def load_video(video_file: Union[str, bytes], use_gpu: bool = True):
                 vr = VideoReader(tmp_file.name, ctx=ctx)
             elif video_file.startswith("data:"):
                 _, encoded = video_file.split(",", 1)
-                video_bytes = base64.b64decode(encoded)
+                video_bytes = pybase64.b64decode(encoded)
                 tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
                 tmp_file.write(video_bytes)
                 tmp_file.close()
@@ -823,7 +823,7 @@ def load_video(video_file: Union[str, bytes], use_gpu: bool = True):
             elif os.path.isfile(video_file):
                 vr = VideoReader(video_file, ctx=ctx)
             else:
-                video_bytes = base64.b64decode(video_file)
+                video_bytes = pybase64.b64decode(video_file)
                 tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
                 tmp_file.write(video_bytes)
                 tmp_file.close()
@@ -2960,7 +2960,7 @@ class ConcurrentCounter:
         This suspends the calling coroutine without blocking the thread, allowing
         other tasks to run while waiting. When the counter becomes zero, the coroutine resumes.
         """
-        self.wait_for(lambda count: count == 0)
+        await self.wait_for(lambda count: count == 0)
 @lru_cache(maxsize=1)

sglang/srt/weight_sync/tensor_bucket.py ADDED Viewed

@@ -0,0 +1,106 @@
+from dataclasses import dataclass
+from typing import List, Tuple
+import torch
+@dataclass
+class FlattenedTensorMetadata:
+    """Metadata for a tensor in a flattened bucket"""
+    name: str
+    shape: torch.Size
+    dtype: torch.dtype
+    start_idx: int
+    end_idx: int
+    numel: int
+class FlattenedTensorBucket:
+    """
+    A bucket that flattens multiple tensors into a single tensor for efficient processing
+    while preserving all metadata needed for reconstruction.
+    """
+    def __init__(
+        self,
+        named_tensors: List[Tuple[str, torch.Tensor]] = None,
+        flattened_tensor: torch.Tensor = None,
+        metadata: List[FlattenedTensorMetadata] = None,
+    ):
+        """
+        Initialize a tensor bucket from a list of named tensors OR from pre-flattened data.
+        Args:
+            named_tensors: List of (name, tensor) tuples (for creating new bucket)
+            flattened_tensor: Pre-flattened tensor (for reconstruction)
+            metadata: Pre-computed metadata (for reconstruction)
+        """
+        if named_tensors is not None:
+            # Create bucket from named tensors
+            self.metadata: List[FlattenedTensorMetadata] = [None] * len(named_tensors)
+            self.flattened_tensor: torch.Tensor = None
+            if not named_tensors:
+                raise ValueError("Cannot create empty tensor bucket")
+            # Collect metadata and flatten tensors
+            current_idx = 0
+            flattened_tensors: List[torch.Tensor] = [None] * len(named_tensors)
+            for i, (name, tensor) in enumerate(named_tensors):
+                flattened = tensor.flatten()
+                flattened_tensors[i] = flattened
+                # Store metadata
+                numel = flattened.numel()
+                metadata_obj = FlattenedTensorMetadata(
+                    name=name,
+                    shape=tensor.shape,
+                    dtype=tensor.dtype,
+                    start_idx=current_idx,
+                    end_idx=current_idx + numel,
+                    numel=numel,
+                )
+                self.metadata[i] = metadata_obj
+                current_idx += numel
+            # Concatenate all flattened tensors
+            self.flattened_tensor = torch.cat(flattened_tensors, dim=0)
+        else:
+            # Initialize from pre-flattened data
+            if flattened_tensor is None or metadata is None:
+                raise ValueError(
+                    "Must provide either named_tensors or both flattened_tensor and metadata"
+                )
+            self.flattened_tensor = flattened_tensor
+            self.metadata = metadata
+    def get_flattened_tensor(self) -> torch.Tensor:
+        """Get the flattened tensor containing all bucket tensors"""
+        return self.flattened_tensor
+    def get_metadata(self) -> List[FlattenedTensorMetadata]:
+        """Get metadata for all tensors in the bucket"""
+        return self.metadata
+    def reconstruct_tensors(self) -> List[Tuple[str, torch.Tensor]]:
+        """
+        Reconstruct original tensors from flattened tensor with optimized performance.
+        Uses memory-efficient operations to minimize allocations and copies.
+        """
+        # preallocate the result list
+        reconstructed = [None] * len(self.metadata)
+        for i, meta in enumerate(self.metadata):
+            tensor = self.flattened_tensor[meta.start_idx : meta.end_idx].reshape(
+                meta.shape
+            )
+            # batch dtype conversion (if needed)
+            if tensor.dtype != meta.dtype:
+                tensor = tensor.to(meta.dtype)
+            reconstructed[i] = (meta.name, tensor)
+        return reconstructed

sglang/test/attention/test_trtllm_mla_backend.py CHANGED Viewed

@@ -43,6 +43,37 @@ DEFAULT_CONFIG = {
     "layer_id": 0,
 }
+ROPE_BASE = 10000
+ROPE_SCALING_CONFIG = {
+    "beta_fast": 32,
+    "beta_slow": 1,
+    "factor": 40,
+    "mscale": 1.0,
+    "mscale_all_dim": 1.0,
+    "original_max_position_embeddings": 4096,
+    "type": "yarn",
+    "rope_type": "deepseek_yarn",
+}
+def build_rotary_emb(config, device=None):
+    from sglang.srt.layers.rotary_embedding import get_rope_wrapper
+    dev = device or config["device"]
+    rope_scaling = config.get("rope_scaling", ROPE_SCALING_CONFIG)
+    rotary = get_rope_wrapper(
+        head_size=config["qk_rope_head_dim"],
+        rotary_dim=config["qk_rope_head_dim"],
+        max_position=config["context_len"],
+        base=ROPE_BASE,
+        rope_scaling=rope_scaling,
+        is_neox_style=False,
+        device=dev,
+    )
+    rotary.cos_sin_cache = rotary.cos_sin_cache.to(dev)
+    return rotary
 # Centralized test cases for different test scenarios
 TEST_CASES = {
     "basic_functionality": [
@@ -63,18 +94,36 @@ TEST_CASES = {
     ],
     "decode_output_match": [
         {
-            "name": "single",
+            "name": "single_fp16",
             "batch_size": 1,
             "max_seq_len": 64,
             "page_size": 32,
-            "description": "Single vs reference",
+            "description": "Single FP16 vs reference",
         },
         {
-            "name": "batch",
+            "name": "single_fp8",
+            "batch_size": 1,
+            "max_seq_len": 64,
+            "page_size": 64,
+            "tolerance": 1e-1,
+            "kv_cache_dtype": torch.float8_e4m3fn,
+            "description": "Single FP8 vs reference",
+        },
+        {
+            "name": "batch_fp16",
             "batch_size": 32,
             "max_seq_len": 64,
             "page_size": 32,
-            "description": "Batch vs reference",
+            "description": "Batch FP16 vs reference",
+        },
+        {
+            "name": "batch_fp8",
+            "batch_size": 32,
+            "max_seq_len": 64,
+            "page_size": 64,
+            "tolerance": 1e-1,
+            "kv_cache_dtype": torch.float8_e4m3fn,
+            "description": "Batch FP8 vs reference",
         },
     ],
     "page_size_consistency": [
@@ -293,26 +342,52 @@ class TestTRTLLMMLA(CustomTestCase):
             layer,
         )
-    def _create_qkv_tensors(self, batch_size, config):
-        """Create Q, K, V tensors for testing."""
-        head_dim = config["kv_lora_rank"] + config["qk_rope_head_dim"]
+    def _create_qkv_tensors(self, batch_size, config, dtype_override=None):
+        """Create Q, K, V random tensors for given batch size with separate MLA components.
+        Args:
+            batch_size: Batch size.
+            config: Configuration dict with model dims and device.
+            dtype_override: Optional torch dtype to override config["dtype"].
+        Returns:
+            Tuple of (q_nope, q_rope, k_nope, k_rope, v, cos_sin_cache)
+        """
         device = config["device"]
-        dtype = config["dtype"]
+        target_dtype = dtype_override or config["dtype"]
-        q = torch.randn(
-            (batch_size, config["num_attention_heads"], head_dim),
-            dtype=dtype,
+        # Create separate nope and rope components for Q
+        q_nope = torch.randn(
+            (batch_size, config["num_attention_heads"], config["kv_lora_rank"]),
+            dtype=config["dtype"],
             device=device,
         )
-        k = torch.randn(
-            (batch_size, config["num_kv_heads"], head_dim), dtype=dtype, device=device
+        q_rope = torch.randn(
+            (batch_size, config["num_attention_heads"], config["qk_rope_head_dim"]),
+            dtype=config["dtype"],
+            device=device,
+        )
+        # Create separate nope and rope components for K
+        k_nope = torch.randn(
+            (batch_size, config["num_kv_heads"], config["kv_lora_rank"]),
+            dtype=config["dtype"],
+            device=device,
+        )
+        k_rope = torch.randn(
+            (batch_size, config["num_kv_heads"], config["qk_rope_head_dim"]),
+            dtype=config["dtype"],
+            device=device,
         )
+        # V tensor (unchanged)
         v = torch.randn(
             (batch_size, config["num_kv_heads"], config["v_head_dim"]),
-            dtype=dtype,
+            dtype=config["dtype"],
             device=device,
         )
-        return q, k, v
+        return q_nope, q_rope, k_nope, k_rope, v
     def _create_forward_batch(
         self, batch_size, seq_lens, backend, model_runner, config
@@ -331,6 +406,10 @@ class TestTRTLLMMLA(CustomTestCase):
         )
         fb.req_to_token_pool = model_runner.req_to_token_pool
         fb.token_to_kv_pool = model_runner.token_to_kv_pool
+        # Add position information for RoPE
+        fb.positions = torch.arange(batch_size, device=config["device"])
         return fb
     def _populate_kv_cache(self, batch_size, seq_lens, model_runners, layer, config):
@@ -344,7 +423,7 @@ class TestTRTLLMMLA(CustomTestCase):
                 for token_idx in range(seq_len - 1):
                     # Create random K components for MLA
                     cache_k_nope = torch.randn(
-                        (1, config["qk_nope_head_dim"]),
+                        (1, config["kv_lora_rank"]),
                         dtype=config["dtype"],
                         device=config["device"],
                     )
@@ -411,12 +490,16 @@ class TestTRTLLMMLA(CustomTestCase):
                     batch_size, seq_lens, [model_runner_trtllm], layer, config
                 )
-                # Create Q, K, V tensors
+                # Create Q, K, V tensors with separate MLA components
                 torch.manual_seed(config["seed_qkv"])
-                q, k, v = self._create_qkv_tensors(batch_size, config)
+                q_nope, q_rope, k_nope, k_rope, v = self._create_qkv_tensors(
+                    batch_size, config
+                )
-                # Run forward decode
-                output = trtllm_backend.forward_decode(q, k, v, layer, fb)
+                # Run forward decode with separate MLA components
+                output = trtllm_backend.forward_decode(
+                    q_nope, k_nope, None, layer, fb, q_rope=q_rope, k_rope=k_rope
+                )
                 # Basic checks
                 expected_shape = (
@@ -439,6 +522,7 @@ class TestTRTLLMMLA(CustomTestCase):
                 config = self._merge_config(test_case)
                 batch_size = config["batch_size"]
                 max_seq_len = config["max_seq_len"]
+                use_fp8 = config["kv_cache_dtype"] == torch.float8_e4m3fn
                 # Create components
                 (
@@ -487,19 +571,66 @@ class TestTRTLLMMLA(CustomTestCase):
                 # Create Q, K, V tensors for current decode step
                 torch.manual_seed(config["seed_qkv"])
-                q, k, v = self._create_qkv_tensors(batch_size, config)
+                q_nope_ref, q_rope_ref, k_nope_ref, k_rope_ref, v_ref = (
+                    self._create_qkv_tensors(batch_size, config)
+                )
+                q_nope_trt, q_rope_trt, k_nope_trt, k_rope_trt, v_trt = (
+                    q_nope_ref.clone(),
+                    q_rope_ref.clone(),
+                    k_nope_ref.clone(),
+                    k_rope_ref.clone(),
+                    v_ref.clone(),
+                )
+                tolerance = config["tolerance"]
+                extra_args = {}
+                if use_fp8:
+                    # TRT kernel applies RoPE + FP8 quantization internally
+                    # pre-apply RoPE on the reference (FlashInfer) path here so
+                    # both paths share the same rope params/cache while keeping
+                    # the TRT path unrotated.
+                    rotary_emb = build_rotary_emb(config)
+                    q_rope_ref, k_rope_ref = rotary_emb(
+                        fb_reference.positions, q_rope_ref, k_rope_ref
+                    )
+                    extra_args = {
+                        "cos_sin_cache": rotary_emb.cos_sin_cache,
+                        "is_neox": rotary_emb.is_neox_style,
+                    }
+                    dtype = q_rope_ref.dtype
+                    q_rope_ref = q_rope_ref.to(torch.float8_e4m3fn).to(dtype)
+                    q_nope_ref = q_nope_ref.to(torch.float8_e4m3fn).to(dtype)
+                    k_rope_ref = k_rope_ref.to(torch.float8_e4m3fn).to(dtype)
+                    k_nope_ref = k_nope_ref.to(torch.float8_e4m3fn).to(dtype)
                 # Run forward decode on both backends
                 out_trtllm = trtllm_backend.forward_decode(
-                    q.clone(), k.clone(), v.clone(), layer, fb_trtllm
+                    q_nope_trt,
+                    k_nope_trt,
+                    None,
+                    layer,
+                    fb_trtllm,
+                    q_rope=q_rope_trt,
+                    k_rope=k_rope_trt,
+                    **extra_args,
                 )
+                # Reference backend should also take separate components, not concatenated
                 out_reference = reference_backend.forward_decode(
-                    q.clone(), k.clone(), v.clone(), layer, fb_reference
+                    q_nope_ref,
+                    k_nope_ref,
+                    v_ref,
+                    layer,
+                    fb_reference,
+                    q_rope=q_rope_ref,
+                    k_rope=k_rope_ref,
                 )
                 # Compare outputs
                 comparison_passed = compare_outputs(
-                    out_trtllm, out_reference, tolerance=config["tolerance"]
+                    out_trtllm, out_reference, tolerance=tolerance
                 )
                 self.assertTrue(
@@ -544,12 +675,16 @@ class TestTRTLLMMLA(CustomTestCase):
                     batch_size, seq_lens, [model_runner], layer, config
                 )
-                # Create Q, K, V tensors
+                # Create Q, K, V tensors with separate MLA components
                 torch.manual_seed(config["seed_qkv"])
-                q, k, v = self._create_qkv_tensors(batch_size, config)
+                q_nope, q_rope, k_nope, k_rope, v = self._create_qkv_tensors(
+                    batch_size, config
+                )
-                # Run forward decode
-                output = backend.forward_decode(q, k, v, layer, fb)
+                # Run forward decode with separate MLA components
+                output = backend.forward_decode(
+                    q_nope, k_nope, None, layer, fb, q_rope=q_rope, k_rope=k_rope
+                )
                 expected_shape = (
                     batch_size,
@@ -591,23 +726,38 @@ class TestTRTLLMMLA(CustomTestCase):
                 )
                 backend.init_forward_metadata(fb)
-                # Create Q, K, V tensors
+                # Create Q, K, V tensors with separate MLA components
                 torch.manual_seed(config["seed_qkv"])
-                head_dim = config["kv_lora_rank"] + config["qk_rope_head_dim"]
-                q = torch.randn(
-                    (batch_size, config["num_attention_heads"], head_dim),
+                q_nope = torch.randn(
+                    (batch_size, config["num_attention_heads"], config["kv_lora_rank"]),
                     dtype=config["dtype"],
                     device=config["device"],
                 )
-                k = torch.randn(
-                    (batch_size, config["num_kv_heads"], head_dim),
+                k_nope = torch.randn(
+                    (batch_size, config["num_kv_heads"], config["kv_lora_rank"]),
                     dtype=config["dtype"],
                     device=config["device"],
                 )
-                v = None
+                q_rope = torch.randn(
+                    (
+                        batch_size,
+                        config["num_attention_heads"],
+                        config["qk_rope_head_dim"],
+                    ),
+                    dtype=config["dtype"],
+                    device=config["device"],
+                )
+                k_rope = torch.randn(
+                    (batch_size, config["num_kv_heads"], config["qk_rope_head_dim"]),
+                    dtype=config["dtype"],
+                    device=config["device"],
+                )
+                v = None  # Test with None v
                 # Run forward decode
-                output = backend.forward_decode(q, k, v, layer, fb)
+                output = backend.forward_decode(
+                    q_nope, k_nope, v, layer, fb, q_rope=q_rope, k_rope=k_rope
+                )
                 # Shape and sanity checks
                 expected_shape = (

sglang/test/doc_patch.py ADDED Viewed

@@ -0,0 +1,59 @@
+"""
+Do some monkey patch to make the documentation compilation faster and more reliable.
+- Avoid port conflicts
+- Reduce the server launch time
+"""
+import weakref
+import nest_asyncio
+nest_asyncio.apply()
+import sglang.srt.server_args as server_args_mod
+from sglang.utils import execute_shell_command, reserve_port
+DEFAULT_MAX_RUNNING_REQUESTS = 128
+DEFAULT_MAX_TOTAL_TOKENS = 20480  # To allow multiple servers on the same machine
+_original_post_init = server_args_mod.ServerArgs.__post_init__
+def patched_post_init(self):
+    _original_post_init(self)
+    if self.max_running_requests is None:
+        self.max_running_requests = DEFAULT_MAX_RUNNING_REQUESTS
+    if self.max_total_tokens is None:
+        self.max_total_tokens = DEFAULT_MAX_TOTAL_TOKENS
+    self.cuda_graph_max_bs = 4
+server_args_mod.ServerArgs.__post_init__ = patched_post_init
+process_socket_map = weakref.WeakKeyDictionary()
+def launch_server_cmd(command: str, host: str = "0.0.0.0", port: int = None):
+    """
+    Launch the server using the given command.
+    If no port is specified, a free port is reserved.
+    """
+    if port is None:
+        port, lock_socket = reserve_port(host)
+    else:
+        lock_socket = None
+    extra_flags = (
+        f"--max-running-requests {DEFAULT_MAX_RUNNING_REQUESTS} "
+        f"--max-total-tokens {DEFAULT_MAX_TOTAL_TOKENS} "
+        f"--cuda-graph-max-bs 4"
+    )
+    full_command = f"{command} --port {port} {extra_flags}"
+    process = execute_shell_command(full_command)
+    if lock_socket is not None:
+        process_socket_map[process] = lock_socket
+    return process, port

sglang/test/few_shot_gsm8k.py CHANGED Viewed

@@ -12,7 +12,7 @@ import time
 import numpy as np
-from sglang.api import set_default_backend
+from sglang.lang.api import set_default_backend
 from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
 from sglang.utils import download_and_cache_file, dump_state_text, read_jsonl

sglang/test/few_shot_gsm8k_engine.py CHANGED Viewed

@@ -8,7 +8,7 @@ import time
 import numpy as np
 import sglang as sgl
-from sglang.api import set_default_backend
+from sglang.lang.api import set_default_backend
 from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
 from sglang.utils import download_and_cache_file, dump_state_text, read_jsonl

sglang/test/run_eval.py CHANGED Viewed

@@ -65,9 +65,10 @@ def run_eval(args):
     sampler = ChatCompletionSampler(
         model=args.model,
-        max_tokens=2048,
+        max_tokens=getattr(args, "max_tokens", 2048),
         base_url=base_url,
         temperature=getattr(args, "temperature", 0.0),
+        reasoning_effort=getattr(args, "reasoning_effort", None),
     )
     # Run eval
@@ -120,7 +121,9 @@ if __name__ == "__main__":
     parser.add_argument("--eval-name", type=str, default="mmlu")
     parser.add_argument("--num-examples", type=int)
     parser.add_argument("--num-threads", type=int, default=512)
+    parser.add_argument("--max-tokens", type=int, default=2048)
     parser.add_argument("--temperature", type=float, default=0.0)
+    parser.add_argument("--reasoning-effort", type=str)
     args = parser.parse_args()
     run_eval(args)

sglang/test/simple_eval_common.py CHANGED Viewed

@@ -91,6 +91,7 @@ class ChatCompletionSampler(SamplerBase):
         model: Optional[str] = None,
         system_message: Optional[str] = None,
         temperature: float = 0.0,
+        reasoning_effort: Optional[str] = None,
         max_tokens: int = 2048,
     ):
         self.client = OpenAI(base_url=base_url, http_client=LargerHttpxClient())
@@ -102,7 +103,11 @@ class ChatCompletionSampler(SamplerBase):
         self.system_message = system_message
         self.temperature = temperature
         self.max_tokens = max_tokens
+        self.reasoning_effort = reasoning_effort
         self.image_format = "url"
+        print(
+            f"ChatCompletionSampler initialized with {self.system_message=} {self.temperature=} {self.max_tokens=} {self.reasoning_effort=}"
+        )
     def _handle_image(
         self,
@@ -138,6 +143,7 @@ class ChatCompletionSampler(SamplerBase):
                     messages=message_list,
                     temperature=self.temperature,
                     max_tokens=self.max_tokens,
+                    reasoning_effort=self.reasoning_effort,
                 )
                 return response.choices[0].message.content
             # NOTE: BadRequestError is triggered once for MMMU, please uncomment if you are rerunning MMMU

sglang/test/simple_eval_gpqa.py CHANGED Viewed

@@ -71,6 +71,8 @@ class GPQAEval(Eval):
                 )
             ]
             response_text = sampler(prompt_messages)
+            if response_text is None:
+                response_text = ""
             match = re.search(ANSWER_PATTERN_MULTICHOICE, response_text)
             extracted_answer = match.group(1) if match else None
             score = 1.0 if extracted_answer == correct_answer else 0.0

sglang 0.5.0rc0__py3-none-any.whl → 0.5.0rc2__py3-none-any.whl

sglang 0.5.0rc0py3-none-any.whl → 0.5.0rc2py3-none-any.whl