PyPI - sglang - Versions diffs - 0.5.1.post3__py3-none-any.whl → 0.5.2__py3-none-any.whl - Mend

sglang 0.5.1.post3py3-none-any.whl → 0.5.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (245) hide show

sglang/bench_one_batch.py +3 -0
sglang/bench_one_batch_server.py +10 -1
sglang/bench_serving.py +251 -26
sglang/lang/interpreter.py +1 -1
sglang/srt/configs/__init__.py +4 -0
sglang/srt/configs/internvl.py +6 -0
sglang/srt/configs/longcat_flash.py +104 -0
sglang/srt/configs/model_config.py +37 -7
sglang/srt/configs/qwen3_next.py +326 -0
sglang/srt/connector/__init__.py +1 -1
sglang/srt/connector/base_connector.py +1 -2
sglang/srt/connector/redis.py +2 -2
sglang/srt/connector/serde/__init__.py +1 -1
sglang/srt/connector/serde/safe_serde.py +4 -3
sglang/srt/custom_op.py +11 -1
sglang/srt/debug_utils/dump_comparator.py +81 -44
sglang/srt/debug_utils/dump_loader.py +97 -0
sglang/srt/debug_utils/dumper.py +11 -3
sglang/srt/debug_utils/text_comparator.py +73 -11
sglang/srt/disaggregation/ascend/conn.py +75 -0
sglang/srt/disaggregation/base/conn.py +1 -1
sglang/srt/disaggregation/common/conn.py +15 -12
sglang/srt/disaggregation/decode.py +6 -4
sglang/srt/disaggregation/fake/conn.py +1 -1
sglang/srt/disaggregation/mini_lb.py +6 -420
sglang/srt/disaggregation/mooncake/conn.py +18 -10
sglang/srt/disaggregation/nixl/conn.py +180 -16
sglang/srt/disaggregation/prefill.py +6 -4
sglang/srt/disaggregation/utils.py +5 -50
sglang/srt/distributed/parallel_state.py +94 -58
sglang/srt/entrypoints/engine.py +34 -14
sglang/srt/entrypoints/http_server.py +172 -47
sglang/srt/entrypoints/openai/protocol.py +63 -3
sglang/srt/entrypoints/openai/serving_base.py +6 -2
sglang/srt/entrypoints/openai/serving_chat.py +34 -19
sglang/srt/entrypoints/openai/serving_completions.py +10 -4
sglang/srt/entrypoints/openai/serving_embedding.py +8 -4
sglang/srt/entrypoints/openai/serving_responses.py +7 -4
sglang/srt/eplb/eplb_manager.py +28 -4
sglang/srt/eplb/expert_distribution.py +55 -15
sglang/srt/eplb/expert_location.py +8 -3
sglang/srt/eplb/expert_location_updater.py +1 -1
sglang/srt/function_call/ebnf_composer.py +11 -9
sglang/srt/function_call/glm4_moe_detector.py +1 -1
sglang/srt/function_call/gpt_oss_detector.py +1 -1
sglang/srt/function_call/qwen3_coder_detector.py +1 -1
sglang/srt/hf_transformers_utils.py +12 -0
sglang/srt/layers/activation.py +44 -9
sglang/srt/layers/attention/aiter_backend.py +93 -68
sglang/srt/layers/attention/ascend_backend.py +250 -112
sglang/srt/layers/attention/fla/chunk.py +242 -0
sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
sglang/srt/layers/attention/fla/chunk_o.py +178 -0
sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
sglang/srt/layers/attention/fla/cumsum.py +300 -0
sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
sglang/srt/layers/attention/fla/index.py +37 -0
sglang/srt/layers/attention/fla/l2norm.py +150 -0
sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
sglang/srt/layers/attention/fla/op.py +66 -0
sglang/srt/layers/attention/fla/solve_tril.py +465 -0
sglang/srt/layers/attention/fla/utils.py +331 -0
sglang/srt/layers/attention/fla/wy_fast.py +158 -0
sglang/srt/layers/attention/flashinfer_backend.py +6 -4
sglang/srt/layers/attention/flashinfer_mla_backend.py +16 -12
sglang/srt/layers/attention/hybrid_attn_backend.py +47 -8
sglang/srt/layers/attention/hybrid_linear_attn_backend.py +584 -0
sglang/srt/layers/attention/intel_amx_backend.py +3 -0
sglang/srt/layers/attention/mamba/causal_conv1d.py +128 -0
sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +1052 -0
sglang/srt/layers/attention/mamba/mamba.py +64 -0
sglang/srt/layers/attention/torch_native_backend.py +12 -6
sglang/srt/layers/attention/trtllm_mla_backend.py +126 -36
sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
sglang/srt/layers/communicator.py +45 -7
sglang/srt/layers/layernorm.py +54 -12
sglang/srt/layers/logits_processor.py +10 -3
sglang/srt/layers/moe/__init__.py +2 -1
sglang/srt/layers/moe/cutlass_w4a8_moe.py +4 -12
sglang/srt/layers/moe/ep_moe/kernels.py +74 -0
sglang/srt/layers/moe/ep_moe/layer.py +110 -49
sglang/srt/layers/moe/fused_moe_native.py +5 -3
sglang/srt/layers/moe/fused_moe_triton/__init__.py +5 -3
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/{E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json } +29 -29
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +9 -1049
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +212 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +799 -0
sglang/srt/layers/moe/fused_moe_triton/layer.py +56 -45
sglang/srt/layers/moe/fused_moe_triton/moe_align_block_size.py +87 -0
sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
sglang/srt/layers/moe/moe_runner/base.py +274 -1
sglang/srt/layers/moe/moe_runner/runner.py +80 -0
sglang/srt/layers/moe/moe_runner/triton.py +448 -0
sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
sglang/srt/layers/moe/token_dispatcher/deepep.py +41 -38
sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
sglang/srt/layers/moe/topk.py +43 -12
sglang/srt/layers/moe/utils.py +6 -5
sglang/srt/layers/quantization/awq.py +19 -7
sglang/srt/layers/quantization/base_config.py +11 -6
sglang/srt/layers/quantization/blockwise_int8.py +38 -27
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +9 -1
sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +0 -3
sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
sglang/srt/layers/quantization/fp8.py +76 -47
sglang/srt/layers/quantization/fp8_utils.py +43 -29
sglang/srt/layers/quantization/gptq.py +25 -17
sglang/srt/layers/quantization/modelopt_quant.py +107 -40
sglang/srt/layers/quantization/moe_wna16.py +21 -18
sglang/srt/layers/quantization/mxfp4.py +77 -45
sglang/srt/layers/quantization/quark/quark_moe.py +32 -27
sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +49 -30
sglang/srt/layers/quantization/quark/utils.py +97 -0
sglang/srt/layers/quantization/rocm_mxfp4_utils.py +13 -0
sglang/srt/layers/quantization/unquant.py +135 -47
sglang/srt/layers/quantization/utils.py +13 -0
sglang/srt/layers/quantization/w4afp8.py +60 -42
sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
sglang/srt/layers/quantization/w8a8_int8.py +83 -41
sglang/srt/layers/rocm_linear_utils.py +44 -0
sglang/srt/layers/rotary_embedding.py +28 -19
sglang/srt/layers/sampler.py +29 -5
sglang/srt/lora/backend/base_backend.py +50 -8
sglang/srt/lora/backend/triton_backend.py +90 -2
sglang/srt/lora/layers.py +32 -0
sglang/srt/lora/lora.py +4 -1
sglang/srt/lora/lora_manager.py +35 -112
sglang/srt/lora/mem_pool.py +24 -10
sglang/srt/lora/utils.py +18 -9
sglang/srt/managers/cache_controller.py +242 -278
sglang/srt/managers/data_parallel_controller.py +30 -15
sglang/srt/managers/detokenizer_manager.py +13 -2
sglang/srt/managers/disagg_service.py +46 -0
sglang/srt/managers/io_struct.py +160 -11
sglang/srt/managers/mm_utils.py +6 -1
sglang/srt/managers/multi_tokenizer_mixin.py +579 -0
sglang/srt/managers/schedule_batch.py +27 -44
sglang/srt/managers/schedule_policy.py +4 -3
sglang/srt/managers/scheduler.py +90 -115
sglang/srt/managers/scheduler_metrics_mixin.py +114 -8
sglang/srt/managers/scheduler_output_processor_mixin.py +29 -19
sglang/srt/managers/scheduler_profiler_mixin.py +1 -1
sglang/srt/managers/scheduler_update_weights_mixin.py +8 -1
sglang/srt/managers/template_manager.py +3 -3
sglang/srt/managers/tokenizer_communicator_mixin.py +491 -0
sglang/srt/managers/tokenizer_manager.py +41 -477
sglang/srt/managers/tp_worker.py +16 -4
sglang/srt/managers/tp_worker_overlap_thread.py +8 -10
sglang/srt/mem_cache/allocator.py +1 -1
sglang/srt/mem_cache/chunk_cache.py +1 -1
sglang/srt/mem_cache/hicache_storage.py +24 -22
sglang/srt/mem_cache/hiradix_cache.py +184 -101
sglang/srt/mem_cache/lora_radix_cache.py +1 -1
sglang/srt/mem_cache/memory_pool.py +324 -41
sglang/srt/mem_cache/memory_pool_host.py +25 -18
sglang/srt/mem_cache/radix_cache.py +5 -6
sglang/srt/mem_cache/radix_cache_cpp.py +1 -1
sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +61 -34
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +149 -12
sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +280 -0
sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +74 -19
sglang/srt/mem_cache/storage/mooncake_store/test_mooncake_store.py +161 -0
sglang/srt/mem_cache/swa_radix_cache.py +1 -3
sglang/srt/metrics/collector.py +484 -63
sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
sglang/srt/metrics/utils.py +48 -0
sglang/srt/model_executor/cpu_graph_runner.py +640 -0
sglang/srt/model_executor/cuda_graph_runner.py +13 -5
sglang/srt/model_executor/forward_batch_info.py +72 -18
sglang/srt/model_executor/model_runner.py +189 -31
sglang/srt/model_loader/__init__.py +9 -3
sglang/srt/model_loader/loader.py +33 -28
sglang/srt/model_loader/utils.py +12 -0
sglang/srt/model_loader/weight_utils.py +2 -1
sglang/srt/models/deepseek_v2.py +311 -50
sglang/srt/models/gemma3n_mm.py +1 -1
sglang/srt/models/glm4_moe.py +10 -1
sglang/srt/models/glm4v.py +4 -2
sglang/srt/models/gpt_oss.py +5 -18
sglang/srt/models/internvl.py +28 -0
sglang/srt/models/llama4.py +9 -0
sglang/srt/models/llama_eagle3.py +17 -0
sglang/srt/models/longcat_flash.py +1026 -0
sglang/srt/models/longcat_flash_nextn.py +699 -0
sglang/srt/models/minicpmv.py +165 -3
sglang/srt/models/mllama4.py +25 -0
sglang/srt/models/opt.py +637 -0
sglang/srt/models/qwen2.py +33 -3
sglang/srt/models/qwen2_5_vl.py +90 -42
sglang/srt/models/qwen2_moe.py +79 -14
sglang/srt/models/qwen3.py +8 -2
sglang/srt/models/qwen3_moe.py +39 -8
sglang/srt/models/qwen3_next.py +1039 -0
sglang/srt/models/qwen3_next_mtp.py +109 -0
sglang/srt/models/torch_native_llama.py +1 -1
sglang/srt/models/transformers.py +1 -1
sglang/srt/multimodal/processors/base_processor.py +4 -2
sglang/srt/multimodal/processors/glm4v.py +9 -9
sglang/srt/multimodal/processors/internvl.py +141 -129
sglang/srt/{reasoning_parser.py → parser/reasoning_parser.py} +1 -1
sglang/srt/sampling/penaltylib/orchestrator.py +14 -2
sglang/srt/sampling/sampling_batch_info.py +18 -15
sglang/srt/server_args.py +297 -79
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -0
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +10 -1
sglang/srt/speculative/eagle_worker.py +216 -120
sglang/srt/speculative/spec_info.py +5 -0
sglang/srt/speculative/standalone_worker.py +109 -0
sglang/srt/utils.py +37 -2
sglang/srt/weight_sync/utils.py +1 -1
sglang/test/attention/test_trtllm_mla_backend.py +181 -8
sglang/test/few_shot_gsm8k.py +1 -0
sglang/test/runners.py +4 -0
sglang/test/test_cutlass_moe.py +24 -6
sglang/test/test_cutlass_w4a8_moe.py +24 -9
sglang/test/test_disaggregation_utils.py +66 -0
sglang/test/test_utils.py +25 -1
sglang/utils.py +5 -0
sglang/version.py +1 -1
{sglang-0.5.1.post3.dist-info → sglang-0.5.2.dist-info}/METADATA +11 -9
{sglang-0.5.1.post3.dist-info → sglang-0.5.2.dist-info}/RECORD +243 -194
sglang/srt/disaggregation/launch_lb.py +0 -131
sglang/srt/mem_cache/storage/mooncake_store/unit_test.py +0 -40
/sglang/srt/{model_parallel.py → layers/model_parallel.py} +0 -0
/sglang/srt/{code_completion_parser.py → parser/code_completion_parser.py} +0 -0
/sglang/srt/{conversation.py → parser/conversation.py} +0 -0
/sglang/srt/{harmony_parser.py → parser/harmony_parser.py} +0 -0
/sglang/srt/{jinja_template_utils.py → parser/jinja_template_utils.py} +0 -0
{sglang-0.5.1.post3.dist-info → sglang-0.5.2.dist-info}/WHEEL +0 -0
{sglang-0.5.1.post3.dist-info → sglang-0.5.2.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.1.post3.dist-info → sglang-0.5.2.dist-info}/top_level.txt +0 -0

sglang/srt/utils.py CHANGED Viewed

@@ -230,8 +230,16 @@ except:
     is_intel_amx_backend_available = False
+try:
+    # move torch._C._cpu._is_amx_tile_supported() from cpu_has_amx_support
+    # to support torch compile
+    is_amx_tile_supported = torch._C._cpu._is_amx_tile_supported()
+except:
+    is_amx_tile_supported = False
 def cpu_has_amx_support():
-    return torch._C._cpu._is_amx_tile_supported() and is_intel_amx_backend_available
+    return is_amx_tile_supported and is_intel_amx_backend_available
 def use_intel_amx_backend(layer):
@@ -426,7 +434,9 @@ def get_available_gpu_memory(
     elif device == "cpu":
         # TODO: rename the variables in the current function to be not GPU specific
-        free_gpu_memory = psutil.virtual_memory().available
+        total_free_memory = psutil.virtual_memory().available
+        n_numa_node: int = len(get_cpu_ids_by_node())
+        free_gpu_memory = round(total_free_memory / n_numa_node, 3)
     elif device == "npu":
         num_gpus = torch.npu.device_count()
         assert gpu_id < num_gpus
@@ -2787,6 +2797,10 @@ def lru_cache_frozenset(maxsize=128):
     return decorator
+def get_origin_rid(rid):
+    return rid.split("_", 1)[1] if "_" in rid else rid
 def apply_module_patch(target_module, target_function, wrappers):
     original_module, original_function = parse_module_path(
         target_module, target_function, False
@@ -2896,6 +2910,18 @@ def mxfp_supported():
         return False
+@lru_cache(maxsize=1)
+def is_gfx95_supported():
+    """
+    Returns whether the current platform supports MX types.
+    """
+    if torch.version.hip:
+        gcn_arch = torch.cuda.get_device_properties(0).gcnArchName
+        return any(gfx in gcn_arch for gfx in ["gfx95"])
+    else:
+        return False
 # LoRA-related constants and utilities
 SUPPORTED_LORA_TARGET_MODULES = [
     "q_proj",
@@ -3011,3 +3037,12 @@ def check_cuda_result(raw_output):
         raise Exception(f"CUDA error: {err}")
     return results
+def numa_bind_to_node(node: int):
+    libnuma = ctypes.CDLL("libnuma.so")
+    if libnuma.numa_available() < 0:
+        raise SystemError("numa not available on this system")
+    libnuma.numa_run_on_node(ctypes.c_int(node))
+    libnuma.numa_set_localalloc()

sglang/srt/weight_sync/utils.py CHANGED Viewed

@@ -6,7 +6,7 @@ from torch.distributed.device_mesh import DeviceMesh
 from torch.distributed.tensor import DTensor
 from sglang.srt.entrypoints.engine import Engine
-from sglang.srt.managers.tokenizer_manager import UpdateWeightsFromTensorReqInput
+from sglang.srt.managers.io_struct import UpdateWeightsFromTensorReqInput
 from sglang.srt.model_executor.model_runner import LocalSerializedTensor
 from sglang.srt.utils import MultiprocessingSerializer

sglang/test/attention/test_trtllm_mla_backend.py CHANGED Viewed

@@ -41,6 +41,10 @@ DEFAULT_CONFIG = {
     "v_head_dim": 512,
     "num_kv_heads": 1,
     "layer_id": 0,
+    "tp_q_head_num": 128,
+    "tp_k_head_num": 128,
+    "prefill_head_dim": 192,
+    "prefill_v_head_dim": 128,
 }
 ROPE_BASE = 10000
@@ -92,7 +96,7 @@ TEST_CASES = {
             "description": "Medium-scale batch",
         },
     ],
-    "decode_output_match": [
+    "output_match": [
         {
             "name": "single_fp16",
             "batch_size": 1,
@@ -208,6 +212,15 @@ class MockModelRunner:
         self.kv_cache_dtype = config["kv_cache_dtype"]
         self.page_size = config["page_size"]
+        # Server args stub - needed by attention backends
+        self.server_args = type(
+            "ServerArgs",
+            (),
+            {
+                "enable_dp_attention": False,  # Default value for testing
+            },
+        )
         # Model-config stub with MLA attributes
         self.model_config = type(
             "ModelConfig",
@@ -313,7 +326,7 @@ class TestTRTLLMMLA(CustomTestCase):
         config.update(test_case)
         return config
-    def _create_model_components(self, config):
+    def _create_model_components(self, config, is_prefill=False):
         """Create model runners, backends, and layer for testing."""
         # Create model runners
         model_runner_trtllm = MockModelRunner(config)
@@ -323,14 +336,23 @@ class TestTRTLLMMLA(CustomTestCase):
         trtllm_backend = TRTLLMMLABackend(model_runner_trtllm)
         reference_backend = FlashInferMLAAttnBackend(model_runner_reference)
+        head_dim = (
+            config["kv_lora_rank"] + config["qk_rope_head_dim"]
+            if not is_prefill
+            else config["prefill_head_dim"]
+        )
+        v_head_dim = (
+            config["v_head_dim"] if not is_prefill else config["prefill_v_head_dim"]
+        )
         # Create RadixAttention layer
         layer = RadixAttention(
             num_heads=config["num_attention_heads"],
-            head_dim=config["kv_lora_rank"] + config["qk_rope_head_dim"],
+            head_dim=head_dim,
             scaling=model_runner_trtllm.model_config.scaling,
             num_kv_heads=config["num_kv_heads"],
             layer_id=config["layer_id"],
-            v_head_dim=config["v_head_dim"],
+            v_head_dim=v_head_dim,
             prefix="attn_mqa",
         )
@@ -515,7 +537,7 @@ class TestTRTLLMMLA(CustomTestCase):
         """Test that TRTLLM and FlashInfer MLA backends produce matching outputs."""
         print(f"\nRunning decode output matching tests...")
-        for test_case in TEST_CASES["decode_output_match"]:
+        for test_case in TEST_CASES["output_match"]:
             with self.subTest(test_case=test_case["name"]):
                 print(f"  Testing {test_case['name']}: {test_case['description']}")
@@ -833,7 +855,7 @@ class TestTRTLLMMLA(CustomTestCase):
                 # Test workspace properties
                 self.assertEqual(metadata.workspace.device.type, "cuda")
-                self.assertEqual(metadata.workspace.dtype, torch.int8)
+                self.assertEqual(metadata.workspace.dtype, torch.uint8)
                 self.assertGreater(
                     metadata.workspace.numel(), 0, "Workspace should have non-zero size"
                 )
@@ -993,8 +1015,8 @@ class TestTRTLLMMLA(CustomTestCase):
         )
         # Verify CUDA graph buffers are allocated
-        self.assertIsNotNone(backend.cuda_graph_kv_indices)
-        self.assertIsNotNone(backend.cuda_graph_workspace)
+        self.assertIsNotNone(backend.decode_cuda_graph_kv_indices)
+        self.assertIsNotNone(backend.decode_cuda_graph_workspace)
         # Test capture metadata
         seq_lens = torch.full(
@@ -1090,6 +1112,157 @@ class TestTRTLLMMLA(CustomTestCase):
         self.assertIsNotNone(metadata_3.block_kv_indices)
         self.assertEqual(metadata_3.block_kv_indices.shape[0], config["batch_size"])
+    def test_prefill_output_match_self_attention(self):
+        """Test prefill (forward) behavior of TRTLLM MLA backend vs reference."""
+        print(f"\nRunning prefill output tests...")
+        for test_case in TEST_CASES["output_match"][:2]:  # Just a subset for speed
+            with self.subTest(test_case=test_case["name"]):
+                print(
+                    f"Prefill Testing {test_case['name']}: {test_case['description']}"
+                )
+                config = self._merge_config(test_case)
+                batch_size = config["batch_size"]
+                max_seq_len = config["max_seq_len"]
+                # Create components
+                (
+                    model_runner_trtllm,
+                    model_runner_reference,
+                    trtllm_backend,
+                    reference_backend,
+                    layer,
+                ) = self._create_model_components(config, is_prefill=True)
+                # Prefill uses full sequences
+                seq_lens = torch.full(
+                    (batch_size,), max_seq_len, device=config["device"]
+                )
+                def _create_forward_batch_prefill(
+                    batch_size,
+                    seq_lens,
+                    extend_prefix_lens,
+                    backend,
+                    model_runner,
+                    config,
+                ):
+                    """Create a forward batch for the given backend."""
+                    fb = ForwardBatch(
+                        batch_size=batch_size,
+                        input_ids=torch.randint(
+                            0, 100, (batch_size, 1), device=config["device"]
+                        ),
+                        out_cache_loc=torch.arange(batch_size, device=config["device"]),
+                        seq_lens_sum=int(seq_lens.sum().item()),
+                        extend_prefix_lens=extend_prefix_lens,
+                        extend_prefix_lens_cpu=extend_prefix_lens.cpu().int().tolist(),
+                        extend_seq_lens_cpu=(seq_lens - extend_prefix_lens)
+                        .cpu()
+                        .int()
+                        .tolist(),
+                        forward_mode=ForwardMode.EXTEND,
+                        req_pool_indices=torch.arange(
+                            batch_size, device=config["device"]
+                        ),
+                        seq_lens=seq_lens,
+                        seq_lens_cpu=seq_lens.cpu(),
+                        attn_attend_prefix_cache=False,
+                        mha_return_lse=False,
+                        attn_backend=backend,
+                    )
+                    fb.req_to_token_pool = model_runner.req_to_token_pool
+                    fb.token_to_kv_pool = model_runner.token_to_kv_pool
+                    # Add position information for RoPE
+                    fb.positions = torch.arange(batch_size, device=config["device"])
+                    return fb
+                # Create forward batches
+                fb_trtllm = _create_forward_batch_prefill(
+                    batch_size,
+                    seq_lens.clone(),
+                    torch.zeros(batch_size, device=config["device"], dtype=torch.int32),
+                    trtllm_backend,
+                    model_runner_trtllm,
+                    config,
+                )
+                fb_reference = _create_forward_batch_prefill(
+                    batch_size,
+                    seq_lens.clone(),
+                    torch.zeros(batch_size, device=config["device"], dtype=torch.int32),
+                    reference_backend,
+                    model_runner_reference,
+                    config,
+                )
+                # Initialize metadata for both backends
+                trtllm_backend.init_forward_metadata(fb_trtllm)
+                reference_backend.init_forward_metadata(fb_reference)
+                # Create Q, K, V tensors for prefill
+                torch.manual_seed(config["seed_qkv"])
+                def _create_qkv_tensors_prefill(
+                    batch_size, seq_len, config, dtype_override=None
+                ):
+                    """Create Q, K, V tensors for prefill, using config for head_num and head_dim."""
+                    device = config["device"]
+                    dtype = dtype_override or config["dtype"]
+                    total_tokens = batch_size * seq_len
+                    tp_q_head_num = config["tp_q_head_num"]
+                    tp_k_head_num = config["tp_k_head_num"]
+                    head_dim = config["prefill_head_dim"]
+                    v_head_dim = config["prefill_v_head_dim"]
+                    q = torch.randn(
+                        (total_tokens, tp_q_head_num * head_dim),
+                        dtype=dtype,
+                        device=device,
+                    )
+                    k = torch.randn(
+                        (total_tokens, tp_k_head_num * head_dim),
+                        dtype=dtype,
+                        device=device,
+                    )
+                    v = torch.randn(
+                        (total_tokens, tp_k_head_num * v_head_dim),
+                        dtype=dtype,
+                        device=device,
+                    )
+                    # Reshape as requested
+                    q = q.view(-1, tp_q_head_num, head_dim)
+                    k = k.view(-1, tp_k_head_num, head_dim)
+                    v = v.view(-1, tp_k_head_num, v_head_dim)
+                    return q, k, v
+                q, k, v = _create_qkv_tensors_prefill(batch_size, max_seq_len, config)
+                # Run prefill on both backends
+                out_trtllm = trtllm_backend.forward_extend(
+                    q, k, v, layer, fb_trtllm, False
+                ).view(-1, layer.tp_q_head_num * layer.v_head_dim)
+                out_reference = reference_backend.forward_extend(
+                    q, k, v, layer, fb_reference, False
+                )
+                tolerance = config.get("tolerance", 1e-2)
+                comparison_passed = compare_outputs(
+                    out_trtllm, out_reference, tolerance=tolerance
+                )
+                self.assertTrue(
+                    comparison_passed,
+                    f"TRTLLM and Reference prefill outputs differ beyond tolerance. "
+                    f"Config: {test_case['name']}, "
+                    f"Max diff: {(out_trtllm - out_reference).abs().max().item()}",
+                )
 if __name__ == "__main__":
     unittest.main()

sglang/test/few_shot_gsm8k.py CHANGED Viewed

@@ -129,6 +129,7 @@ def run_eval(args):
     return {
         "accuracy": acc,
+        "invalid": invalid,
         "latency": latency,
         "output_throughput": output_throughput,
     }

sglang/test/runners.py CHANGED Viewed

@@ -505,6 +505,7 @@ class SRTRunner:
         mem_fraction_static: float = 0.65,
         trust_remote_code: bool = False,
         speculative_draft_model_path: Optional[str] = None,
+        speculative_draft_model_revision: Optional[str] = None,
         speculative_algorithm: Optional[str] = None,
         speculative_num_steps: Optional[int] = None,
         speculative_eagle_topk: Optional[int] = None,
@@ -526,6 +527,9 @@ class SRTRunner:
         spec_kwargs = {}
         if speculative_draft_model_path:
             spec_kwargs["speculative_draft_model_path"] = speculative_draft_model_path
+            spec_kwargs["speculative_draft_model_revision"] = (
+                speculative_draft_model_revision
+            )
             spec_kwargs["speculative_algorithm"] = speculative_algorithm
             spec_kwargs["speculative_num_steps"] = speculative_num_steps
             spec_kwargs["speculative_eagle_topk"] = speculative_eagle_topk

sglang/test/test_cutlass_moe.py CHANGED Viewed

@@ -9,6 +9,7 @@ from transformers import AutoConfig
 from sglang.srt.layers.moe.cutlass_moe import cutlass_fused_experts_fp8
 from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_experts
 from sglang.srt.layers.moe.moe_runner.base import MoeRunnerConfig
+from sglang.srt.layers.moe.topk import StandardTopKOutput
 # Copy from: https://github.com/deepseek-ai/DeepGEMM/blob/main/deep_gemm/utils.py
@@ -21,7 +22,7 @@ def calc_diff(x, y):
 def get_model_config(tp_size: int):
     config = AutoConfig.from_pretrained(
-        "deepseek-ai/deepseek-R1", trust_remote_code=True
+        "deepseek-ai/Deepseek-R1", trust_remote_code=True
     )
     E = config.n_routed_experts
     topk = config.num_experts_per_tok
@@ -152,14 +153,31 @@ def run_test(tp_size, batch_size, model_config, check=False):
         problem_sizes2,
     )
+    topk_output = StandardTopKOutput(
+        topk_weights=topk_weights,
+        topk_ids=topk_ids,
+        router_logits=torch.randn(
+            (batch_size, topk), device=topk_weights.device, dtype=dtype
+        ),
+    )
+    moe_runner_config = MoeRunnerConfig(
+        num_experts=E,
+        top_k=topk,
+        hidden_size=H,
+        intermediate_size_per_partition=I,
+        params_dtype=dtype,
+        activation="silu",
+        inplace=False,
+    )
     # Note: Triton expects non-transposed weights
-    moe_config = MoeRunnerConfig(inplace=False)
     triton_lambda = lambda: fused_experts(
         x,
         w1,
         w2,
-        (topk_weights, topk_ids, "dummy"),
-        moe_config,
+        topk_output,
+        moe_runner_config,
         use_fp8_w8a8=True,
         w1_scale=w1_scale,
         w2_scale=w2_scale,
@@ -224,8 +242,8 @@ def run_test(tp_size, batch_size, model_config, check=False):
                 x,
                 w1,  # Original shape
                 w2,  # Original shape
-                (topk_weights, topk_ids, "dummy"),
-                moe_config,
+                topk_output,
+                moe_runner_config,
                 use_fp8_w8a8=True,
                 w1_scale=w1_scale,
                 w2_scale=w2_scale,

sglang/test/test_cutlass_w4a8_moe.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
-from typing import Optional
+from typing import Literal, Optional
 import pytest
 import torch
@@ -25,7 +25,7 @@ def pack_int4_values_to_int8(int4_values_interleaved: torch.Tensor) -> torch.Ten
     return packed_tensor.to(torch.int8)
-def pack_interleave(num_experts, ref_weight, ref_scale):
+def pack_interleave(num_experts, ref_weight, ref_scale, alignment=4):
     n, k = ref_weight.shape[1], ref_weight.shape[2]
     weight = pack_int4_values_to_int8(ref_weight.cpu()).cuda()
@@ -33,11 +33,16 @@ def pack_interleave(num_experts, ref_weight, ref_scale):
     w_q = w_q.contiguous()
     scale_interleaved = ref_scale.reshape(
-        ref_scale.shape[0], ref_scale.shape[1], (ref_scale.shape[2] // 4), 4
+        ref_scale.shape[0],
+        ref_scale.shape[1],
+        (ref_scale.shape[2] // alignment),
+        alignment,
     )  # [E, N, K/4, 4]
     scale_interleaved = scale_interleaved.permute(0, 2, 1, 3)  # [E, K/4, N, 4]
     scale_interleaved = scale_interleaved.reshape(
-        ref_scale.shape[0], ref_scale.shape[2] // 4, ref_scale.shape[1] * 4
+        ref_scale.shape[0],
+        ref_scale.shape[2] // alignment,
+        ref_scale.shape[1] * alignment,
     )  # [E, K/4, N*4]
     w_scale = scale_interleaved.contiguous()
@@ -48,12 +53,17 @@ def pack_interleave(num_experts, ref_weight, ref_scale):
 @pytest.mark.parametrize("N", [2048])
 @pytest.mark.parametrize("K", [7168])
 @pytest.mark.parametrize("E", [256])
-@pytest.mark.parametrize("ep_size", [8])
+@pytest.mark.parametrize("tp_size", [8])
+@pytest.mark.parametrize("use_ep_moe", [True, False])
 @pytest.mark.parametrize("topk", [8])
 @pytest.mark.parametrize("group_size", [128])
 @pytest.mark.parametrize("dtype", [torch.bfloat16])
-def test_cutlass_w4a8_moe(M, N, K, E, ep_size, topk, group_size, dtype):
-    local_e = E // ep_size
+def test_cutlass_w4a8_moe(M, N, K, E, tp_size, use_ep_moe, topk, group_size, dtype):
+    if use_ep_moe:
+        local_e = E // tp_size
+    else:  # tp mode
+        local_e = E
+        N = N // tp_size
     debug = False
     if debug:
@@ -87,7 +97,10 @@ def test_cutlass_w4a8_moe(M, N, K, E, ep_size, topk, group_size, dtype):
         )
     w1_q, w1_scale = pack_interleave(local_e, ref_weight_1, scale_1)
-    w2_q, w2_scale = pack_interleave(local_e, ref_weight_2, scale_2)
+    if use_ep_moe:
+        w2_q, w2_scale = pack_interleave(local_e, ref_weight_2, scale_2)
+    else:
+        w2_q, w2_scale = pack_interleave(local_e, ref_weight_2, scale_2, 1)
     device = "cuda"
     a_strides1 = torch.full((local_e, 3), K, device=device, dtype=torch.int64)
@@ -265,7 +278,9 @@ def ref(
         gate, fc1 = fc1.chunk(2, dim=-1)
         fc1 = fc1 * torch.nn.functional.silu(gate)
-        act = (fc1 / pre_quant_scale_2.float()).to(torch.float8_e4m3fn)
+        act = torch.clamp((fc1 / pre_quant_scale_2.float()), -448.0, 448.0).to(
+            torch.float8_e4m3fn
+        )
         act = act.to(dtype)
         w2 = ref_weight_2[e_idx]

sglang/test/test_disaggregation_utils.py ADDED Viewed

@@ -0,0 +1,66 @@
+import time
+import requests
+from sglang.srt.utils import kill_process_tree
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    CustomTestCase,
+    popen_with_error_check,
+)
+class TestDisaggregationBase(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.process_lb, cls.process_decode, cls.process_prefill = None, None, None
+        pass
+    @classmethod
+    def launch_lb(cls):
+        lb_command = [
+            "python3",
+            "-m",
+            "sglang_router.launch_router",
+            "--pd-disaggregation",
+            "--mini-lb",  # FIXME: remove this
+            "--prefill",
+            cls.prefill_url,
+            "--decode",
+            cls.decode_url,
+            "--host",
+            cls.base_host,
+            "--port",
+            cls.lb_port,
+        ]
+        print("Starting load balancer:", " ".join(lb_command))
+        cls.process_lb = popen_with_error_check(lb_command)
+        cls.wait_server_ready(cls.lb_url + "/health")
+    @classmethod
+    def wait_server_ready(cls, url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH):
+        start_time = time.perf_counter()
+        while True:
+            try:
+                response = requests.get(url)
+                if response.status_code == 200:
+                    print(f"Server {url} is ready")
+                    return
+            except Exception:
+                pass
+            if time.perf_counter() - start_time > timeout:
+                raise RuntimeError(f"Server {url} failed to start in {timeout}s")
+            time.sleep(1)
+    @classmethod
+    def tearDownClass(cls):
+        for process in [cls.process_lb, cls.process_decode, cls.process_prefill]:
+            if process:
+                try:
+                    kill_process_tree(process.pid)
+                except Exception as e:
+                    print(f"Error killing process {process.pid}: {e}")
+        # wait for 5 seconds
+        time.sleep(5)

sglang/test/test_utils.py CHANGED Viewed

@@ -42,7 +42,8 @@ DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.1-8B-Instruct"
 DEFAULT_SMALL_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.2-1B-Instruct"
 DEFAULT_SMALL_MODEL_NAME_FOR_TEST_BASE = "meta-llama/Llama-3.2-1B"
 DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
-DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST = "Qwen/Qwen1.5-MoE-A2.7B"
+DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_BASE = "Qwen/Qwen1.5-MoE-A2.7B"
+DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_CHAT = "Qwen/Qwen1.5-MoE-A2.7B-Chat"
 # MLA test models
 DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST = "Alibaba-NLP/gte-Qwen2-1.5B-instruct"
@@ -72,6 +73,10 @@ DEFAULT_MODEL_NAME_FOR_TEST_W8A8_WITH_MOE = "nytopop/Qwen3-30B-A3B.w8a8"
 DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST = "meta-llama/Llama-2-7b-chat-hf"
 DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST = "lmsys/sglang-EAGLE-llama2-chat-7B"
 DEFAULT_MODEL_NAME_FOR_TEST_EAGLE3 = "jamesliu1/sglang-EAGLE3-Llama-3.1-Instruct-8B"
+DEFAULT_STANDALONE_SPECULATIVE_TARGET_MODEL_FOR_TEST = (
+    "meta-llama/Llama-3.1-8B-Instruct"
+)
+DEFAULT_STANDALONE_SPECULATIVE_DRAFT_MODEL_FOR_TEST = "meta-llama/Llama-3.2-1B-Instruct"
 # Other use cases
 DEFAULT_MODEL_NAME_FOR_TEST_LOCAL_ATTENTION = (
@@ -466,6 +471,25 @@ def try_cached_model(model_repo: str):
     return model_dir if model_dir else model_repo
+def popen_with_error_check(command: list[str], allow_exit: bool = False):
+    process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    def _run_and_check():
+        stdout, stderr = process.communicate()
+        while process.poll() is None:
+            time.sleep(5)
+        if not allow_exit or process.returncode != 0:
+            raise Exception(
+                f"{command} exited with code {process.returncode}\n{stdout=}\n{stderr=}"
+            )
+    t = threading.Thread(target=_run_and_check)
+    t.start()
+    return process
 def popen_launch_server(
     model: str,
     base_url: str,

sglang/utils.py CHANGED Viewed

@@ -457,6 +457,7 @@ def wait_for_server(base_url: str, timeout: int = None) -> None:
                     NOTE: Typically, the server runs in a separate terminal.
                     In this notebook, we run the server and notebook code together, so their outputs are combined.
                     To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.
+                    To reduce the log length, we set the log level to warning for the server, the default log level is info.
                     We are running those notebooks in a CI environment, so the throughput is not representative of the actual performance.
                     """
                 )
@@ -472,6 +473,10 @@ class TypeBasedDispatcher:
     def __init__(self, mapping: List[Tuple[Type, Callable]]):
         self._mapping = mapping
+    def __iadd__(self, other: "TypeBasedDispatcher"):
+        self._mapping.extend(other._mapping)
+        return self
     def __call__(self, obj: Any):
         for ty, fn in self._mapping:
             if isinstance(obj, ty):

sglang/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.5.~~1.post3~~"
1	+ __version__ = "0.5.2"

sglang 0.5.1.post3__py3-none-any.whl → 0.5.2__py3-none-any.whl

sglang 0.5.1.post3py3-none-any.whl → 0.5.2py3-none-any.whl