PyPI - sglang - Versions diffs - 0.4.10__py3-none-any.whl → 0.4.10.post1__py3-none-any.whl - Mend

sglang 0.4.10py3-none-any.whl → 0.4.10.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

sglang/bench_offline_throughput.py +20 -0
sglang/srt/configs/model_config.py +1 -0
sglang/srt/disaggregation/launch_lb.py +5 -20
sglang/srt/disaggregation/mooncake/conn.py +33 -15
sglang/srt/layers/attention/trtllm_mla_backend.py +372 -0
sglang/srt/layers/attention/utils.py +6 -1
sglang/srt/layers/moe/ep_moe/layer.py +19 -34
sglang/srt/layers/moe/fused_moe_triton/layer.py +56 -2
sglang/srt/layers/quantization/fp8.py +52 -0
sglang/srt/layers/quantization/w8a8_int8.py +4 -1
sglang/srt/managers/cache_controller.py +35 -35
sglang/srt/managers/scheduler.py +1 -0
sglang/srt/mem_cache/hicache_storage.py +15 -6
sglang/srt/mem_cache/hiradix_cache.py +21 -4
sglang/srt/mem_cache/memory_pool.py +15 -118
sglang/srt/mem_cache/memory_pool_host.py +350 -33
sglang/srt/mem_cache/nixl/hicache_nixl.py +163 -0
sglang/srt/mem_cache/nixl/nixl_utils.py +238 -0
sglang/srt/mem_cache/nixl/test_hicache_nixl_storage.py +216 -0
sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +8 -2
sglang/srt/model_executor/cuda_graph_runner.py +25 -1
sglang/srt/model_executor/model_runner.py +8 -1
sglang/srt/model_loader/weight_utils.py +2 -0
sglang/srt/models/deepseek_v2.py +5 -6
sglang/srt/models/glm4_moe.py +3 -3
sglang/srt/models/step3_vl.py +0 -3
sglang/srt/server_args.py +40 -6
sglang/srt/utils.py +1 -0
sglang/test/attention/test_trtllm_mla_backend.py +945 -0
sglang/version.py +1 -1
{sglang-0.4.10.dist-info → sglang-0.4.10.post1.dist-info}/METADATA +1 -1
{sglang-0.4.10.dist-info → sglang-0.4.10.post1.dist-info}/RECORD +35 -30
{sglang-0.4.10.dist-info → sglang-0.4.10.post1.dist-info}/WHEEL +0 -0
{sglang-0.4.10.dist-info → sglang-0.4.10.post1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.10.dist-info → sglang-0.4.10.post1.dist-info}/top_level.txt +0 -0

sglang/srt/server_args.py CHANGED Viewed

@@ -24,6 +24,7 @@ import tempfile
 from typing import List, Literal, Optional, Union
 from sglang.srt.hf_transformers_utils import check_gguf_file, get_config
+from sglang.srt.layers.utils import is_sm100_supported
 from sglang.srt.lora.lora_registry import LoRARef
 from sglang.srt.reasoning_parser import ReasoningParser
 from sglang.srt.utils import (
@@ -197,7 +198,8 @@ class ServerArgs:
     hicache_ratio: float = 2.0
     hicache_size: int = 0
     hicache_write_policy: str = "write_through_selective"
-    hicache_io_backend: str = ""
+    hicache_io_backend: str = "kernel"
+    hicache_mem_layout: str = "layer_first"
     hicache_storage_backend: Optional[str] = None
     # Double Sparsity
@@ -215,6 +217,7 @@ class ServerArgs:
     disable_cuda_graph: bool = False
     disable_cuda_graph_padding: bool = False
     enable_profile_cuda_graph: bool = False
+    enable_cudagraph_gc: bool = False
     enable_nccl_nvls: bool = False
     enable_tokenizer_batch_encode: bool = False
     disable_outlines_disk_cache: bool = False
@@ -401,6 +404,22 @@ class ServerArgs:
             )
             self.page_size = 128
+        if self.attention_backend == "trtllm_mla":
+            if not is_sm100_supported():
+                raise ValueError(
+                    "TRTLLM MLA backend is only supported on Blackwell GPUs (SM100). Please use a different backend."
+                )
+            if self.page_size not in [32, 64]:
+                logger.warning(
+                    f"TensorRT-LLM MLA only supports page_size of 32 or 64, changing page_size from {self.page_size} to 64."
+                )
+                self.page_size = 64
+            if self.speculative_algorithm is not None:
+                raise ValueError(
+                    "trtllm_mla backend does not support speculative decoding yet."
+                )
         # Set page size
         if self.page_size is None:
             self.page_size = 1
@@ -436,10 +455,11 @@ class ServerArgs:
                 self.quantization == "modelopt_fp4"
             ), "modelopt_fp4 quantization is required for Flashinfer MOE"
             os.environ["TRTLLM_ENABLE_PDL"] = "1"
-        if self.enable_flashinfer_trtllm_moe:
-            assert self.enable_ep_moe, "EP MoE is required for Flashinfer TRTLLM MOE"
-            logger.warning(f"Flashinfer TRTLLM MoE is enabled.")
+            if self.enable_ep_moe:
+                self.ep_size = self.tp_size
+                logger.warning(
+                    f"Flashinfer cutlass MoE and EP MoE are enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
+                )
         # DeepEP MoE
         if self.enable_deepep_moe:
@@ -1219,6 +1239,7 @@ class ServerArgs:
                 "torch_native",
                 "ascend",
                 "triton",
+                "trtllm_mla",
             ],
             default=ServerArgs.attention_backend,
             help="Choose the kernels for attention layers.",
@@ -1467,10 +1488,18 @@ class ServerArgs:
             default=ServerArgs.hicache_io_backend,
             help="The IO backend for KV cache transfer between CPU and GPU",
         )
+        parser.add_argument(
+            "--hicache-mem-layout",
+            type=str,
+            choices=["layer_first", "page_first"],
+            default=ServerArgs.hicache_mem_layout,
+            help="The layout of host memory pool for hierarchical cache.",
+        )
         parser.add_argument(
             "--hicache-storage-backend",
             type=str,
-            choices=["file", "mooncake", "hf3fs"],
+            choices=["file", "mooncake", "hf3fs", "nixl"],
             default=ServerArgs.hicache_storage_backend,
             help="The storage backend for hierarchical KV cache.",
         )
@@ -1545,6 +1574,11 @@ class ServerArgs:
             action="store_true",
             help="Enable profiling of cuda graph capture.",
         )
+        parser.add_argument(
+            "--enable-cudagraph-gc",
+            action="store_true",
+            help="Enable garbage collection during CUDA graph capture. If disabled (default), GC is frozen during capture to speed up the process.",
+        )
         parser.add_argument(
             "--enable-nccl-nvls",
             action="store_true",

sglang/srt/utils.py CHANGED Viewed

@@ -2344,6 +2344,7 @@ def is_fa3_default_architecture(hf_config):
         "Qwen3ForCausalLM",
         "Qwen3MoeForCausalLM",
         "Glm4MoeForCausalLM",
+        "Step3VLForConditionalGeneration",
     }
     return architectures[0] in default_archs

sglang 0.4.10__py3-none-any.whl → 0.4.10.post1__py3-none-any.whl

sglang 0.4.10py3-none-any.whl → 0.4.10.post1py3-none-any.whl