PyPI - sglang - Versions diffs - 0.5.0rc1__py3-none-any.whl → 0.5.1__py3-none-any.whl - Mend

sglang 0.5.0rc1py3-none-any.whl → 0.5.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (203) hide show

sglang/bench_one_batch.py +0 -7
sglang/bench_one_batch_server.py +7 -2
sglang/bench_serving.py +3 -3
sglang/eval/llama3_eval.py +0 -1
sglang/srt/configs/model_config.py +25 -9
sglang/srt/configs/update_config.py +40 -5
sglang/srt/constrained/xgrammar_backend.py +23 -11
sglang/srt/conversation.py +2 -15
sglang/srt/disaggregation/ascend/conn.py +1 -3
sglang/srt/disaggregation/base/conn.py +1 -0
sglang/srt/disaggregation/decode.py +1 -2
sglang/srt/disaggregation/launch_lb.py +7 -1
sglang/srt/disaggregation/mini_lb.py +11 -5
sglang/srt/disaggregation/mooncake/conn.py +141 -47
sglang/srt/disaggregation/prefill.py +261 -5
sglang/srt/disaggregation/utils.py +2 -1
sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -1
sglang/srt/distributed/device_communicators/pynccl.py +68 -18
sglang/srt/distributed/device_communicators/pynccl_wrapper.py +52 -0
sglang/srt/distributed/naive_distributed.py +112 -0
sglang/srt/distributed/parallel_state.py +90 -4
sglang/srt/entrypoints/context.py +20 -1
sglang/srt/entrypoints/engine.py +29 -4
sglang/srt/entrypoints/http_server.py +76 -0
sglang/srt/entrypoints/openai/protocol.py +4 -2
sglang/srt/entrypoints/openai/serving_chat.py +23 -6
sglang/srt/entrypoints/openai/serving_completions.py +10 -1
sglang/srt/entrypoints/openai/serving_responses.py +2 -2
sglang/srt/eplb/expert_distribution.py +2 -3
sglang/srt/function_call/deepseekv3_detector.py +1 -1
sglang/srt/hf_transformers_utils.py +24 -0
sglang/srt/host_shared_memory.py +83 -0
sglang/srt/layers/attention/ascend_backend.py +132 -22
sglang/srt/layers/attention/flashattention_backend.py +24 -17
sglang/srt/layers/attention/flashinfer_backend.py +14 -3
sglang/srt/layers/attention/flashinfer_mla_backend.py +227 -76
sglang/srt/layers/attention/triton_backend.py +109 -73
sglang/srt/layers/attention/triton_ops/decode_attention.py +33 -2
sglang/srt/layers/attention/triton_ops/extend_attention.py +32 -2
sglang/srt/layers/attention/trtllm_mha_backend.py +398 -36
sglang/srt/layers/attention/trtllm_mla_backend.py +49 -19
sglang/srt/layers/attention/utils.py +94 -15
sglang/srt/layers/attention/vision.py +40 -13
sglang/srt/layers/attention/vision_utils.py +65 -0
sglang/srt/layers/communicator.py +58 -10
sglang/srt/layers/dp_attention.py +137 -27
sglang/srt/layers/elementwise.py +94 -0
sglang/srt/layers/flashinfer_comm_fusion.py +29 -1
sglang/srt/layers/layernorm.py +8 -1
sglang/srt/layers/linear.py +24 -0
sglang/srt/layers/logits_processor.py +16 -18
sglang/srt/layers/moe/__init__.py +31 -0
sglang/srt/layers/moe/ep_moe/layer.py +37 -33
sglang/srt/layers/moe/fused_moe_native.py +14 -25
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=129,N=352,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=161,N=192,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_0/E=16,N=1024,device_name=NVIDIA_B200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=161,N=384,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +69 -76
sglang/srt/layers/moe/fused_moe_triton/layer.py +66 -123
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +20 -18
sglang/srt/layers/moe/moe_runner/__init__.py +3 -0
sglang/srt/layers/moe/moe_runner/base.py +13 -0
sglang/srt/layers/moe/rocm_moe_utils.py +141 -0
sglang/srt/layers/moe/router.py +15 -9
sglang/srt/layers/moe/token_dispatcher/__init__.py +6 -0
sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +55 -14
sglang/srt/layers/moe/token_dispatcher/deepep.py +11 -21
sglang/srt/layers/moe/token_dispatcher/standard.py +1 -1
sglang/srt/layers/moe/topk.py +167 -83
sglang/srt/layers/moe/utils.py +159 -18
sglang/srt/layers/multimodal.py +156 -40
sglang/srt/layers/quantization/__init__.py +18 -46
sglang/srt/layers/quantization/awq.py +22 -23
sglang/srt/layers/quantization/base_config.py +2 -6
sglang/srt/layers/quantization/blockwise_int8.py +4 -12
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +72 -29
sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -1
sglang/srt/layers/quantization/fp8.py +127 -119
sglang/srt/layers/quantization/fp8_kernel.py +195 -24
sglang/srt/layers/quantization/fp8_utils.py +34 -9
sglang/srt/layers/quantization/fpgemm_fp8.py +203 -0
sglang/srt/layers/quantization/gptq.py +17 -21
sglang/srt/layers/quantization/marlin_utils.py +26 -8
sglang/srt/layers/quantization/marlin_utils_fp8.py +352 -0
sglang/srt/layers/quantization/modelopt_quant.py +217 -98
sglang/srt/layers/quantization/moe_wna16.py +10 -15
sglang/srt/layers/quantization/mxfp4.py +222 -39
sglang/srt/layers/quantization/quark/quark.py +390 -0
sglang/srt/layers/quantization/quark/quark_moe.py +197 -0
sglang/srt/layers/quantization/unquant.py +34 -70
sglang/srt/layers/quantization/utils.py +77 -2
sglang/srt/layers/quantization/w4afp8.py +7 -8
sglang/srt/layers/quantization/w8a8_fp8.py +5 -13
sglang/srt/layers/quantization/w8a8_int8.py +5 -13
sglang/srt/layers/radix_attention.py +6 -0
sglang/srt/layers/rotary_embedding.py +1 -0
sglang/srt/layers/sampler.py +5 -2
sglang/srt/lora/layers.py +6 -2
sglang/srt/lora/lora_manager.py +21 -22
sglang/srt/lora/lora_registry.py +3 -3
sglang/srt/lora/mem_pool.py +26 -24
sglang/srt/lora/utils.py +10 -12
sglang/srt/managers/cache_controller.py +80 -19
sglang/srt/managers/detokenizer_manager.py +10 -2
sglang/srt/managers/io_struct.py +23 -0
sglang/srt/managers/mm_utils.py +1 -1
sglang/srt/managers/schedule_batch.py +22 -48
sglang/srt/managers/scheduler.py +28 -20
sglang/srt/managers/session_controller.py +1 -1
sglang/srt/managers/template_manager.py +7 -5
sglang/srt/managers/tokenizer_manager.py +88 -39
sglang/srt/managers/tp_worker.py +1 -0
sglang/srt/managers/utils.py +59 -1
sglang/srt/mem_cache/allocator.py +10 -157
sglang/srt/mem_cache/allocator_ascend.py +147 -0
sglang/srt/mem_cache/chunk_cache.py +1 -1
sglang/srt/mem_cache/hicache_storage.py +14 -4
sglang/srt/mem_cache/memory_pool.py +3 -3
sglang/srt/mem_cache/memory_pool_host.py +35 -2
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +56 -12
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +8 -4
sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +153 -59
sglang/srt/mem_cache/storage/nixl/nixl_utils.py +19 -53
sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +46 -7
sglang/srt/model_executor/cuda_graph_runner.py +33 -33
sglang/srt/model_executor/forward_batch_info.py +11 -10
sglang/srt/model_executor/model_runner.py +93 -78
sglang/srt/model_executor/npu_graph_runner.py +94 -0
sglang/srt/model_loader/loader.py +24 -6
sglang/srt/models/dbrx.py +12 -6
sglang/srt/models/deepseek.py +2 -1
sglang/srt/models/deepseek_nextn.py +5 -2
sglang/srt/models/deepseek_v2.py +226 -223
sglang/srt/models/ernie4.py +2 -2
sglang/srt/models/glm4_moe.py +27 -65
sglang/srt/models/glm4_moe_nextn.py +2 -1
sglang/srt/models/glm4v.py +52 -1
sglang/srt/models/glm4v_moe.py +8 -11
sglang/srt/models/gpt_oss.py +41 -76
sglang/srt/models/granitemoe.py +0 -1
sglang/srt/models/grok.py +376 -48
sglang/srt/models/interns1.py +12 -47
sglang/srt/models/internvl.py +6 -51
sglang/srt/models/llama.py +10 -2
sglang/srt/models/llama4.py +18 -7
sglang/srt/models/minicpm3.py +0 -1
sglang/srt/models/mixtral.py +0 -2
sglang/srt/models/nemotron_nas.py +435 -0
sglang/srt/models/olmoe.py +0 -1
sglang/srt/models/phi4mm.py +3 -21
sglang/srt/models/qwen2.py +2 -2
sglang/srt/models/qwen2_5_vl.py +2 -0
sglang/srt/models/qwen2_moe.py +23 -23
sglang/srt/models/qwen3.py +2 -2
sglang/srt/models/qwen3_classification.py +84 -0
sglang/srt/models/qwen3_moe.py +27 -43
sglang/srt/models/step3_vl.py +8 -3
sglang/srt/models/xverse_moe.py +11 -5
sglang/srt/multimodal/processors/base_processor.py +3 -3
sglang/srt/multimodal/processors/internvl.py +7 -2
sglang/srt/multimodal/processors/llava.py +11 -7
sglang/srt/offloader.py +433 -0
sglang/srt/operations.py +22 -2
sglang/srt/reasoning_parser.py +4 -3
sglang/srt/sampling/sampling_batch_info.py +7 -4
sglang/srt/server_args.py +264 -105
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -21
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +7 -21
sglang/srt/speculative/eagle_utils.py +36 -13
sglang/srt/speculative/eagle_worker.py +56 -3
sglang/srt/tokenizer/tiktoken_tokenizer.py +161 -0
sglang/srt/two_batch_overlap.py +20 -19
sglang/srt/utils.py +68 -70
sglang/test/runners.py +8 -5
sglang/test/test_block_fp8.py +5 -6
sglang/test/test_block_fp8_ep.py +13 -19
sglang/test/test_cutlass_moe.py +4 -6
sglang/test/test_cutlass_w4a8_moe.py +4 -3
sglang/test/test_fp4_moe.py +4 -3
sglang/test/test_marlin_moe.py +1 -1
sglang/test/test_marlin_utils.py +1 -1
sglang/test/test_utils.py +7 -0
sglang/utils.py +0 -1
sglang/version.py +1 -1
{sglang-0.5.0rc1.dist-info → sglang-0.5.1.dist-info}/METADATA +11 -11
{sglang-0.5.0rc1.dist-info → sglang-0.5.1.dist-info}/RECORD +201 -171
sglang/srt/layers/quantization/fp4.py +0 -557
sglang/srt/layers/quantization/scalar_type.py +0 -352
{sglang-0.5.0rc1.dist-info → sglang-0.5.1.dist-info}/WHEEL +0 -0
{sglang-0.5.0rc1.dist-info → sglang-0.5.1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.0rc1.dist-info → sglang-0.5.1.dist-info}/top_level.txt +0 -0

sglang/srt/lora/mem_pool.py CHANGED Viewed

@@ -13,9 +13,9 @@ from sglang.srt.lora.utils import (
     ROW_PARALLELISM_LINEAR_LORA_NAMES,
     LoRAType,
     get_hidden_dim,
-    get_normalized_lora_weight_names,
+    get_normalized_target_modules,
     get_stacked_multiply,
-    get_weight_name,
+    get_target_module_name,
 )
 logger = logging.getLogger(__name__)
@@ -52,7 +52,7 @@ class LoRAMemoryPool:
         tp_size: int,
         tp_rank: int,
         max_lora_rank: int,
-        lora_weight_names: Set[str],
+        target_modules: Set[str],
         base_model: torch.nn.Module,
     ):
         self.base_hf_config: AutoConfig = base_hf_config
@@ -62,7 +62,7 @@ class LoRAMemoryPool:
         self.tp_size: int = tp_size
         self.tp_rank: int = tp_rank
         self.max_lora_rank: int = max_lora_rank
-        self.lora_weight_names: Set[str] = lora_weight_names
+        self.target_modules: Set[str] = target_modules
         # Both A_buffer and B_buffer maps lora weight names to its buffer space.
         # A_buffer contains num_layer number of row-major tensors with shape
@@ -95,8 +95,8 @@ class LoRAMemoryPool:
             """
             if config.r > self.max_lora_rank:
                 return False
-            weights = get_normalized_lora_weight_names(config.target_modules)
-            return weights.issubset(self.lora_weight_names)
+            target_module_names = get_normalized_target_modules(config.target_modules)
+            return target_module_names.issubset(self.target_modules)
         if isinstance(config, LoRAConfig):
             return _can_support(config)
@@ -139,10 +139,10 @@ class LoRAMemoryPool:
         def init_buffer(
             buffer: Dict[str, List[torch.Tensor]],
-            lora_weight_names: Set[str],
+            target_modules: Set[str],
             get_lora_shape_fn: Callable[[str, torch.nn.Module, int], Tuple[int]],
         ):
-            for module_name in lora_weight_names:
+            for module_name in target_modules:
                 lora_shape = get_lora_shape_fn(
                     module_name, base_model, self.max_lora_rank
                 )
@@ -157,13 +157,13 @@ class LoRAMemoryPool:
         init_buffer(
             self.A_buffer,
-            self.lora_weight_names,
+            self.target_modules,
             self.get_lora_A_shape,
         )
         init_buffer(
             self.B_buffer,
-            self.lora_weight_names,
+            self.target_modules,
             self.get_lora_B_shape,
         )
@@ -242,32 +242,34 @@ class LoRAMemoryPool:
         for layer_id in range(self.num_layer):
             layer_weights = lora_adapter.layers[layer_id].weights
             temp_A_buffer: Dict[str, Optional[torch.Tensor]] = {
-                weight_name: None for weight_name in self.A_buffer
+                target_module: None for target_module in self.A_buffer
             }
             temp_B_buffer: Dict[str, Optional[torch.Tensor]] = {
-                weight_name: None for weight_name in self.B_buffer
+                target_module: None for target_module in self.B_buffer
             }
             for name, weights in layer_weights.items():
-                lora_weight_name = get_weight_name(name, self.lora_weight_names)
+                target_module = get_target_module_name(name, self.target_modules)
                 if "lora_A" in name:
-                    temp_A_buffer[lora_weight_name] = weights
+                    temp_A_buffer[target_module] = weights
                 else:
-                    temp_B_buffer[lora_weight_name] = weights
+                    temp_B_buffer[target_module] = weights
             if self.tp_size > 1:
                 cur_layer_modules = lora_modules[layer_id]
                 for module_name, module in cur_layer_modules.items():
-                    weight_name = get_weight_name(module_name, self.lora_weight_names)
+                    target_module = get_target_module_name(
+                        module_name, self.target_modules
+                    )
-                    if temp_A_buffer[weight_name] is None:
+                    if temp_A_buffer[target_module] is None:
                         # Skip weight slicing if the weight is not present in the adapter
                         continue
-                    temp_A_buffer[weight_name] = module.slice_lora_a_weights(
-                        temp_A_buffer[weight_name], self.tp_rank
+                    temp_A_buffer[target_module] = module.slice_lora_a_weights(
+                        temp_A_buffer[target_module], self.tp_rank
                     )
-                    temp_B_buffer[weight_name] = module.slice_lora_b_weights(
-                        temp_B_buffer[weight_name], self.tp_rank
+                    temp_B_buffer[target_module] = module.slice_lora_b_weights(
+                        temp_B_buffer[target_module], self.tp_rank
                     )
             for name, weights in temp_A_buffer.items():
@@ -282,12 +284,12 @@ class LoRAMemoryPool:
                 load_lora_weight_tensor(buffer_view, weights)
     def get_tensor(
-        self, weight_name: str, layer_id: int, lora_type: LoRAType
+        self, target_module: str, layer_id: int, lora_type: LoRAType
     ) -> torch.Tensor:
         if lora_type == LoRAType.LORA_A:
-            return self.A_buffer[weight_name][layer_id]
+            return self.A_buffer[target_module][layer_id]
-        return self.B_buffer[weight_name][layer_id]
+        return self.B_buffer[target_module][layer_id]
     def get_buffer_id(self, lora_uid: str):
         return self.uid_to_buffer_id[lora_uid]

sglang/srt/lora/utils.py CHANGED Viewed

@@ -84,7 +84,7 @@ def get_hidden_dim(
             raise NotImplementedError()
-def get_normalized_lora_weight_names(
+def get_normalized_target_modules(
     target_modules: Iterable[str],
 ) -> set[str]:
     """
@@ -100,8 +100,8 @@ def get_normalized_lora_weight_names(
     result = set()
     for name in target_modules:
-        weight_name = params_mapping.get(name, name)
-        result.add(weight_name)
+        normalized_name = params_mapping.get(name, name)
+        result.add(normalized_name)
     return result
@@ -116,20 +116,18 @@ def get_stacked_multiply(module_name: str) -> int:
     return stacked_rank[module_name] if module_name in stacked_rank else 1
-def get_weight_name(
-    target_name: str, lora_weight_names: Tuple[Set[str]]
-) -> Optional[str]:
+def get_target_module_name(full_module_name: str, target_modules: Set[str]) -> str:
     """
-    Get the weight name in lora_weight_names that can match target_name.
+    Get the target module name in target_modules that can match full_module_name.
-    If there is a weight name in lora_weight_names that can match target_name, return this name
+    If there is a target module name in target_modules that can match full_module_name, return this name
     Else raise ValueError.
     """
-    for weight_name in lora_weight_names:
-        if weight_name in target_name:
-            return weight_name
+    for target_module in target_modules:
+        if target_module in full_module_name:
+            return target_module
     raise ValueError(
-        f"Cannot find weight name for {target_name} in {lora_weight_names}"
+        f"Cannot find target module name for {full_module_name} in {target_modules}"
     )

sglang/srt/managers/cache_controller.py CHANGED Viewed

@@ -26,6 +26,8 @@ if TYPE_CHECKING:
     from sglang.srt.mem_cache.allocator import BaseTokenToKVPoolAllocator
     from sglang.srt.mem_cache.memory_pool_host import HostKVCache
+from sglang.srt.distributed import get_tensor_model_parallel_rank
+from sglang.srt.mem_cache.memory_pool_host import MLATokenToKVPoolHost
 logger = logging.getLogger(__name__)
@@ -238,13 +240,14 @@ class HiCacheController:
         self.io_backend = io_backend
         self.enable_storage = False
+        self.is_mla = isinstance(self.mem_pool_host, MLATokenToKVPoolHost)
         # todo: move backend initialization to storage backend module
         if storage_backend is not None:
             self.storage_backend_type = storage_backend
             from sglang.srt.mem_cache.hicache_storage import HiCacheFile, get_hash_str
             if storage_backend == "file":
-                self.storage_backend = HiCacheFile()
+                self.storage_backend = HiCacheFile(is_mla=self.is_mla)
                 self.get_hash_str = get_hash_str
             elif storage_backend == "nixl":
                 from sglang.srt.mem_cache.storage.nixl.hicache_nixl import HiCacheNixl
@@ -257,23 +260,26 @@ class HiCacheController:
                     get_hash_str_mooncake,
                 )
-                self.storage_backend = MooncakeStore()
+                self.storage_backend = MooncakeStore(is_mla=self.is_mla)
                 self.get_hash_str = get_hash_str_mooncake
                 self.storage_backend.register_buffer(self.mem_pool_host.kv_buffer)
                 assert self.mem_pool_host.layout == "page_first"
             elif storage_backend == "hf3fs":
-                from sglang.srt.distributed import get_tensor_model_parallel_rank
                 from sglang.srt.mem_cache.storage.hf3fs.storage_hf3fs import (
                     HiCacheHF3FS,
                 )
-                rank = get_tensor_model_parallel_rank()
-                bytes_per_page = (
-                    mem_pool_host.get_size_per_token() * mem_pool_host.page_size
-                )
+                if self.mem_pool_host.layout == "page_first":
+                    bytes_per_page = (
+                        mem_pool_host.get_ksize_per_token() * mem_pool_host.page_size
+                    )
+                elif self.mem_pool_host.layout == "layer_first":
+                    bytes_per_page = (
+                        mem_pool_host.get_size_per_token() * mem_pool_host.page_size
+                    )
                 dtype = mem_pool_host.dtype
                 self.storage_backend = HiCacheHF3FS.from_env_config(
-                    rank, bytes_per_page, dtype
+                    bytes_per_page, dtype
                 )
                 self.get_hash_str = get_hash_str
             else:
@@ -296,6 +302,9 @@ class HiCacheController:
                 self.prefetch_tp_group = torch.distributed.new_group(
                     group_ranks, backend="gloo"
                 )
+                self.prefetch_io_tp_group = torch.distributed.new_group(
+                    group_ranks, backend="gloo"
+                )
                 self.backup_tp_group = torch.distributed.new_group(
                     group_ranks, backend="gloo"
                 )
@@ -391,6 +400,15 @@ class HiCacheController:
             self.prefetch_thread.start()
             self.backup_thread.start()
+    @property
+    def backup_skip(self):
+        return (
+            self.is_mla
+            and get_tensor_model_parallel_rank() != 0
+            # todo: only support file and mooncake
+            and self.storage_backend_type in ["file", "mooncake"]
+        )
     def write(
         self,
         device_indices: torch.Tensor,
@@ -552,13 +570,34 @@ class HiCacheController:
         operation.mark_done()
         return operation.completed_tokens, operation.hash_value
+    def zerocopy_page_transfer(self, operation, batch_size=8):
+        hashes, dsts = self.mem_pool_host.get_buffer_with_hash(
+            operation.hash_value, operation.host_indices
+        )
+        for i in range(0, len(hashes), batch_size):
+            page_hashes = hashes[i : i + batch_size]
+            page_dsts = dsts[i : i + batch_size]
+            page_data = self.storage_backend.batch_get(page_hashes, page_dsts)
+            if page_data is None:
+                logger.warning(
+                    f"Prefetch operation {operation.request_id} failed to retrieve page {page_hashes}."
+                )
+                break
+            completed_tokens = operation.completed_tokens
+            if operation.increment(self.page_size * len(page_hashes)):
+                for i in range(len(page_hashes)):
+                    completed_tokens += self.page_size
+            else:
+                break
     def generic_page_transfer(self, operation, batch_size=8):
         for i in range(0, len(operation.hash_value), batch_size):
             page_hashes = operation.hash_value[i : i + batch_size]
             # todo: zero copy
-            dummy_page_dst = [self.mem_pool_host.get_dummy_flat_data_page()] * len(
-                page_hashes
-            )
+            dummy_page_dst = [
+                self.mem_pool_host.get_dummy_flat_data_page()
+                for _ in range(len(page_hashes))
+            ]
             page_data = self.storage_backend.batch_get(page_hashes, dummy_page_dst)
             if page_data is None:
                 logger.warning(
@@ -596,13 +635,16 @@ class HiCacheController:
                 if self.is_mooncake_backend():
                     self.mooncake_page_transfer(operation)
                 elif self.storage_backend_type == "hf3fs":
-                    self.generic_page_transfer(operation, batch_size=128)
+                    if self.mem_pool_host.layout == "page_first":
+                        self.zerocopy_page_transfer(operation, batch_size=128)
+                    elif self.mem_pool_host.layout == "layer_first":
+                        self.generic_page_transfer(operation, batch_size=128)
                 else:
                     self.generic_page_transfer(operation)
                 if self.tp_world_size > 1:
                     # to ensure all TP workers release the host memory at the same time
-                    torch.distributed.barrier(group=self.prefetch_tp_group)
+                    torch.distributed.barrier(group=self.prefetch_io_tp_group)
                 # operation terminated by controller, release pre-allocated memory
                 self.mem_pool_host.free(
                     operation.host_indices[operation.completed_tokens :]
@@ -713,6 +755,19 @@ class HiCacheController:
         self.backup_queue.put(operation)
         return operation.id
+    def zerocopy_page_backup(self, operation, batch_size=8):
+        hashes, dsts = self.mem_pool_host.get_buffer_with_hash(
+            operation.hash_value, operation.host_indices
+        )
+        for i in range(0, len(hashes), batch_size):
+            page_hashes = hashes[i : i + batch_size]
+            page_data = dsts[i : i + batch_size]
+            success = self.storage_backend.batch_set(page_hashes, page_data)
+            if not success:
+                logger.warning(f"Failed to write page {page_hashes} to storage.")
+                break
+            operation.completed_tokens += self.page_size * len(page_hashes)
     def generic_page_backup(self, operation, batch_size=8):
         for i in range(0, len(operation.hash_value), batch_size):
             page_hashes = operation.hash_value[i : i + batch_size]
@@ -764,14 +819,20 @@ class HiCacheController:
                 if operation is None:
                     continue
-                if self.is_mooncake_backend():
-                    self.mooncake_page_backup(operation)
-                elif self.storage_backend_type == "hf3fs":
-                    self.generic_page_backup(operation, batch_size=128)
+                if not self.backup_skip:
+                    if self.is_mooncake_backend():
+                        self.mooncake_page_backup(operation)
+                    elif self.storage_backend_type == "hf3fs":
+                        if self.mem_pool_host.layout == "page_first":
+                            self.zerocopy_page_backup(operation, batch_size=128)
+                        elif self.mem_pool_host.layout == "layer_first":
+                            self.generic_page_backup(operation, batch_size=128)
+                    else:
+                        self.generic_page_backup(operation)
+                    min_completed_tokens = operation.completed_tokens
                 else:
-                    self.generic_page_backup(operation)
+                    min_completed_tokens = len(operation.token_ids)
-                min_completed_tokens = operation.completed_tokens
                 if self.tp_world_size > 1:
                     completed_tokens_tensor = torch.tensor(
                         min_completed_tokens, dtype=torch.int

sglang/srt/managers/detokenizer_manager.py CHANGED Viewed

@@ -31,10 +31,12 @@ from sglang.srt.managers.io_struct import (
     BatchMultimodalOut,
     BatchStrOut,
     BatchTokenIDOut,
+    FreezeGCReq,
 )
 from sglang.srt.server_args import PortArgs, ServerArgs
 from sglang.srt.utils import (
     configure_logger,
+    freeze_gc,
     get_zmq_socket,
     kill_itself_when_parent_died,
 )
@@ -100,6 +102,7 @@ class DetokenizerManager:
                 (BatchEmbeddingOut, self.handle_batch_embedding_out),
                 (BatchTokenIDOut, self.handle_batch_token_id_out),
                 (BatchMultimodalDecodeReq, self.handle_multimodal_decode_req),
+                (FreezeGCReq, self.handle_freeze_gc_req),
             ]
         )
@@ -108,7 +111,8 @@ class DetokenizerManager:
         while True:
             recv_obj = self.recv_from_scheduler.recv_pyobj()
             output = self._request_dispatcher(recv_obj)
-            self.send_to_tokenizer.send_pyobj(output)
+            if output is not None:
+                self.send_to_tokenizer.send_pyobj(output)
     def trim_matched_stop(
         self, output: Union[str, List[int]], finished_reason: Dict, no_stop_trim: bool
@@ -216,7 +220,7 @@ class DetokenizerManager:
             rids=recv_obj.rids,
             finished_reasons=recv_obj.finished_reasons,
             output_strs=output_strs,
-            output_ids=recv_obj.output_ids,
+            output_ids=recv_obj.decode_ids,
             prompt_tokens=recv_obj.prompt_tokens,
             completion_tokens=recv_obj.completion_tokens,
             cached_tokens=recv_obj.cached_tokens,
@@ -247,6 +251,10 @@ class DetokenizerManager:
             cached_tokens=recv_obj.cached_tokens,
         )
+    def handle_freeze_gc_req(self, recv_req: FreezeGCReq):
+        freeze_gc("Detokenizer Manager")
+        return None
 class LimitedCapacityDict(OrderedDict):
     def __init__(self, capacity: int, *args, **kwargs):

sglang/srt/managers/io_struct.py CHANGED Viewed

@@ -612,6 +612,8 @@ class EmbeddingReqInput:
             if self.sampling_params is None:
                 self.sampling_params = [{}] * self.batch_size
+            elif isinstance(self.sampling_params, dict):
+                self.sampling_params = [self.sampling_params] * self.batch_size
             for i in range(self.batch_size):
                 self.sampling_params[i]["max_new_tokens"] = 0
@@ -660,6 +662,8 @@ class TokenizedEmbeddingReqInput:
     token_type_ids: List[int]
     # Dummy sampling params for compatibility
     sampling_params: SamplingParams
+    # For data parallel rank routing
+    data_parallel_rank: Optional[int] = None
     # For dp balance
     dp_balance_id: int = -1
@@ -798,6 +802,8 @@ class UpdateWeightFromDiskReqInput:
     load_format: Optional[str] = None
     # Whether to abort all requests before updating weights
     abort_all_requests: bool = False
+    # Optional: Update weight version along with weights
+    weight_version: Optional[str] = None
 @dataclass
@@ -819,6 +825,8 @@ class UpdateWeightsFromDistributedReqInput:
     flush_cache: bool = True
     # Whether to abort all requests before updating weights
     abort_all_requests: bool = False
+    # Optional: Update weight version along with weights
+    weight_version: Optional[str] = None
 @dataclass
@@ -842,6 +850,8 @@ class UpdateWeightsFromTensorReqInput:
     flush_cache: bool = True
     # Whether to abort all requests before updating weights
     abort_all_requests: bool = False
+    # Optional: Update weight version along with weights
+    weight_version: Optional[str] = None
 @dataclass
@@ -872,6 +882,14 @@ class InitWeightsUpdateGroupReqOutput:
     message: str
+@dataclass
+class UpdateWeightVersionReqInput:
+    # The new weight version
+    new_version: str
+    # Whether to abort all running requests before updating
+    abort_all_requests: bool = True
 @dataclass
 class GetWeightsByNameReqInput:
     name: str
@@ -987,6 +1005,11 @@ class ProfileReqOutput:
     message: str
+@dataclass
+class FreezeGCReq:
+    pass
 @dataclass
 class ConfigureLoggingReq:
     log_requests: Optional[bool] = None

sglang/srt/managers/mm_utils.py CHANGED Viewed

@@ -560,7 +560,7 @@ def embed_mm_inputs(
                 ]
                 items_size[i + 1] = len(mm_items)
                 items_offsets.append(
-                    flatten_nested_list([item.offsets for item in mm_inputs.mm_items])
+                    flatten_nested_list([item.offsets for item in mm_items])
                 )
             items_size = torch.cumsum(items_size, dim=0).tolist()

sglang/srt/managers/schedule_batch.py CHANGED Viewed

@@ -52,6 +52,7 @@ from sglang.srt.disaggregation.decode_schedule_batch_mixin import (
     ScheduleBatchDisaggregationDecodeMixin,
 )
 from sglang.srt.distributed.parallel_state import get_tensor_model_parallel_rank
+from sglang.srt.layers.moe import is_tbo_enabled
 from sglang.srt.mem_cache.allocator import (
     BaseTokenToKVPoolAllocator,
     SWATokenToKVPoolAllocator,
@@ -83,19 +84,13 @@ GLOBAL_SERVER_ARGS_KEYS = [
     "chunked_prefill_size",
     "device",
     "disable_chunked_prefix_cache",
+    "disable_flashinfer_cutlass_moe_fp4_allgather",
     "disable_radix_cache",
-    "enable_dp_attention",
-    "enable_two_batch_overlap",
-    "tbo_token_distribution_threshold",
     "enable_dp_lm_head",
-    "moe_a2a_backend",
-    "deepep_mode",
-    "enable_flashinfer_cutlass_moe",
-    "enable_flashinfer_trtllm_moe",
+    "flashinfer_mxfp4_moe_precision",
     "enable_flashinfer_allreduce_fusion",
     "moe_dense_tp_size",
     "ep_dispatch_algorithm",
-    "deepep_config",
     "ep_num_redundant_experts",
     "enable_nan_detection",
     "flashinfer_mla_disable_ragged",
@@ -108,11 +103,11 @@ GLOBAL_SERVER_ARGS_KEYS = [
     "triton_attention_reduce_in_fp32",
     "num_reserved_decode_tokens",
     "weight_loader_disable_mmap",
-    "enable_triton_kernel_moe",
-    "enable_flashinfer_mxfp4_moe",
     "enable_multimodal",
     "enable_symm_mem",
     "quantization",
+    "enable_custom_logit_processor",
+    "disaggregation_mode",
 ]
 # Put some global args for easy access
@@ -909,12 +904,12 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
     spec_algorithm: SpeculativeAlgorithm = None
     spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]] = None
-    # Enable custom logit processor
-    enable_custom_logit_processor: bool = False
     # Whether to return hidden states
     return_hidden_states: bool = False
+    # Whether this batch is prefill-only (no token generation needed)
+    is_prefill_only: bool = False
     # hicache pointer for synchronizing data loading from CPU to GPU
     hicache_consumer_index: int = 0
@@ -928,7 +923,6 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
         model_config: ModelConfig,
         enable_overlap: bool,
         spec_algorithm: SpeculativeAlgorithm,
-        enable_custom_logit_processor: bool,
         chunked_req: Optional[Req] = None,
     ):
         return_logprob = any(req.return_logprob for req in reqs)
@@ -955,8 +949,10 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
             has_grammar=any(req.grammar for req in reqs),
             device=req_to_token_pool.device,
             spec_algorithm=spec_algorithm,
-            enable_custom_logit_processor=enable_custom_logit_processor,
             return_hidden_states=any(req.return_hidden_states for req in reqs),
+            is_prefill_only=all(
+                req.sampling_params.max_new_tokens == 0 for req in reqs
+            ),
             chunked_req=chunked_req,
         )
@@ -1009,6 +1005,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
         extend_num_tokens: int,
         backup_state: bool = False,
     ):
+        # Over estimate the number of tokens: assume each request needs a new page.
         num_tokens = (
             extend_num_tokens
             + len(seq_lens) * self.token_to_kv_pool_allocator.page_size
@@ -1041,8 +1038,8 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
         last_loc: torch.Tensor,
         backup_state: bool = False,
     ):
+        # Over estimate the number of tokens: assume each request needs a new page.
         num_tokens = len(seq_lens) * self.token_to_kv_pool_allocator.page_size
         self._evict_tree_cache_if_needed(num_tokens)
         if backup_state:
@@ -1721,38 +1718,18 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
             extend_prefix_lens = self.prefix_lens
             extend_logprob_start_lens = self.extend_logprob_start_lens
-        if self.forward_mode.is_decode_or_idle():
-            attention_backend_str = global_server_args_dict["decode_attention_backend"]
-        else:
-            attention_backend_str = global_server_args_dict["prefill_attention_backend"]
-        # Create seq_lens_cpu when needed
-        if (
-            attention_backend_str
-            in [
-                "fa3",
-                "flashinfer",
-                "flashmla",
-                "cutlass_mla",
-                "ascend",
-                "trtllm_mha",
-                "aiter",
-            ]
-            or global_server_args_dict["enable_two_batch_overlap"]
-        ):
-            seq_lens_cpu = (
-                seq_lens_cpu_cache
-                if seq_lens_cpu_cache is not None
-                else self.seq_lens.cpu()
-            )
-        else:
-            seq_lens_cpu = None
         if self.sampling_info:
             if self.has_grammar:
                 self.sampling_info.grammars = [req.grammar for req in self.reqs]
             else:
                 self.sampling_info.grammars = None
+        seq_lens_cpu = (
+            seq_lens_cpu_cache
+            if seq_lens_cpu_cache is not None
+            else self.seq_lens.cpu()
+        )
         global bid
         bid += 1
         return ModelWorkerBatch(
@@ -1815,18 +1792,15 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
             return_logprob=self.return_logprob,
             decoding_reqs=self.decoding_reqs,
             spec_algorithm=self.spec_algorithm,
-            enable_custom_logit_processor=self.enable_custom_logit_processor,
             global_num_tokens=self.global_num_tokens,
             global_num_tokens_for_logprob=self.global_num_tokens_for_logprob,
             can_run_dp_cuda_graph=self.can_run_dp_cuda_graph,
             is_extend_in_batch=self.is_extend_in_batch,
+            is_prefill_only=self.is_prefill_only,
         )
-    def _evict_tree_cache_if_needed(
-        self,
-        num_tokens: int,
-    ) -> None:
-        if isinstance(self.tree_cache, SWAChunkCache):
+    def _evict_tree_cache_if_needed(self, num_tokens: int):
+        if isinstance(self.tree_cache, (SWAChunkCache, ChunkCache)):
             return
         if self.is_hybrid:

sglang 0.5.0rc1__py3-none-any.whl → 0.5.1__py3-none-any.whl

sglang 0.5.0rc1py3-none-any.whl → 0.5.1py3-none-any.whl