PyPI - sglang - Versions diffs - 0.5.0rc2__py3-none-any.whl → 0.5.1__py3-none-any.whl - Mend

sglang 0.5.0rc2py3-none-any.whl → 0.5.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (180) hide show

sglang/bench_one_batch.py +0 -6
sglang/bench_one_batch_server.py +7 -2
sglang/bench_serving.py +3 -3
sglang/eval/llama3_eval.py +0 -1
sglang/srt/configs/model_config.py +24 -9
sglang/srt/configs/update_config.py +40 -5
sglang/srt/constrained/xgrammar_backend.py +23 -11
sglang/srt/conversation.py +2 -15
sglang/srt/disaggregation/ascend/conn.py +1 -3
sglang/srt/disaggregation/base/conn.py +1 -0
sglang/srt/disaggregation/decode.py +1 -1
sglang/srt/disaggregation/launch_lb.py +7 -1
sglang/srt/disaggregation/mini_lb.py +11 -5
sglang/srt/disaggregation/mooncake/conn.py +141 -47
sglang/srt/disaggregation/prefill.py +261 -5
sglang/srt/disaggregation/utils.py +2 -1
sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -1
sglang/srt/distributed/device_communicators/pynccl.py +68 -18
sglang/srt/distributed/device_communicators/pynccl_wrapper.py +52 -0
sglang/srt/distributed/naive_distributed.py +112 -0
sglang/srt/distributed/parallel_state.py +90 -4
sglang/srt/entrypoints/context.py +20 -1
sglang/srt/entrypoints/engine.py +27 -2
sglang/srt/entrypoints/http_server.py +12 -0
sglang/srt/entrypoints/openai/protocol.py +2 -2
sglang/srt/entrypoints/openai/serving_chat.py +22 -6
sglang/srt/entrypoints/openai/serving_completions.py +9 -1
sglang/srt/entrypoints/openai/serving_responses.py +2 -2
sglang/srt/eplb/expert_distribution.py +2 -3
sglang/srt/function_call/deepseekv3_detector.py +1 -1
sglang/srt/hf_transformers_utils.py +24 -0
sglang/srt/host_shared_memory.py +83 -0
sglang/srt/layers/attention/ascend_backend.py +132 -22
sglang/srt/layers/attention/flashattention_backend.py +24 -17
sglang/srt/layers/attention/flashinfer_backend.py +11 -3
sglang/srt/layers/attention/flashinfer_mla_backend.py +226 -76
sglang/srt/layers/attention/triton_backend.py +85 -46
sglang/srt/layers/attention/triton_ops/decode_attention.py +33 -2
sglang/srt/layers/attention/triton_ops/extend_attention.py +32 -2
sglang/srt/layers/attention/trtllm_mha_backend.py +390 -30
sglang/srt/layers/attention/trtllm_mla_backend.py +39 -16
sglang/srt/layers/attention/utils.py +94 -15
sglang/srt/layers/attention/vision.py +40 -13
sglang/srt/layers/attention/vision_utils.py +65 -0
sglang/srt/layers/communicator.py +51 -3
sglang/srt/layers/dp_attention.py +23 -4
sglang/srt/layers/elementwise.py +94 -0
sglang/srt/layers/flashinfer_comm_fusion.py +29 -1
sglang/srt/layers/layernorm.py +8 -1
sglang/srt/layers/linear.py +24 -0
sglang/srt/layers/logits_processor.py +5 -1
sglang/srt/layers/moe/__init__.py +31 -0
sglang/srt/layers/moe/ep_moe/layer.py +37 -33
sglang/srt/layers/moe/fused_moe_native.py +14 -25
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=161,N=384,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +69 -76
sglang/srt/layers/moe/fused_moe_triton/layer.py +66 -123
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +20 -18
sglang/srt/layers/moe/moe_runner/__init__.py +3 -0
sglang/srt/layers/moe/moe_runner/base.py +13 -0
sglang/srt/layers/moe/rocm_moe_utils.py +141 -0
sglang/srt/layers/moe/router.py +15 -9
sglang/srt/layers/moe/token_dispatcher/__init__.py +6 -0
sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +55 -14
sglang/srt/layers/moe/token_dispatcher/deepep.py +11 -21
sglang/srt/layers/moe/token_dispatcher/standard.py +1 -1
sglang/srt/layers/moe/topk.py +167 -83
sglang/srt/layers/moe/utils.py +159 -18
sglang/srt/layers/quantization/__init__.py +13 -14
sglang/srt/layers/quantization/awq.py +7 -7
sglang/srt/layers/quantization/base_config.py +2 -6
sglang/srt/layers/quantization/blockwise_int8.py +4 -12
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +72 -28
sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -1
sglang/srt/layers/quantization/fp8.py +127 -119
sglang/srt/layers/quantization/fp8_kernel.py +195 -24
sglang/srt/layers/quantization/fp8_utils.py +34 -9
sglang/srt/layers/quantization/fpgemm_fp8.py +203 -0
sglang/srt/layers/quantization/gptq.py +5 -4
sglang/srt/layers/quantization/marlin_utils.py +11 -3
sglang/srt/layers/quantization/marlin_utils_fp8.py +352 -0
sglang/srt/layers/quantization/modelopt_quant.py +165 -68
sglang/srt/layers/quantization/moe_wna16.py +10 -15
sglang/srt/layers/quantization/mxfp4.py +206 -37
sglang/srt/layers/quantization/quark/quark.py +390 -0
sglang/srt/layers/quantization/quark/quark_moe.py +197 -0
sglang/srt/layers/quantization/unquant.py +34 -70
sglang/srt/layers/quantization/utils.py +25 -0
sglang/srt/layers/quantization/w4afp8.py +7 -8
sglang/srt/layers/quantization/w8a8_fp8.py +5 -13
sglang/srt/layers/quantization/w8a8_int8.py +5 -13
sglang/srt/layers/radix_attention.py +6 -0
sglang/srt/layers/rotary_embedding.py +1 -0
sglang/srt/lora/lora_manager.py +21 -22
sglang/srt/lora/lora_registry.py +3 -3
sglang/srt/lora/mem_pool.py +26 -24
sglang/srt/lora/utils.py +10 -12
sglang/srt/managers/cache_controller.py +76 -18
sglang/srt/managers/detokenizer_manager.py +10 -2
sglang/srt/managers/io_struct.py +9 -0
sglang/srt/managers/mm_utils.py +1 -1
sglang/srt/managers/schedule_batch.py +4 -9
sglang/srt/managers/scheduler.py +25 -16
sglang/srt/managers/session_controller.py +1 -1
sglang/srt/managers/template_manager.py +7 -5
sglang/srt/managers/tokenizer_manager.py +60 -21
sglang/srt/managers/tp_worker.py +1 -0
sglang/srt/managers/utils.py +59 -1
sglang/srt/mem_cache/allocator.py +7 -5
sglang/srt/mem_cache/allocator_ascend.py +0 -11
sglang/srt/mem_cache/hicache_storage.py +14 -4
sglang/srt/mem_cache/memory_pool.py +3 -3
sglang/srt/mem_cache/memory_pool_host.py +35 -2
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +56 -12
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +8 -4
sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +153 -59
sglang/srt/mem_cache/storage/nixl/nixl_utils.py +19 -53
sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +46 -7
sglang/srt/model_executor/cuda_graph_runner.py +25 -12
sglang/srt/model_executor/forward_batch_info.py +4 -1
sglang/srt/model_executor/model_runner.py +43 -32
sglang/srt/model_executor/npu_graph_runner.py +94 -0
sglang/srt/model_loader/loader.py +24 -6
sglang/srt/models/dbrx.py +12 -6
sglang/srt/models/deepseek.py +2 -1
sglang/srt/models/deepseek_nextn.py +3 -1
sglang/srt/models/deepseek_v2.py +224 -223
sglang/srt/models/ernie4.py +2 -2
sglang/srt/models/glm4_moe.py +25 -63
sglang/srt/models/glm4v.py +52 -1
sglang/srt/models/glm4v_moe.py +8 -11
sglang/srt/models/gpt_oss.py +34 -74
sglang/srt/models/granitemoe.py +0 -1
sglang/srt/models/grok.py +376 -48
sglang/srt/models/interns1.py +12 -47
sglang/srt/models/internvl.py +6 -51
sglang/srt/models/llama4.py +0 -2
sglang/srt/models/minicpm3.py +0 -1
sglang/srt/models/mixtral.py +0 -2
sglang/srt/models/nemotron_nas.py +435 -0
sglang/srt/models/olmoe.py +0 -1
sglang/srt/models/phi4mm.py +3 -21
sglang/srt/models/qwen2_5_vl.py +2 -0
sglang/srt/models/qwen2_moe.py +3 -18
sglang/srt/models/qwen3.py +2 -2
sglang/srt/models/qwen3_classification.py +7 -1
sglang/srt/models/qwen3_moe.py +9 -38
sglang/srt/models/step3_vl.py +2 -1
sglang/srt/models/xverse_moe.py +11 -5
sglang/srt/multimodal/processors/base_processor.py +3 -3
sglang/srt/multimodal/processors/internvl.py +7 -2
sglang/srt/multimodal/processors/llava.py +11 -7
sglang/srt/offloader.py +433 -0
sglang/srt/operations.py +6 -1
sglang/srt/reasoning_parser.py +4 -3
sglang/srt/server_args.py +237 -104
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +1 -0
sglang/srt/speculative/eagle_utils.py +36 -13
sglang/srt/speculative/eagle_worker.py +56 -3
sglang/srt/tokenizer/tiktoken_tokenizer.py +161 -0
sglang/srt/two_batch_overlap.py +16 -11
sglang/srt/utils.py +68 -70
sglang/test/runners.py +8 -5
sglang/test/test_block_fp8.py +5 -6
sglang/test/test_block_fp8_ep.py +13 -19
sglang/test/test_cutlass_moe.py +4 -6
sglang/test/test_cutlass_w4a8_moe.py +4 -3
sglang/test/test_fp4_moe.py +4 -3
sglang/test/test_utils.py +7 -0
sglang/utils.py +0 -1
sglang/version.py +1 -1
{sglang-0.5.0rc2.dist-info → sglang-0.5.1.dist-info}/METADATA +7 -7
{sglang-0.5.0rc2.dist-info → sglang-0.5.1.dist-info}/RECORD +179 -161
sglang/srt/layers/quantization/fp4.py +0 -557
{sglang-0.5.0rc2.dist-info → sglang-0.5.1.dist-info}/WHEEL +0 -0
{sglang-0.5.0rc2.dist-info → sglang-0.5.1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.0rc2.dist-info → sglang-0.5.1.dist-info}/top_level.txt +0 -0

sglang/srt/lora/mem_pool.py CHANGED Viewed

@@ -13,9 +13,9 @@ from sglang.srt.lora.utils import (
     ROW_PARALLELISM_LINEAR_LORA_NAMES,
     LoRAType,
     get_hidden_dim,
-    get_normalized_lora_weight_names,
+    get_normalized_target_modules,
     get_stacked_multiply,
-    get_weight_name,
+    get_target_module_name,
 )
 logger = logging.getLogger(__name__)
@@ -52,7 +52,7 @@ class LoRAMemoryPool:
         tp_size: int,
         tp_rank: int,
         max_lora_rank: int,
-        lora_weight_names: Set[str],
+        target_modules: Set[str],
         base_model: torch.nn.Module,
     ):
         self.base_hf_config: AutoConfig = base_hf_config
@@ -62,7 +62,7 @@ class LoRAMemoryPool:
         self.tp_size: int = tp_size
         self.tp_rank: int = tp_rank
         self.max_lora_rank: int = max_lora_rank
-        self.lora_weight_names: Set[str] = lora_weight_names
+        self.target_modules: Set[str] = target_modules
         # Both A_buffer and B_buffer maps lora weight names to its buffer space.
         # A_buffer contains num_layer number of row-major tensors with shape
@@ -95,8 +95,8 @@ class LoRAMemoryPool:
             """
             if config.r > self.max_lora_rank:
                 return False
-            weights = get_normalized_lora_weight_names(config.target_modules)
-            return weights.issubset(self.lora_weight_names)
+            target_module_names = get_normalized_target_modules(config.target_modules)
+            return target_module_names.issubset(self.target_modules)
         if isinstance(config, LoRAConfig):
             return _can_support(config)
@@ -139,10 +139,10 @@ class LoRAMemoryPool:
         def init_buffer(
             buffer: Dict[str, List[torch.Tensor]],
-            lora_weight_names: Set[str],
+            target_modules: Set[str],
             get_lora_shape_fn: Callable[[str, torch.nn.Module, int], Tuple[int]],
         ):
-            for module_name in lora_weight_names:
+            for module_name in target_modules:
                 lora_shape = get_lora_shape_fn(
                     module_name, base_model, self.max_lora_rank
                 )
@@ -157,13 +157,13 @@ class LoRAMemoryPool:
         init_buffer(
             self.A_buffer,
-            self.lora_weight_names,
+            self.target_modules,
             self.get_lora_A_shape,
         )
         init_buffer(
             self.B_buffer,
-            self.lora_weight_names,
+            self.target_modules,
             self.get_lora_B_shape,
         )
@@ -242,32 +242,34 @@ class LoRAMemoryPool:
         for layer_id in range(self.num_layer):
             layer_weights = lora_adapter.layers[layer_id].weights
             temp_A_buffer: Dict[str, Optional[torch.Tensor]] = {
-                weight_name: None for weight_name in self.A_buffer
+                target_module: None for target_module in self.A_buffer
             }
             temp_B_buffer: Dict[str, Optional[torch.Tensor]] = {
-                weight_name: None for weight_name in self.B_buffer
+                target_module: None for target_module in self.B_buffer
             }
             for name, weights in layer_weights.items():
-                lora_weight_name = get_weight_name(name, self.lora_weight_names)
+                target_module = get_target_module_name(name, self.target_modules)
                 if "lora_A" in name:
-                    temp_A_buffer[lora_weight_name] = weights
+                    temp_A_buffer[target_module] = weights
                 else:
-                    temp_B_buffer[lora_weight_name] = weights
+                    temp_B_buffer[target_module] = weights
             if self.tp_size > 1:
                 cur_layer_modules = lora_modules[layer_id]
                 for module_name, module in cur_layer_modules.items():
-                    weight_name = get_weight_name(module_name, self.lora_weight_names)
+                    target_module = get_target_module_name(
+                        module_name, self.target_modules
+                    )
-                    if temp_A_buffer[weight_name] is None:
+                    if temp_A_buffer[target_module] is None:
                         # Skip weight slicing if the weight is not present in the adapter
                         continue
-                    temp_A_buffer[weight_name] = module.slice_lora_a_weights(
-                        temp_A_buffer[weight_name], self.tp_rank
+                    temp_A_buffer[target_module] = module.slice_lora_a_weights(
+                        temp_A_buffer[target_module], self.tp_rank
                     )
-                    temp_B_buffer[weight_name] = module.slice_lora_b_weights(
-                        temp_B_buffer[weight_name], self.tp_rank
+                    temp_B_buffer[target_module] = module.slice_lora_b_weights(
+                        temp_B_buffer[target_module], self.tp_rank
                     )
             for name, weights in temp_A_buffer.items():
@@ -282,12 +284,12 @@ class LoRAMemoryPool:
                 load_lora_weight_tensor(buffer_view, weights)
     def get_tensor(
-        self, weight_name: str, layer_id: int, lora_type: LoRAType
+        self, target_module: str, layer_id: int, lora_type: LoRAType
     ) -> torch.Tensor:
         if lora_type == LoRAType.LORA_A:
-            return self.A_buffer[weight_name][layer_id]
+            return self.A_buffer[target_module][layer_id]
-        return self.B_buffer[weight_name][layer_id]
+        return self.B_buffer[target_module][layer_id]
     def get_buffer_id(self, lora_uid: str):
         return self.uid_to_buffer_id[lora_uid]

sglang/srt/lora/utils.py CHANGED Viewed

@@ -84,7 +84,7 @@ def get_hidden_dim(
             raise NotImplementedError()
-def get_normalized_lora_weight_names(
+def get_normalized_target_modules(
     target_modules: Iterable[str],
 ) -> set[str]:
     """
@@ -100,8 +100,8 @@ def get_normalized_lora_weight_names(
     result = set()
     for name in target_modules:
-        weight_name = params_mapping.get(name, name)
-        result.add(weight_name)
+        normalized_name = params_mapping.get(name, name)
+        result.add(normalized_name)
     return result
@@ -116,20 +116,18 @@ def get_stacked_multiply(module_name: str) -> int:
     return stacked_rank[module_name] if module_name in stacked_rank else 1
-def get_weight_name(
-    target_name: str, lora_weight_names: Tuple[Set[str]]
-) -> Optional[str]:
+def get_target_module_name(full_module_name: str, target_modules: Set[str]) -> str:
     """
-    Get the weight name in lora_weight_names that can match target_name.
+    Get the target module name in target_modules that can match full_module_name.
-    If there is a weight name in lora_weight_names that can match target_name, return this name
+    If there is a target module name in target_modules that can match full_module_name, return this name
     Else raise ValueError.
     """
-    for weight_name in lora_weight_names:
-        if weight_name in target_name:
-            return weight_name
+    for target_module in target_modules:
+        if target_module in full_module_name:
+            return target_module
     raise ValueError(
-        f"Cannot find weight name for {target_name} in {lora_weight_names}"
+        f"Cannot find target module name for {full_module_name} in {target_modules}"
     )

sglang/srt/managers/cache_controller.py CHANGED Viewed

@@ -26,6 +26,8 @@ if TYPE_CHECKING:
     from sglang.srt.mem_cache.allocator import BaseTokenToKVPoolAllocator
     from sglang.srt.mem_cache.memory_pool_host import HostKVCache
+from sglang.srt.distributed import get_tensor_model_parallel_rank
+from sglang.srt.mem_cache.memory_pool_host import MLATokenToKVPoolHost
 logger = logging.getLogger(__name__)
@@ -238,13 +240,14 @@ class HiCacheController:
         self.io_backend = io_backend
         self.enable_storage = False
+        self.is_mla = isinstance(self.mem_pool_host, MLATokenToKVPoolHost)
         # todo: move backend initialization to storage backend module
         if storage_backend is not None:
             self.storage_backend_type = storage_backend
             from sglang.srt.mem_cache.hicache_storage import HiCacheFile, get_hash_str
             if storage_backend == "file":
-                self.storage_backend = HiCacheFile()
+                self.storage_backend = HiCacheFile(is_mla=self.is_mla)
                 self.get_hash_str = get_hash_str
             elif storage_backend == "nixl":
                 from sglang.srt.mem_cache.storage.nixl.hicache_nixl import HiCacheNixl
@@ -257,23 +260,26 @@ class HiCacheController:
                     get_hash_str_mooncake,
                 )
-                self.storage_backend = MooncakeStore()
+                self.storage_backend = MooncakeStore(is_mla=self.is_mla)
                 self.get_hash_str = get_hash_str_mooncake
                 self.storage_backend.register_buffer(self.mem_pool_host.kv_buffer)
                 assert self.mem_pool_host.layout == "page_first"
             elif storage_backend == "hf3fs":
-                from sglang.srt.distributed import get_tensor_model_parallel_rank
                 from sglang.srt.mem_cache.storage.hf3fs.storage_hf3fs import (
                     HiCacheHF3FS,
                 )
-                rank = get_tensor_model_parallel_rank()
-                bytes_per_page = (
-                    mem_pool_host.get_size_per_token() * mem_pool_host.page_size
-                )
+                if self.mem_pool_host.layout == "page_first":
+                    bytes_per_page = (
+                        mem_pool_host.get_ksize_per_token() * mem_pool_host.page_size
+                    )
+                elif self.mem_pool_host.layout == "layer_first":
+                    bytes_per_page = (
+                        mem_pool_host.get_size_per_token() * mem_pool_host.page_size
+                    )
                 dtype = mem_pool_host.dtype
                 self.storage_backend = HiCacheHF3FS.from_env_config(
-                    rank, bytes_per_page, dtype
+                    bytes_per_page, dtype
                 )
                 self.get_hash_str = get_hash_str
             else:
@@ -394,6 +400,15 @@ class HiCacheController:
             self.prefetch_thread.start()
             self.backup_thread.start()
+    @property
+    def backup_skip(self):
+        return (
+            self.is_mla
+            and get_tensor_model_parallel_rank() != 0
+            # todo: only support file and mooncake
+            and self.storage_backend_type in ["file", "mooncake"]
+        )
     def write(
         self,
         device_indices: torch.Tensor,
@@ -555,13 +570,34 @@ class HiCacheController:
         operation.mark_done()
         return operation.completed_tokens, operation.hash_value
+    def zerocopy_page_transfer(self, operation, batch_size=8):
+        hashes, dsts = self.mem_pool_host.get_buffer_with_hash(
+            operation.hash_value, operation.host_indices
+        )
+        for i in range(0, len(hashes), batch_size):
+            page_hashes = hashes[i : i + batch_size]
+            page_dsts = dsts[i : i + batch_size]
+            page_data = self.storage_backend.batch_get(page_hashes, page_dsts)
+            if page_data is None:
+                logger.warning(
+                    f"Prefetch operation {operation.request_id} failed to retrieve page {page_hashes}."
+                )
+                break
+            completed_tokens = operation.completed_tokens
+            if operation.increment(self.page_size * len(page_hashes)):
+                for i in range(len(page_hashes)):
+                    completed_tokens += self.page_size
+            else:
+                break
     def generic_page_transfer(self, operation, batch_size=8):
         for i in range(0, len(operation.hash_value), batch_size):
             page_hashes = operation.hash_value[i : i + batch_size]
             # todo: zero copy
-            dummy_page_dst = [self.mem_pool_host.get_dummy_flat_data_page()] * len(
-                page_hashes
-            )
+            dummy_page_dst = [
+                self.mem_pool_host.get_dummy_flat_data_page()
+                for _ in range(len(page_hashes))
+            ]
             page_data = self.storage_backend.batch_get(page_hashes, dummy_page_dst)
             if page_data is None:
                 logger.warning(
@@ -599,7 +635,10 @@ class HiCacheController:
                 if self.is_mooncake_backend():
                     self.mooncake_page_transfer(operation)
                 elif self.storage_backend_type == "hf3fs":
-                    self.generic_page_transfer(operation, batch_size=128)
+                    if self.mem_pool_host.layout == "page_first":
+                        self.zerocopy_page_transfer(operation, batch_size=128)
+                    elif self.mem_pool_host.layout == "layer_first":
+                        self.generic_page_transfer(operation, batch_size=128)
                 else:
                     self.generic_page_transfer(operation)
@@ -716,6 +755,19 @@ class HiCacheController:
         self.backup_queue.put(operation)
         return operation.id
+    def zerocopy_page_backup(self, operation, batch_size=8):
+        hashes, dsts = self.mem_pool_host.get_buffer_with_hash(
+            operation.hash_value, operation.host_indices
+        )
+        for i in range(0, len(hashes), batch_size):
+            page_hashes = hashes[i : i + batch_size]
+            page_data = dsts[i : i + batch_size]
+            success = self.storage_backend.batch_set(page_hashes, page_data)
+            if not success:
+                logger.warning(f"Failed to write page {page_hashes} to storage.")
+                break
+            operation.completed_tokens += self.page_size * len(page_hashes)
     def generic_page_backup(self, operation, batch_size=8):
         for i in range(0, len(operation.hash_value), batch_size):
             page_hashes = operation.hash_value[i : i + batch_size]
@@ -767,14 +819,20 @@ class HiCacheController:
                 if operation is None:
                     continue
-                if self.is_mooncake_backend():
-                    self.mooncake_page_backup(operation)
-                elif self.storage_backend_type == "hf3fs":
-                    self.generic_page_backup(operation, batch_size=128)
+                if not self.backup_skip:
+                    if self.is_mooncake_backend():
+                        self.mooncake_page_backup(operation)
+                    elif self.storage_backend_type == "hf3fs":
+                        if self.mem_pool_host.layout == "page_first":
+                            self.zerocopy_page_backup(operation, batch_size=128)
+                        elif self.mem_pool_host.layout == "layer_first":
+                            self.generic_page_backup(operation, batch_size=128)
+                    else:
+                        self.generic_page_backup(operation)
+                    min_completed_tokens = operation.completed_tokens
                 else:
-                    self.generic_page_backup(operation)
+                    min_completed_tokens = len(operation.token_ids)
-                min_completed_tokens = operation.completed_tokens
                 if self.tp_world_size > 1:
                     completed_tokens_tensor = torch.tensor(
                         min_completed_tokens, dtype=torch.int

sglang/srt/managers/detokenizer_manager.py CHANGED Viewed

@@ -31,10 +31,12 @@ from sglang.srt.managers.io_struct import (
     BatchMultimodalOut,
     BatchStrOut,
     BatchTokenIDOut,
+    FreezeGCReq,
 )
 from sglang.srt.server_args import PortArgs, ServerArgs
 from sglang.srt.utils import (
     configure_logger,
+    freeze_gc,
     get_zmq_socket,
     kill_itself_when_parent_died,
 )
@@ -100,6 +102,7 @@ class DetokenizerManager:
                 (BatchEmbeddingOut, self.handle_batch_embedding_out),
                 (BatchTokenIDOut, self.handle_batch_token_id_out),
                 (BatchMultimodalDecodeReq, self.handle_multimodal_decode_req),
+                (FreezeGCReq, self.handle_freeze_gc_req),
             ]
         )
@@ -108,7 +111,8 @@ class DetokenizerManager:
         while True:
             recv_obj = self.recv_from_scheduler.recv_pyobj()
             output = self._request_dispatcher(recv_obj)
-            self.send_to_tokenizer.send_pyobj(output)
+            if output is not None:
+                self.send_to_tokenizer.send_pyobj(output)
     def trim_matched_stop(
         self, output: Union[str, List[int]], finished_reason: Dict, no_stop_trim: bool
@@ -216,7 +220,7 @@ class DetokenizerManager:
             rids=recv_obj.rids,
             finished_reasons=recv_obj.finished_reasons,
             output_strs=output_strs,
-            output_ids=recv_obj.output_ids,
+            output_ids=recv_obj.decode_ids,
             prompt_tokens=recv_obj.prompt_tokens,
             completion_tokens=recv_obj.completion_tokens,
             cached_tokens=recv_obj.cached_tokens,
@@ -247,6 +251,10 @@ class DetokenizerManager:
             cached_tokens=recv_obj.cached_tokens,
         )
+    def handle_freeze_gc_req(self, recv_req: FreezeGCReq):
+        freeze_gc("Detokenizer Manager")
+        return None
 class LimitedCapacityDict(OrderedDict):
     def __init__(self, capacity: int, *args, **kwargs):

sglang/srt/managers/io_struct.py CHANGED Viewed

@@ -612,6 +612,8 @@ class EmbeddingReqInput:
             if self.sampling_params is None:
                 self.sampling_params = [{}] * self.batch_size
+            elif isinstance(self.sampling_params, dict):
+                self.sampling_params = [self.sampling_params] * self.batch_size
             for i in range(self.batch_size):
                 self.sampling_params[i]["max_new_tokens"] = 0
@@ -660,6 +662,8 @@ class TokenizedEmbeddingReqInput:
     token_type_ids: List[int]
     # Dummy sampling params for compatibility
     sampling_params: SamplingParams
+    # For data parallel rank routing
+    data_parallel_rank: Optional[int] = None
     # For dp balance
     dp_balance_id: int = -1
@@ -1001,6 +1005,11 @@ class ProfileReqOutput:
     message: str
+@dataclass
+class FreezeGCReq:
+    pass
 @dataclass
 class ConfigureLoggingReq:
     log_requests: Optional[bool] = None

sglang/srt/managers/mm_utils.py CHANGED Viewed

@@ -560,7 +560,7 @@ def embed_mm_inputs(
                 ]
                 items_size[i + 1] = len(mm_items)
                 items_offsets.append(
-                    flatten_nested_list([item.offsets for item in mm_inputs.mm_items])
+                    flatten_nested_list([item.offsets for item in mm_items])
                 )
             items_size = torch.cumsum(items_size, dim=0).tolist()

sglang/srt/managers/schedule_batch.py CHANGED Viewed

@@ -52,6 +52,7 @@ from sglang.srt.disaggregation.decode_schedule_batch_mixin import (
     ScheduleBatchDisaggregationDecodeMixin,
 )
 from sglang.srt.distributed.parallel_state import get_tensor_model_parallel_rank
+from sglang.srt.layers.moe import is_tbo_enabled
 from sglang.srt.mem_cache.allocator import (
     BaseTokenToKVPoolAllocator,
     SWATokenToKVPoolAllocator,
@@ -83,18 +84,13 @@ GLOBAL_SERVER_ARGS_KEYS = [
     "chunked_prefill_size",
     "device",
     "disable_chunked_prefix_cache",
+    "disable_flashinfer_cutlass_moe_fp4_allgather",
     "disable_radix_cache",
-    "enable_two_batch_overlap",
-    "tbo_token_distribution_threshold",
     "enable_dp_lm_head",
-    "moe_a2a_backend",
-    "deepep_mode",
-    "enable_flashinfer_cutlass_moe",
-    "enable_flashinfer_trtllm_moe",
+    "flashinfer_mxfp4_moe_precision",
     "enable_flashinfer_allreduce_fusion",
     "moe_dense_tp_size",
     "ep_dispatch_algorithm",
-    "deepep_config",
     "ep_num_redundant_experts",
     "enable_nan_detection",
     "flashinfer_mla_disable_ragged",
@@ -107,12 +103,11 @@ GLOBAL_SERVER_ARGS_KEYS = [
     "triton_attention_reduce_in_fp32",
     "num_reserved_decode_tokens",
     "weight_loader_disable_mmap",
-    "enable_triton_kernel_moe",
-    "enable_flashinfer_mxfp4_moe",
     "enable_multimodal",
     "enable_symm_mem",
     "quantization",
     "enable_custom_logit_processor",
+    "disaggregation_mode",
 ]
 # Put some global args for easy access

sglang/srt/managers/scheduler.py CHANGED Viewed

@@ -64,7 +64,7 @@ from sglang.srt.hf_transformers_utils import (
 )
 from sglang.srt.layers.dp_attention import compute_dp_attention_world_info
 from sglang.srt.layers.logits_processor import LogitsProcessorOutput
-from sglang.srt.layers.moe.utils import DeepEPMode, MoeA2ABackend
+from sglang.srt.layers.moe import initialize_moe_config
 from sglang.srt.managers.io_struct import (
     AbortReq,
     CloseSessionReqInput,
@@ -72,6 +72,7 @@ from sglang.srt.managers.io_struct import (
     ExpertDistributionReqOutput,
     FlushCacheReqInput,
     FlushCacheReqOutput,
+    FreezeGCReq,
     GetInternalStateReq,
     GetInternalStateReqOutput,
     GetWeightsByNameReqInput,
@@ -145,6 +146,7 @@ from sglang.srt.utils import (
     configure_gc_logger,
     configure_logger,
     disable_request_logging,
+    freeze_gc,
     get_available_gpu_memory,
     get_bool_env_var,
     get_zmq_socket,
@@ -245,6 +247,9 @@ class Scheduler(
             )
         )
+        # Init model config
+        self.model_config = ModelConfig.from_server_args(server_args)
         # Init inter-process communication
         context = zmq.Context(2)
         self.idle_sleeper = None
@@ -292,6 +297,9 @@ class Scheduler(
         # Init tokenizer
         self.init_tokenizer()
+        # Init moe config
+        self.init_moe_config()
         # Set reasoning_parser and think_end_id if --reasoning_parser is enabled
         if self.server_args.reasoning_parser and self.tokenizer:
             reasoning_parser = ReasoningParser(
@@ -518,6 +526,7 @@ class Scheduler(
                 (ResumeMemoryOccupationReqInput, self.resume_memory_occupation),
                 (SlowDownReqInput, self.slow_down),
                 (ProfileReq, self.profile),
+                (FreezeGCReq, self.handle_freeze_gc),
                 (GetInternalStateReq, self.get_internal_state),
                 (SetInternalStateReq, self.set_internal_state),
                 (RpcReqInput, self.handle_rpc_request),
@@ -538,8 +547,6 @@ class Scheduler(
     def init_tokenizer(self):
         server_args = self.server_args
-        self.model_config = ModelConfig.from_server_args(server_args)
         self.is_generation = self.model_config.is_generation
         if server_args.skip_tokenizer_init:
@@ -761,6 +768,10 @@ class Scheduler(
             # The prefill requests that are in the middle of kv sending
             self.disagg_prefill_inflight_queue: List[Req] = []
+    def init_moe_config(self):
+        if hasattr(self.model_config.hf_config, "num_experts_per_tok"):
+            initialize_moe_config(self.server_args)
     @DynamicGradMode()
     def event_loop_normal(self):
         """A normal scheduler loop."""
@@ -1133,7 +1144,7 @@ class Scheduler(
                         f"boostrap room id. {req.rid=}"
                     )
                     logger.error(error_msg)
-                    prepare_abort(req, error_msg)
+                    prepare_abort(req, error_msg, status_code=HTTPStatus.BAD_REQUEST)
                     self.stream_output([req], req.return_logprob)
                     return
@@ -1823,11 +1834,6 @@ class Scheduler(
             disable_cuda_graph=self.server_args.disable_cuda_graph,
             spec_algorithm=self.spec_algorithm,
             speculative_num_draft_tokens=self.server_args.speculative_num_draft_tokens,
-            enable_two_batch_overlap=self.server_args.enable_two_batch_overlap,
-            enable_deepep_moe=MoeA2ABackend(
-                self.server_args.moe_a2a_backend
-            ).is_deepep(),
-            deepep_mode=DeepEPMode(self.server_args.deepep_mode),
             require_mlp_tp_gather=require_mlp_tp_gather(self.server_args),
             disable_overlap_schedule=self.server_args.disable_overlap_schedule,
         )
@@ -1922,9 +1928,6 @@ class Scheduler(
         disable_cuda_graph: bool,
         spec_algorithm,
         speculative_num_draft_tokens,
-        enable_two_batch_overlap: bool,
-        enable_deepep_moe: bool,
-        deepep_mode: DeepEPMode,
         require_mlp_tp_gather: bool,
         disable_overlap_schedule: bool,
     ):
@@ -1972,9 +1975,6 @@ class Scheduler(
                 is_extend_in_batch,
                 *tbo_preparer.prepare_all_gather(
                     local_batch,
-                    deepep_mode,
-                    enable_deepep_moe,
-                    enable_two_batch_overlap,
                 ),
             ],
             dtype=torch.int64,
@@ -2472,6 +2472,12 @@ class Scheduler(
         if self.idle_sleeper is not None:
             self.idle_sleeper.maybe_sleep()
+    def handle_freeze_gc(self, recv_req: FreezeGCReq):
+        """Handle freeze_gc request: freeze scheduler's GC and forward to detokenizer."""
+        freeze_gc("Scheduler")
+        self.send_to_detokenizer.send_pyobj(recv_req)
+        return None
 class IdleSleeper:
     """
@@ -2582,7 +2588,10 @@ def run_scheduler_process(
             if scheduler.enable_overlap:
                 scheduler.event_loop_overlap_disagg_prefill()
             else:
-                scheduler.event_loop_normal_disagg_prefill()
+                if server_args.pp_size > 1:
+                    scheduler.event_loop_pp_disagg_prefill()
+                else:
+                    scheduler.event_loop_normal_disagg_prefill()
         elif disaggregation_mode == DisaggregationMode.DECODE:
             if scheduler.enable_overlap:

sglang/srt/managers/session_controller.py CHANGED Viewed

@@ -54,7 +54,7 @@ class SessionReqNode:
             prefix += " -- " + self.childs[0].req.rid
             ret = self.childs[0]._str_helper(prefix)
             for child in self.childs[1:]:
-                prefix = " " * len(origin_prefix) + " \- " + child.req.rid
+                prefix = " " * len(origin_prefix) + " \\- " + child.req.rid
                 ret += child._str_helper(prefix)
             return ret

sglang/srt/managers/template_manager.py CHANGED Viewed

@@ -89,6 +89,7 @@ class TemplateManager:
         if template is None:
             return False
+        # TODO: remove this hard code the reasoning pattern
         force_reasoning_pattern = r"<\|im_start\|>assistant\\n<think>\\n"
         has_reasoning = re.search(force_reasoning_pattern, template) is not None
@@ -128,11 +129,12 @@ class TemplateManager:
                     logger.info(
                         f"Using default HuggingFace chat template with detected content format: {self._jinja_template_content_format}"
                     )
-                    return
-            # Default to string content format if no template was found
-            self._jinja_template_content_format = "string"
-            logger.info("No chat template found, defaulting to 'string' content format")
+                else:
+                    # Default to string content format if no template was found
+                    self._jinja_template_content_format = "string"
+                    logger.info(
+                        "No chat template found, defaulting to 'string' content format"
+                    )
         # Detect reasoning pattern from chat template
         if tokenizer_manager.tokenizer:

sglang 0.5.0rc2__py3-none-any.whl → 0.5.1__py3-none-any.whl

sglang 0.5.0rc2py3-none-any.whl → 0.5.1py3-none-any.whl