PyPI - sglang - Versions diffs - 0.5.4__py3-none-any.whl → 0.5.4.post1__py3-none-any.whl - Mend

sglang 0.5.4py3-none-any.whl → 0.5.4.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (88) hide show

sglang/bench_serving.py +56 -12
sglang/launch_server.py +2 -0
sglang/srt/batch_invariant_ops/batch_invariant_ops.py +101 -4
sglang/srt/compilation/backend.py +1 -1
sglang/srt/configs/model_config.py +5 -5
sglang/srt/distributed/parallel_state.py +0 -7
sglang/srt/entrypoints/engine.py +18 -15
sglang/srt/entrypoints/grpc_server.py +0 -1
sglang/srt/entrypoints/http_server.py +75 -94
sglang/srt/environ.py +16 -2
sglang/srt/eplb/expert_distribution.py +30 -0
sglang/srt/function_call/function_call_parser.py +2 -0
sglang/srt/function_call/minimax_m2.py +367 -0
sglang/srt/layers/activation.py +6 -0
sglang/srt/layers/attention/flashattention_backend.py +12 -2
sglang/srt/layers/attention/flashinfer_backend.py +10 -1
sglang/srt/layers/attention/flashinfer_mla_backend.py +18 -10
sglang/srt/layers/attention/trtllm_mla_backend.py +1 -13
sglang/srt/layers/attention/utils.py +78 -0
sglang/srt/layers/communicator.py +1 -0
sglang/srt/layers/deep_gemm_wrapper/compile_utils.py +1 -1
sglang/srt/layers/layernorm.py +19 -4
sglang/srt/layers/logits_processor.py +5 -0
sglang/srt/layers/moe/cutlass_w4a8_moe.py +138 -0
sglang/srt/layers/moe/ep_moe/kernels.py +194 -0
sglang/srt/layers/moe/ep_moe/layer.py +79 -272
sglang/srt/layers/moe/fused_moe_triton/layer.py +3 -3
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +7 -4
sglang/srt/layers/moe/moe_runner/deep_gemm.py +287 -22
sglang/srt/layers/moe/moe_runner/runner.py +3 -0
sglang/srt/layers/moe/moe_runner/triton_kernels.py +194 -0
sglang/srt/layers/moe/token_dispatcher/__init__.py +4 -4
sglang/srt/layers/moe/token_dispatcher/base.py +11 -5
sglang/srt/layers/moe/token_dispatcher/deepep.py +18 -14
sglang/srt/layers/moe/token_dispatcher/standard.py +1 -1
sglang/srt/layers/moe/topk.py +4 -4
sglang/srt/layers/moe/utils.py +3 -4
sglang/srt/layers/quantization/__init__.py +3 -5
sglang/srt/layers/quantization/awq.py +0 -3
sglang/srt/layers/quantization/base_config.py +7 -0
sglang/srt/layers/quantization/fp8.py +68 -63
sglang/srt/layers/quantization/gguf.py +566 -0
sglang/srt/layers/quantization/mxfp4.py +30 -38
sglang/srt/layers/quantization/unquant.py +23 -45
sglang/srt/layers/quantization/w4afp8.py +38 -2
sglang/srt/layers/radix_attention.py +5 -2
sglang/srt/layers/rotary_embedding.py +13 -1
sglang/srt/layers/sampler.py +12 -1
sglang/srt/managers/io_struct.py +3 -0
sglang/srt/managers/multi_tokenizer_mixin.py +17 -1
sglang/srt/managers/scheduler.py +21 -15
sglang/srt/managers/scheduler_metrics_mixin.py +22 -14
sglang/srt/managers/scheduler_profiler_mixin.py +3 -4
sglang/srt/managers/tokenizer_manager.py +11 -19
sglang/srt/mem_cache/hicache_storage.py +7 -1
sglang/srt/mem_cache/memory_pool.py +82 -0
sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +3 -2
sglang/srt/model_executor/forward_batch_info.py +44 -3
sglang/srt/model_executor/model_runner.py +1 -149
sglang/srt/model_executor/piecewise_cuda_graph_runner.py +22 -12
sglang/srt/models/deepseek_v2.py +147 -44
sglang/srt/models/glm4_moe.py +322 -354
sglang/srt/models/glm4_moe_nextn.py +4 -14
sglang/srt/models/glm4v_moe.py +29 -196
sglang/srt/models/minimax_m2.py +922 -0
sglang/srt/models/nvila.py +355 -0
sglang/srt/models/nvila_lite.py +184 -0
sglang/srt/models/qwen2.py +22 -1
sglang/srt/models/qwen3.py +34 -4
sglang/srt/models/qwen3_moe.py +2 -4
sglang/srt/multimodal/processors/base_processor.py +1 -0
sglang/srt/multimodal/processors/glm4v.py +1 -1
sglang/srt/multimodal/processors/{vila.py → nvila.py} +32 -24
sglang/srt/multimodal/processors/points_v15_chat.py +2 -2
sglang/srt/parser/reasoning_parser.py +28 -1
sglang/srt/server_args.py +365 -186
sglang/srt/single_batch_overlap.py +2 -7
sglang/srt/utils/common.py +87 -42
sglang/srt/utils/hf_transformers_utils.py +7 -3
sglang/test/test_deterministic.py +235 -12
sglang/test/test_deterministic_utils.py +2 -1
sglang/version.py +1 -1
{sglang-0.5.4.dist-info → sglang-0.5.4.post1.dist-info}/METADATA +7 -6
{sglang-0.5.4.dist-info → sglang-0.5.4.post1.dist-info}/RECORD +87 -82
sglang/srt/models/vila.py +0 -306
{sglang-0.5.4.dist-info → sglang-0.5.4.post1.dist-info}/WHEEL +0 -0
{sglang-0.5.4.dist-info → sglang-0.5.4.post1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.4.dist-info → sglang-0.5.4.post1.dist-info}/top_level.txt +0 -0

sglang/srt/mem_cache/memory_pool.py CHANGED Viewed

@@ -1213,6 +1213,65 @@ def set_mla_kv_buffer_triton(
     )
+@triton.jit
+def get_mla_kv_buffer_kernel(
+    kv_buffer_ptr,
+    cache_k_nope_ptr,
+    cache_k_rope_ptr,
+    loc_ptr,
+    buffer_stride: tl.constexpr,
+    nope_stride: tl.constexpr,
+    rope_stride: tl.constexpr,
+    nope_dim: tl.constexpr,
+    rope_dim: tl.constexpr,
+):
+    pid_loc = tl.program_id(0)
+    loc = tl.load(loc_ptr + pid_loc)
+    loc_src_ptr = kv_buffer_ptr + loc * buffer_stride
+    nope_offs = tl.arange(0, nope_dim)
+    nope_src_ptr = loc_src_ptr + nope_offs
+    nope_src = tl.load(nope_src_ptr)
+    tl.store(
+        cache_k_nope_ptr + pid_loc * nope_stride + nope_offs,
+        nope_src,
+    )
+    rope_offs = tl.arange(0, rope_dim)
+    rope_src_ptr = loc_src_ptr + nope_dim + rope_offs
+    rope_src = tl.load(rope_src_ptr)
+    tl.store(
+        cache_k_rope_ptr + pid_loc * rope_stride + rope_offs,
+        rope_src,
+    )
+def get_mla_kv_buffer_triton(
+    kv_buffer: torch.Tensor,
+    loc: torch.Tensor,
+    cache_k_nope: torch.Tensor,
+    cache_k_rope: torch.Tensor,
+):
+    # The source data type will be implicitly converted to the target data type.
+    nope_dim = cache_k_nope.shape[-1]  # 512
+    rope_dim = cache_k_rope.shape[-1]  # 64
+    n_loc = loc.numel()
+    grid = (n_loc,)
+    get_mla_kv_buffer_kernel[grid](
+        kv_buffer,
+        cache_k_nope,
+        cache_k_rope,
+        loc,
+        kv_buffer.stride(0),
+        cache_k_nope.stride(0),
+        cache_k_rope.stride(0),
+        nope_dim,
+        rope_dim,
+    )
 class MLATokenToKVPool(KVCache):
     def __init__(
         self,
@@ -1363,6 +1422,29 @@ class MLATokenToKVPool(KVCache):
                 cache_k_rope,
             )
+    def get_mla_kv_buffer(
+        self,
+        layer: RadixAttention,
+        loc: torch.Tensor,
+        dst_dtype: Optional[torch.dtype] = None,
+    ):
+        # get k nope and k rope from the kv buffer, and optionally cast them to dst_dtype.
+        layer_id = layer.layer_id
+        kv_buffer = self.get_key_buffer(layer_id)
+        dst_dtype = dst_dtype or self.dtype
+        cache_k_nope = torch.empty(
+            (loc.shape[0], 1, self.kv_lora_rank),
+            dtype=dst_dtype,
+            device=kv_buffer.device,
+        )
+        cache_k_rope = torch.empty(
+            (loc.shape[0], 1, self.qk_rope_head_dim),
+            dtype=dst_dtype,
+            device=kv_buffer.device,
+        )
+        get_mla_kv_buffer_triton(kv_buffer, loc, cache_k_nope, cache_k_rope)
+        return cache_k_nope, cache_k_rope
     def get_cpu_copy(self, indices):
         torch.cuda.synchronize()
         kv_cache_cpu = []

sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py CHANGED Viewed

@@ -3,8 +3,9 @@ import atexit
 import json
 import logging
 import threading
+from collections import OrderedDict
 from pathlib import Path
-from typing import Dict, List, Optional, OrderedDict, Tuple
+from typing import Dict, List, Optional, Tuple
 import orjson
 import requests
@@ -136,7 +137,7 @@ class GlobalMetadataState:
                     num_pages = data["num_pages"]
                     rank_meta = RankMetadata(num_pages)
                     rank_meta.free_pages = data["free_pages"]
-                    rank_meta.key_to_index = dict(data["key_to_index"])
+                    rank_meta.key_to_index = OrderedDict(data["key_to_index"])
                     self.ranks[rank_id] = rank_meta
                 logging.info(
                     f"Successfully loaded metadata for {len(self.ranks)} ranks."

sglang/srt/model_executor/forward_batch_info.py CHANGED Viewed

@@ -39,6 +39,7 @@ import triton
 import triton.language as tl
 from sglang.srt.distributed.parallel_state import get_moe_expert_parallel_world_size
+from sglang.srt.layers.attention.utils import create_flashinfer_kv_indices_triton
 from sglang.srt.layers.dp_attention import (
     DpPaddingMode,
     get_attention_dp_rank,
@@ -250,6 +251,8 @@ class ForwardBatch:
     # For MLA chunked prefix cache used in chunked prefill
     # Tell attention backend whether lse needs to be returned
     mha_return_lse: Optional[bool] = None
+    mha_one_shot_kv_indices: Optional[torch.Tensor] = None
+    mha_one_shot: Optional[bool] = None
     # For multimodal
     mm_inputs: Optional[List[MultimodalInputs]] = None
@@ -572,9 +575,15 @@ class ForwardBatch:
                         device=model_runner.device,
                     )
                 else:
-                    mrope_position_deltas = mm_input.mrope_position_delta.flatten().to(
-                        model_runner.device, non_blocking=True
-                    )
+                    if mm_input.mrope_position_delta.device.type != model_runner.device:
+                        # transfer mrope_position_delta to device when the first running,
+                        # avoiding successvie host-to-device data transfer
+                        mm_input.mrope_position_delta = (
+                            mm_input.mrope_position_delta.to(
+                                model_runner.device, non_blocking=True
+                            )
+                        )
+                    mrope_position_deltas = mm_input.mrope_position_delta.flatten()
                     mrope_positions_list[batch_idx] = (
                         (mrope_position_deltas + self.seq_lens[batch_idx] - 1)
                         .unsqueeze(0)
@@ -863,6 +872,10 @@ class ForwardBatch:
             self.token_to_kv_pool, MLATokenToKVPool
         ), "Currently chunked prefix cache can only be used by Deepseek models"
+        if not any(self.extend_prefix_lens_cpu):
+            self.num_prefix_chunks = 0
+            return
         if self.prefix_chunk_len is not None:
             # Chunked kv cache info already prepared by prior modules
             return
@@ -917,6 +930,34 @@ class ForwardBatch:
     def can_run_tbo(self):
         return self.tbo_split_seq_index is not None
+    def fetch_mha_one_shot_kv_indices(self):
+        if self.mha_one_shot_kv_indices is not None:
+            return self.mha_one_shot_kv_indices
+        batch_size = self.batch_size
+        paged_kernel_lens_sum = sum(self.seq_lens_cpu)
+        kv_indices = torch.empty(
+            paged_kernel_lens_sum,
+            dtype=torch.int32,
+            device=self.req_pool_indices.device,
+        )
+        kv_indptr = torch.zeros(
+            batch_size + 1,
+            dtype=torch.int32,
+            device=self.req_pool_indices.device,
+        )
+        kv_indptr[1:] = torch.cumsum(self.seq_lens, dim=0)
+        create_flashinfer_kv_indices_triton[(self.batch_size,)](
+            self.req_to_token_pool.req_to_token,
+            self.req_pool_indices,
+            self.seq_lens,
+            kv_indptr,
+            None,
+            kv_indices,
+            self.req_to_token_pool.req_to_token.shape[1],
+        )
+        self.mha_one_shot_kv_indices = kv_indices
+        return kv_indices
 def enable_num_token_non_padded(server_args):
     return get_moe_expert_parallel_world_size() > 1

sglang/srt/model_executor/model_runner.py CHANGED Viewed

@@ -131,16 +131,10 @@ from sglang.srt.utils import (
     get_bool_env_var,
     get_cpu_ids_by_node,
     init_custom_process_group,
-    is_fa3_default_architecture,
-    is_flashinfer_available,
     is_hip,
-    is_hopper_with_cuda_12_3,
-    is_no_spec_infer_or_topk_one,
     is_npu,
-    is_sm100_supported,
     log_info_on_rank0,
     monkey_patch_p2p_access_check,
-    monkey_patch_vllm_gguf_config,
     set_cuda_arch,
     slow_rank_detector,
     xpu_has_xmx_support,
@@ -503,121 +497,6 @@ class ModelRunner:
     def model_specific_adjustment(self):
         server_args = self.server_args
-        if (
-            server_args.attention_backend == "intel_amx"
-            and server_args.device == "cpu"
-            and not _is_cpu_amx_available
-        ):
-            logger.info(
-                "The current platform does not support Intel AMX, will fallback to torch_native backend."
-            )
-            server_args.attention_backend = "torch_native"
-        if (
-            server_args.attention_backend == "intel_xpu"
-            and server_args.device == "xpu"
-            and not _is_xpu_xmx_available
-        ):
-            logger.info(
-                "The current platform does not support Intel XMX, will fallback to triton backend."
-            )
-            server_args.attention_backend = "triton"
-        if server_args.prefill_attention_backend is not None and (
-            server_args.prefill_attention_backend
-            == server_args.decode_attention_backend
-        ):  # override the default attention backend
-            server_args.attention_backend = server_args.prefill_attention_backend
-        if (
-            getattr(self.model_config.hf_config, "dual_chunk_attention_config", None)
-            is not None
-        ):
-            if server_args.attention_backend is None:
-                server_args.attention_backend = "dual_chunk_flash_attn"
-                logger.info("Dual chunk attention is turned on by default.")
-            elif server_args.attention_backend != "dual_chunk_flash_attn":
-                raise ValueError(
-                    "Dual chunk attention is enabled, but attention backend is set to "
-                    f"{server_args.attention_backend}. Please set it to 'dual_chunk_flash_attn'."
-                )
-        if server_args.attention_backend is None:
-            """
-            Auto select the fastest attention backend.
-            1. Models with MHA Architecture (e.g: Llama, QWen)
-                1.1 We will turn on FA3 on hopper unless user use spec decode with topk > 1 or page_size > 1.
-                1.2 In other cases, we will use flashinfer if available, otherwise use triton.
-            2. Models with MLA Architecture and using FA3
-                2.1 We will use FA3 backend on hopper.
-                2.2 We will use Flashinfer backend on blackwell.
-                2.3 Otherwise, we will use triton backend.
-            """
-            if not self.use_mla_backend:
-                # MHA architecture
-                if (
-                    is_hopper_with_cuda_12_3()
-                    and is_no_spec_infer_or_topk_one(server_args)
-                    and is_fa3_default_architecture(self.model_config.hf_config)
-                ):
-                    server_args.attention_backend = "fa3"
-                elif _is_hip:
-                    server_args.attention_backend = "aiter"
-                elif _is_npu:
-                    server_args.attention_backend = "ascend"
-                else:
-                    server_args.attention_backend = (
-                        "flashinfer" if is_flashinfer_available() else "triton"
-                    )
-            else:
-                # MLA architecture
-                if is_hopper_with_cuda_12_3():
-                    server_args.attention_backend = "fa3"
-                elif is_sm100_supported():
-                    server_args.attention_backend = "flashinfer"
-                elif _is_hip:
-                    head_num = self.model_config.get_num_kv_heads(self.tp_size)
-                    # TODO current aiter only support head number 16 or 128 head number
-                    if head_num == 128 or head_num == 16:
-                        server_args.attention_backend = "aiter"
-                    else:
-                        server_args.attention_backend = "triton"
-                elif _is_npu:
-                    server_args.attention_backend = "ascend"
-                else:
-                    server_args.attention_backend = "triton"
-            log_info_on_rank0(
-                logger,
-                f"Attention backend not explicitly specified. Use {server_args.attention_backend} backend by default.",
-            )
-        elif self.use_mla_backend:
-            if server_args.device != "cpu":
-                if server_args.attention_backend in MLA_ATTENTION_BACKENDS:
-                    logger.info(
-                        f"MLA optimization is turned on. Use {server_args.attention_backend} backend."
-                    )
-                else:
-                    raise ValueError(
-                        f"Invalid attention backend for MLA: {server_args.attention_backend}"
-                    )
-            else:
-                if server_args.attention_backend != "intel_amx":
-                    raise ValueError(
-                        "MLA optimization not supported on CPU except for intel_amx backend."
-                    )
-        if (
-            server_args.attention_backend == "fa3"
-            and server_args.kv_cache_dtype == "fp8_e5m2"
-        ):
-            logger.warning(
-                "FlashAttention3 only supports fp8_e4m3 if using FP8; "
-                "Setting attention backend to triton."
-            )
-            server_args.attention_backend = "triton"
         if server_args.enable_double_sparsity:
             logger.info(
                 "Double sparsity optimization is turned on. Use triton backend without CUDA graph."
@@ -643,37 +522,12 @@ class ModelRunner:
         if not server_args.disable_chunked_prefix_cache:
             log_info_on_rank0(logger, "Chunked prefix cache is turned on.")
-        if server_args.attention_backend == "aiter":
-            if self.model_config.context_len > 8192:
-                self.mem_fraction_static *= 0.85
-        if (
-            server_args.enable_hierarchical_cache
-            and server_args.hicache_io_backend == "kernel"
-        ):
-            # fix for the compatibility issue with FlashAttention3 decoding and HiCache kernel backend
-            if server_args.decode_attention_backend is None:
-                if not self.use_mla_backend:
-                    server_args.decode_attention_backend = (
-                        "flashinfer" if is_flashinfer_available() else "triton"
-                    )
-                else:
-                    server_args.decode_attention_backend = (
-                        "flashinfer" if is_sm100_supported() else "triton"
-                    )
-            elif server_args.decode_attention_backend == "fa3":
-                server_args.hicache_io_backend = "direct"
-                logger.warning(
-                    "FlashAttention3 decode backend is not compatible with hierarchical cache. "
-                    "Setting hicache_io_backend to vanilla I/O, which may lead to suboptimal performance with small page sizes."
-                )
         if self.model_config.hf_config.model_type == "qwen3_vl_moe":
             if (
                 quantization_config := getattr(
                     self.model_config.hf_config, "quantization_config", None
                 )
-            ) is not None:
+            ) is not None and "weight_block_size" in quantization_config:
                 weight_block_size_n = quantization_config["weight_block_size"][0]
                 if self.tp_size % self.moe_ep_size != 0:
@@ -858,8 +712,6 @@ class ModelRunner:
             self.model_config = adjust_config_with_unaligned_cpu_tp(
                 self.model_config, self.load_config, self.tp_size
             )
-        if self.server_args.load_format == "gguf":
-            monkey_patch_vllm_gguf_config()
         if self.server_args.load_format == LoadFormat.REMOTE_INSTANCE:
             if self.tp_rank == 0:

sglang/srt/model_executor/piecewise_cuda_graph_runner.py CHANGED Viewed

@@ -32,7 +32,6 @@ from sglang.srt.distributed import get_tensor_model_parallel_rank
 from sglang.srt.distributed.device_communicators.pynccl_allocator import (
     set_graph_pool_id,
 )
-from sglang.srt.distributed.parallel_state import graph_capture
 from sglang.srt.layers.dp_attention import (
     DpPaddingMode,
     get_attention_tp_rank,
@@ -250,6 +249,9 @@ class PiecewiseCudaGraphRunner:
                 lora_ids=None,
             )
+        # Attention backend
+        self.model_runner.attn_backend.init_forward_metadata(forward_batch)
         with set_forward_context(forward_batch, self.attention_layers):
             _ = self.model_runner.model.forward(
                 forward_batch.input_ids,
@@ -262,9 +264,14 @@ class PiecewiseCudaGraphRunner:
     def can_run(self, forward_batch: ForwardBatch):
         num_tokens = len(forward_batch.input_ids)
-        # TODO(yuwei): support return logprob
+        # TODO(yuwei): support return input_ids' logprob
         if forward_batch.return_logprob:
-            return False
+            for start_len, seq_len in zip(
+                forward_batch.extend_logprob_start_lens_cpu,
+                forward_batch.extend_seq_lens_cpu,
+            ):
+                if start_len is not None and start_len < seq_len:
+                    return False
         if num_tokens <= self.max_num_tokens:
             return True
         return False
@@ -273,10 +280,10 @@ class PiecewiseCudaGraphRunner:
         # Trigger CUDA graph capture for specific shapes.
         # Capture the large shapes first so that the smaller shapes
         # can reuse the memory pool allocated for the large shapes.
-        with freeze_gc(
-            self.model_runner.server_args.enable_cudagraph_gc
-        ), graph_capture() as graph_capture_context:
-            self.stream = graph_capture_context.stream
+        with freeze_gc(self.model_runner.server_args.enable_cudagraph_gc):
+            if self.model_runner.tp_group.ca_comm is not None:
+                old_ca_disable = self.model_runner.tp_group.ca_comm.disabled
+                self.model_runner.tp_group.ca_comm.disabled = True
             avail_mem = get_available_gpu_memory(
                 self.model_runner.device,
                 self.model_runner.gpu_id,
@@ -304,9 +311,10 @@ class PiecewiseCudaGraphRunner:
                 # Save gemlite cache after each capture
                 save_gemlite_cache()
+            if self.model_runner.tp_group.ca_comm is not None:
+                self.model_runner.tp_group.ca_comm.disabled = old_ca_disable
     def capture_one_batch_size(self, num_tokens: int):
-        stream = self.stream
         bs = 1
         # Graph inputs
@@ -370,9 +378,6 @@ class PiecewiseCudaGraphRunner:
         if lora_ids is not None:
             self.model_runner.lora_manager.prepare_lora_batch(forward_batch)
-        # # Attention backend
-        self.model_runner.attn_backend.init_forward_metadata(forward_batch)
         # Run and capture
         def run_once():
             # Clean intermediate result cache for DP attention
@@ -438,7 +443,7 @@ class PiecewiseCudaGraphRunner:
             out_cache_loc=out_cache_loc,
             seq_lens_sum=forward_batch.seq_lens_sum,
             encoder_lens=forward_batch.encoder_lens,
-            return_logprob=forward_batch.return_logprob,
+            return_logprob=False,
             extend_seq_lens=forward_batch.extend_seq_lens,
             extend_prefix_lens=forward_batch.extend_prefix_lens,
             extend_start_loc=forward_batch.extend_start_loc,
@@ -474,6 +479,9 @@ class PiecewiseCudaGraphRunner:
         forward_batch: ForwardBatch,
         **kwargs,
     ) -> Union[LogitsProcessorOutput, PPProxyTensors]:
+        if self.model_runner.tp_group.ca_comm is not None:
+            old_ca_disable = self.model_runner.tp_group.ca_comm.disabled
+            self.model_runner.tp_group.ca_comm.disabled = True
         static_forward_batch = self.replay_prepare(forward_batch, **kwargs)
         # Replay
         with set_forward_context(static_forward_batch, self.attention_layers):
@@ -499,6 +507,8 @@ class PiecewiseCudaGraphRunner:
                 raise NotImplementedError(
                     "PPProxyTensors is not supported in PiecewiseCudaGraphRunner yet."
                 )
+        if self.model_runner.tp_group.ca_comm is not None:
+            self.model_runner.tp_group.ca_comm.disabled = old_ca_disable
     def get_spec_info(self, num_tokens: int):
         spec_info = None

sglang 0.5.4__py3-none-any.whl → 0.5.4.post1__py3-none-any.whl

sglang 0.5.4py3-none-any.whl → 0.5.4.post1py3-none-any.whl