PyPI - sglang - Versions diffs - 0.4.4.post2__py3-none-any.whl → 0.4.4.post4__py3-none-any.whl - Mend

sglang 0.4.4.post2py3-none-any.whl → 0.4.4.post4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (108) hide show

sglang/bench_serving.py +72 -10
sglang/srt/_custom_ops.py +59 -92
sglang/srt/configs/deepseekvl2.py +10 -1
sglang/srt/configs/model_config.py +6 -16
sglang/srt/constrained/base_grammar_backend.py +5 -1
sglang/srt/custom_op.py +5 -0
sglang/srt/distributed/device_communicators/custom_all_reduce.py +28 -80
sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +2 -2
sglang/srt/distributed/parallel_state.py +32 -5
sglang/srt/entrypoints/engine.py +0 -5
sglang/srt/entrypoints/http_server.py +7 -1
sglang/srt/entrypoints/verl_engine.py +2 -0
sglang/srt/function_call_parser.py +0 -1
sglang/srt/layers/attention/flashattention_backend.py +582 -125
sglang/srt/layers/attention/flashinfer_backend.py +5 -7
sglang/srt/layers/attention/flashinfer_mla_backend.py +1 -3
sglang/srt/layers/attention/flashmla_backend.py +1 -1
sglang/srt/layers/dp_attention.py +12 -1
sglang/srt/layers/moe/ep_moe/kernels.py +142 -0
sglang/srt/layers/moe/ep_moe/layer.py +79 -80
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +382 -199
sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +403 -47
sglang/srt/layers/moe/topk.py +79 -6
sglang/srt/layers/quantization/__init__.py +137 -165
sglang/srt/layers/quantization/awq.py +200 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +2 -1
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +34 -10
sglang/srt/layers/quantization/fp8_kernel.py +2 -1
sglang/srt/layers/quantization/fp8_utils.py +1 -4
sglang/srt/layers/quantization/gptq.py +30 -40
sglang/srt/layers/quantization/moe_wna16.py +501 -0
sglang/srt/layers/quantization/utils.py +1 -1
sglang/srt/layers/quantization/w8a8_fp8.py +1 -1
sglang/srt/lora/backend/base_backend.py +4 -4
sglang/srt/lora/backend/flashinfer_backend.py +12 -9
sglang/srt/lora/backend/triton_backend.py +5 -8
sglang/srt/lora/layers.py +19 -33
sglang/srt/lora/lora_manager.py +20 -7
sglang/srt/lora/mem_pool.py +12 -6
sglang/srt/lora/triton_ops/gate_up_lora_b.py +10 -4
sglang/srt/lora/triton_ops/qkv_lora_b.py +8 -3
sglang/srt/lora/triton_ops/sgemm_lora_a.py +16 -5
sglang/srt/lora/triton_ops/sgemm_lora_b.py +11 -6
sglang/srt/lora/utils.py +6 -0
sglang/srt/managers/cache_controller.py +34 -11
sglang/srt/managers/io_struct.py +4 -2
sglang/srt/managers/mm_utils.py +202 -156
sglang/srt/managers/multimodal_processor.py +0 -2
sglang/srt/managers/multimodal_processors/base_processor.py +45 -77
sglang/srt/managers/multimodal_processors/clip.py +44 -0
sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +17 -58
sglang/srt/managers/multimodal_processors/gemma3.py +12 -27
sglang/srt/managers/multimodal_processors/janus_pro.py +21 -47
sglang/srt/managers/multimodal_processors/llava.py +34 -14
sglang/srt/managers/multimodal_processors/minicpm.py +35 -38
sglang/srt/managers/multimodal_processors/mlama.py +10 -23
sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -45
sglang/srt/managers/schedule_batch.py +185 -127
sglang/srt/managers/scheduler.py +29 -23
sglang/srt/managers/tokenizer_manager.py +1 -2
sglang/srt/managers/tp_worker.py +3 -0
sglang/srt/managers/utils.py +1 -6
sglang/srt/mem_cache/hiradix_cache.py +62 -52
sglang/srt/mem_cache/memory_pool.py +72 -6
sglang/srt/mem_cache/paged_allocator.py +39 -0
sglang/srt/metrics/collector.py +23 -53
sglang/srt/model_executor/cuda_graph_runner.py +16 -13
sglang/srt/model_executor/forward_batch_info.py +10 -10
sglang/srt/model_executor/model_runner.py +64 -59
sglang/srt/model_loader/loader.py +19 -1
sglang/srt/model_loader/weight_utils.py +6 -3
sglang/srt/models/clip.py +568 -0
sglang/srt/models/deepseek_janus_pro.py +12 -17
sglang/srt/models/deepseek_v2.py +339 -123
sglang/srt/models/deepseek_vl2.py +105 -104
sglang/srt/models/gemma3_causal.py +12 -2
sglang/srt/models/gemma3_mm.py +20 -80
sglang/srt/models/llama.py +4 -1
sglang/srt/models/llava.py +31 -19
sglang/srt/models/llavavid.py +16 -7
sglang/srt/models/minicpmo.py +63 -147
sglang/srt/models/minicpmv.py +17 -27
sglang/srt/models/mllama.py +29 -14
sglang/srt/models/qwen2.py +9 -6
sglang/srt/models/qwen2_5_vl.py +21 -31
sglang/srt/models/qwen2_vl.py +20 -21
sglang/srt/openai_api/adapter.py +106 -93
sglang/srt/openai_api/protocol.py +10 -5
sglang/srt/patch_torch.py +71 -0
sglang/srt/platforms/interface.py +371 -0
sglang/srt/server_args.py +120 -25
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -5
sglang/srt/speculative/eagle_utils.py +140 -28
sglang/srt/speculative/eagle_worker.py +94 -25
sglang/srt/utils.py +137 -51
sglang/test/runners.py +27 -2
sglang/test/test_custom_ops.py +55 -0
sglang/test/test_utils.py +14 -27
sglang/utils.py +2 -2
sglang/version.py +1 -1
{sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/METADATA +10 -5
{sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/RECORD +108 -99
{sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/WHEEL +0 -0
{sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/top_level.txt +0 -0

sglang/srt/managers/schedule_batch.py CHANGED Viewed

@@ -1,5 +1,7 @@
 from __future__ import annotations
+from enum import Enum, auto
 # Copyright 2023-2024 SGLang Team
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -51,7 +53,7 @@ from sglang.srt.model_executor.forward_batch_info import CaptureHiddenMode, Forw
 from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
 from sglang.srt.sampling.sampling_params import SamplingParams
 from sglang.srt.server_args import ServerArgs
-from sglang.srt.utils import get_compiler_backend
+from sglang.srt.utils import flatten_nested_list, get_compiler_backend
 if TYPE_CHECKING:
     from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
@@ -70,14 +72,16 @@ global_server_args_dict = {
     "enable_dp_attention": ServerArgs.enable_dp_attention,
     "enable_ep_moe": ServerArgs.enable_ep_moe,
     "enable_deepep_moe": ServerArgs.enable_deepep_moe,
+    "deepep_mode": ServerArgs.deepep_mode,
     "device": ServerArgs.device,
     "speculative_accept_threshold_single": ServerArgs.speculative_accept_threshold_single,
     "speculative_accept_threshold_acc": ServerArgs.speculative_accept_threshold_acc,
-    "enable_flashinfer_mla": ServerArgs.enable_flashinfer_mla,
     "enable_flashmla": ServerArgs.enable_flashmla,
     "disable_radix_cache": ServerArgs.disable_radix_cache,
     "flashinfer_mla_disable_ragged": ServerArgs.flashinfer_mla_disable_ragged,
     "chunked_prefill_size": ServerArgs.chunked_prefill_size,
+    "n_share_experts_fusion": ServerArgs.n_share_experts_fusion,
+    "disable_shared_experts_fusion": ServerArgs.disable_shared_experts_fusion,
 }
 logger = logging.getLogger(__name__)
@@ -143,165 +147,185 @@ class FINISH_ABORT(BaseFinishReason):
         }
+class Modality(Enum):
+    IMAGE = auto()
+    MULTI_IMAGES = auto()
+    VIDEO = auto()
+    AUDIO = auto()
 @dataclasses.dataclass
-class MultimodalInputs:
-    """The image related inputs."""
+class MultimodalDataItem:
+    """
+    A single multimodal data, from a single image/video/audio or other
+    """
+    modality: Modality
+    hash: int = None
+    pad_value: int = None
-    pixel_values: Union[torch.Tensor, np.array]
-    data_hashes: Optional[list] = None
-    image_sizes: Optional[list] = None
+    aspect_ratio_id: Optional[List[torch.Tensor]] = None
+    aspect_ratio_mask: Optional[List[torch.Tensor]] = None
+    image_sizes: Tuple[int, int] = None
     image_offsets: Optional[list] = None
+    # the real data, pixel_values or audio_features
+    # data: Union[List[torch.Tensor], List[np.array]]
+    pixel_values: Union[torch.Tensor, np.array] = None
+    image_grid_thws: Union[torch.Tensor, np.array] = None
+    video_grid_thws: Union[torch.Tensor, np.array] = None
+    image_emb_mask: Optional[torch.Tensor] = None
+    image_spatial_crop: Optional[torch.Tensor] = None
+    second_per_grid_ts: Optional[List[torch.Tensor]] = None
+    # [num_images, (n, w, h)]
+    tgt_size: Tuple[int, int] = None
+    audio_features: Union[torch.Tensor, np.array] = None
+    audio_feature_lens: Optional[List[torch.Tensor]] = None
+    @staticmethod
+    def is_empty_list(l):
+        if l is None:
+            return True
+        return len([item for item in flatten_nested_list(l) if item is not None]) == 0
+    def set_pad_value(self):
+        """
+        Set the pad value after first hashign the data
+        """
+        def hash_feature(f):
+            if isinstance(f, list):
+                return hash(tuple(flatten_nested_list(f)))
+            elif isinstance(f, np.ndarray):
+                arr = np.ascontiguousarray(f)
+                arr_bytes = arr.tobytes()
+                return hash(arr_bytes)
+            return hash(f)
+        if self.is_audio():
+            self.hash = hash_feature(self.audio_features)
+        else:
+            self.hash = hash_feature(self.pixel_values)
+        assert self.hash is not None
+        self.pad_value = self.hash % (1 << 30)
+    def is_audio(self):
+        return (
+            self.modality == Modality.AUDIO
+        ) and not MultimodalDataItem.is_empty_list(self.audio_features)
+    def is_image(self):
+        return (
+            self.modality == Modality.IMAGE or self.modality == Modality.MULTI_IMAGES
+        ) and not MultimodalDataItem.is_empty_list(self.pixel_values)
+    def is_video(self):
+        return (
+            self.modality == Modality.VIDEO
+        ) and not MultimodalDataItem.is_empty_list(self.pixel_values)
+    def validate(self):
+        ...
+        # TODO
+@dataclasses.dataclass
+class MultimodalInputs:
+    """The multimodal data related inputs."""
+    # items of data
+    mm_items: List[MultimodalDataItem]
     image_pad_len: Optional[list] = None
-    pad_values: Optional[list] = None
-    modalities: Optional[list] = None
     num_image_tokens: Optional[int] = None
-    # Llava related
-    aspect_ratio_ids: Optional[List[torch.Tensor]] = None
-    aspect_ratio_mask: Optional[List[torch.Tensor]] = None
     # QWen2-VL related
-    # [num_of_images, t, h, w]
-    image_grid_thws: torch.Tensor = None
     mrope_position_delta: Optional[torch.Tensor] = None
-    # Qwen2-VL video related
-    video_token_id: Optional[int] = None
-    video_grid_thws: List[Tuple[int, int, int]] = None
-    second_per_grid_ts: Optional[List[torch.Tensor]] = None
-    # deepseek vl2 related
-    images_emb_mask: Optional[List[torch.Tensor]] = None
-    image_spatial_crop: Optional[List[torch.Tensor]] = None
-    # The id of the single-image placeholder token
+    # image
     im_token_id: Optional[torch.Tensor] = None
-    # All the images in the batch should share the same special image
-    # bound token ids.
     im_start_id: Optional[int] = None
     im_end_id: Optional[int] = None
     slice_start_id: Optional[int] = None
     slice_end_id: Optional[int] = None
-    # [num_images, 2 (w, h)]
-    tgt_sizes: Optional[list] = None
+    # video
+    video_token_id: Optional[int] = None
     # audio
     audio_start_id: Optional[torch.Tensor] = None
     audio_end_id: Optional[torch.Tensor] = None
-    audio_features: Optional[List[torch.Tensor]] = None
-    audio_feature_lens: Optional[List[torch.Tensor]] = None
     @staticmethod
     def from_dict(obj: dict):
         ret = MultimodalInputs(
-            pixel_values=obj["pixel_values"],
-            data_hashes=obj["data_hashes"],
+            mm_items=obj["mm_items"],
         )
+        assert isinstance(ret.mm_items, list)
+        ret.mm_items = [
+            item
+            for item in ret.mm_items
+            if item.is_audio() or item.is_image() or item.is_video()
+        ]
+        assert len(ret.mm_items) != 0
         # Use image hash as fake token_ids. We use this as the key for prefix matching in the radix cache.
         # Please note that if the `input_ids` is later used in the model forward,
         # you also need to clamp the values within the range of [0, vocab_size) to avoid out-of-bound
         # errors in cuda kernels. See also llava.py for example.
-        ret.pad_values = [x % (1 << 30) for x in ret.data_hashes]
+        for item in ret.mm_items:
+            item.set_pad_value()
         optional_args = [
-            "image_sizes",
             "modalities",
-            "aspect_ratio_ids",
-            "aspect_ratio_mask",
-            "image_grid_thws",
-            "images_emb_mask",
-            "image_spatial_crop",
             "im_token_id",
             "im_start_id",
             "im_end_id",
             "slice_start_id",
             "slice_end_id",
-            "tgt_sizes",
             "audio_start_id",
             "audio_end_id",
-            "audio_features",
-            "audio_feature_lens",
         ]
         for arg in optional_args:
             if arg in obj:
                 setattr(ret, arg, obj[arg])
-        # validate
-        assert (
-            isinstance(ret.pixel_values, torch.Tensor)
-            or isinstance(ret.pixel_values, np.ndarray)
-            or isinstance(ret.pixel_values, list)
-        )
-        assert ret.audio_features is None or isinstance(ret.audio_features, list)
         return ret
     def contains_image_inputs(self) -> bool:
         """ """
-        return self.pixel_values is not None and self.pixel_values != []
+        return any(item.is_image() for item in self.mm_items)
     def contains_audio_inputs(self) -> bool:
         """ """
-        return self.audio_features is not None and self.audio_features != []
+        return any(item.is_audio() for item in self.mm_items)
+    def collect_image_inputs(self) -> List[torch.Tensor]:
+        return [item.pixel_values for item in self.mm_items if item.is_image()]
     def merge(self, other: MultimodalInputs):
         """
         merge image inputs when requests are being merged
         """
-        if isinstance(self.pixel_values, list):
-            # in some rare cases, pixel values are list of patches with different shapes
-            # e.g. minicpm
-            self.pixel_values += other.pixel_values
-        else:
-            assert (
-                self.pixel_values.shape[1:] == other.pixel_values.shape[1:]
-            ), f"{self.pixel_values.shape[1:]} vs {other.pixel_values.shape[1:]}"
-            self.pixel_values = np.concatenate([self.pixel_values, other.pixel_values])
-        # args would be stacked along first dim
-        # usually these are already tensors
-        stack_args = [
-            # TODO: merge with image_grid_thws, basically the same thing
-            "tgt_sizes",
-            "image_spatial_crop",
-        ]
-        for arg in stack_args:
-            if getattr(self, arg, None) is None:
-                setattr(self, arg, getattr(other, arg, None))
-            elif getattr(other, arg, None) is not None:
-                # self and other both not None
-                setattr(
-                    self,
-                    arg,
-                    torch.cat([getattr(self, arg), getattr(other, arg)], dim=0),
-                )
-        if self.image_grid_thws is None:
-            self.image_grid_thws = other.image_grid_thws
-        elif other.image_grid_thws is not None:
-            self.image_grid_thws = torch.concat(
-                [self.image_grid_thws, other.image_grid_thws]
-            )
         # Use image hash as fake token_ids. We use this as the key for prefix matching in the radix cache.
         # Please note that if the `input_ids` is later used in the model forward,
         # you also need to clamp the values within the range of [0, vocab_size) to avoid out-of-bound
         # errors in cuda kernels. See also llava.py for example.
-        self.data_hashes += other.data_hashes
-        self.pad_values = [x % (1 << 30) for x in self.data_hashes]
         # args needed to be merged
         optional_args = [
-            "audio_features",
-            "image_sizes",
+            "items",
             "image_offsets",
             "image_pad_len",
             # "modalities", # modalities should be ["multi-images"] (one entry) even for multiple images
-            "aspect_ratio_ids",
-            "aspect_ratio_mask",
-            "images_emb_mask",
         ]
         for arg in optional_args:
             self_arg = getattr(self, arg, None)
@@ -599,6 +623,7 @@ class Req:
         self.extend_logprob_start_len = 0
         self.is_chunked = 0
         self.req_pool_idx = None
+        self.already_computed = 0
     def __repr__(self):
         return (
@@ -740,11 +765,14 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
             )
         return req_pool_indices
-    def alloc_token_slots(self, num_tokens: int):
+    def alloc_token_slots(self, num_tokens: int, backup_state: bool = False):
         if self.token_to_kv_pool_allocator.available_size() < num_tokens:
             if self.tree_cache is not None:
                 self.tree_cache.evict(num_tokens)
+        if backup_state:
+            state = self.token_to_kv_pool_allocator.backup_state()
         out_cache_loc = self.token_to_kv_pool_allocator.alloc(num_tokens)
         if out_cache_loc is None:
             phase_str = "Prefill" if self.forward_mode.is_extend() else "Decode"
@@ -758,7 +786,10 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
                 self.tree_cache.pretty_print()
             raise RuntimeError(error_msg)
-        return out_cache_loc
+        if backup_state:
+            return out_cache_loc, state
+        else:
+            return out_cache_loc
     def alloc_paged_token_slots_extend(
         self,
@@ -766,6 +797,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
         seq_lens: torch.Tensor,
         last_loc: torch.Tensor,
         extend_num_tokens: int,
+        backup_state: bool = False,
     ):
         if (
             self.token_to_kv_pool_allocator.available_size()
@@ -778,6 +810,9 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
                     + len(seq_lens) * self.token_to_kv_pool_allocator.page_size,
                 )
+        if backup_state:
+            state = self.token_to_kv_pool_allocator.backup_state()
         out_cache_loc = self.token_to_kv_pool_allocator.alloc_extend(
             prefix_lens, seq_lens, last_loc, extend_num_tokens
         )
@@ -791,23 +826,31 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
             )
             logger.error(error_msg)
             raise RuntimeError(error_msg)
-        return out_cache_loc
+        if backup_state:
+            return out_cache_loc, state
+        else:
+            return out_cache_loc
     def alloc_paged_token_slots_decode(
         self,
         seq_lens: torch.Tensor,
         last_loc: torch.Tensor,
+        backup_state: bool = False,
     ):
-        if (
-            self.token_to_kv_pool_allocator.available_size()
-            < len(seq_lens) * self.token_to_kv_pool_allocator.page_size
-        ):
-            if self.tree_cache is not None:
+        if self.tree_cache is not None:
+            if (
+                self.token_to_kv_pool_allocator.available_size()
+                < len(seq_lens) * self.token_to_kv_pool_allocator.page_size
+            ):
                 self.tree_cache.evict(
                     len(seq_lens) * self.token_to_kv_pool_allocator.page_size,
                 )
-        out_cache_loc = self.token_to_kv_pool_allocator.alloc_decode(seq_lens, last_loc)
+        if backup_state:
+            state = self.token_to_kv_pool_allocator.backup_state()
+        out_cache_loc = self.token_to_kv_pool_allocator.alloc_decode(seq_lens, last_loc)
         if out_cache_loc is None:
             error_msg = (
                 f"Decode out of memory. Try to lower your batch size.\n"
@@ -818,7 +861,11 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
             )
             logger.error(error_msg)
             raise RuntimeError(error_msg)
-        return out_cache_loc
+        if backup_state:
+            return out_cache_loc, state
+        else:
+            return out_cache_loc
     def prepare_encoder_info_extend(self, input_ids: List[int], seq_lens: List[int]):
         self.encoder_lens_cpu = []
@@ -938,8 +985,6 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
                 # If req.input_embeds is already a list, append its content directly
                 input_embeds.extend(req.input_embeds)  # Use extend to avoid nesting
-            if req.is_retracted:
-                req.already_computed = 0
             req.cached_tokens += pre_len - req.already_computed
             req.already_computed = seq_len
             req.is_retracted = False
@@ -1095,17 +1140,25 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
         # TODO (lianmin): Revisit this. It should be seq_len - 1
         self.extend_logprob_start_lens.extend([0] * running_bs)
-    def check_decode_mem(self, buf_multiplier=1):
-        bs = len(self.reqs) * buf_multiplier
-        if self.token_to_kv_pool_allocator.available_size() >= bs:
-            return True
+    def new_page_count_next_decode(self):
+        page_size = self.token_to_kv_pool_allocator.page_size
+        if page_size == 1:
+            return len(self.reqs)
+        return sum(1 for req in self.reqs if req.seqlen % page_size == 0)
-        self.tree_cache.evict(bs)
+    def check_decode_mem(self, buf_multiplier=1):
+        tokens_required = (
+            self.new_page_count_next_decode()
+            * buf_multiplier
+            * self.token_to_kv_pool_allocator.page_size
+        )
-        if self.token_to_kv_pool_allocator.available_size() >= bs:
+        if self.token_to_kv_pool_allocator.available_size() >= tokens_required:
             return True
-        return False
+        self.tree_cache.evict(tokens_required)
+        return self.token_to_kv_pool_allocator.available_size() >= tokens_required
     def retract_decode(self, server_args: ServerArgs):
         """Retract the decoding requests when there is not enough memory."""
@@ -1167,7 +1220,9 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
                 self.req_to_token_pool.free(req.req_pool_idx)
             else:
                 # TODO: apply more fine-grained retraction
-                last_uncached_pos = len(req.prefix_indices)
+                last_uncached_pos = (
+                    len(req.prefix_indices) // server_args.page_size
+                ) * server_args.page_size
                 token_indices = self.req_to_token_pool.req_to_token[
                     req.req_pool_idx, last_uncached_pos : seq_lens_cpu[idx]
                 ]
@@ -1373,20 +1428,25 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
     def get_model_worker_batch(self) -> ModelWorkerBatch:
         if self.forward_mode.is_decode_or_idle():
-            if (
-                global_server_args_dict["enable_flashinfer_mla"]
-                or global_server_args_dict["enable_flashmla"]
-            ):
-                decode_seq_lens = self.seq_lens.cpu()
-            else:
-                decode_seq_lens = None
             extend_seq_lens = extend_prefix_lens = extend_logprob_start_lens = None
         else:
-            decode_seq_lens = None
             extend_seq_lens = self.extend_lens
             extend_prefix_lens = self.prefix_lens
             extend_logprob_start_lens = self.extend_logprob_start_lens
+        # Create seq_lens_cpu when needed
+        if (
+            (
+                global_server_args_dict["use_mla_backend"]
+                and global_server_args_dict["attention_backend"] == "flashinfer"
+            )
+            or global_server_args_dict["enable_flashmla"]
+            or global_server_args_dict["attention_backend"] == "fa3"
+        ):
+            seq_lens_cpu = self.seq_lens.cpu()
+        else:
+            seq_lens_cpu = None
         if self.sampling_info:
             if self.has_grammar:
                 self.sampling_info.grammars = [req.grammar for req in self.reqs]
@@ -1409,7 +1469,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
             global_num_tokens=self.global_num_tokens,
             global_num_tokens_for_logprob=self.global_num_tokens_for_logprob,
             can_run_dp_cuda_graph=self.can_run_dp_cuda_graph,
-            decode_seq_lens=decode_seq_lens,
+            seq_lens_cpu=seq_lens_cpu,
             extend_num_tokens=self.extend_num_tokens,
             extend_seq_lens=extend_seq_lens,
             extend_prefix_lens=extend_prefix_lens,
@@ -1470,6 +1530,7 @@ class ModelWorkerBatch:
     req_pool_indices: torch.Tensor
     # The sequence length
     seq_lens: torch.Tensor
+    seq_lens_cpu: Optional[torch.Tensor]
     # The indices of output tokens in the token_to_kv_pool_allocator
     out_cache_loc: torch.Tensor
@@ -1486,9 +1547,6 @@ class ModelWorkerBatch:
     global_num_tokens_for_logprob: Optional[List[int]]
     can_run_dp_cuda_graph: bool
-    # For decode
-    decode_seq_lens: Optional[torch.Tensor]
     # For extend
     extend_num_tokens: Optional[int]
     extend_seq_lens: Optional[List[int]]

sglang 0.4.4.post2__py3-none-any.whl → 0.4.4.post4__py3-none-any.whl

sglang 0.4.4.post2py3-none-any.whl → 0.4.4.post4py3-none-any.whl