PyPI - sglang - Versions diffs - 0.4.7.post1__py3-none-any.whl → 0.4.8.post1__py3-none-any.whl - Mend

sglang 0.4.7.post1py3-none-any.whl → 0.4.8.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (123) hide show

sglang/bench_one_batch.py +8 -6
sglang/srt/_custom_ops.py +2 -2
sglang/srt/code_completion_parser.py +2 -44
sglang/srt/configs/model_config.py +1 -0
sglang/srt/constants.py +3 -0
sglang/srt/conversation.py +14 -3
sglang/srt/custom_op.py +11 -1
sglang/srt/disaggregation/base/conn.py +2 -0
sglang/srt/disaggregation/decode.py +22 -28
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +4 -3
sglang/srt/disaggregation/mini_lb.py +34 -4
sglang/srt/disaggregation/mooncake/conn.py +301 -64
sglang/srt/disaggregation/mooncake/transfer_engine.py +31 -1
sglang/srt/disaggregation/nixl/conn.py +94 -46
sglang/srt/disaggregation/prefill.py +20 -15
sglang/srt/disaggregation/utils.py +47 -18
sglang/srt/distributed/parallel_state.py +12 -4
sglang/srt/entrypoints/engine.py +27 -31
sglang/srt/entrypoints/http_server.py +149 -79
sglang/srt/entrypoints/http_server_engine.py +0 -3
sglang/srt/entrypoints/openai/__init__.py +0 -0
sglang/srt/{openai_api → entrypoints/openai}/protocol.py +115 -34
sglang/srt/entrypoints/openai/serving_base.py +149 -0
sglang/srt/entrypoints/openai/serving_chat.py +897 -0
sglang/srt/entrypoints/openai/serving_completions.py +425 -0
sglang/srt/entrypoints/openai/serving_embedding.py +170 -0
sglang/srt/entrypoints/openai/serving_rerank.py +102 -0
sglang/srt/entrypoints/openai/serving_score.py +61 -0
sglang/srt/entrypoints/openai/usage_processor.py +81 -0
sglang/srt/entrypoints/openai/utils.py +72 -0
sglang/srt/function_call/base_format_detector.py +7 -4
sglang/srt/function_call/deepseekv3_detector.py +1 -1
sglang/srt/function_call/ebnf_composer.py +64 -10
sglang/srt/function_call/function_call_parser.py +6 -6
sglang/srt/function_call/llama32_detector.py +1 -1
sglang/srt/function_call/mistral_detector.py +1 -1
sglang/srt/function_call/pythonic_detector.py +1 -1
sglang/srt/function_call/qwen25_detector.py +1 -1
sglang/srt/{openai_api/utils.py → jinja_template_utils.py} +6 -5
sglang/srt/layers/activation.py +28 -3
sglang/srt/layers/attention/aiter_backend.py +5 -2
sglang/srt/layers/attention/base_attn_backend.py +1 -1
sglang/srt/layers/attention/cutlass_mla_backend.py +1 -0
sglang/srt/layers/attention/flashattention_backend.py +43 -23
sglang/srt/layers/attention/flashinfer_backend.py +9 -6
sglang/srt/layers/attention/flashinfer_mla_backend.py +7 -4
sglang/srt/layers/attention/flashmla_backend.py +5 -2
sglang/srt/layers/attention/tbo_backend.py +3 -3
sglang/srt/layers/attention/triton_backend.py +19 -11
sglang/srt/layers/communicator.py +5 -5
sglang/srt/layers/dp_attention.py +11 -2
sglang/srt/layers/layernorm.py +44 -2
sglang/srt/layers/linear.py +18 -1
sglang/srt/layers/logits_processor.py +14 -5
sglang/srt/layers/moe/ep_moe/kernels.py +159 -2
sglang/srt/layers/moe/ep_moe/layer.py +286 -13
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +19 -2
sglang/srt/layers/moe/fused_moe_native.py +7 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +13 -2
sglang/srt/layers/moe/fused_moe_triton/layer.py +148 -26
sglang/srt/layers/moe/topk.py +117 -4
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +6 -2
sglang/srt/layers/quantization/fp8.py +25 -17
sglang/srt/layers/quantization/fp8_utils.py +5 -4
sglang/srt/layers/quantization/modelopt_quant.py +62 -8
sglang/srt/layers/quantization/utils.py +5 -2
sglang/srt/layers/rotary_embedding.py +144 -12
sglang/srt/layers/sampler.py +1 -1
sglang/srt/layers/vocab_parallel_embedding.py +14 -1
sglang/srt/lora/lora_manager.py +173 -74
sglang/srt/lora/mem_pool.py +49 -45
sglang/srt/lora/utils.py +1 -1
sglang/srt/managers/cache_controller.py +33 -15
sglang/srt/managers/expert_distribution.py +21 -0
sglang/srt/managers/io_struct.py +19 -14
sglang/srt/managers/multimodal_processors/base_processor.py +44 -9
sglang/srt/managers/multimodal_processors/gemma3n.py +97 -0
sglang/srt/managers/schedule_batch.py +49 -32
sglang/srt/managers/schedule_policy.py +70 -56
sglang/srt/managers/scheduler.py +189 -68
sglang/srt/managers/template_manager.py +226 -0
sglang/srt/managers/tokenizer_manager.py +11 -8
sglang/srt/managers/tp_worker.py +12 -2
sglang/srt/managers/tp_worker_overlap_thread.py +11 -0
sglang/srt/mem_cache/{paged_allocator.py → allocator.py} +125 -34
sglang/srt/mem_cache/base_prefix_cache.py +52 -8
sglang/srt/mem_cache/chunk_cache.py +11 -16
sglang/srt/mem_cache/hiradix_cache.py +34 -23
sglang/srt/mem_cache/memory_pool.py +118 -114
sglang/srt/mem_cache/radix_cache.py +20 -16
sglang/srt/model_executor/cuda_graph_runner.py +77 -46
sglang/srt/model_executor/forward_batch_info.py +18 -5
sglang/srt/model_executor/model_runner.py +27 -8
sglang/srt/model_loader/loader.py +50 -8
sglang/srt/model_loader/weight_utils.py +100 -2
sglang/srt/models/deepseek_nextn.py +35 -30
sglang/srt/models/deepseek_v2.py +255 -30
sglang/srt/models/gemma3n_audio.py +949 -0
sglang/srt/models/gemma3n_causal.py +1009 -0
sglang/srt/models/gemma3n_mm.py +511 -0
sglang/srt/models/glm4.py +312 -0
sglang/srt/models/hunyuan.py +771 -0
sglang/srt/models/mimo_mtp.py +2 -18
sglang/srt/reasoning_parser.py +21 -11
sglang/srt/server_args.py +51 -9
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +131 -10
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +125 -12
sglang/srt/speculative/eagle_utils.py +80 -8
sglang/srt/speculative/eagle_worker.py +124 -41
sglang/srt/torch_memory_saver_adapter.py +19 -15
sglang/srt/two_batch_overlap.py +4 -1
sglang/srt/utils.py +248 -11
sglang/test/test_block_fp8_ep.py +1 -0
sglang/test/test_utils.py +1 -0
sglang/version.py +1 -1
{sglang-0.4.7.post1.dist-info → sglang-0.4.8.post1.dist-info}/METADATA +4 -10
{sglang-0.4.7.post1.dist-info → sglang-0.4.8.post1.dist-info}/RECORD +121 -105
sglang/srt/entrypoints/verl_engine.py +0 -179
sglang/srt/openai_api/adapter.py +0 -2148
{sglang-0.4.7.post1.dist-info → sglang-0.4.8.post1.dist-info}/WHEEL +0 -0
{sglang-0.4.7.post1.dist-info → sglang-0.4.8.post1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.7.post1.dist-info → sglang-0.4.8.post1.dist-info}/top_level.txt +0 -0

sglang/srt/managers/io_struct.py CHANGED Viewed

@@ -226,11 +226,11 @@ class GenerateReqInput:
         # Expand input based on type
         self._expand_inputs(num)
+        self._normalize_rid(num)
         self._normalize_lora_paths(num)
         self._normalize_image_data(num)
         self._normalize_audio_data(num)
         self._normalize_sampling_params(num)
-        self._normalize_rid(num)
         self._normalize_logprob_params(num)
         self._normalize_custom_logit_processor(num)
@@ -319,8 +319,16 @@ class GenerateReqInput:
         """Normalize request IDs for batch processing."""
         if self.rid is None:
             self.rid = [uuid.uuid4().hex for _ in range(num)]
-        elif not isinstance(self.rid, list):
-            raise ValueError("The rid should be a list for batch processing.")
+        elif isinstance(self.rid, str):
+            new_rids = [f"{self.rid}_{i}" for i in range(num)]
+            self.rid = new_rids
+        elif isinstance(self.rid, list):
+            if len(self.rid) != num:
+                raise ValueError(
+                    "The specified rids length mismatch with the batch_size for batch processing."
+                )
+        else:
+            raise ValueError("The rid should be a string or a list of strings.")
     def _normalize_logprob_params(self, num):
         """Normalize logprob-related parameters for batch processing."""
@@ -530,6 +538,7 @@ class EmbeddingReqInput:
         if self.text is not None:
             if isinstance(self.text, list):
                 self.batch_size += len(self.text)
+                self.is_single = False
             else:
                 self.batch_size += 1
@@ -537,12 +546,10 @@ class EmbeddingReqInput:
         if self.input_ids is not None:
             if isinstance(self.input_ids[0], list):
                 self.batch_size += len(self.input_ids)
+                self.is_single = False
             else:
                 self.batch_size += 1
-        if self.batch_size > 1:
-            self.is_single = False
         # Fill in default arguments
         if self.is_single:
             if self.rid is None:
@@ -812,7 +819,9 @@ class GetWeightsByNameReqOutput:
 @dataclass
 class ReleaseMemoryOccupationReqInput:
-    pass
+    # Optional tags to identify the memory region, which is primarily used for RL
+    # Currently we only support `weights` and `kv_cache`
+    tags: Optional[List[str]] = None
 @dataclass
@@ -822,7 +831,9 @@ class ReleaseMemoryOccupationReqOutput:
 @dataclass
 class ResumeMemoryOccupationReqInput:
-    pass
+    # Optional tags to identify the memory region, which is primarily used for RL
+    # Currently we only support `weights` and `kv_cache`
+    tags: Optional[List[str]] = None
 @dataclass
@@ -861,12 +872,6 @@ class SetInternalStateReq:
     server_args: Dict[str, Any]
-@dataclass
-class V1RerankReqInput:
-    query: str
-    documents: List[str]
 @dataclass
 class SetInternalStateReqOutput:
     updated: bool

sglang/srt/managers/multimodal_processors/base_processor.py CHANGED Viewed

@@ -23,6 +23,7 @@ class MultimodalInputFormat(Enum):
     RAW_IMAGES = "raw_images"
     PRECOMPUTED_FEATURES = "precomputed_features"
     PIXEL_VALUES = "pixel_values"
+    AUDIO = "audio"
 @dataclasses.dataclass
@@ -441,10 +442,13 @@ class BaseMultimodalProcessor(ABC):
                 has_image = False
                 has_pixel_values = False
                 has_precomputed_features = False
+                has_audio = False
                 for mm_input in mm_inputs:
                     if isinstance(mm_input, Image.Image):
                         has_image = True
+                    elif isinstance(mm_input, np.ndarray):
+                        has_audio = True
                     elif isinstance(mm_input, dict):
                         if mm_input.get("precomputed_features", None) is not None:
                             has_precomputed_features = True
@@ -461,13 +465,13 @@ class BaseMultimodalProcessor(ABC):
                 # Validate format consistency
                 format_count = sum(
-                    [has_image, has_pixel_values, has_precomputed_features]
+                    [has_image, has_pixel_values, has_precomputed_features, has_audio]
                 )
                 if format_count > 1:
                     raise ValueError(
                         "Unsupported: mixture of multimodal input formats. "
                         f"Found formats: image={has_image}, pixel_values={has_pixel_values}, "
-                        f"precomputed_features={has_precomputed_features}"
+                        f"precomputed_features={has_precomputed_features}, audio={has_audio}"
                     )
                 if has_image:
@@ -476,6 +480,8 @@ class BaseMultimodalProcessor(ABC):
                     return MultimodalInputFormat.PRECOMPUTED_FEATURES
                 elif has_pixel_values:
                     return MultimodalInputFormat.PIXEL_VALUES
+                elif has_audio:
+                    return MultimodalInputFormat.AUDIO
                 else:
                     raise ValueError("No valid multimodal input format found")
             except Exception as e:
@@ -521,20 +527,47 @@ class BaseMultimodalProcessor(ABC):
             input_ids = tokenize_text(base_output.input_text)
             return combined_mm_item, input_ids
+        def process_audio(
+            base_output: BaseMultiModalProcessorOutput,
+        ) -> Tuple[MultimodalDataItem, torch.Tensor]:
+            """Process inputs with audio."""
+            ret = self.process_mm_data(
+                input_text=base_output.input_text,
+                audio=base_output.audios,  # Note: "audio" is for gemma3n only
+            )
+            combined_mm_item = MultimodalDataItem(modality=Modality.AUDIO)
+            for key, value in ret.items():
+                if key != "input_ids" and hasattr(combined_mm_item, key):
+                    setattr(combined_mm_item, key, value)
+            input_ids = ret["input_ids"].flatten()
+            return combined_mm_item, input_ids
         def finalize_mm_item(
             combined_mm_item: MultimodalDataItem, input_ids: torch.Tensor
         ) -> MultimodalDataItem:
             """Apply common post-processing to the multimodal item."""
-            combined_mm_item.image_offsets = self.get_mm_items_offset(
-                input_ids=input_ids,
-                mm_token_id=self.IM_TOKEN_ID,
-            )
+            if combined_mm_item.modality in [Modality.IMAGE, Modality.MULTI_IMAGES]:
+                combined_mm_item.image_offsets = self.get_mm_items_offset(
+                    input_ids=input_ids,
+                    mm_token_id=self.IM_TOKEN_ID,
+                )
+            elif combined_mm_item.modality == Modality.AUDIO:
+                combined_mm_item.audio_offsets = self.get_mm_items_offset(
+                    input_ids=input_ids,
+                    mm_token_id=self.AUDIO_TOKEN_ID,
+                )
+            elif combined_mm_item.modality == Modality.VIDEO:
+                combined_mm_item.video_offsets = self.get_mm_items_offset(
+                    input_ids=input_ids,
+                    mm_token_id=self.VIDEO_TOKEN_ID,
+                )
+            else:
+                raise ValueError(f"Unknown modality: {combined_mm_item.modality}")
             return combined_mm_item
-        # Main logic
-        mm_inputs = base_output.images
+        # Main logic - determine input type and handle text-only case
+        mm_inputs = base_output.images or base_output.audios
         if not mm_inputs:
-            # Return text-only case
             input_ids = tokenize_text(base_output.input_text)
             return None, input_ids
@@ -548,6 +581,8 @@ class BaseMultimodalProcessor(ABC):
             combined_mm_item, input_ids = process_precomputed_features(base_output)
         elif input_format == MultimodalInputFormat.PIXEL_VALUES:
             combined_mm_item, input_ids = process_pixel_values(base_output)
+        elif input_format == MultimodalInputFormat.AUDIO:
+            combined_mm_item, input_ids = process_audio(base_output)
         else:
             raise ValueError(f"Unknown input format: {input_format}")

sglang/srt/managers/multimodal_processors/gemma3n.py ADDED Viewed

@@ -0,0 +1,97 @@
+# Copyright 2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import re
+from typing import Dict, List, Optional, Union
+from sglang.srt.managers.multimodal_processor import (
+    BaseMultimodalProcessor as SGLangBaseProcessor,
+)
+from sglang.srt.managers.multimodal_processors.base_processor import (
+    MultimodalSpecialTokens,
+)
+from sglang.srt.models.gemma3n_mm import Gemma3nForConditionalGeneration
+class Gemma3nSGLangProcessor(SGLangBaseProcessor):
+    """Multimodal processor for Gemma3n supporting image and audio inputs."""
+    models = [Gemma3nForConditionalGeneration]
+    def __init__(self, hf_config, server_args, _processor):
+        super().__init__(hf_config, server_args, _processor)
+        self.IMAGE_TOKEN = "<image_soft_token>"
+        self.IMAGE_TOKEN_REGEX = re.compile(
+            r"<start_of_image>(?:(?:<image_soft_token>)*<end_of_image>)?"
+        )
+        self.AUDIO_TOKEN = "<audio_soft_token>"
+        self.AUDIO_TOKEN_REGEX = re.compile(
+            r"<start_of_audio>(?:(?:<audio_soft_token>)*<end_of_audio>)?"
+        )
+        self.IM_TOKEN_ID = hf_config.image_token_id
+        self.IM_START_TOKEN_ID = hf_config.boi_token_id
+        self.IM_END_TOKEN_ID = hf_config.eoi_token_id
+        self.AUDIO_TOKEN_ID = hf_config.audio_token_id
+        self.AUDIO_START_TOKEN_ID = hf_config.boa_token_id
+        self.AUDIO_END_TOKEN_ID = hf_config.eoa_token_id
+    async def process_mm_data_async(
+        self,
+        image_data: Optional[List[Union[str, bytes, Dict]]] = None,
+        audio_data: Optional[List[Union[str, bytes, Dict]]] = None,
+        input_text: str = "",
+        request_obj=None,
+        max_req_input_len: int = 0,
+        *args,
+        **kwargs,
+    ):
+        """Process multimodal data including images and audio."""
+        audio_data = request_obj.audio_data
+        if not image_data and not audio_data:
+            return None
+        if isinstance(image_data, str):
+            image_data = [image_data]
+        if isinstance(audio_data, str):
+            audio_data = [audio_data]
+        base_output = self.load_mm_data(
+            prompt=input_text,
+            image_data=image_data,
+            audio_data=audio_data,
+            max_req_input_len=max_req_input_len,
+            multimodal_tokens=MultimodalSpecialTokens(
+                image_token=self.IMAGE_TOKEN,
+                image_token_regex=self.IMAGE_TOKEN_REGEX,
+                audio_token=self.AUDIO_TOKEN,
+                audio_token_regex=self.AUDIO_TOKEN_REGEX,
+            ),
+        )
+        combined_mm_item, input_ids = self.process_and_combine_mm_data(base_output)
+        return {
+            "input_ids": input_ids.tolist(),
+            "mm_items": [combined_mm_item] if combined_mm_item is not None else [],
+            "im_start_id": self.IM_START_TOKEN_ID,
+            "im_end_id": self.IM_END_TOKEN_ID,
+            "audio_start_id": self.AUDIO_START_TOKEN_ID,
+            "audio_end_id": self.AUDIO_END_TOKEN_ID,
+        }

sglang/srt/managers/schedule_batch.py CHANGED Viewed

@@ -38,7 +38,7 @@ import logging
 import threading
 from enum import Enum, auto
 from http import HTTPStatus
-from typing import TYPE_CHECKING, List, Optional, Set, Tuple, Union
+from typing import TYPE_CHECKING, Any, List, Optional, Set, Tuple, Union
 import numpy as np
 import torch
@@ -54,9 +54,10 @@ from sglang.srt.disaggregation.decode_schedule_batch_mixin import (
 )
 from sglang.srt.distributed.parallel_state import get_tensor_model_parallel_rank
 from sglang.srt.layers.multimodal import gpu_tensor_hash
+from sglang.srt.mem_cache.allocator import BaseTokenToKVPoolAllocator
 from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache
 from sglang.srt.mem_cache.chunk_cache import ChunkCache
-from sglang.srt.mem_cache.memory_pool import ReqToTokenPool, TokenToKVPoolAllocator
+from sglang.srt.mem_cache.memory_pool import ReqToTokenPool
 from sglang.srt.metrics.collector import TimeStats
 from sglang.srt.model_executor.forward_batch_info import CaptureHiddenMode, ForwardMode
 from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
@@ -85,6 +86,7 @@ GLOBAL_SERVER_ARGS_KEYS = [
     "enable_deepep_moe",
     "deepep_mode",
     "enable_ep_moe",
+    "enable_flashinfer_moe",
     "moe_dense_tp_size",
     "ep_dispatch_algorithm",
     "deepep_config",
@@ -99,6 +101,7 @@ GLOBAL_SERVER_ARGS_KEYS = [
     "torchao_config",
     "triton_attention_reduce_in_fp32",
     "num_reserved_decode_tokens",
+    "weight_loader_disable_mmap",
 ]
 # Put some global args for easy access
@@ -211,6 +214,10 @@ class MultimodalDataItem:
     audio_feature_lens: Optional[List[torch.Tensor]] = None
     audio_offsets: Optional[List[Tuple[int, int]]] = None
+    # gemma3n related
+    input_features: Optional[torch.Tensor] = None
+    input_features_mask: Optional[torch.Tensor] = None
     precomputed_features: Optional[Union[torch.Tensor, np.ndarray]] = None
     @staticmethod
@@ -274,7 +281,10 @@ class MultimodalDataItem:
         if self.precomputed_features is not None:
             self.hash = hash_feature(self.precomputed_features)
         elif self.is_audio():
-            self.hash = hash_feature(self.audio_features)
+            if self.audio_features is not None:
+                self.hash = hash_feature(self.audio_features)
+            elif self.input_features is not None:
+                self.hash = hash_feature(self.input_features)
         else:
             self.hash = hash_feature(self.pixel_values)
@@ -285,6 +295,7 @@ class MultimodalDataItem:
         return (self.modality == Modality.AUDIO) and (
             self.precomputed_features is not None
             or not MultimodalDataItem.is_empty_list(self.audio_features)
+            or not MultimodalDataItem.is_empty_list(self.input_features)
         )
     def is_image(self):
@@ -436,7 +447,7 @@ class Req:
         self,
         rid: str,
         origin_input_text: str,
-        origin_input_ids: Tuple[int],
+        origin_input_ids: List[int],
         sampling_params: SamplingParams,
         return_logprob: bool = False,
         top_logprobs_num: int = 0,
@@ -467,7 +478,7 @@ class Req:
         # Each decode stage's output ids
         self.output_ids = []
         # fill_ids = origin_input_ids + output_ids. Updated if chunked.
-        self.fill_ids = None
+        self.fill_ids = []
         self.session_id = session_id
         self.input_embeds = input_embeds
@@ -519,13 +530,14 @@ class Req:
         # Prefix info
         # The indices to kv cache for the shared prefix.
-        self.prefix_indices = []
+        self.prefix_indices: torch.Tensor = []
         # Number of tokens to run prefill.
         self.extend_input_len = 0
         # The relative logprob_start_len in an extend batch
         self.extend_logprob_start_len = 0
-        self.last_node = None
-        self.last_node_global = None
+        self.last_node: Any = None
+        self.last_host_node: Any = None
+        self.host_hit_length = 0
         # Whether or not if it is chunked. It increments whenever
         # it is chunked, and decrement whenever chunked request is
@@ -583,6 +595,7 @@ class Req:
                 self.output_token_ids_logprobs_idx
             ) = None
         self.hidden_states: List[List[float]] = []
+        self.hidden_states_tensor = None  # Note: use tensor instead of list to transfer hidden_states when PD + MTP
         # Embedding (return values)
         self.embedding = None
@@ -644,29 +657,17 @@ class Req:
     def init_next_round_input(
         self,
         tree_cache: Optional[BasePrefixCache] = None,
-        enable_hierarchical_cache=False,
     ):
         self.fill_ids = self.origin_input_ids + self.output_ids
         if tree_cache is not None:
-            # tree cache is None if the prefix is not computed with tree cache.
-            if enable_hierarchical_cache:
-                self.prefix_indices, self.last_node, self.last_node_global = (
-                    tree_cache.match_prefix(
-                        key=self.adjust_max_prefix_ids(), include_evicted=True
-                    )
-                )
-            else:
-                self.prefix_indices, self.last_node = tree_cache.match_prefix(
-                    rid=self.rid, key=self.adjust_max_prefix_ids()
-                )
-        elif enable_hierarchical_cache:
-            # in case last_node is evicted during scheduling, we need to update the prefix_indices
-            while self.last_node.evicted:
-                self.prefix_indices = self.prefix_indices[
-                    : -len(self.last_node.host_value)
-                ]
-                self.last_node = self.last_node.parent
+            (
+                self.prefix_indices,
+                self.last_node,
+                self.last_host_node,
+                self.host_hit_length,
+            ) = tree_cache.match_prefix(
+                key=self.adjust_max_prefix_ids(),
+            )
         self.extend_input_len = len(self.fill_ids) - len(self.prefix_indices)
     def adjust_max_prefix_ids(self):
@@ -796,6 +797,7 @@ class Req:
         self.multimodal_inputs = None
         self.grammar = None
         self.origin_input_ids = [0]  # set it to one token to skip the long prefill
+        self.return_logprob = False
         self.finished_reason = FINISH_ABORT(
             error_msg, HTTPStatus.BAD_REQUEST, "BadRequestError"
         )
@@ -820,7 +822,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
     # Request, memory pool, and cache
     reqs: List[Req]
     req_to_token_pool: ReqToTokenPool = None
-    token_to_kv_pool_allocator: TokenToKVPoolAllocator = None
+    token_to_kv_pool_allocator: BaseTokenToKVPoolAllocator = None
     tree_cache: BasePrefixCache = None
     # Batch configs
@@ -862,6 +864,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
     global_num_tokens: Optional[List[int]] = None
     global_num_tokens_for_logprob: Optional[List[int]] = None
     can_run_dp_cuda_graph: bool = False
+    is_extend_in_batch: bool = False
     tbo_split_seq_index: Optional[int] = None
     global_forward_mode: Optional[ForwardMode] = None
@@ -908,12 +911,15 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
     # Whether to return hidden states
     return_hidden_states: bool = False
+    # hicache pointer for synchronizing data loading from CPU to GPU
+    hicache_consumer_index: int = 0
     @classmethod
     def init_new(
         cls,
         reqs: List[Req],
         req_to_token_pool: ReqToTokenPool,
-        token_to_kv_pool_allocator: TokenToKVPoolAllocator,
+        token_to_kv_pool_allocator: BaseTokenToKVPoolAllocator,
         tree_cache: BasePrefixCache,
         model_config: ModelConfig,
         enable_overlap: bool,
@@ -1365,7 +1371,11 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
             return len(self.reqs)
         # In the decoding phase, the length of a request's KV cache should be
         # the total length of the request minus 1
-        return sum(1 for req in self.reqs if (req.seqlen - 1) % page_size == 0)
+        return (
+            sum(1 for req in self.reqs if req.seqlen % page_size == 0)
+            if self.enable_overlap
+            else sum(1 for req in self.reqs if (req.seqlen - 1) % page_size == 0)
+        )
     def check_decode_mem(self, buf_multiplier=1):
         tokens_required = (
@@ -1734,6 +1744,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
             token_type_ids=self.token_type_ids,
             spec_algorithm=self.spec_algorithm,
             spec_info=self.spec_info,
+            hicache_consumer_index=self.hicache_consumer_index,
             capture_hidden_mode=(
                 CaptureHiddenMode.FULL
                 if self.return_hidden_states
@@ -1760,11 +1771,15 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
             decoding_reqs=self.decoding_reqs,
             spec_algorithm=self.spec_algorithm,
             enable_custom_logit_processor=self.enable_custom_logit_processor,
+            global_num_tokens=self.global_num_tokens,
+            global_num_tokens_for_logprob=self.global_num_tokens_for_logprob,
+            can_run_dp_cuda_graph=self.can_run_dp_cuda_graph,
+            is_extend_in_batch=self.is_extend_in_batch,
         )
     def __str__(self):
         return (
-            f"ScheduleBatch(forward_mode={self.forward_mode.name}, "
+            f"ScheduleBatch(forward_mode={self.forward_mode.name if self.forward_mode else 'None'}, "
             f"#req={(len(self.reqs))})"
         )
@@ -1833,6 +1848,8 @@ class ModelWorkerBatch:
     spec_info: Optional[Union[EagleVerifyInput, EagleDraftInput]] = None
     # If set, the output of the batch contains the hidden states of the run.
     capture_hidden_mode: CaptureHiddenMode = None
+    spec_num_draft_tokens: Optional[int] = None
+    hicache_consumer_index: int = 0
     # Overlap event
     launch_done: Optional[threading.Event] = None

sglang 0.4.7.post1__py3-none-any.whl → 0.4.8.post1__py3-none-any.whl

sglang 0.4.7.post1py3-none-any.whl → 0.4.8.post1py3-none-any.whl