PyPI - sglang - Versions diffs - 0.4.7__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl - Mend

sglang 0.4.7py3-none-any.whl → 0.4.7.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (99) hide show

sglang/__init__.py +2 -0
sglang/api.py +7 -0
sglang/bench_serving.py +1 -1
sglang/lang/interpreter.py +40 -1
sglang/lang/ir.py +27 -0
sglang/math_utils.py +8 -0
sglang/srt/configs/model_config.py +6 -0
sglang/srt/conversation.py +6 -0
sglang/srt/disaggregation/base/__init__.py +1 -1
sglang/srt/disaggregation/base/conn.py +25 -11
sglang/srt/disaggregation/common/__init__.py +5 -1
sglang/srt/disaggregation/common/utils.py +42 -0
sglang/srt/disaggregation/decode.py +196 -51
sglang/srt/disaggregation/fake/__init__.py +1 -1
sglang/srt/disaggregation/fake/conn.py +15 -9
sglang/srt/disaggregation/mooncake/__init__.py +1 -1
sglang/srt/disaggregation/mooncake/conn.py +18 -13
sglang/srt/disaggregation/nixl/__init__.py +6 -1
sglang/srt/disaggregation/nixl/conn.py +17 -12
sglang/srt/disaggregation/prefill.py +128 -43
sglang/srt/disaggregation/utils.py +127 -123
sglang/srt/entrypoints/engine.py +15 -1
sglang/srt/entrypoints/http_server.py +13 -2
sglang/srt/eplb_simulator/__init__.py +1 -0
sglang/srt/eplb_simulator/reader.py +51 -0
sglang/srt/layers/activation.py +19 -0
sglang/srt/layers/attention/aiter_backend.py +15 -2
sglang/srt/layers/attention/cutlass_mla_backend.py +38 -15
sglang/srt/layers/attention/flashattention_backend.py +53 -64
sglang/srt/layers/attention/flashinfer_backend.py +1 -2
sglang/srt/layers/attention/flashinfer_mla_backend.py +22 -24
sglang/srt/layers/attention/flashmla_backend.py +2 -10
sglang/srt/layers/attention/triton_backend.py +119 -119
sglang/srt/layers/attention/triton_ops/decode_attention.py +2 -7
sglang/srt/layers/attention/vision.py +51 -24
sglang/srt/layers/communicator.py +23 -5
sglang/srt/layers/linear.py +0 -4
sglang/srt/layers/logits_processor.py +0 -12
sglang/srt/layers/moe/ep_moe/kernels.py +6 -5
sglang/srt/layers/moe/ep_moe/layer.py +42 -32
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +11 -37
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +1 -4
sglang/srt/layers/moe/topk.py +16 -8
sglang/srt/layers/pooler.py +56 -0
sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py +1 -0
sglang/srt/layers/quantization/{deep_gemm.py → deep_gemm_wrapper/compile_utils.py} +23 -80
sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +32 -0
sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +110 -0
sglang/srt/layers/quantization/fp8_kernel.py +44 -15
sglang/srt/layers/quantization/fp8_utils.py +87 -22
sglang/srt/layers/radix_attention.py +2 -3
sglang/srt/lora/lora_manager.py +79 -34
sglang/srt/lora/mem_pool.py +4 -5
sglang/srt/managers/cache_controller.py +2 -1
sglang/srt/managers/io_struct.py +28 -4
sglang/srt/managers/multimodal_processors/base_processor.py +2 -2
sglang/srt/managers/multimodal_processors/vila.py +85 -0
sglang/srt/managers/schedule_batch.py +39 -6
sglang/srt/managers/scheduler.py +73 -17
sglang/srt/managers/tokenizer_manager.py +29 -2
sglang/srt/mem_cache/chunk_cache.py +1 -0
sglang/srt/mem_cache/hiradix_cache.py +4 -2
sglang/srt/mem_cache/memory_pool.py +111 -407
sglang/srt/mem_cache/memory_pool_host.py +380 -0
sglang/srt/mem_cache/radix_cache.py +36 -12
sglang/srt/model_executor/cuda_graph_runner.py +122 -55
sglang/srt/model_executor/forward_batch_info.py +14 -5
sglang/srt/model_executor/model_runner.py +6 -6
sglang/srt/model_loader/loader.py +8 -1
sglang/srt/models/bert.py +113 -13
sglang/srt/models/deepseek_v2.py +113 -155
sglang/srt/models/internvl.py +46 -102
sglang/srt/models/roberta.py +117 -9
sglang/srt/models/vila.py +305 -0
sglang/srt/openai_api/adapter.py +162 -4
sglang/srt/openai_api/protocol.py +37 -1
sglang/srt/sampling/sampling_batch_info.py +24 -0
sglang/srt/sampling/sampling_params.py +2 -0
sglang/srt/server_args.py +318 -233
sglang/srt/speculative/build_eagle_tree.py +1 -1
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +4 -3
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +5 -2
sglang/srt/speculative/eagle_utils.py +389 -109
sglang/srt/speculative/eagle_worker.py +134 -43
sglang/srt/two_batch_overlap.py +4 -2
sglang/srt/utils.py +58 -0
sglang/test/attention/test_prefix_chunk_info.py +2 -0
sglang/test/runners.py +38 -3
sglang/test/test_block_fp8.py +1 -0
sglang/test/test_block_fp8_deep_gemm_blackwell.py +252 -0
sglang/test/test_block_fp8_ep.py +1 -0
sglang/test/test_utils.py +3 -1
sglang/utils.py +9 -0
sglang/version.py +1 -1
{sglang-0.4.7.dist-info → sglang-0.4.7.post1.dist-info}/METADATA +5 -5
{sglang-0.4.7.dist-info → sglang-0.4.7.post1.dist-info}/RECORD +99 -88
{sglang-0.4.7.dist-info → sglang-0.4.7.post1.dist-info}/WHEEL +0 -0
{sglang-0.4.7.dist-info → sglang-0.4.7.post1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.7.dist-info → sglang-0.4.7.post1.dist-info}/top_level.txt +0 -0

sglang/srt/layers/radix_attention.py CHANGED Viewed

@@ -18,7 +18,6 @@ from typing import Optional
 from torch import nn
-from sglang.srt.layers.linear import UnquantizedLinearMethod
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
@@ -52,9 +51,9 @@ class RadixAttention(nn.Module):
         sliding_window_size: int = -1,
         is_cross_attention: bool = False,
         quant_config: Optional[QuantizationConfig] = None,
-        attn_type=AttentionType.DECODER,
-        prefix: str = "",
+        attn_type: AttentionType = AttentionType.DECODER,
         use_irope: bool = False,
+        prefix: str = "",
     ):
         super().__init__()
         self.tp_q_head_num = num_heads

sglang/srt/lora/lora_manager.py CHANGED Viewed

@@ -81,7 +81,7 @@ class LoRAManager:
                 seg_indptr=torch.zeros(
                     self.max_bs_in_cuda_graph + 1, dtype=torch.int32
                 ),
-                max_len=0,
+                max_len=1,
                 weight_indices=torch.zeros(
                     self.max_bs_in_cuda_graph, dtype=torch.int32
                 ),
@@ -89,6 +89,17 @@ class LoRAManager:
                 scalings=torch.zeros(self.max_loras_per_batch, dtype=torch.float),
             )
+            # Initialize seg_lens and seg_indptr for CUDA graph as they remain constant
+            # across batches.
+            self.cuda_graph_batch_info.seg_lens[: self.max_bs_in_cuda_graph].fill_(1)
+            torch.cumsum(
+                self.cuda_graph_batch_info.seg_lens[: self.max_bs_in_cuda_graph],
+                dim=0,
+                out=self.cuda_graph_batch_info.seg_indptr[
+                    1 : self.max_bs_in_cuda_graph + 1
+                ],
+            )
     def init_loras(self):
         # Config of each LoRA adapter
         self.configs: Dict[str, LoRAConfig] = {}
@@ -159,6 +170,45 @@ class LoRAManager:
         # set up batch info shared by all lora modules
         bs = forward_batch.batch_size
+        def transfer_adapter_info(
+            weight_indices_out: torch.Tensor,
+            lora_ranks_out: torch.Tensor,
+            scalings_out: torch.Tensor,
+        ):
+            """
+            Transfer adapter metadata (weight indices, LoRA rank, scalings) from host
+            to device (CUDA) asynchronously.
+            """
+            weight_indices = [0] * len(forward_batch.lora_paths)
+            lora_ranks = [0] * self.max_loras_per_batch
+            scalings = [0] * self.max_loras_per_batch
+            for i, lora_path in enumerate(forward_batch.lora_paths):
+                weight_indices[i] = self.memory_pool.get_buffer_id(lora_path)
+                if lora_path is not None:
+                    lora = self.loras[lora_path]
+                    lora_ranks[weight_indices[i]] = lora.config.hf_config["r"]
+                    scalings[weight_indices[i]] = lora.scaling
+            # Use pinned memory to avoid synchronizations during host-to-device transfer
+            weight_indices_tensor = torch.tensor(
+                weight_indices, dtype=torch.int32, pin_memory=True, device="cpu"
+            )
+            lora_ranks_tensor = torch.tensor(
+                lora_ranks, dtype=torch.int32, pin_memory=True, device="cpu"
+            )
+            scalings_tensor = torch.tensor(
+                scalings, dtype=torch.float, pin_memory=True, device="cpu"
+            )
+            # Copy to device tensors asynchronously
+            weight_indices_out[:bs].copy_(weight_indices_tensor, non_blocking=True)
+            lora_ranks_out[: self.max_loras_per_batch].copy_(
+                lora_ranks_tensor, non_blocking=True
+            )
+            scalings_out[: self.max_loras_per_batch].copy_(
+                scalings_tensor, non_blocking=True
+            )
         if (
             hasattr(self, "max_bs_in_cuda_graph")
             and bs <= self.max_bs_in_cuda_graph
@@ -166,51 +216,46 @@ class LoRAManager:
         ):
             # Do in-place updates when CUDA graph is enabled and the batch forward mode
             # could use CUDA graph.
-            self.cuda_graph_batch_info.bs = bs
-            self.cuda_graph_batch_info.seg_lens[:bs].fill_(1)
-            torch.cumsum(
-                self.cuda_graph_batch_info.seg_lens[:bs],
-                dim=0,
-                out=self.cuda_graph_batch_info.seg_indptr[1 : bs + 1],
+            transfer_adapter_info(
+                self.cuda_graph_batch_info.weight_indices,
+                self.cuda_graph_batch_info.lora_ranks,
+                self.cuda_graph_batch_info.scalings,
             )
-            self.cuda_graph_batch_info.max_len = 1
-            for i, lora_path in enumerate(forward_batch.lora_paths):
-                self.cuda_graph_batch_info.weight_indices[i] = (
-                    self.memory_pool.get_buffer_id(lora_path)
-                )
-                if lora_path is not None:
-                    lora = self.loras[lora_path]
-                    self.cuda_graph_batch_info.lora_ranks[
-                        self.cuda_graph_batch_info.weight_indices[i]
-                    ] = lora.config.hf_config["r"]
-                    self.cuda_graph_batch_info.scalings[
-                        self.cuda_graph_batch_info.weight_indices[i]
-                    ] = lora.scaling
+            self.cuda_graph_batch_info.bs = bs
+            self.cuda_graph_batch_info.max_len = 1
             batch_info = self.cuda_graph_batch_info
         else:
+            weight_indices = torch.empty((bs,), dtype=torch.int32, device=self.device)
+            lora_ranks = torch.zeros(
+                (self.max_loras_per_batch,), dtype=torch.int64, device=self.device
+            )
+            scalings = torch.zeros(
+                (self.max_loras_per_batch,), dtype=torch.float, device=self.device
+            )
+            transfer_adapter_info(
+                weight_indices,
+                lora_ranks,
+                scalings,
+            )
             seg_lens = (
                 forward_batch.extend_seq_lens
                 if forward_batch.forward_mode.is_extend()
                 else torch.ones(bs, device=self.device)
             )
+            max_len = (
+                # Calculate max_len from the CPU copy to avoid D2H transfer.
+                max(forward_batch.extend_seq_lens_cpu)
+                if forward_batch.forward_mode.is_extend()
+                else 1
+            )
             seg_indptr = torch.zeros((bs + 1,), dtype=torch.int32, device=self.device)
             seg_indptr[1:] = torch.cumsum(seg_lens, dim=0)
-            max_len = int(torch.max(seg_lens))
-            weight_indices = torch.empty((bs,), dtype=torch.int64, device=self.device)
-            lora_ranks = torch.zeros(
-                (self.max_loras_per_batch,), dtype=torch.int64, device="cuda"
-            )
-            scalings = torch.zeros(
-                (self.max_loras_per_batch,), dtype=torch.float, device="cuda"
-            )
-            for i, lora_path in enumerate(forward_batch.lora_paths):
-                weight_indices[i] = self.memory_pool.get_buffer_id(lora_path)
-                if lora_path is not None:
-                    lora = self.loras[lora_path]
-                    lora_ranks[weight_indices[i]] = lora.config.hf_config["r"]
-                    scalings[weight_indices[i]] = lora.scaling
             batch_info = LoRABatchInfo(
                 bs=bs,
                 seg_lens=seg_lens,

sglang/srt/lora/mem_pool.py CHANGED Viewed

@@ -132,12 +132,13 @@ class LoRAMemoryPool:
             for buffer_id in range(self.max_loras_per_batch):
                 # Prioritize empty slots
                 if self.buffer_id_to_uid[buffer_id] == "":
-                    return buffer_id, ""
+                    return buffer_id
             for buffer_id in range(self.max_loras_per_batch):
                 # Evict unneeded lora
                 if self.buffer_id_to_uid[buffer_id] not in cur_uids:
-                    return buffer_id, self.buffer_id_to_uid[buffer_id]
+                    self.uid_to_buffer_id.pop(self.buffer_id_to_uid[buffer_id])
+                    return buffer_id
             raise ValueError(
                 "No available buffer slots found. Please ensure the number of active loras is less than max_loras_per_batch."
@@ -145,9 +146,7 @@ class LoRAMemoryPool:
         for uid in cur_uids:
             if uid not in self.uid_to_buffer_id:
-                buffer_id, evicted_lora_uid = get_available_buffer_slot()
-                if evicted_lora_uid != "":
-                    self.uid_to_buffer_id.pop(evicted_lora_uid)
+                buffer_id = get_available_buffer_slot()
                 self.load_lora_weight_to_buffer(
                     uid, buffer_id, lora_adapters.get(uid, None)
                 )

sglang/srt/managers/cache_controller.py CHANGED Viewed

@@ -22,7 +22,8 @@ from typing import List, Optional
 import torch
-from sglang.srt.mem_cache.memory_pool import HostKVCache, TokenToKVPoolAllocator
+from sglang.srt.mem_cache.memory_pool import TokenToKVPoolAllocator
+from sglang.srt.mem_cache.memory_pool_host import HostKVCache
 logger = logging.getLogger(__name__)

sglang/srt/managers/io_struct.py CHANGED Viewed

@@ -87,7 +87,7 @@ class GenerateReqInput:
     # The modalities of the image data [image, multi-images, video]
     modalities: Optional[List[str]] = None
-    # LoRA related
+    # The path to the LoRA
     lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
     # Session info for continual prompting
@@ -99,7 +99,7 @@ class GenerateReqInput:
     custom_logit_processor: Optional[Union[List[Optional[str]], str]] = None
     # Whether to return hidden states
-    return_hidden_states: bool = False
+    return_hidden_states: Union[List[bool], bool] = False
     # For disaggregated inference
     bootstrap_host: Optional[Union[List[str], str]] = None
@@ -409,7 +409,11 @@ class GenerateReqInput:
                 if self.custom_logit_processor is not None
                 else None
             ),
-            return_hidden_states=self.return_hidden_states,
+            return_hidden_states=(
+                self.return_hidden_states[i]
+                if isinstance(self.return_hidden_states, list)
+                else self.return_hidden_states
+            ),
             # if `__getitem__` is called, the bootstrap_host, bootstrap_port, bootstrap_room must be a list
             bootstrap_host=(
                 self.bootstrap_host[i] if self.bootstrap_host is not None else None
@@ -477,7 +481,7 @@ class TokenizedGenerateReqInput:
 @dataclass
 class EmbeddingReqInput:
     # The input prompt. It can be a single prompt or a batch of prompts.
-    text: Optional[Union[List[str], str]] = None
+    text: Optional[Union[List[List[str]], List[str], str]] = None
     # The image input. It can be an image instance, file name, URL, or base64 encoded string.
     # Can be formatted as:
     # - Single image for a single request
@@ -501,6 +505,8 @@ class EmbeddingReqInput:
     log_metrics: bool = True
     # The modalities of the image data [image, multi-images, video]
     modalities: Optional[List[str]] = None
+    # For cross-encoder requests
+    is_cross_encoder_request: bool = False
     def contains_mm_input(self) -> bool:
         return has_valid_data(self.image_data) or has_valid_data(self.audio_data)
@@ -560,6 +566,16 @@ class EmbeddingReqInput:
         return self.rid
     def __getitem__(self, i):
+        if self.is_cross_encoder_request:
+            return EmbeddingReqInput(
+                text=[self.text[i]] if self.text is not None else None,
+                input_ids=None,
+                image_data=None,
+                sampling_params=self.sampling_params[i],
+                rid=self.rid[i],
+                is_cross_encoder_request=True,
+            )
         return EmbeddingReqInput(
             text=self.text[i] if self.text is not None else None,
             input_ids=self.input_ids[i] if self.input_ids is not None else None,
@@ -579,6 +595,8 @@ class TokenizedEmbeddingReqInput:
     input_ids: List[int]
     # The image inputs
     image_inputs: dict
+    # The token type ids
+    token_type_ids: List[int]
     # Dummy sampling params for compatibility
     sampling_params: SamplingParams
@@ -843,6 +861,12 @@ class SetInternalStateReq:
     server_args: Dict[str, Any]
+@dataclass
+class V1RerankReqInput:
+    query: str
+    documents: List[str]
 @dataclass
 class SetInternalStateReqOutput:
     updated: bool

sglang/srt/managers/multimodal_processors/base_processor.py CHANGED Viewed

@@ -146,7 +146,7 @@ class BaseMultimodalProcessor(ABC):
         request_obj,
         max_req_input_len,
         **kwargs,
-    ):
+    ) -> Optional[Dict[str, Any]]:
         pass
     def get_estimated_frames_list(self, image_data):
@@ -261,7 +261,7 @@ class BaseMultimodalProcessor(ABC):
     def load_mm_data(
         self,
-        prompt: str,
+        prompt: str | List[int],
         multimodal_tokens: MultimodalSpecialTokens,
         max_req_input_len: int,
         image_data: Optional[list] = None,

sglang/srt/managers/multimodal_processors/vila.py ADDED Viewed

@@ -0,0 +1,85 @@
+from typing import Any, Dict, List, Optional, Type, cast
+import torch.nn as nn
+from transformers.configuration_utils import PretrainedConfig
+from transformers.processing_utils import ProcessorMixin
+from transformers.tokenization_utils_base import PreTrainedTokenizerBase
+from sglang.srt.managers.io_struct import (
+    EmbeddingReqInput,
+    GenerateReqInput,
+    ImageDataItem,
+)
+from sglang.srt.managers.multimodal_processors.base_processor import (
+    BaseMultimodalProcessor,
+    MultimodalSpecialTokens,
+)
+from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
+from sglang.srt.models.vila import VILAForConditionalGeneration
+from sglang.srt.server_args import ServerArgs
+class VILAProcessor(ProcessorMixin):
+    """A stub class for the VILA processor."""
+    tokenizer: PreTrainedTokenizerBase
+class VILAMultimodalProcessor(BaseMultimodalProcessor):
+    models: List[Type[nn.Module]] = [VILAForConditionalGeneration]
+    _processor: VILAProcessor
+    def __init__(
+        self,
+        hf_config: PretrainedConfig,
+        server_args: ServerArgs,
+        _processor: VILAProcessor,
+    ) -> None:
+        super().__init__(hf_config, server_args, _processor)
+    async def process_mm_data_async(
+        self,
+        image_data: Optional[ImageDataItem | List[ImageDataItem]],
+        input_text: str | List[int],
+        request_obj: GenerateReqInput | EmbeddingReqInput,
+        max_req_input_len: int,
+        **kwargs,
+    ) -> Optional[Dict[str, Any]]:
+        if not image_data:
+            return None
+        if not isinstance(image_data, list):
+            image_data = [image_data]
+        mm_data = self.load_mm_data(
+            prompt=input_text,
+            multimodal_tokens=MultimodalSpecialTokens(
+                image_token=self._processor.tokenizer.image_token
+            ),
+            max_req_input_len=max_req_input_len,
+            image_data=image_data,
+        )
+        inputs = self.process_mm_data(
+            input_text=mm_data.input_text,
+            images=mm_data.images,
+        )
+        image_offsets = self.get_mm_items_offset(
+            input_ids=inputs.input_ids[0],
+            mm_token_id=cast(int, self._processor.tokenizer.image_token_id),
+        )
+        mm_items: List[MultimodalDataItem] = [
+            MultimodalDataItem(
+                modality=Modality.IMAGE,
+                image_offsets=image_offsets,
+                pixel_values=inputs.pixel_values,
+            )
+        ]
+        return dict(
+            input_ids=inputs.input_ids[0].tolist(),
+            mm_items=mm_items,
+        )

sglang/srt/managers/schedule_batch.py CHANGED Viewed

@@ -72,32 +72,33 @@ INIT_INCREMENTAL_DETOKENIZATION_OFFSET = 5
 GLOBAL_SERVER_ARGS_KEYS = [
     "attention_backend",
+    "mm_attention_backend",
     "debug_tensor_dump_inject",
     "debug_tensor_dump_output_folder",
     "chunked_prefill_size",
-    "deepep_mode",
     "device",
     "disable_chunked_prefix_cache",
     "disable_radix_cache",
-    "enable_deepep_moe",
     "enable_dp_attention",
     "enable_two_batch_overlap",
     "enable_dp_lm_head",
+    "enable_deepep_moe",
+    "deepep_mode",
     "enable_ep_moe",
+    "moe_dense_tp_size",
+    "ep_dispatch_algorithm",
     "deepep_config",
+    "ep_num_redundant_experts",
     "enable_nan_detection",
     "flashinfer_mla_disable_ragged",
     "max_micro_batch_size",
-    "moe_dense_tp_size",
-    "ep_dispatch_algorithm",
     "disable_shared_experts_fusion",
     "sampling_backend",
     "speculative_accept_threshold_acc",
     "speculative_accept_threshold_single",
     "torchao_config",
     "triton_attention_reduce_in_fp32",
-    "ep_num_redundant_experts",
-    "mm_attention_backend",
+    "num_reserved_decode_tokens",
 ]
 # Put some global args for easy access
@@ -444,6 +445,7 @@ class Req:
         origin_input_ids_unpadded: Optional[Tuple[int]] = None,
         lora_path: Optional[str] = None,
         input_embeds: Optional[List[List[float]]] = None,
+        token_type_ids: List[int] = None,
         session_id: Optional[str] = None,
         custom_logit_processor: Optional[str] = None,
         return_hidden_states: bool = False,
@@ -469,6 +471,9 @@ class Req:
         self.session_id = session_id
         self.input_embeds = input_embeds
+        # for corss-endoder model
+        self.token_type_ids = token_type_ids
         # Sampling info
         if isinstance(sampling_params.custom_params, dict):
             sampling_params = copy.copy(sampling_params)
@@ -840,6 +845,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
     # Batched arguments to model runner
     input_ids: torch.Tensor = None  # shape: [b], int64
     input_embeds: torch.Tensor = None  # shape: [b, hidden_size], float32
+    token_type_ids: torch.Tensor = None  # shape: [b], int64
     req_pool_indices: torch.Tensor = None  # shape: [b], int64
     seq_lens: torch.Tensor = None  # shape: [b], int64
     # The output locations of the KV cache
@@ -1141,6 +1147,10 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
         prefix_lens = [len(r.prefix_indices) for r in reqs]
         extend_lens = [r.extend_input_len for r in reqs]
+        token_type_ids = [
+            r.token_type_ids for r in reqs if r.token_type_ids is not None
+        ]
         req_pool_indices_tensor = torch.tensor(req_pool_indices, dtype=torch.int64).to(
             self.device, non_blocking=True
         )
@@ -1153,6 +1163,13 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
         prefix_lens_tensor = torch.tensor(
             prefix_lens, dtype=torch.int64, device=self.device
         )
+        token_type_ids_tensor = None
+        if len(token_type_ids) > 0:
+            token_type_ids_tensor = torch.tensor(
+                sum(token_type_ids, []), dtype=torch.int64
+            ).to(self.device, non_blocking=True)
         extend_lens_tensor = seq_lens_tensor - prefix_lens_tensor
         # Copy prefix and do some basic check
@@ -1268,6 +1285,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
                         self.device, non_blocking=True
                     )
         self.multimodal_inputs = multimodal_inputs
+        self.token_type_ids = token_type_ids_tensor
         self.seq_lens_sum = sum(seq_lens)
         if self.return_logprob:
@@ -1414,6 +1432,11 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
             req = self.reqs[idx]
             retracted_reqs.append(req)
+            if server_args.disaggregation_mode == "decode":
+                req.offload_kv_cache(
+                    self.req_to_token_pool, self.token_to_kv_pool_allocator
+                )
             if isinstance(self.tree_cache, ChunkCache):
                 # ChunkCache does not have eviction
                 token_indices = self.req_to_token_pool.req_to_token[
@@ -1445,6 +1468,12 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
             req.reset_for_retract()
+            if len(retracted_reqs) == 0:
+                # Corner case: only one request left
+                raise ValueError(
+                    "Failed to retract any request. No space left for only one request."
+                )
         self.filter_batch(keep_indices=sorted_indices)
         # Reqs in batch are filtered
@@ -1702,6 +1731,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
             lora_paths=[req.lora_path for req in self.reqs],
             sampling_info=self.sampling_info,
             input_embeds=self.input_embeds,
+            token_type_ids=self.token_type_ids,
             spec_algorithm=self.spec_algorithm,
             spec_info=self.spec_info,
             capture_hidden_mode=(
@@ -1795,6 +1825,9 @@ class ModelWorkerBatch:
     # The input Embeds
     input_embeds: Optional[torch.tensor] = None
+    # For corss-encoder model
+    token_type_ids: Optional[torch.Tensor] = None
     # Speculative decoding
     spec_algorithm: SpeculativeAlgorithm = None
     spec_info: Optional[Union[EagleVerifyInput, EagleDraftInput]] = None

sglang 0.4.7__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl

sglang 0.4.7py3-none-any.whl → 0.4.7.post1py3-none-any.whl