PyPI - sglang - Versions diffs - 0.4.5.post3__py3-none-any.whl → 0.4.6__py3-none-any.whl - Mend

sglang 0.4.5.post3py3-none-any.whl → 0.4.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (70) hide show

sglang/bench_one_batch.py +19 -3
sglang/bench_serving.py +8 -9
sglang/compile_deep_gemm.py +45 -4
sglang/srt/code_completion_parser.py +1 -1
sglang/srt/configs/deepseekvl2.py +1 -1
sglang/srt/configs/model_config.py +9 -3
sglang/srt/constrained/llguidance_backend.py +78 -61
sglang/srt/conversation.py +34 -1
sglang/srt/disaggregation/decode.py +59 -11
sglang/srt/disaggregation/mini_lb.py +45 -8
sglang/srt/disaggregation/mooncake/conn.py +198 -31
sglang/srt/disaggregation/prefill.py +24 -9
sglang/srt/entrypoints/http_server.py +8 -2
sglang/srt/function_call_parser.py +77 -5
sglang/srt/layers/attention/base_attn_backend.py +3 -0
sglang/srt/layers/attention/flashattention_backend.py +28 -10
sglang/srt/layers/attention/flashmla_backend.py +8 -11
sglang/srt/layers/attention/vision.py +2 -0
sglang/srt/layers/layernorm.py +38 -16
sglang/srt/layers/logits_processor.py +2 -2
sglang/srt/layers/moe/fused_moe_native.py +2 -4
sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +41 -41
sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +18 -15
sglang/srt/layers/pooler.py +6 -0
sglang/srt/layers/quantization/awq.py +5 -1
sglang/srt/layers/quantization/deep_gemm.py +17 -10
sglang/srt/layers/quantization/int8_kernel.py +32 -1
sglang/srt/layers/radix_attention.py +13 -3
sglang/srt/layers/rotary_embedding.py +170 -126
sglang/srt/managers/data_parallel_controller.py +10 -3
sglang/srt/managers/io_struct.py +7 -0
sglang/srt/managers/mm_utils.py +85 -28
sglang/srt/managers/multimodal_processors/base_processor.py +14 -1
sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +9 -2
sglang/srt/managers/multimodal_processors/gemma3.py +2 -5
sglang/srt/managers/multimodal_processors/janus_pro.py +2 -2
sglang/srt/managers/multimodal_processors/minicpm.py +4 -3
sglang/srt/managers/multimodal_processors/qwen_vl.py +38 -13
sglang/srt/managers/schedule_batch.py +29 -12
sglang/srt/managers/scheduler.py +31 -20
sglang/srt/managers/tokenizer_manager.py +5 -1
sglang/srt/mem_cache/memory_pool.py +87 -0
sglang/srt/model_executor/cuda_graph_runner.py +4 -3
sglang/srt/model_executor/forward_batch_info.py +51 -95
sglang/srt/model_executor/model_runner.py +11 -24
sglang/srt/models/deepseek.py +12 -2
sglang/srt/models/deepseek_nextn.py +101 -6
sglang/srt/models/deepseek_v2.py +144 -70
sglang/srt/models/deepseek_vl2.py +9 -4
sglang/srt/models/gemma3_causal.py +1 -1
sglang/srt/models/llama4.py +0 -1
sglang/srt/models/minicpmo.py +5 -1
sglang/srt/models/mllama4.py +2 -2
sglang/srt/models/qwen2_5_vl.py +3 -6
sglang/srt/models/qwen2_vl.py +3 -7
sglang/srt/models/roberta.py +178 -0
sglang/srt/openai_api/adapter.py +18 -8
sglang/srt/server_args.py +15 -22
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +3 -3
sglang/srt/torch_memory_saver_adapter.py +10 -1
sglang/srt/utils.py +2 -1
sglang/test/runners.py +6 -13
sglang/test/test_utils.py +36 -18
sglang/version.py +1 -1
{sglang-0.4.5.post3.dist-info → sglang-0.4.6.dist-info}/METADATA +4 -5
{sglang-0.4.5.post3.dist-info → sglang-0.4.6.dist-info}/RECORD +70 -68
{sglang-0.4.5.post3.dist-info → sglang-0.4.6.dist-info}/WHEEL +1 -1
{sglang-0.4.5.post3.dist-info → sglang-0.4.6.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.5.post3.dist-info → sglang-0.4.6.dist-info}/top_level.txt +0 -0

sglang/srt/layers/rotary_embedding.py CHANGED Viewed

@@ -14,8 +14,6 @@ _is_cuda = is_cuda()
 if _is_cuda:
     from sgl_kernel import apply_rope_with_cos_sin_cache_inplace
-else:
-    from vllm._custom_ops import rotary_embedding as vllm_rotary_embedding
 def _rotate_neox(x: torch.Tensor) -> torch.Tensor:
@@ -84,6 +82,12 @@ class RotaryEmbedding(CustomOp):
         # NOTE(ByronHsu): cache needs to be in FP32 for numerical stability
         if not _is_cuda:
             cache = cache.to(dtype)
+        if not _is_cuda or self.head_size not in [64, 128, 256, 512]:
+            from vllm._custom_ops import rotary_embedding
+            self.vllm_rotary_embedding = rotary_embedding
         self.cos_sin_cache: torch.Tensor
         self.register_buffer("cos_sin_cache", cache, persistent=False)
@@ -160,7 +164,7 @@ class RotaryEmbedding(CustomOp):
             )
         else:
             self.cos_sin_cache = self.cos_sin_cache.to(query.device, dtype=query.dtype)
-            vllm_rotary_embedding(
+            self.vllm_rotary_embedding(
                 positions,
                 query,
                 key,
@@ -665,6 +669,7 @@ class DeepseekScalingRotaryEmbedding(RotaryEmbedding):
         offsets: Optional[torch.Tensor] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         """PyTorch-native implementation equivalent to forward()."""
+        dtype = query.dtype
         query_rot = query[..., : self.rotary_dim]
         key_rot = key[..., : self.rotary_dim]
         if self.rotary_dim < self.head_size:
@@ -695,7 +700,7 @@ class DeepseekScalingRotaryEmbedding(RotaryEmbedding):
         else:
             query = query_rot
             key = key_rot
-        return query, key
+        return query.to(dtype), key.to(dtype)
 class Llama3RotaryEmbedding(RotaryEmbedding):
@@ -876,142 +881,181 @@ class MRotaryEmbedding(RotaryEmbedding):
         key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
         return query, key
+    # Copied from https://github.com/huggingface/transformers/blob/c8e0e603de9b3d49161a15fe6e8ea84badfb5d02/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L1439
     @staticmethod
-    def get_input_positions(
-        input_tokens: List[int],
-        image_grid_thw: Union[List[List[int]], torch.Tensor],
-        video_grid_thw: Union[List[List[int]], torch.Tensor],
+    def get_rope_index(
+        spatial_merge_size: int,
         image_token_id: int,
         video_token_id: int,
         vision_start_token_id: int,
-        vision_end_token_id: int,
-        spatial_merge_size: int,
-        context_len: int = 0,
-        seq_len: Optional[int] = None,
-        second_per_grid_ts: Optional[torch.Tensor] = None,
+        model_type: str,
         tokens_per_second: Optional[int] = None,
-    ) -> Tuple[List[List[int]], int]:
-        """
-        Get mrope input positions and delta value.
-        :arg
-            second_per_grid_ts (`torch.Tensor` of shape `(num_videos)`, *optional*):
-                The time interval (in seconds) for each grid along the temporal dimension in the 3D position IDs.
-        """
-        if isinstance(image_grid_thw, torch.Tensor):
-            image_grid_thw = image_grid_thw.tolist()
-        if isinstance(video_grid_thw, torch.Tensor):
-            video_grid_thw = video_grid_thw.tolist()
-        input_tokens_tensor = torch.tensor(input_tokens)
-        vision_start_indices = torch.argwhere(
-            input_tokens_tensor == vision_start_token_id
-        ).squeeze(1)
-        vision_tokens = input_tokens_tensor[vision_start_indices + 1]
-        image_nums = (vision_tokens == image_token_id).sum()
-        video_nums = (vision_tokens == video_token_id).sum()
-        llm_pos_ids_list: list = []
-        st = 0
-        remain_images, remain_videos = image_nums, video_nums
-        image_index, video_index = 0, 0
-        for _ in range(image_nums + video_nums):
-            if image_token_id in input_tokens and remain_images > 0:
-                ed_image = input_tokens.index(image_token_id, st)
-            else:
-                ed_image = len(input_tokens) + 1
-            if video_token_id in input_tokens and remain_videos > 0:
-                ed_video = input_tokens.index(video_token_id, st)
-            else:
-                ed_video = len(input_tokens) + 1
-            if ed_image < ed_video:
-                t, h, w = (
-                    image_grid_thw[image_index][0],
-                    image_grid_thw[image_index][1],
-                    image_grid_thw[image_index][2],
-                )
-                image_index += 1
-                remain_images -= 1
-                second_per_grid_t = 0
-                ed = ed_image
-            else:
-                t, h, w = (
-                    video_grid_thw[video_index][0],
-                    video_grid_thw[video_index][1],
-                    video_grid_thw[video_index][2],
-                )
-                if second_per_grid_ts is not None:
-                    second_per_grid_t = second_per_grid_ts[video_index]
-                else:
-                    second_per_grid_t = 1.0
-                video_index += 1
-                remain_videos -= 1
-                ed = ed_video
-            llm_grid_t, llm_grid_h, llm_grid_w = (
-                t,
-                h // spatial_merge_size,
-                w // spatial_merge_size,
-            )
-            text_len = ed - st
-            st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
-            llm_pos_ids_list.append(
-                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
-            )
-            t_index = (
-                torch.arange(llm_grid_t).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w)
-                * second_per_grid_t
-                * tokens_per_second
-            ).flatten()
-            h_index = (
-                torch.arange(llm_grid_h)
-                .view(1, -1, 1)
-                .expand(llm_grid_t, -1, llm_grid_w)
-                .flatten()
-            )
-            w_index = (
-                torch.arange(llm_grid_w)
-                .view(1, 1, -1)
-                .expand(llm_grid_t, llm_grid_h, -1)
-                .flatten()
-            )
-            llm_pos_ids_list.append(
-                torch.stack([t_index, h_index, w_index]) + text_len + st_idx
+        input_ids: Optional[torch.LongTensor] = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        video_grid_thw: Optional[torch.LongTensor] = None,
+        second_per_grid_ts: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        mrope_position_deltas = []
+        if input_ids is not None and (
+            image_grid_thw is not None or video_grid_thw is not None
+        ):
+            total_input_ids = input_ids
+            position_ids = torch.ones(
+                3,
+                input_ids.shape[0],
+                input_ids.shape[1],
+                dtype=input_ids.dtype,
+                device=input_ids.device,
             )
-            st = ed + llm_grid_t * llm_grid_h * llm_grid_w
-        if st < len(input_tokens):
-            st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
-            text_len = len(input_tokens) - st
-            llm_pos_ids_list.append(
-                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
+            image_index, video_index = 0, 0
+            for i, input_ids in enumerate(total_input_ids):
+                image_nums, video_nums = 0, 0
+                vision_start_indices = torch.argwhere(
+                    input_ids == vision_start_token_id
+                ).squeeze(1)
+                vision_tokens = input_ids[vision_start_indices + 1]
+                image_nums = (vision_tokens == image_token_id).sum()
+                video_nums = (vision_tokens == video_token_id).sum()
+                input_tokens = input_ids.tolist()
+                llm_pos_ids_list: list = []
+                st = 0
+                remain_images, remain_videos = image_nums, video_nums
+                for _ in range(image_nums + video_nums):
+                    if image_token_id in input_tokens and remain_images > 0:
+                        ed_image = input_tokens.index(image_token_id, st)
+                    else:
+                        ed_image = len(input_tokens) + 1
+                    if video_token_id in input_tokens and remain_videos > 0:
+                        ed_video = input_tokens.index(video_token_id, st)
+                    else:
+                        ed_video = len(input_tokens) + 1
+                    if ed_image < ed_video:
+                        t, h, w = (
+                            image_grid_thw[image_index][0],
+                            image_grid_thw[image_index][1],
+                            image_grid_thw[image_index][2],
+                        )
+                        second_per_grid_t = 0
+                        image_index += 1
+                        remain_images -= 1
+                        ed = ed_image
+                    else:
+                        t, h, w = (
+                            video_grid_thw[video_index][0],
+                            video_grid_thw[video_index][1],
+                            video_grid_thw[video_index][2],
+                        )
+                        if second_per_grid_ts is not None:
+                            second_per_grid_t = second_per_grid_ts[video_index]
+                        else:
+                            second_per_grid_t = 1.0
+                        video_index += 1
+                        remain_videos -= 1
+                        ed = ed_video
+                    llm_grid_t, llm_grid_h, llm_grid_w = (
+                        t.item(),
+                        h.item() // spatial_merge_size,
+                        w.item() // spatial_merge_size,
+                    )
+                    text_len = ed - st
+                    st_idx = (
+                        llm_pos_ids_list[-1].max() + 1
+                        if len(llm_pos_ids_list) > 0
+                        else 0
+                    )
+                    llm_pos_ids_list.append(
+                        torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
+                    )
+                    if model_type == "qwen2_5_vl":
+                        range_tensor = torch.arange(llm_grid_t).view(-1, 1)
+                        expanded_range = range_tensor.expand(
+                            -1, llm_grid_h * llm_grid_w
+                        )
+                        time_tensor = (
+                            expanded_range * second_per_grid_t * tokens_per_second
+                        )
+                        time_tensor_long = time_tensor.long()
+                        t_index = time_tensor_long.flatten()
+                    elif model_type == "qwen2_vl":
+                        t_index = (
+                            torch.arange(llm_grid_t)
+                            .view(-1, 1)
+                            .expand(-1, llm_grid_h * llm_grid_w)
+                            .flatten()
+                        )
+                    else:
+                        raise RuntimeError("Unimplemented")
+                    h_index = (
+                        torch.arange(llm_grid_h)
+                        .view(1, -1, 1)
+                        .expand(llm_grid_t, -1, llm_grid_w)
+                        .flatten()
+                    )
+                    w_index = (
+                        torch.arange(llm_grid_w)
+                        .view(1, 1, -1)
+                        .expand(llm_grid_t, llm_grid_h, -1)
+                        .flatten()
+                    )
+                    llm_pos_ids_list.append(
+                        torch.stack([t_index, h_index, w_index]) + text_len + st_idx
+                    )
+                    st = ed + llm_grid_t * llm_grid_h * llm_grid_w
+                if st < len(input_tokens):
+                    st_idx = (
+                        llm_pos_ids_list[-1].max() + 1
+                        if len(llm_pos_ids_list) > 0
+                        else 0
+                    )
+                    text_len = len(input_tokens) - st
+                    llm_pos_ids_list.append(
+                        torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
+                    )
+                llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+                position_ids[..., i, :] = llm_positions.to(position_ids.device)
+                mrope_position_deltas.append(
+                    llm_positions.max() + 1 - len(total_input_ids[i])
+                )
+            mrope_position_deltas = torch.tensor(
+                mrope_position_deltas, device=input_ids.device
+            ).unsqueeze(1)
+            return position_ids, mrope_position_deltas
+        else:
+            s = input_ids.shape[1]
+            position_ids = torch.arange(s)
+            position_ids = (
+                position_ids.unsqueeze(0).expand(3, -1, -1).to(input_ids.device)
             )
-        llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
-        mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item()
-        llm_positions = llm_positions[:, context_len:seq_len]
-        return llm_positions.tolist(), mrope_position_delta
+            max_position_ids = position_ids.max(0, keepdim=False)[0].max(
+                -1, keepdim=True
+            )[0]
+            mrope_position_deltas = max_position_ids + 1 - s
+            return position_ids, mrope_position_deltas
     @staticmethod
     def get_next_input_positions(
         mrope_position_delta: int,
         context_len: int,
         seq_len: int,
-    ) -> List[List[int]]:
-        return [
-            list(
-                range(
-                    context_len + mrope_position_delta, seq_len + mrope_position_delta
+    ) -> torch.Tensor:
+        return torch.tensor(
+            [
+                list(
+                    range(
+                        context_len + mrope_position_delta,
+                        seq_len + mrope_position_delta,
+                    )
                 )
-            )
-            for _ in range(3)
-        ]
+                for _ in range(3)
+            ]
+        )
 _ROPE_DICT: Dict[Tuple, RotaryEmbedding] = {}

sglang/srt/managers/data_parallel_controller.py CHANGED Viewed

@@ -23,11 +23,13 @@ import psutil
 import setproctitle
 import zmq
+from sglang.srt.disaggregation.utils import DisaggregationMode
 from sglang.srt.layers.dp_attention import compute_dp_attention_world_info
 from sglang.srt.managers.io_struct import (
     TokenizedEmbeddingReqInput,
     TokenizedGenerateReqInput,
 )
+from sglang.srt.managers.schedule_batch import Req
 from sglang.srt.managers.scheduler import run_scheduler_process
 from sglang.srt.server_args import PortArgs, ServerArgs
 from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
@@ -226,9 +228,14 @@ class DataParallelController:
         self.max_total_num_tokens = scheduler_info[0]["max_total_num_tokens"]
         self.max_req_input_len = scheduler_info[0]["max_req_input_len"]
-    def round_robin_scheduler(self, req):
-        self.workers[self.round_robin_counter].send_pyobj(req)
-        self.round_robin_counter = (self.round_robin_counter + 1) % len(self.workers)
+    def round_robin_scheduler(self, req: Req):
+        if self.server_args.disaggregation_mode == "null":
+            self.workers[self.round_robin_counter].send_pyobj(req)
+            self.round_robin_counter = (self.round_robin_counter + 1) % len(
+                self.workers
+            )
+        else:
+            self.workers[req.bootstrap_room % len(self.workers)].send_pyobj(req)
     def shortest_queue_scheduler(self, input_requests):
         raise NotImplementedError()

sglang/srt/managers/io_struct.py CHANGED Viewed

@@ -97,6 +97,7 @@ class GenerateReqInput:
     # For disaggregated inference
     bootstrap_host: Optional[Union[List[str], str]] = None
+    bootstrap_port: Optional[Union[List[int], int]] = None
     bootstrap_room: Optional[Union[List[int], int]] = None
     def normalize_batch_and_arguments(self):
@@ -400,6 +401,9 @@ class GenerateReqInput:
             bootstrap_host=(
                 self.bootstrap_host[i] if self.bootstrap_host is not None else None
             ),
+            bootstrap_port=(
+                self.bootstrap_port[i] if self.bootstrap_port is not None else None
+            ),
             bootstrap_room=(
                 self.bootstrap_room[i] if self.bootstrap_room is not None else None
             ),
@@ -447,6 +451,7 @@ class TokenizedGenerateReqInput:
     # For disaggregated inference
     bootstrap_host: Optional[str] = None
+    bootstrap_port: Optional[int] = None
     bootstrap_room: Optional[int] = None
@@ -463,6 +468,8 @@ class EmbeddingReqInput:
     image_data: Optional[
         Union[List[List[Union[Image, str]]], List[Union[Image, str]], Union[Image, str]]
     ] = None
+    # The audio input. Like image data, it can be a file name, a url, or base64 encoded string.
+    audio_data: Optional[Union[List[str], str]] = None
     # The token ids for text; one can either specify text or input_ids.
     input_ids: Optional[Union[List[List[int]], List[int]]] = None
     # The request id.

sglang/srt/managers/mm_utils.py CHANGED Viewed

@@ -10,12 +10,13 @@ import torch
 from torch import nn
 from sglang.srt.managers.schedule_batch import (
+    Modality,
     MultimodalDataItem,
     MultimodalInputs,
     global_server_args_dict,
 )
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
-from sglang.srt.utils import print_warning_once
+from sglang.srt.utils import flatten_nested_list, print_warning_once
 logger = logging.getLogger(__name__)
@@ -97,31 +98,80 @@ class MultiModalityDataPaddingPatternTokenPairs(MultiModalityDataPaddingPattern)
         return padded_ids
-class MultiModalityDataPaddingPatternImageTokens(MultiModalityDataPaddingPattern):
+class MultiModalityDataPaddingPatternMultimodalTokens(MultiModalityDataPaddingPattern):
     """In this pattern, data tokens should be represented as repetitions of a single token
     e.g. <image><image>....<image>, or <audio><audio>...<audio>
     """
-    def __init__(self, image_token_id: torch.Tensor) -> None:
-        self.image_token_id = image_token_id
+    def __init__(self, token_ids: List[int]) -> None:
+        self.token_ids = token_ids
-    def pad_input_tokens(self, input_ids: List[int], mm_inputs) -> List[int]:
+    def pad_input_tokens(
+        self, input_ids: List[int], mm_inputs: MultimodalInputs
+    ) -> List[int]:
         """
-        This function will replace the data-tokens in between with pad_values accordingly
+        Finds contiguous regions of tokens matching `self.token_ids` in `input_ids`
+        and replaces each region with the corresponding `pad_value` from `mm_inputs.mm_items`.
         """
         pad_values = [item.pad_value for item in mm_inputs.mm_items]
-        assert len(pad_values) != 0
+        if not pad_values:
+            # No multimodal items, return original input_ids
+            return input_ids
+        if not input_ids:
+            return []
         input_ids_tensor = torch.tensor(input_ids)
-        mask = torch.isin(input_ids_tensor, self.image_token_id)
+        device = input_ids_tensor.device
+        token_ids_tensor = torch.tensor(self.token_ids, device=device)
+        mask = torch.isin(input_ids_tensor, token_ids_tensor)
-        num_image_tokens = mask.sum().item()
-        repeated_pad_values = torch.tensor(pad_values).repeat(
-            num_image_tokens // len(pad_values) + 1
-        )[:num_image_tokens]
+        if not mask.any():
+            # No tokens match token_ids, return original input_ids
+            return input_ids
+        # Find contiguous regions
+        padded_mask = torch.cat(
+            (
+                torch.tensor([False], device=device),
+                mask,
+                torch.tensor([False], device=device),
+            )
+        )
+        # Find indices where the mask value changes
+        diff_indices = torch.where(padded_mask[1:] != padded_mask[:-1])[0]
+        # Start indices are where False changes to True
+        starts = diff_indices[::2]
+        # End indices are where True changes to False (exclusive index)
+        ends = diff_indices[1::2]
+        # Check if the number of regions matches the number of pad values
+        if len(starts) != len(pad_values):
+            # Maybe log a warning here?
+            num_regions = len(starts)
+            num_pad_values = len(pad_values)
+            if num_regions > 0 and num_pad_values > 0:
+                pad_values = (pad_values * (num_regions // num_pad_values + 1))[
+                    :num_regions
+                ]
+            else:  # If no regions or no pad_values, this loop won't run anyway.
+                pad_values = []  # Ensure pad_values is empty if starts is empty
+        # Create a copy to modify
+        output_ids_tensor = input_ids_tensor.clone()
+        # Replace tokens in each region with the corresponding pad value
+        # Ensure we don't iterate if pad_values became empty due to mismatch and num_regions=0
+        for i in range(min(len(starts), len(pad_values))):
+            start_idx = starts[i]
+            end_idx = ends[i]
+            pad_value = pad_values[i]
+            if pad_value is not None:  # Ensure pad_value is not None before assignment
+                output_ids_tensor[start_idx:end_idx] = pad_value
+            else:
+                logger.warning(f"Skipping region {i} due to None pad_value.")
-        input_ids_tensor[mask] = repeated_pad_values
-        return input_ids_tensor.tolist()
+        return output_ids_tensor.tolist()
 def get_embedding_and_mask(
@@ -150,7 +200,6 @@ def get_embedding_and_mask(
     ).unsqueeze(-1)
     num_mm_tokens_in_input_ids = special_multimodal_mask.sum().item()
     if num_mm_tokens_in_input_ids != num_mm_tokens_in_embedding:
         logger.warning(
             f"Number of tokens in multimodal embedding does not match those in the input text."
@@ -190,13 +239,13 @@ def embed_mm_inputs(
     audio_data_embedding_func: Callable[
         [List[MultimodalDataItem]], torch.Tensor
     ] = None,
-    placeholder_token_ids: List[int] = None,
+    placeholder_tokens: dict[Modality, List[int]] = None,
 ) -> Optional[torch.Tensor]:
     """
     Calculate the multimodal embeddings if necessary, then scatter the result with the help of a boolean mask denoting the embed locations
         Args:
-            placeholder_token_ids: denoting the token of multimodal data in input_ids.
+            placeholder_tokens: denoting the token of multimodal data in input_ids.
                 If none, the pad_values of multimodal items are used
         Returns:
@@ -208,9 +257,17 @@ def embed_mm_inputs(
     # 1. Calculate the multimodal data which exists in input_ids, with the help of pad_values
     # we assume that multimodal data are represented with its pad_values in input_ids
-    placeholder_token_ids = placeholder_token_ids or [
-        item.pad_value for item in mm_inputs.mm_items
-    ]
+    # See `pad_input_ids` for more detail
+    # if placeholder_tokens is specified
+    if placeholder_tokens is not None:
+        placeholder_token_ids = flatten_nested_list(
+            [placeholder_token for placeholder_token in placeholder_tokens.values()]
+        )
+    else:
+        placeholder_token_ids = [item.pad_value for item in mm_inputs.mm_items]
+    assert isinstance(placeholder_token_ids[0], int)
     placeholder_tensor = torch.tensor(placeholder_token_ids, device=input_ids.device)
@@ -233,7 +290,7 @@ def embed_mm_inputs(
         using_all_items = False
         if len(appearing_items) == 0:
             # This happens mostly when arg placeholder_token_ids is passed
-            logger.warning_once(
+            logger.warning(
                 "No multimodal data item's pad value exist in placeholder ids. Using all items"
             )
             using_all_items = True
@@ -253,7 +310,8 @@ def embed_mm_inputs(
                 data_embedding_func=image_data_embedding_func,
                 embedding_items=items,
                 placeholder_tensor=(
-                    placeholder_tensor
+                    # use the specified modality token to identify the location to embed
+                    placeholder_tokens[Modality.IMAGE]
                     if using_all_items
                     else torch.tensor(
                         [item.pad_value for item in items],
@@ -275,7 +333,7 @@ def embed_mm_inputs(
                 data_embedding_func=audio_data_embedding_func,
                 embedding_items=items,
                 placeholder_tensor=(
-                    placeholder_tensor
+                    placeholder_tokens[Modality.AUDIO]
                     if using_all_items
                     else torch.tensor(
                         [item.pad_value for item in items],
@@ -296,7 +354,7 @@ def embed_mm_inputs(
         input_ids.clamp_(min=0, max=vocab_size - 1)
         inputs_embeds = input_embedding(input_ids)
-        # 4. scatter embeddings into input embedding
+        # 4. Scatter embeddings into input embedding
         for embedding, mask in zip(embeddings, masks):
             mask = mask.expand_as(inputs_embeds).to(inputs_embeds.device)
             inputs_embeds = inputs_embeds.masked_scatter(
@@ -316,7 +374,7 @@ def general_mm_embed_routine(
     audio_data_embedding_func: Callable[
         [List[MultimodalDataItem]], torch.Tensor
     ] = None,
-    placeholder_token_ids: List[int] = None,
+    placeholder_tokens: dict[Modality, List[int]] = None,
     **kwargs,
 ) -> torch.Tensor:
     """
@@ -328,7 +386,6 @@ def general_mm_embed_routine(
             audio_data_embedding_func : the function returning the image embedding
         Returns:
-            inputs_embedding
             forwarded hidden states
     """
@@ -346,9 +403,9 @@ def general_mm_embed_routine(
             input_embedding=embed_tokens,
             image_data_embedding_func=image_data_embedding_func,
             audio_data_embedding_func=audio_data_embedding_func,
-            placeholder_token_ids=placeholder_token_ids,
+            placeholder_tokens=placeholder_tokens,
         )
-        # once used, mm_inputs is useless
+        # once used, mm_inputs is useless, considering chunked-prefill is disabled for multimodal models
         # just being defensive here
         forward_batch.mm_inputs = None
     else:

sglang/srt/managers/multimodal_processors/base_processor.py CHANGED Viewed

@@ -8,6 +8,7 @@ from typing import List, Optional
 import numpy as np
 import PIL
+from PIL import Image
 from transformers import BaseImageProcessorFast
 from sglang.srt.managers.schedule_batch import Modality
@@ -92,7 +93,12 @@ class BaseMultimodalProcessor(ABC):
     @abstractmethod
     async def process_mm_data_async(
-        self, image_data, input_text, max_req_input_len, **kwargs
+        self,
+        image_data,
+        input_text,
+        request_obj,
+        max_req_input_len,
+        **kwargs,
     ):
         pass
@@ -104,6 +110,8 @@ class BaseMultimodalProcessor(ABC):
         from decord import VideoReader, cpu
         # Before processing inputs
+        if not image_data or len(image_data) == 0:
+            return []
         estimated_frames_list = []
         for image in image_data:
             if isinstance(image, str) and image.startswith("video:"):
@@ -215,6 +223,9 @@ class BaseMultimodalProcessor(ABC):
             discard_alpha_channel: if True, discards the alpha channel in the returned images
         """
+        if image_data is None:
+            image_data = []
         if isinstance(multimodal_tokens.image_token, int):
             multimodal_tokens.image_token = (
                 self._processor.tokenizer.convert_ids_to_tokens(
@@ -229,6 +240,8 @@ class BaseMultimodalProcessor(ABC):
             prompt = self._processor.tokenizer.decode(prompt)
         else:
             prompt = prompt
+        assert isinstance(prompt, str)
         if return_text:
             import re

sglang 0.4.5.post3__py3-none-any.whl → 0.4.6__py3-none-any.whl

sglang 0.4.5.post3py3-none-any.whl → 0.4.6py3-none-any.whl