PyPI - sglang - Versions diffs - 0.4.5.post2__py3-none-any.whl → 0.4.6__py3-none-any.whl - Mend

sglang 0.4.5.post2py3-none-any.whl → 0.4.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (99) hide show

sglang/bench_one_batch.py +19 -3
sglang/bench_serving.py +8 -8
sglang/compile_deep_gemm.py +177 -0
sglang/lang/backend/openai.py +5 -1
sglang/lang/backend/runtime_endpoint.py +5 -1
sglang/srt/code_completion_parser.py +1 -1
sglang/srt/configs/deepseekvl2.py +1 -1
sglang/srt/configs/model_config.py +11 -2
sglang/srt/constrained/llguidance_backend.py +78 -61
sglang/srt/constrained/xgrammar_backend.py +1 -0
sglang/srt/conversation.py +34 -1
sglang/srt/disaggregation/decode.py +96 -5
sglang/srt/disaggregation/mini_lb.py +113 -15
sglang/srt/disaggregation/mooncake/conn.py +199 -32
sglang/srt/disaggregation/nixl/__init__.py +1 -0
sglang/srt/disaggregation/nixl/conn.py +622 -0
sglang/srt/disaggregation/prefill.py +119 -20
sglang/srt/disaggregation/utils.py +17 -0
sglang/srt/entrypoints/engine.py +4 -0
sglang/srt/entrypoints/http_server.py +11 -9
sglang/srt/function_call_parser.py +132 -0
sglang/srt/layers/activation.py +2 -2
sglang/srt/layers/attention/base_attn_backend.py +3 -0
sglang/srt/layers/attention/flashattention_backend.py +809 -160
sglang/srt/layers/attention/flashmla_backend.py +8 -11
sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +5 -5
sglang/srt/layers/attention/triton_ops/extend_attention.py +5 -5
sglang/srt/layers/attention/triton_ops/prefill_attention.py +7 -3
sglang/srt/layers/attention/vision.py +2 -0
sglang/srt/layers/dp_attention.py +1 -1
sglang/srt/layers/layernorm.py +42 -5
sglang/srt/layers/logits_processor.py +2 -2
sglang/srt/layers/moe/ep_moe/layer.py +2 -0
sglang/srt/layers/moe/fused_moe_native.py +2 -4
sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +41 -41
sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +18 -15
sglang/srt/layers/pooler.py +6 -0
sglang/srt/layers/quantization/awq.py +5 -1
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +153 -0
sglang/srt/layers/quantization/deep_gemm.py +385 -0
sglang/srt/layers/quantization/fp8_kernel.py +7 -38
sglang/srt/layers/quantization/fp8_utils.py +2 -2
sglang/srt/layers/quantization/gptq.py +13 -7
sglang/srt/layers/quantization/int8_kernel.py +32 -1
sglang/srt/layers/quantization/modelopt_quant.py +2 -2
sglang/srt/layers/quantization/w8a8_int8.py +3 -3
sglang/srt/layers/radix_attention.py +13 -3
sglang/srt/layers/rotary_embedding.py +176 -132
sglang/srt/layers/sampler.py +2 -2
sglang/srt/managers/data_parallel_controller.py +17 -4
sglang/srt/managers/io_struct.py +21 -3
sglang/srt/managers/mm_utils.py +85 -28
sglang/srt/managers/multimodal_processors/base_processor.py +14 -1
sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +9 -2
sglang/srt/managers/multimodal_processors/gemma3.py +2 -5
sglang/srt/managers/multimodal_processors/janus_pro.py +2 -2
sglang/srt/managers/multimodal_processors/minicpm.py +4 -3
sglang/srt/managers/multimodal_processors/qwen_vl.py +38 -13
sglang/srt/managers/schedule_batch.py +42 -12
sglang/srt/managers/scheduler.py +47 -26
sglang/srt/managers/tokenizer_manager.py +120 -30
sglang/srt/managers/tp_worker.py +1 -0
sglang/srt/mem_cache/hiradix_cache.py +40 -32
sglang/srt/mem_cache/memory_pool.py +118 -13
sglang/srt/model_executor/cuda_graph_runner.py +16 -10
sglang/srt/model_executor/forward_batch_info.py +51 -95
sglang/srt/model_executor/model_runner.py +29 -27
sglang/srt/models/deepseek.py +12 -2
sglang/srt/models/deepseek_nextn.py +101 -6
sglang/srt/models/deepseek_v2.py +153 -76
sglang/srt/models/deepseek_vl2.py +9 -4
sglang/srt/models/gemma3_causal.py +1 -1
sglang/srt/models/llama4.py +0 -1
sglang/srt/models/minicpm3.py +2 -2
sglang/srt/models/minicpmo.py +22 -7
sglang/srt/models/mllama4.py +2 -2
sglang/srt/models/qwen2_5_vl.py +3 -6
sglang/srt/models/qwen2_vl.py +3 -7
sglang/srt/models/roberta.py +178 -0
sglang/srt/openai_api/adapter.py +87 -10
sglang/srt/openai_api/protocol.py +6 -1
sglang/srt/server_args.py +65 -60
sglang/srt/speculative/build_eagle_tree.py +2 -2
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +3 -3
sglang/srt/speculative/eagle_utils.py +2 -2
sglang/srt/speculative/eagle_worker.py +2 -7
sglang/srt/torch_memory_saver_adapter.py +10 -1
sglang/srt/utils.py +48 -6
sglang/test/runners.py +6 -13
sglang/test/test_utils.py +39 -19
sglang/version.py +1 -1
{sglang-0.4.5.post2.dist-info → sglang-0.4.6.dist-info}/METADATA +6 -7
{sglang-0.4.5.post2.dist-info → sglang-0.4.6.dist-info}/RECORD +99 -92
{sglang-0.4.5.post2.dist-info → sglang-0.4.6.dist-info}/WHEEL +1 -1
{sglang-0.4.5.post2.dist-info → sglang-0.4.6.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.5.post2.dist-info → sglang-0.4.6.dist-info}/top_level.txt +0 -0

sglang/srt/managers/data_parallel_controller.py CHANGED Viewed

@@ -23,13 +23,16 @@ import psutil
 import setproctitle
 import zmq
+from sglang.srt.disaggregation.utils import DisaggregationMode
 from sglang.srt.layers.dp_attention import compute_dp_attention_world_info
 from sglang.srt.managers.io_struct import (
     TokenizedEmbeddingReqInput,
     TokenizedGenerateReqInput,
 )
+from sglang.srt.managers.schedule_batch import Req
 from sglang.srt.managers.scheduler import run_scheduler_process
 from sglang.srt.server_args import PortArgs, ServerArgs
+from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
 from sglang.srt.utils import bind_port, configure_logger, get_zmq_socket
 from sglang.utils import get_exception_traceback
@@ -174,6 +177,10 @@ class DataParallelController:
         if not server_args.enable_dp_attention:
             logger.info(f"Launch DP{dp_rank} starting at GPU #{base_gpu_id}.")
+        memory_saver_adapter = TorchMemorySaverAdapter.create(
+            enable=server_args.enable_memory_saver
+        )
         # Launch tensor parallel scheduler processes
         scheduler_pipe_readers = []
         tp_size_per_node = server_args.tp_size // server_args.nnodes
@@ -208,7 +215,8 @@ class DataParallelController:
                 target=run_scheduler_process,
                 args=(server_args, rank_port_args, gpu_id, tp_rank, dp_rank, writer),
             )
-            proc.start()
+            with memory_saver_adapter.configure_subprocess():
+                proc.start()
             self.scheduler_procs.append(proc)
             scheduler_pipe_readers.append(reader)
@@ -220,9 +228,14 @@ class DataParallelController:
         self.max_total_num_tokens = scheduler_info[0]["max_total_num_tokens"]
         self.max_req_input_len = scheduler_info[0]["max_req_input_len"]
-    def round_robin_scheduler(self, req):
-        self.workers[self.round_robin_counter].send_pyobj(req)
-        self.round_robin_counter = (self.round_robin_counter + 1) % len(self.workers)
+    def round_robin_scheduler(self, req: Req):
+        if self.server_args.disaggregation_mode == "null":
+            self.workers[self.round_robin_counter].send_pyobj(req)
+            self.round_robin_counter = (self.round_robin_counter + 1) % len(
+                self.workers
+            )
+        else:
+            self.workers[req.bootstrap_room % len(self.workers)].send_pyobj(req)
     def shortest_queue_scheduler(self, input_requests):
         raise NotImplementedError()

sglang/srt/managers/io_struct.py CHANGED Viewed

@@ -96,8 +96,9 @@ class GenerateReqInput:
     return_hidden_states: bool = False
     # For disaggregated inference
-    bootstrap_host: Optional[str] = None
-    bootstrap_room: Optional[int] = None
+    bootstrap_host: Optional[Union[List[str], str]] = None
+    bootstrap_port: Optional[Union[List[int], int]] = None
+    bootstrap_room: Optional[Union[List[int], int]] = None
     def normalize_batch_and_arguments(self):
         """
@@ -397,6 +398,15 @@ class GenerateReqInput:
                 else None
             ),
             return_hidden_states=self.return_hidden_states,
+            bootstrap_host=(
+                self.bootstrap_host[i] if self.bootstrap_host is not None else None
+            ),
+            bootstrap_port=(
+                self.bootstrap_port[i] if self.bootstrap_port is not None else None
+            ),
+            bootstrap_room=(
+                self.bootstrap_room[i] if self.bootstrap_room is not None else None
+            ),
         )
@@ -441,6 +451,7 @@ class TokenizedGenerateReqInput:
     # For disaggregated inference
     bootstrap_host: Optional[str] = None
+    bootstrap_port: Optional[int] = None
     bootstrap_room: Optional[int] = None
@@ -457,6 +468,8 @@ class EmbeddingReqInput:
     image_data: Optional[
         Union[List[List[Union[Image, str]]], List[Union[Image, str]], Union[Image, str]]
     ] = None
+    # The audio input. Like image data, it can be a file name, a url, or base64 encoded string.
+    audio_data: Optional[Union[List[str], str]] = None
     # The token ids for text; one can either specify text or input_ids.
     input_ids: Optional[Union[List[List[int]], List[int]]] = None
     # The request id.
@@ -665,10 +678,15 @@ class BatchEmbeddingOut:
 @dataclass
-class FlushCacheReq:
+class FlushCacheReqInput:
     pass
+@dataclass
+class FlushCacheReqOutput:
+    success: bool
 @dataclass
 class UpdateWeightFromDiskReqInput:
     # The model path with the new weights

sglang/srt/managers/mm_utils.py CHANGED Viewed

@@ -10,12 +10,13 @@ import torch
 from torch import nn
 from sglang.srt.managers.schedule_batch import (
+    Modality,
     MultimodalDataItem,
     MultimodalInputs,
     global_server_args_dict,
 )
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
-from sglang.srt.utils import print_warning_once
+from sglang.srt.utils import flatten_nested_list, print_warning_once
 logger = logging.getLogger(__name__)
@@ -97,31 +98,80 @@ class MultiModalityDataPaddingPatternTokenPairs(MultiModalityDataPaddingPattern)
         return padded_ids
-class MultiModalityDataPaddingPatternImageTokens(MultiModalityDataPaddingPattern):
+class MultiModalityDataPaddingPatternMultimodalTokens(MultiModalityDataPaddingPattern):
     """In this pattern, data tokens should be represented as repetitions of a single token
     e.g. <image><image>....<image>, or <audio><audio>...<audio>
     """
-    def __init__(self, image_token_id: torch.Tensor) -> None:
-        self.image_token_id = image_token_id
+    def __init__(self, token_ids: List[int]) -> None:
+        self.token_ids = token_ids
-    def pad_input_tokens(self, input_ids: List[int], mm_inputs) -> List[int]:
+    def pad_input_tokens(
+        self, input_ids: List[int], mm_inputs: MultimodalInputs
+    ) -> List[int]:
         """
-        This function will replace the data-tokens in between with pad_values accordingly
+        Finds contiguous regions of tokens matching `self.token_ids` in `input_ids`
+        and replaces each region with the corresponding `pad_value` from `mm_inputs.mm_items`.
         """
         pad_values = [item.pad_value for item in mm_inputs.mm_items]
-        assert len(pad_values) != 0
+        if not pad_values:
+            # No multimodal items, return original input_ids
+            return input_ids
+        if not input_ids:
+            return []
         input_ids_tensor = torch.tensor(input_ids)
-        mask = torch.isin(input_ids_tensor, self.image_token_id)
+        device = input_ids_tensor.device
+        token_ids_tensor = torch.tensor(self.token_ids, device=device)
+        mask = torch.isin(input_ids_tensor, token_ids_tensor)
-        num_image_tokens = mask.sum().item()
-        repeated_pad_values = torch.tensor(pad_values).repeat(
-            num_image_tokens // len(pad_values) + 1
-        )[:num_image_tokens]
+        if not mask.any():
+            # No tokens match token_ids, return original input_ids
+            return input_ids
+        # Find contiguous regions
+        padded_mask = torch.cat(
+            (
+                torch.tensor([False], device=device),
+                mask,
+                torch.tensor([False], device=device),
+            )
+        )
+        # Find indices where the mask value changes
+        diff_indices = torch.where(padded_mask[1:] != padded_mask[:-1])[0]
+        # Start indices are where False changes to True
+        starts = diff_indices[::2]
+        # End indices are where True changes to False (exclusive index)
+        ends = diff_indices[1::2]
+        # Check if the number of regions matches the number of pad values
+        if len(starts) != len(pad_values):
+            # Maybe log a warning here?
+            num_regions = len(starts)
+            num_pad_values = len(pad_values)
+            if num_regions > 0 and num_pad_values > 0:
+                pad_values = (pad_values * (num_regions // num_pad_values + 1))[
+                    :num_regions
+                ]
+            else:  # If no regions or no pad_values, this loop won't run anyway.
+                pad_values = []  # Ensure pad_values is empty if starts is empty
+        # Create a copy to modify
+        output_ids_tensor = input_ids_tensor.clone()
+        # Replace tokens in each region with the corresponding pad value
+        # Ensure we don't iterate if pad_values became empty due to mismatch and num_regions=0
+        for i in range(min(len(starts), len(pad_values))):
+            start_idx = starts[i]
+            end_idx = ends[i]
+            pad_value = pad_values[i]
+            if pad_value is not None:  # Ensure pad_value is not None before assignment
+                output_ids_tensor[start_idx:end_idx] = pad_value
+            else:
+                logger.warning(f"Skipping region {i} due to None pad_value.")
-        input_ids_tensor[mask] = repeated_pad_values
-        return input_ids_tensor.tolist()
+        return output_ids_tensor.tolist()
 def get_embedding_and_mask(
@@ -150,7 +200,6 @@ def get_embedding_and_mask(
     ).unsqueeze(-1)
     num_mm_tokens_in_input_ids = special_multimodal_mask.sum().item()
     if num_mm_tokens_in_input_ids != num_mm_tokens_in_embedding:
         logger.warning(
             f"Number of tokens in multimodal embedding does not match those in the input text."
@@ -190,13 +239,13 @@ def embed_mm_inputs(
     audio_data_embedding_func: Callable[
         [List[MultimodalDataItem]], torch.Tensor
     ] = None,
-    placeholder_token_ids: List[int] = None,
+    placeholder_tokens: dict[Modality, List[int]] = None,
 ) -> Optional[torch.Tensor]:
     """
     Calculate the multimodal embeddings if necessary, then scatter the result with the help of a boolean mask denoting the embed locations
         Args:
-            placeholder_token_ids: denoting the token of multimodal data in input_ids.
+            placeholder_tokens: denoting the token of multimodal data in input_ids.
                 If none, the pad_values of multimodal items are used
         Returns:
@@ -208,9 +257,17 @@ def embed_mm_inputs(
     # 1. Calculate the multimodal data which exists in input_ids, with the help of pad_values
     # we assume that multimodal data are represented with its pad_values in input_ids
-    placeholder_token_ids = placeholder_token_ids or [
-        item.pad_value for item in mm_inputs.mm_items
-    ]
+    # See `pad_input_ids` for more detail
+    # if placeholder_tokens is specified
+    if placeholder_tokens is not None:
+        placeholder_token_ids = flatten_nested_list(
+            [placeholder_token for placeholder_token in placeholder_tokens.values()]
+        )
+    else:
+        placeholder_token_ids = [item.pad_value for item in mm_inputs.mm_items]
+    assert isinstance(placeholder_token_ids[0], int)
     placeholder_tensor = torch.tensor(placeholder_token_ids, device=input_ids.device)
@@ -233,7 +290,7 @@ def embed_mm_inputs(
         using_all_items = False
         if len(appearing_items) == 0:
             # This happens mostly when arg placeholder_token_ids is passed
-            logger.warning_once(
+            logger.warning(
                 "No multimodal data item's pad value exist in placeholder ids. Using all items"
             )
             using_all_items = True
@@ -253,7 +310,8 @@ def embed_mm_inputs(
                 data_embedding_func=image_data_embedding_func,
                 embedding_items=items,
                 placeholder_tensor=(
-                    placeholder_tensor
+                    # use the specified modality token to identify the location to embed
+                    placeholder_tokens[Modality.IMAGE]
                     if using_all_items
                     else torch.tensor(
                         [item.pad_value for item in items],
@@ -275,7 +333,7 @@ def embed_mm_inputs(
                 data_embedding_func=audio_data_embedding_func,
                 embedding_items=items,
                 placeholder_tensor=(
-                    placeholder_tensor
+                    placeholder_tokens[Modality.AUDIO]
                     if using_all_items
                     else torch.tensor(
                         [item.pad_value for item in items],
@@ -296,7 +354,7 @@ def embed_mm_inputs(
         input_ids.clamp_(min=0, max=vocab_size - 1)
         inputs_embeds = input_embedding(input_ids)
-        # 4. scatter embeddings into input embedding
+        # 4. Scatter embeddings into input embedding
         for embedding, mask in zip(embeddings, masks):
             mask = mask.expand_as(inputs_embeds).to(inputs_embeds.device)
             inputs_embeds = inputs_embeds.masked_scatter(
@@ -316,7 +374,7 @@ def general_mm_embed_routine(
     audio_data_embedding_func: Callable[
         [List[MultimodalDataItem]], torch.Tensor
     ] = None,
-    placeholder_token_ids: List[int] = None,
+    placeholder_tokens: dict[Modality, List[int]] = None,
     **kwargs,
 ) -> torch.Tensor:
     """
@@ -328,7 +386,6 @@ def general_mm_embed_routine(
             audio_data_embedding_func : the function returning the image embedding
         Returns:
-            inputs_embedding
             forwarded hidden states
     """
@@ -346,9 +403,9 @@ def general_mm_embed_routine(
             input_embedding=embed_tokens,
             image_data_embedding_func=image_data_embedding_func,
             audio_data_embedding_func=audio_data_embedding_func,
-            placeholder_token_ids=placeholder_token_ids,
+            placeholder_tokens=placeholder_tokens,
         )
-        # once used, mm_inputs is useless
+        # once used, mm_inputs is useless, considering chunked-prefill is disabled for multimodal models
         # just being defensive here
         forward_batch.mm_inputs = None
     else:

sglang/srt/managers/multimodal_processors/base_processor.py CHANGED Viewed

@@ -8,6 +8,7 @@ from typing import List, Optional
 import numpy as np
 import PIL
+from PIL import Image
 from transformers import BaseImageProcessorFast
 from sglang.srt.managers.schedule_batch import Modality
@@ -92,7 +93,12 @@ class BaseMultimodalProcessor(ABC):
     @abstractmethod
     async def process_mm_data_async(
-        self, image_data, input_text, max_req_input_len, **kwargs
+        self,
+        image_data,
+        input_text,
+        request_obj,
+        max_req_input_len,
+        **kwargs,
     ):
         pass
@@ -104,6 +110,8 @@ class BaseMultimodalProcessor(ABC):
         from decord import VideoReader, cpu
         # Before processing inputs
+        if not image_data or len(image_data) == 0:
+            return []
         estimated_frames_list = []
         for image in image_data:
             if isinstance(image, str) and image.startswith("video:"):
@@ -215,6 +223,9 @@ class BaseMultimodalProcessor(ABC):
             discard_alpha_channel: if True, discards the alpha channel in the returned images
         """
+        if image_data is None:
+            image_data = []
         if isinstance(multimodal_tokens.image_token, int):
             multimodal_tokens.image_token = (
                 self._processor.tokenizer.convert_ids_to_tokens(
@@ -229,6 +240,8 @@ class BaseMultimodalProcessor(ABC):
             prompt = self._processor.tokenizer.decode(prompt)
         else:
             prompt = prompt
+        assert isinstance(prompt, str)
         if return_text:
             import re

sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py CHANGED Viewed

@@ -16,6 +16,7 @@
 # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
 # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+from typing import List, Union
 import torch
@@ -35,7 +36,13 @@ class DeepseekVL2ImageProcessor(BaseMultimodalProcessor):
         self.IMAGE_TOKEN = "<image>"
     async def process_mm_data_async(
-        self, image_data, input_ids, request_obj, max_req_input_len, *args, **kwargs
+        self,
+        image_data: List[Union[str, bytes]],
+        input_text,
+        request_obj,
+        max_req_input_len,
+        *args,
+        **kwargs
     ):
         if not image_data:
             return None
@@ -45,7 +52,7 @@ class DeepseekVL2ImageProcessor(BaseMultimodalProcessor):
         image_token = self.IMAGE_TOKEN
         base_output = self.load_mm_data(
-            input_ids,
+            input_text,
             image_data=image_data,
             multimodal_tokens=MultimodalSpecialTokens(image_token=image_token),
             max_req_input_len=max_req_input_len,

sglang/srt/managers/multimodal_processors/gemma3.py CHANGED Viewed

@@ -1,7 +1,5 @@
 from typing import List, Union
-from transformers.utils import logging
 from sglang.srt.managers.multimodal_processor import (
     BaseMultimodalProcessor as SGLangBaseProcessor,
 )
@@ -13,7 +11,6 @@ from sglang.srt.models.gemma3_mm import Gemma3ForConditionalGeneration
 # Copied from: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gemma3/image_processing_gemma3_fast.py
 # will be removed in the future
-logger = logging.get_logger(__name__)
 class Gemma3SGLangImageProcessor(SGLangBaseProcessor):
@@ -28,7 +25,7 @@ class Gemma3SGLangImageProcessor(SGLangBaseProcessor):
     async def process_mm_data_async(
         self,
         image_data: List[Union[str, bytes]],
-        input_ids,
+        input_text,
         request_obj,
         max_req_input_len,
         *args,
@@ -41,7 +38,7 @@ class Gemma3SGLangImageProcessor(SGLangBaseProcessor):
         image_token = self.IMAGE_TOKEN
         base_output = self.load_mm_data(
-            prompt=input_ids,
+            prompt=input_text,
             image_data=image_data,
             multimodal_tokens=MultimodalSpecialTokens(image_token=image_token),
             max_req_input_len=max_req_input_len,

sglang/srt/managers/multimodal_processors/janus_pro.py CHANGED Viewed

@@ -17,7 +17,7 @@ class JanusProImageProcessor(BaseMultimodalProcessor):
     async def process_mm_data_async(
         self,
         image_data: List[Union[str, bytes]],
-        input_ids,
+        input_text,
         request_obj,
         max_req_input_len,
         **kwargs,
@@ -31,7 +31,7 @@ class JanusProImageProcessor(BaseMultimodalProcessor):
         processor = self._processor
         base_out = self.load_mm_data(
-            prompt=input_ids,
+            prompt=input_text,
             image_data=image_data,
             multimodal_tokens=MultimodalSpecialTokens(
                 image_token=processor.image_token

sglang/srt/managers/multimodal_processors/minicpm.py CHANGED Viewed

@@ -51,9 +51,10 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
     async def process_mm_data_async(
         self,
         image_data: List[Union[str, bytes]],
-        input_ids,
+        input_text,
         request_obj,
         max_req_input_len,
+        **kwargs,
     ):
         audio_data = request_obj.audio_data
         if not image_data and not audio_data:
@@ -64,7 +65,7 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
             audio_data = [audio_data]
         base_output = self.load_mm_data(
-            prompt=input_ids,
+            prompt=input_text,
             max_req_input_len=max_req_input_len,
             audio_data=audio_data,
             image_data=image_data,
@@ -96,7 +97,7 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
             audio_start_id = tokenizer.audio_start_id
             audio_end_id = tokenizer.audio_end_id
-        im_token_id = tokenizer.unk_token_id
+        im_token_id = tokenizer.unk_id
         pixel_values = res["pixel_values"]
         tgt_sizes = res["tgt_sizes"]

sglang/srt/managers/multimodal_processors/qwen_vl.py CHANGED Viewed

@@ -5,6 +5,7 @@ from typing import List, Union
 import torch
 from PIL import Image
+from sglang.srt.layers.rotary_embedding import MRotaryEmbedding
 from sglang.srt.managers.multimodal_processors.base_processor import (
     BaseMultimodalProcessor as SGLangBaseProcessor,
 )
@@ -27,6 +28,8 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
         self.IM_END_TOKEN_ID = hf_config.vision_end_token_id
         self.image_token_id = hf_config.image_token_id
         self.video_token_id = hf_config.video_token_id
+        self.vision_start_token_id = hf_config.vision_start_token_id
+        self.vision_end_token_id = hf_config.vision_end_token_id
         self.NUM_TOKEN_PER_FRAME = 770
         self.IMAGE_FACTOR = 28
         self.MIN_PIXELS = 4 * 28 * 28
@@ -36,20 +39,18 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
     async def process_mm_data_async(
         self,
         image_data: List[Union[str, bytes]],
-        prompt,
+        input_text,
         request_obj,
         max_req_input_len,
         *args,
         **kwargs,
     ):
-        if not image_data:
-            return None
         if isinstance(image_data, str):
             image_data = [image_data]
         image_token = self.IMAGE_TOKEN
         base_output = self.load_mm_data(
-            prompt=prompt,
+            prompt=input_text,
             image_data=image_data,
             multimodal_tokens=MultimodalSpecialTokens(image_token=image_token),
             max_req_input_len=max_req_input_len,
@@ -116,29 +117,53 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
         async def resize_image_async(image):
             return resize_image(image)
-        resize_tasks = [resize_image_async(image) for image in base_output.images]
-        resized_images = await asyncio.gather(*resize_tasks)
+        if base_output.images:
+            resize_tasks = [resize_image_async(image) for image in base_output.images]
+            base_output.images = await asyncio.gather(*resize_tasks)
         ret = self.process_mm_data(
             input_text=base_output.input_text,
-            images=resized_images,
+            images=base_output.images,
         )
-        image_grid_thws = torch.concat([ret["image_grid_thw"]])
-        return {
-            "input_ids": ret["input_ids"].flatten().tolist(),
-            "mm_items": [
+        items = []
+        input_ids = ret["input_ids"].flatten().tolist()
+        if "pixel_values" in ret:
+            items += [
                 MultimodalDataItem(
                     pixel_values=ret["pixel_values"],
-                    image_grid_thws=image_grid_thws,
+                    image_grid_thws=torch.concat([ret["image_grid_thw"]]),
                     # TODO
                     video_grid_thws=None,
                     second_per_grid_ts=ret.get("second_per_grid_ts", None),
                     modality=Modality.IMAGE,
                 )
-            ],
+            ]
+        mrope_positions, mrope_position_delta = MRotaryEmbedding.get_rope_index(
+            spatial_merge_size=self.hf_config.vision_config.spatial_merge_size,
+            image_token_id=self.image_token_id,
+            video_token_id=self.video_token_id,
+            vision_start_token_id=self.vision_start_token_id,
+            model_type=self.hf_config.model_type,
+            tokens_per_second=getattr(
+                self.hf_config.vision_config, "tokens_per_second", None
+            ),
+            input_ids=torch.tensor(input_ids).unsqueeze(0),
+            image_grid_thw=ret.get("image_grid_thw", None),
+            video_grid_thw=ret.get("video_grid_thw", None),
+            second_per_grid_ts=ret.get("second_per_grid_ts", None),
+        )
+        mrope_positions = mrope_positions.squeeze(1)
+        return {
+            "input_ids": input_ids,
+            "mm_items": items,
             "im_start_id": self.IM_START_TOKEN_ID,
             "im_end_id": self.IM_END_TOKEN_ID,
             "im_token_id": self.image_token_id,
             "video_token_id": self.video_token_id,
+            "mrope_positions": mrope_positions,
+            "mrope_position_delta": mrope_position_delta,
         }

sglang 0.4.5.post2__py3-none-any.whl → 0.4.6__py3-none-any.whl

sglang 0.4.5.post2py3-none-any.whl → 0.4.6py3-none-any.whl