PyPI - sglang - Versions diffs - 0.4.6.post3__py3-none-any.whl → 0.4.6.post5__py3-none-any.whl - Mend

sglang 0.4.6.post3py3-none-any.whl → 0.4.6.post5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (180) hide show

sglang/bench_offline_throughput.py +10 -8
sglang/bench_one_batch.py +7 -6
sglang/bench_one_batch_server.py +157 -21
sglang/bench_serving.py +137 -59
sglang/compile_deep_gemm.py +5 -5
sglang/eval/loogle_eval.py +157 -0
sglang/lang/chat_template.py +78 -78
sglang/lang/tracer.py +1 -1
sglang/srt/code_completion_parser.py +1 -1
sglang/srt/configs/deepseekvl2.py +2 -2
sglang/srt/configs/model_config.py +40 -28
sglang/srt/constrained/base_grammar_backend.py +55 -72
sglang/srt/constrained/llguidance_backend.py +25 -21
sglang/srt/constrained/outlines_backend.py +27 -26
sglang/srt/constrained/reasoner_grammar_backend.py +22 -33
sglang/srt/constrained/xgrammar_backend.py +69 -43
sglang/srt/conversation.py +49 -44
sglang/srt/disaggregation/base/conn.py +1 -0
sglang/srt/disaggregation/decode.py +129 -135
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +142 -0
sglang/srt/disaggregation/fake/conn.py +3 -13
sglang/srt/disaggregation/kv_events.py +357 -0
sglang/srt/disaggregation/mini_lb.py +57 -24
sglang/srt/disaggregation/mooncake/conn.py +238 -122
sglang/srt/disaggregation/mooncake/transfer_engine.py +2 -1
sglang/srt/disaggregation/nixl/conn.py +10 -19
sglang/srt/disaggregation/prefill.py +132 -47
sglang/srt/disaggregation/utils.py +123 -6
sglang/srt/distributed/utils.py +3 -3
sglang/srt/entrypoints/EngineBase.py +5 -0
sglang/srt/entrypoints/engine.py +44 -9
sglang/srt/entrypoints/http_server.py +23 -6
sglang/srt/entrypoints/http_server_engine.py +5 -2
sglang/srt/function_call/base_format_detector.py +250 -0
sglang/srt/function_call/core_types.py +34 -0
sglang/srt/function_call/deepseekv3_detector.py +157 -0
sglang/srt/function_call/ebnf_composer.py +234 -0
sglang/srt/function_call/function_call_parser.py +175 -0
sglang/srt/function_call/llama32_detector.py +74 -0
sglang/srt/function_call/mistral_detector.py +84 -0
sglang/srt/function_call/pythonic_detector.py +163 -0
sglang/srt/function_call/qwen25_detector.py +67 -0
sglang/srt/function_call/utils.py +35 -0
sglang/srt/hf_transformers_utils.py +46 -7
sglang/srt/layers/attention/aiter_backend.py +513 -0
sglang/srt/layers/attention/flashattention_backend.py +64 -18
sglang/srt/layers/attention/flashinfer_mla_backend.py +8 -4
sglang/srt/layers/attention/flashmla_backend.py +340 -78
sglang/srt/layers/attention/triton_backend.py +3 -0
sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +1 -1
sglang/srt/layers/attention/utils.py +6 -4
sglang/srt/layers/attention/vision.py +1 -1
sglang/srt/layers/communicator.py +451 -0
sglang/srt/layers/dp_attention.py +61 -21
sglang/srt/layers/layernorm.py +1 -1
sglang/srt/layers/logits_processor.py +46 -11
sglang/srt/layers/moe/cutlass_moe.py +207 -0
sglang/srt/layers/moe/ep_moe/kernels.py +34 -12
sglang/srt/layers/moe/ep_moe/layer.py +105 -51
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +82 -7
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +1 -1
sglang/srt/layers/moe/fused_moe_triton/layer.py +14 -0
sglang/srt/layers/moe/topk.py +67 -10
sglang/srt/layers/multimodal.py +70 -0
sglang/srt/layers/quantization/__init__.py +8 -3
sglang/srt/layers/quantization/blockwise_int8.py +2 -2
sglang/srt/layers/quantization/deep_gemm.py +77 -74
sglang/srt/layers/quantization/fp8.py +92 -2
sglang/srt/layers/quantization/fp8_kernel.py +3 -3
sglang/srt/layers/quantization/fp8_utils.py +6 -0
sglang/srt/layers/quantization/gptq.py +298 -6
sglang/srt/layers/quantization/int8_kernel.py +20 -7
sglang/srt/layers/quantization/qoq.py +244 -0
sglang/srt/layers/sampler.py +0 -4
sglang/srt/layers/vocab_parallel_embedding.py +18 -7
sglang/srt/lora/lora_manager.py +2 -4
sglang/srt/lora/mem_pool.py +4 -4
sglang/srt/lora/triton_ops/gate_up_lora_b.py +1 -1
sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
sglang/srt/lora/triton_ops/sgemm_lora_a.py +1 -1
sglang/srt/lora/triton_ops/sgemm_lora_b.py +1 -1
sglang/srt/lora/utils.py +1 -1
sglang/srt/managers/data_parallel_controller.py +3 -3
sglang/srt/managers/deepseek_eplb.py +278 -0
sglang/srt/managers/detokenizer_manager.py +21 -8
sglang/srt/managers/eplb_manager.py +55 -0
sglang/srt/managers/expert_distribution.py +704 -56
sglang/srt/managers/expert_location.py +394 -0
sglang/srt/managers/expert_location_dispatch.py +91 -0
sglang/srt/managers/io_struct.py +19 -4
sglang/srt/managers/mm_utils.py +294 -140
sglang/srt/managers/multimodal_processors/base_processor.py +127 -42
sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +6 -1
sglang/srt/managers/multimodal_processors/gemma3.py +31 -6
sglang/srt/managers/multimodal_processors/internvl.py +14 -5
sglang/srt/managers/multimodal_processors/janus_pro.py +7 -1
sglang/srt/managers/multimodal_processors/kimi_vl.py +7 -6
sglang/srt/managers/multimodal_processors/llava.py +46 -0
sglang/srt/managers/multimodal_processors/minicpm.py +25 -31
sglang/srt/managers/multimodal_processors/mllama4.py +6 -0
sglang/srt/managers/multimodal_processors/pixtral.py +127 -0
sglang/srt/managers/multimodal_processors/qwen_vl.py +58 -16
sglang/srt/managers/schedule_batch.py +122 -42
sglang/srt/managers/schedule_policy.py +1 -5
sglang/srt/managers/scheduler.py +205 -138
sglang/srt/managers/scheduler_output_processor_mixin.py +124 -55
sglang/srt/managers/session_controller.py +1 -1
sglang/srt/managers/tokenizer_manager.py +232 -58
sglang/srt/managers/tp_worker.py +12 -9
sglang/srt/managers/tp_worker_overlap_thread.py +22 -11
sglang/srt/mem_cache/base_prefix_cache.py +3 -0
sglang/srt/mem_cache/chunk_cache.py +3 -1
sglang/srt/mem_cache/hiradix_cache.py +4 -4
sglang/srt/mem_cache/memory_pool.py +76 -52
sglang/srt/mem_cache/multimodal_cache.py +45 -0
sglang/srt/mem_cache/radix_cache.py +58 -5
sglang/srt/metrics/collector.py +314 -39
sglang/srt/mm_utils.py +10 -0
sglang/srt/model_executor/cuda_graph_runner.py +29 -19
sglang/srt/model_executor/expert_location_updater.py +422 -0
sglang/srt/model_executor/forward_batch_info.py +5 -1
sglang/srt/model_executor/model_runner.py +163 -68
sglang/srt/model_loader/loader.py +10 -6
sglang/srt/models/clip.py +5 -1
sglang/srt/models/deepseek_janus_pro.py +2 -2
sglang/srt/models/deepseek_v2.py +308 -351
sglang/srt/models/exaone.py +8 -3
sglang/srt/models/gemma3_mm.py +70 -33
sglang/srt/models/llama.py +2 -0
sglang/srt/models/llama4.py +15 -8
sglang/srt/models/llava.py +258 -7
sglang/srt/models/mimo_mtp.py +220 -0
sglang/srt/models/minicpmo.py +5 -12
sglang/srt/models/mistral.py +71 -1
sglang/srt/models/mixtral.py +98 -34
sglang/srt/models/mllama.py +3 -3
sglang/srt/models/pixtral.py +467 -0
sglang/srt/models/qwen2.py +95 -26
sglang/srt/models/qwen2_5_vl.py +8 -0
sglang/srt/models/qwen2_moe.py +330 -60
sglang/srt/models/qwen2_vl.py +6 -0
sglang/srt/models/qwen3.py +52 -10
sglang/srt/models/qwen3_moe.py +411 -48
sglang/srt/models/roberta.py +1 -1
sglang/srt/models/siglip.py +294 -0
sglang/srt/models/torch_native_llama.py +1 -1
sglang/srt/openai_api/adapter.py +58 -20
sglang/srt/openai_api/protocol.py +6 -8
sglang/srt/operations.py +154 -0
sglang/srt/operations_strategy.py +31 -0
sglang/srt/reasoning_parser.py +3 -3
sglang/srt/sampling/custom_logit_processor.py +18 -3
sglang/srt/sampling/sampling_batch_info.py +4 -56
sglang/srt/sampling/sampling_params.py +2 -2
sglang/srt/server_args.py +162 -22
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +3 -3
sglang/srt/speculative/eagle_utils.py +138 -7
sglang/srt/speculative/eagle_worker.py +69 -21
sglang/srt/utils.py +74 -17
sglang/test/few_shot_gsm8k.py +2 -2
sglang/test/few_shot_gsm8k_engine.py +2 -2
sglang/test/run_eval.py +2 -2
sglang/test/runners.py +8 -1
sglang/test/send_one.py +13 -3
sglang/test/simple_eval_common.py +1 -1
sglang/test/simple_eval_humaneval.py +1 -1
sglang/test/test_cutlass_moe.py +278 -0
sglang/test/test_programs.py +5 -5
sglang/test/test_utils.py +55 -14
sglang/utils.py +3 -3
sglang/version.py +1 -1
{sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/METADATA +23 -13
{sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/RECORD +178 -149
{sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/WHEEL +1 -1
sglang/srt/function_call_parser.py +0 -858
sglang/srt/platforms/interface.py +0 -371
/sglang/{llama3_eval.py → eval/llama3_eval.py} +0 -0
/sglang/srt/models/{xiaomi_mimo.py → mimo.py} +0 -0
{sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/top_level.txt +0 -0

sglang/srt/managers/multimodal_processors/base_processor.py CHANGED Viewed

@@ -3,16 +3,16 @@ import concurrent.futures
 import dataclasses
 import multiprocessing as mp
 import os
+import re
 from abc import ABC, abstractmethod
-from typing import List, Optional
+from typing import List, Optional, Tuple, Union
 import numpy as np
-import PIL
 import torch
 from PIL import Image
 from transformers import BaseImageProcessorFast
-from sglang.srt.managers.schedule_batch import Modality
+from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
 from sglang.srt.utils import encode_video, load_audio, load_image
@@ -22,13 +22,13 @@ class BaseMultiModalProcessorOutput:
     input_text: str
     # frames loaded from image and video, in given order
-    images: Optional[list[PIL.Image]] = None
+    images: Optional[list[Union[Image.Image, MultimodalDataItem]]] = None
     # audios
-    audios: Optional[list[np.ndarray]] = None
+    audios: Optional[list[Union[np.ndarray, MultimodalDataItem]]] = None
     def normalize(self):
-        for field_name in ["image_sizes", "images", "audios"]:
+        for field_name in ["images", "audios"]:
             field = getattr(self, field_name, None)
             if field is not None and isinstance(field, list) and len(field) == 0:
                 setattr(self, field_name, None)
@@ -36,16 +36,48 @@ class BaseMultiModalProcessorOutput:
 @dataclasses.dataclass
 class MultimodalSpecialTokens:
-    image_token: Optional[str] = None
-    video_token: Optional[str] = None
-    audio_token: Optional[str] = None
-    def collect(self) -> list[str]:
-        return [
-            token
-            for token in [self.image_token, self.video_token, self.audio_token]
-            if token
+    image_token: Optional[Union[int, str, List[str]]] = None
+    video_token: Optional[Union[int, str, List[str]]] = None
+    audio_token: Optional[Union[int, str, List[str]]] = None
+    def convert_to_str(self, token: Union[str, int], processor) -> str:
+        if token is None:
+            return token
+        if isinstance(token, str):
+            return token
+        return processor.tokenizer.convert_ids_to_tokens([token])[0]
+    def convert_to_strs(self, processor):
+        self.image_token = self.convert_to_str(self.image_token, processor)
+        self.video_token = self.convert_to_str(self.video_token, processor)
+        self.audio_token = self.convert_to_str(self.audio_token, processor)
+    image_token_regex: Optional[re.Pattern] = None
+    video_token_regex: Optional[re.Pattern] = None
+    audio_token_regex: Optional[re.Pattern] = None
+    def __post_init__(self):
+        if self.image_token_regex is None and self.image_token is not None:
+            self.image_token_regex = re.compile(re.escape(self.image_token))
+        if self.video_token_regex is None and self.video_token is not None:
+            self.video_token_regex = re.compile(re.escape(self.video_token))
+        if self.audio_token_regex is None and self.audio_token is not None:
+            self.audio_token_regex = re.compile(re.escape(self.audio_token))
+    def collect(self) -> re.Pattern:
+        tokens = [
+            self.image_token_regex,
+            self.video_token_regex,
+            self.audio_token_regex,
         ]
+        patterns = []
+        flags = 0
+        for t in tokens:
+            if t is not None:
+                patterns.append(t.pattern)
+                flags |= t.flags
+        combined = "(" + "|".join(f"(?:{p})" for p in patterns) + ")"
+        return re.compile(combined, flags)
 class BaseMultimodalProcessor(ABC):
@@ -54,6 +86,7 @@ class BaseMultimodalProcessor(ABC):
     def __init__(self, hf_config, server_args, _processor):
         self.hf_config = hf_config
         self._processor = _processor
+        self.arch = hf_config.architectures[0]
         self.server_args = server_args
         # FIXME: not accurate, model and image specific
         self.NUM_TOKEN_PER_FRAME = 330
@@ -136,6 +169,10 @@ class BaseMultimodalProcessor(ABC):
         data, is_video, is_audio, frame_count_limit=None, discard_alpha_channel=True
     ):
         """Static method that can be pickled for multiprocessing"""
+        if isinstance(data, dict):
+            return MultimodalDataItem.from_dict(data)
+        if isinstance(data, MultimodalDataItem):
+            return data
         try:
             if is_audio:
                 return load_audio(data)
@@ -175,7 +212,10 @@ class BaseMultimodalProcessor(ABC):
         image_index, audio_index = 0, 0
         for text_part in text_parts:
-            if text_part == multimodal_tokens.image_token:
+            if (
+                multimodal_tokens.image_token_regex
+                and multimodal_tokens.image_token_regex.match(text_part)
+            ):
                 data = image_data[image_index]
                 is_video = isinstance(data, str) and data.startswith("video:")
                 estimated_frames = estimated_frames_list[image_index]
@@ -192,7 +232,10 @@ class BaseMultimodalProcessor(ABC):
                 )
                 task_info.append((Modality.IMAGE, data, frame_count_limit))
                 image_index += 1
-            elif text_part == multimodal_tokens.audio_token:
+            elif (
+                multimodal_tokens.audio_token_regex
+                and multimodal_tokens.audio_token_regex.match(text_part)
+            ):
                 data = audio_data[audio_index]
                 futures.append(
                     self.io_executor.submit(
@@ -228,17 +271,13 @@ class BaseMultimodalProcessor(ABC):
             discard_alpha_channel: if True, discards the alpha channel in the returned images
         """
+        if not return_text:
+            raise NotImplementedError()
         if image_data is None:
             image_data = []
-        if isinstance(multimodal_tokens.image_token, int):
-            multimodal_tokens.image_token = (
-                self._processor.tokenizer.convert_ids_to_tokens(
-                    multimodal_tokens.image_token
-                )
-            )
-        else:
-            multimodal_tokens.image_token = multimodal_tokens.image_token
+        multimodal_tokens.convert_to_strs(self._processor)
+        multimodal_tokens_pattern = multimodal_tokens.collect()
         if isinstance(prompt, list) and return_text:
             assert len(prompt) and isinstance(prompt[0], int)
@@ -247,16 +286,8 @@ class BaseMultimodalProcessor(ABC):
             prompt = prompt
         assert isinstance(prompt, str)
-        if return_text:
-            import re
-            pattern = (
-                "("
-                + "|".join(re.escape(sep) for sep in multimodal_tokens.collect())
-                + ")"
-            )
-            # split text into list of normal text and special tokens
-            text_parts = re.split(pattern, prompt)
+        # split text into list of normal text and special tokens
+        text_parts = re.split(multimodal_tokens_pattern, prompt)
         futures, task_info = self.submit_data_loading_tasks(
             text_parts=text_parts,
@@ -266,34 +297,88 @@ class BaseMultimodalProcessor(ABC):
             discard_alpha_channel=discard_alpha_channel,
         )
         # Process results
-        image_sizes, images, audios = [], [], []
+        images, audios = [], []
         new_text = ""
         task_ptr = 0
         for text_part in text_parts:
-            if text_part in multimodal_tokens.collect():
+            if multimodal_tokens_pattern.match(text_part):
                 task_type, data, frame_limit = task_info[task_ptr]
                 result = futures[task_ptr].result()
                 task_ptr += 1
                 if task_type == Modality.IMAGE:
+                    # If data is already processed it will be a
+                    # dictionary. In this case we want to keep the
+                    # expanded tokens in text_part. Otherwise, we will
+                    # call the processor code, so keep only a single image
+                    # token.
+                    mm_tokens = (
+                        text_part
+                        if isinstance(data, dict)
+                        else multimodal_tokens.image_token
+                    )
                     frames = [result] if not isinstance(result, list) else result
                     if frames:
-                        image_sizes += frames[0].size * len(frames)
                         images += frames
-                        new_text += multimodal_tokens.image_token * len(frames)
+                        new_text += mm_tokens * len(frames)
                 elif task_type == Modality.AUDIO:
                     # audio
+                    mm_tokens = (
+                        text_part
+                        if isinstance(data, dict)
+                        else multimodal_tokens.audio_token
+                    )
                     audios.append(result)
-                    new_text += multimodal_tokens.audio_token
+                    new_text += mm_tokens
                 # TODO: handle video
             else:
                 new_text += text_part
         out = BaseMultiModalProcessorOutput(
+            input_text=new_text,
             images=images,
             audios=audios,
-            input_text=new_text,
         )
         out.normalize()
         return out
+    @staticmethod
+    def get_mm_items_offset(
+        input_ids: torch.Tensor, mm_token_id: int
+    ) -> List[Tuple[int, int]]:
+        """
+        Get a set of range for mm_items from input_ids
+        Example:
+            input_ids = [1, 2, 3, 3, 3, 4, 3, 3]
+            mm_token_id = 3
+            return result = [(2,4),(6,7)]
+        """
+        mask = input_ids == mm_token_id
+        start_positions = (mask & ~torch.roll(mask, 1)).nonzero(as_tuple=True)[0]
+        end_positions = (mask & ~torch.roll(mask, -1)).nonzero(as_tuple=True)[0]
+        return list(zip(start_positions.tolist(), end_positions.tolist()))
+    @staticmethod
+    def get_mm_items_offset_by_pair(
+        input_ids: torch.Tensor, mm_start_id: int, mm_end_id: int
+    ) -> List[Tuple[int, int]]:
+        indices_start = (input_ids == mm_start_id).nonzero(as_tuple=True)[0] + 1
+        indices_end = (input_ids == mm_end_id).nonzero(as_tuple=True)[0] - 1
+        return list(zip(indices_start.tolist(), indices_end.tolist()))
+    def mm_inputs_are_preprocessed(self, mm_inputs: Optional[list]):
+        """Returns true if all images are preprocessed, false if all are not, and error otherwise."""
+        if not mm_inputs:
+            return True
+        ret = any(isinstance(mm_input, MultimodalDataItem) for mm_input in mm_inputs)
+        if ret and not all(
+            isinstance(mm_input, MultimodalDataItem) for mm_input in mm_inputs
+        ):
+            raise ValueError(
+                "Unsupported: mixture of multimodal inputs where some but not all are preprocessed."
+            )
+        return ret

sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py CHANGED Viewed

@@ -70,8 +70,13 @@ class DeepseekVL2ImageProcessor(BaseMultimodalProcessor):
         batched_images_spatial_crop = torch.stack(batched_images_spatial_crop, dim=0)
         items = []
+        input_ids = res["input_ids"]
+        image_offsets = self.get_mm_items_offset(
+            input_ids=input_ids, mm_token_id=self._processor.image_token_id
+        )
         item = MultimodalDataItem(
             pixel_values=res["images"],
+            image_offsets=image_offsets,
             modality=Modality.IMAGE,
             image_emb_mask=images_seq_mask,
             image_spatial_crop=batched_images_spatial_crop,
@@ -80,6 +85,6 @@ class DeepseekVL2ImageProcessor(BaseMultimodalProcessor):
         return {
             "mm_items": items,
-            "input_ids": res["input_ids"].tolist(),
+            "input_ids": input_ids.tolist(),
             "im_token_id": self._processor.image_token_id,
         }

sglang/srt/managers/multimodal_processors/gemma3.py CHANGED Viewed

@@ -1,4 +1,5 @@
-from typing import List, Union
+import re
+from typing import Dict, List, Union
 from sglang.srt.managers.multimodal_processor import (
     BaseMultimodalProcessor as SGLangBaseProcessor,
@@ -18,13 +19,18 @@ class Gemma3SGLangImageProcessor(SGLangBaseProcessor):
     def __init__(self, hf_config, server_args, _processor):
         super().__init__(hf_config, server_args, _processor)
+        # The single, pre-expanded image token.
         self.IMAGE_TOKEN = "<start_of_image>"
+        # The regex that matches expanded image tokens.
+        self.IMAGE_TOKEN_REGEX = re.compile(
+            r"<start_of_image>(?:(?:<image_soft_token>)*<end_of_image>)?"
+        )
         self.IM_START_TOKEN_ID = hf_config.boi_token_index
         self.IM_END_TOKEN_ID = hf_config.eoi_token_index
     async def process_mm_data_async(
         self,
-        image_data: List[Union[str, bytes]],
+        image_data: List[Union[str, bytes, Dict]],
         input_text,
         request_obj,
         max_req_input_len,
@@ -37,29 +43,48 @@ class Gemma3SGLangImageProcessor(SGLangBaseProcessor):
             image_data = [image_data]
         image_token = self.IMAGE_TOKEN
+        image_token_regex = self.IMAGE_TOKEN_REGEX
         base_output = self.load_mm_data(
             prompt=input_text,
             image_data=image_data,
-            multimodal_tokens=MultimodalSpecialTokens(image_token=image_token),
+            multimodal_tokens=MultimodalSpecialTokens(
+                image_token=image_token, image_token_regex=image_token_regex
+            ),
             max_req_input_len=max_req_input_len,
             discard_alpha_channel=True,
         )
+        images_are_preprocessed = self.mm_inputs_are_preprocessed(base_output.images)
         ret = self.process_mm_data(
-            input_text=base_output.input_text, images=base_output.images
+            input_text=base_output.input_text,
+            images=None if images_are_preprocessed else base_output.images,
         )
         items = []
+        input_ids = ret["input_ids"].flatten()
+        image_offsets = self.get_mm_items_offset(
+            input_ids=input_ids,
+            mm_token_id=self.hf_config.image_token_index,
+        )
         for i, image in enumerate(base_output.images):
+            if images_are_preprocessed:
+                pixel_values = image.pixel_values
+                precomputed_features = image.precomputed_features
+            else:
+                pixel_values = ret["pixel_values"][i]
+                precomputed_features = None
             item = MultimodalDataItem(
-                pixel_values=ret["pixel_values"][i],
+                pixel_values=pixel_values,
+                precomputed_features=precomputed_features,
                 modality=Modality.IMAGE,
+                image_offsets=image_offsets[i],
             )
             items += [item]
         return {
             "mm_items": items,
-            "input_ids": ret["input_ids"].flatten().tolist(),
+            "input_ids": input_ids.tolist(),
             "im_start_id": self.IM_START_TOKEN_ID,
             "im_end_id": self.IM_END_TOKEN_ID,
         }

sglang/srt/managers/multimodal_processors/internvl.py CHANGED Viewed

@@ -3,7 +3,6 @@
 import numpy as np
 import torch
 from decord import VideoReader, cpu
-from numpy.distutils.cpuinfo import cpu
 from PIL import Image
 from sglang.srt.managers.multimodal_processors.base_processor import (
@@ -210,7 +209,6 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
                 return None
         pixel_values = torch.cat(pixel_values, dim=0)
-        items = [MultimodalDataItem(pixel_values=pixel_values, modality=Modality.IMAGE)]
         for idx, num_patches in enumerate(num_patches_list):
             image_tokens = (
@@ -221,10 +219,21 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
             input_text = input_text.replace("<image>", image_tokens, 1)
         tokenizer = self._processor
+        input_ids = tokenizer(input_text, return_tensors="pt")["input_ids"].flatten()
+        image_offsets = self.get_mm_items_offset(
+            input_ids=input_ids,
+            mm_token_id=self.img_context_token_id,
+        )
+        items = [
+            MultimodalDataItem(
+                pixel_values=pixel_values,
+                modality=Modality.IMAGE,
+                image_offsets=image_offsets,
+            )
+        ]
         return {
-            "input_ids": tokenizer(input_text, return_tensors="pt")["input_ids"]
-            .flatten()
-            .tolist(),
+            "input_ids": input_ids.tolist(),
             "mm_items": items,
             "im_start_id": self.img_start_token_id,
             "im_end_id": self.img_end_token_id,

sglang/srt/managers/multimodal_processors/janus_pro.py CHANGED Viewed

@@ -45,15 +45,21 @@ class JanusProImageProcessor(BaseMultimodalProcessor):
             prompt=base_out.input_text,
             images=images,
         )
+        input_ids = res["input_ids"].flatten()
+        image_offsets = self.get_mm_items_offset(
+            input_ids=input_ids, mm_token_id=processor.image_id
+        )
         return {
             "mm_items": [
                 MultimodalDataItem(
                     pixel_values=res["pixel_values"],
                     image_emb_mask=res["images_emb_mask"],
+                    image_offsets=image_offsets,
                     modality=Modality.IMAGE,
                 )
             ],
-            "input_ids": res["input_ids"].flatten().tolist(),
+            "input_ids": input_ids.tolist(),
             "im_start_id": processor.image_start_id,
             "im_end_id": processor.image_end_id,
             "im_token_id": processor.image_id,

sglang/srt/managers/multimodal_processors/kimi_vl.py CHANGED Viewed

@@ -1,10 +1,5 @@
-import asyncio
-import math
 from typing import List, Union
-import torch
-from PIL import Image
 from sglang.srt.managers.multimodal_processors.base_processor import (
     BaseMultimodalProcessor as SGLangBaseProcessor,
 )
@@ -57,13 +52,19 @@ class KimiVLImageProcessor(SGLangBaseProcessor):
             input_text=base_output.input_text,
             images=base_output.images,
         )
+        input_ids = ret["input_ids"].flatten()
+        image_offsets = self.get_mm_items_offset(
+            input_ids=input_ids,
+            mm_token_id=self.im_token_id,
+        )
         return {
-            "input_ids": ret["input_ids"].flatten().tolist(),
+            "input_ids": input_ids.tolist(),
             "mm_items": [
                 MultimodalDataItem(
                     pixel_values=ret["pixel_values"],
                     image_grid_thws=ret["image_grid_hws"],
                     modality=Modality.IMAGE,
+                    image_offsets=image_offsets,
                 )
             ],
             "im_token_id": self.im_token_id,

sglang/srt/managers/multimodal_processors/llava.py CHANGED Viewed

@@ -2,18 +2,24 @@ import asyncio
 from typing import List, Optional, Union
 import numpy as np
+from transformers.models.auto.processing_auto import (
+    PROCESSOR_MAPPING_NAMES as HF_MAPPING_NAMES,
+)
+import sglang.srt.managers.multimodal_processor as sgl_mm_processor_utils
 from sglang.srt.managers.multimodal_processors.base_processor import (
     BaseMultimodalProcessor,
 )
 from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
 from sglang.srt.mm_utils import expand2square, process_anyres_image
 from sglang.srt.models.llava import (
+    LlavaForConditionalGeneration,
     LlavaLlamaForCausalLM,
     LlavaMistralForCausalLM,
     LlavaQwenForCausalLM,
 )
 from sglang.srt.models.llavavid import LlavaVidForCausalLM
+from sglang.srt.models.mistral import Mistral3ForConditionalGeneration
 from sglang.srt.utils import load_image, logger
 from sglang.utils import get_exception_traceback
@@ -133,6 +139,7 @@ class LlavaImageProcessor(BaseMultimodalProcessor):
                             img_data, aspect_ratio, grid_pinpoints
                         )
                     )
                 res = await asyncio.gather(*res)
                 for pixel_v, image_h, image_s in res:
                     pixel_values.append(pixel_v)
@@ -165,3 +172,42 @@ class LlavaImageProcessor(BaseMultimodalProcessor):
                 )
             ],
         }
+class LlavaMultimodalProcessor(BaseMultimodalProcessor):
+    """
+    This is a wrapper class used to identify the multimodal processor for Llava architectures' vision model.
+    """
+    models = [LlavaForConditionalGeneration, Mistral3ForConditionalGeneration]
+    def _get_sgl_processor_cls(self, model_type: str):
+        if hf_name := HF_MAPPING_NAMES.get(model_type):
+            sgl_mm_processor_set = sgl_mm_processor_utils.PROCESSOR_MAPPING.values()
+            sgl_processor_cls = list(
+                filter(lambda p: p.__name__ == hf_name, sgl_mm_processor_set)
+            )
+            if sgl_processor_cls:
+                return sgl_processor_cls[0]
+        raise ValueError(
+            f"Cannot find corresponding multimodal processor registered in sglang for model type `{model_type}`"
+        )
+    def __init__(self, hf_config, server_args, _processor):
+        assert hasattr(hf_config, "vision_config")
+        assert hasattr(hf_config, "text_config")
+        self.vision_config = hf_config.vision_config
+        self.text_config = hf_config.text_config
+        self.hf_config = hf_config
+        if vision_type := getattr(self.vision_config, "model_type"):
+            self.inner = self._get_sgl_processor_cls(vision_type)(
+                hf_config, server_args, _processor
+            )
+        else:
+            raise ValueError(
+                f"Required `vision_config.model_type` is not found in hf_config: `{hf_config}`"
+            )
+    async def process_mm_data_async(self, *args, **kwargs):
+        return await self.inner.process_mm_data_async(*args, **kwargs)

sglang/srt/managers/multimodal_processors/minicpm.py CHANGED Viewed

@@ -1,7 +1,6 @@
 from typing import List, Union
 import torch
-from transformers import BaseImageProcessorFast
 from sglang.srt.managers.multimodal_processors.base_processor import (
     BaseMultimodalProcessor,
@@ -21,33 +20,6 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
         self.image_token = "(<image>./</image>)"
         self.audio_token = "(<audio>./</audio>)"
-    def process_data_task(self, input_text, images=None, audios=None):
-        if isinstance(images, list) and len(images) == 0:
-            images = None
-        if isinstance(audios, list) and len(audios) == 0:
-            audios = None
-        processor = self._processor
-        args = {}
-        if isinstance(processor, BaseImageProcessorFast):
-            args["device"] = "cuda"
-        result = self._processor.__call__(
-            text=input_text,
-            images=images,
-            audios=audios,
-            return_tensors="pt",
-            chunk_input=True,
-            **args,
-        )
-        return {
-            "input_ids": result.input_ids,
-            "pixel_values": getattr(result, "pixel_values", None),
-            "tgt_sizes": getattr(result, "tgt_sizes", None),
-            "audio_features": getattr(result, "audio_features", None),
-            "audio_feature_lens": getattr(result, "audio_feature_lens", None),
-            "audio_bounds": getattr(result, "audio_bounds", None),
-        }
     async def process_mm_data_async(
         self,
         image_data: List[Union[str, bytes]],
@@ -97,6 +69,8 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
             audio_start_id = tokenizer.audio_start_id
             audio_end_id = tokenizer.audio_end_id
+        im_start_id = tokenizer.im_start_id
+        im_end_id = tokenizer.im_end_id
         im_token_id = tokenizer.unk_id
         pixel_values = res["pixel_values"]
         tgt_sizes = res["tgt_sizes"]
@@ -132,9 +106,20 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
         pixel_values = pixel_values_flat
         items = []
+        input_ids = res["input_ids"].flatten()
+        image_offsets = self.get_mm_items_offset_by_pair(
+            input_ids=input_ids, mm_start_id=im_start_id, mm_end_id=im_end_id
+        )
+        slice_offsets = self.get_mm_items_offset_by_pair(
+            input_ids=input_ids, mm_start_id=slice_start_id, mm_end_id=slice_end_id
+        )
+        image_offsets.extend(slice_offsets)
+        image_offsets = sorted(image_offsets)
         if len(pixel_values) != 0:
             item = MultimodalDataItem(
                 pixel_values=pixel_values,
+                image_offsets=image_offsets,
                 tgt_size=tgt_sizes_flat,
                 modality=Modality.IMAGE,
             )
@@ -145,21 +130,30 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
             and res["audio_features"] is not None
             and len(res["audio_features"]) != 0
         ):
+            if audio_start_id is not None and audio_end_id is not None:
+                audio_offsets = self.get_mm_items_offset_by_pair(
+                    input_ids=input_ids,
+                    mm_start_id=audio_start_id,
+                    mm_end_id=audio_end_id,
+                )
+            else:
+                audio_offsets = None
             item = MultimodalDataItem(
                 audio_features=[res["audio_features"]],
                 audio_feature_lens=res["audio_feature_lens"],
+                audio_offsets=audio_offsets,
                 modality=Modality.AUDIO,
             )
             items += [item]
         return {
             "mm_items": items,
-            "input_ids": res["input_ids"].flatten().tolist(),
+            "input_ids": input_ids.tolist(),
             "audio_start_id": audio_start_id,
             "audio_end_id": audio_end_id,
             "im_token_id": im_token_id,
-            "im_start_id": tokenizer.im_start_id,
-            "im_end_id": tokenizer.im_end_id,
+            "im_start_id": im_start_id,
+            "im_end_id": im_end_id,
             "slice_start_id": slice_start_id,
             "slice_end_id": slice_end_id,
         }

sglang/srt/managers/multimodal_processors/mllama4.py CHANGED Viewed

@@ -135,11 +135,17 @@ class Mllama4ImageProcessor(BaseMultimodalProcessor):
         processor_output["im_end_id"] = self.eoi_token_index
         processor_output["im_token_id"] = self.image_token_index
+        image_offsets = self.get_mm_items_offset(
+            input_ids=torch.tensor(processor_output["input_ids"]),
+            mm_token_id=self.image_token_index,
+        )
         # Add metadata for image processing
         processor_output["mm_items"] = [
             MultimodalDataItem(
                 pixel_values=processor_output["pixel_values"],
                 modality=Modality.IMAGE,
+                image_offsets=image_offsets,
             )
         ]

sglang 0.4.6.post3__py3-none-any.whl → 0.4.6.post5__py3-none-any.whl

sglang 0.4.6.post3py3-none-any.whl → 0.4.6.post5py3-none-any.whl