PyPI - sglang - Versions diffs - 0.4.9.post2__py3-none-any.whl → 0.4.9.post4__py3-none-any.whl - Mend

sglang 0.4.9.post2py3-none-any.whl → 0.4.9.post4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (200) hide show

sglang/bench_one_batch.py +2 -1
sglang/eval/loogle_eval.py +7 -0
sglang/srt/_custom_ops.py +29 -1
sglang/srt/configs/deepseekvl2.py +11 -2
sglang/srt/configs/internvl.py +3 -0
sglang/srt/configs/janus_pro.py +3 -0
sglang/srt/configs/model_config.py +10 -8
sglang/srt/configs/update_config.py +3 -1
sglang/srt/conversation.py +2 -1
sglang/srt/custom_op.py +5 -2
sglang/srt/disaggregation/common/conn.py +34 -6
sglang/srt/disaggregation/decode.py +9 -1
sglang/srt/disaggregation/mini_lb.py +3 -2
sglang/srt/disaggregation/mooncake/conn.py +93 -76
sglang/srt/disaggregation/mooncake/transfer_engine.py +4 -2
sglang/srt/disaggregation/nixl/conn.py +17 -13
sglang/srt/distributed/device_communicators/custom_all_reduce.py +3 -91
sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +96 -1
sglang/srt/distributed/device_communicators/quick_all_reduce.py +273 -0
sglang/srt/distributed/device_communicators/shm_broadcast.py +12 -5
sglang/srt/distributed/parallel_state.py +103 -15
sglang/srt/entrypoints/engine.py +31 -33
sglang/srt/entrypoints/http_server.py +20 -32
sglang/srt/entrypoints/openai/protocol.py +3 -3
sglang/srt/entrypoints/openai/serving_chat.py +48 -6
sglang/srt/eplb/expert_location_dispatch.py +1 -1
sglang/srt/function_call/base_format_detector.py +74 -12
sglang/srt/function_call/deepseekv3_detector.py +26 -11
sglang/srt/function_call/ebnf_composer.py +95 -63
sglang/srt/function_call/function_call_parser.py +4 -2
sglang/srt/function_call/kimik2_detector.py +41 -16
sglang/srt/function_call/llama32_detector.py +6 -3
sglang/srt/function_call/mistral_detector.py +11 -3
sglang/srt/function_call/pythonic_detector.py +16 -14
sglang/srt/function_call/qwen25_detector.py +12 -3
sglang/srt/function_call/qwen3_coder_detector.py +151 -0
sglang/srt/hf_transformers_utils.py +0 -1
sglang/srt/layers/activation.py +24 -3
sglang/srt/layers/attention/base_attn_backend.py +3 -1
sglang/srt/layers/attention/flashattention_backend.py +3 -3
sglang/srt/layers/attention/flashinfer_backend.py +40 -1
sglang/srt/layers/communicator.py +12 -12
sglang/srt/layers/dp_attention.py +72 -24
sglang/srt/layers/linear.py +13 -102
sglang/srt/layers/logits_processor.py +34 -24
sglang/srt/layers/moe/ep_moe/kernels.py +4 -2
sglang/srt/layers/moe/ep_moe/layer.py +23 -402
sglang/srt/layers/moe/fused_moe_native.py +7 -47
sglang/srt/layers/moe/fused_moe_triton/__init__.py +4 -4
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=320,device_name=NVIDIA_H20-3e.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=385,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=385,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +54 -263
sglang/srt/layers/moe/fused_moe_triton/layer.py +14 -396
sglang/srt/layers/moe/topk.py +190 -23
sglang/srt/layers/quantization/__init__.py +20 -134
sglang/srt/layers/quantization/awq.py +578 -11
sglang/srt/layers/quantization/awq_triton.py +339 -0
sglang/srt/layers/quantization/base_config.py +85 -10
sglang/srt/layers/quantization/blockwise_int8.py +17 -55
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +13 -11
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +23 -79
sglang/srt/layers/quantization/fp8.py +273 -62
sglang/srt/layers/quantization/fp8_kernel.py +210 -46
sglang/srt/layers/quantization/fp8_utils.py +2 -2
sglang/srt/layers/quantization/gptq.py +501 -143
sglang/srt/layers/quantization/marlin_utils.py +790 -0
sglang/srt/layers/quantization/modelopt_quant.py +34 -112
sglang/srt/layers/quantization/moe_wna16.py +45 -49
sglang/srt/layers/quantization/petit.py +252 -0
sglang/srt/layers/quantization/petit_utils.py +104 -0
sglang/srt/layers/quantization/qoq.py +7 -6
sglang/srt/layers/quantization/scalar_type.py +352 -0
sglang/srt/layers/quantization/unquant.py +422 -0
sglang/srt/layers/quantization/utils.py +340 -9
sglang/srt/layers/quantization/w4afp8.py +8 -4
sglang/srt/layers/quantization/w8a8_fp8.py +17 -51
sglang/srt/layers/quantization/w8a8_int8.py +51 -115
sglang/srt/layers/radix_attention.py +5 -3
sglang/srt/layers/vocab_parallel_embedding.py +1 -41
sglang/srt/lora/lora.py +0 -4
sglang/srt/lora/lora_manager.py +162 -164
sglang/srt/lora/lora_registry.py +124 -0
sglang/srt/lora/mem_pool.py +83 -35
sglang/srt/lora/utils.py +12 -5
sglang/srt/managers/cache_controller.py +288 -0
sglang/srt/managers/io_struct.py +60 -30
sglang/srt/managers/mm_utils.py +7 -8
sglang/srt/managers/schedule_batch.py +163 -113
sglang/srt/managers/schedule_policy.py +68 -27
sglang/srt/managers/scheduler.py +256 -86
sglang/srt/managers/scheduler_output_processor_mixin.py +22 -4
sglang/srt/managers/tokenizer_manager.py +38 -27
sglang/srt/managers/tp_worker.py +16 -4
sglang/srt/managers/tp_worker_overlap_thread.py +11 -0
sglang/srt/mem_cache/allocator.py +74 -23
sglang/srt/mem_cache/base_prefix_cache.py +14 -2
sglang/srt/mem_cache/chunk_cache.py +5 -2
sglang/srt/mem_cache/hicache_storage.py +168 -0
sglang/srt/mem_cache/hiradix_cache.py +194 -5
sglang/srt/mem_cache/memory_pool.py +16 -1
sglang/srt/mem_cache/memory_pool_host.py +44 -2
sglang/srt/mem_cache/radix_cache.py +26 -0
sglang/srt/mem_cache/swa_radix_cache.py +1025 -0
sglang/srt/metrics/collector.py +9 -0
sglang/srt/model_executor/cuda_graph_runner.py +66 -31
sglang/srt/model_executor/forward_batch_info.py +210 -25
sglang/srt/model_executor/model_runner.py +147 -42
sglang/srt/model_loader/loader.py +7 -1
sglang/srt/model_loader/utils.py +4 -4
sglang/srt/models/clip.py +1 -1
sglang/srt/models/deepseek.py +9 -6
sglang/srt/models/deepseek_janus_pro.py +1 -1
sglang/srt/models/deepseek_v2.py +192 -173
sglang/srt/models/deepseek_vl2.py +5 -5
sglang/srt/models/gemma.py +48 -0
sglang/srt/models/gemma2.py +52 -0
sglang/srt/models/gemma3_causal.py +63 -0
sglang/srt/models/gemma3_mm.py +1 -1
sglang/srt/models/gemma3n_mm.py +2 -4
sglang/srt/models/granitemoe.py +385 -0
sglang/srt/models/grok.py +9 -3
sglang/srt/models/hunyuan.py +63 -16
sglang/srt/models/internvl.py +1 -1
sglang/srt/models/kimi_vl.py +1 -1
sglang/srt/models/llama.py +41 -0
sglang/srt/models/llama4.py +11 -11
sglang/srt/models/llava.py +2 -2
sglang/srt/models/llavavid.py +1 -1
sglang/srt/models/minicpm.py +0 -2
sglang/srt/models/minicpmo.py +3 -7
sglang/srt/models/minicpmv.py +1 -1
sglang/srt/models/mistral.py +1 -1
sglang/srt/models/mixtral.py +9 -2
sglang/srt/models/mllama.py +3 -5
sglang/srt/models/mllama4.py +13 -6
sglang/srt/models/olmoe.py +8 -5
sglang/srt/models/persimmon.py +330 -0
sglang/srt/models/phi.py +321 -0
sglang/srt/models/phi4mm.py +44 -4
sglang/srt/models/phi4mm_audio.py +1260 -0
sglang/srt/models/phi4mm_utils.py +1917 -0
sglang/srt/models/phimoe.py +9 -3
sglang/srt/models/qwen.py +37 -0
sglang/srt/models/qwen2.py +41 -0
sglang/srt/models/qwen2_5_vl.py +4 -4
sglang/srt/models/qwen2_audio.py +1 -1
sglang/srt/models/qwen2_moe.py +53 -9
sglang/srt/models/qwen2_vl.py +4 -4
sglang/srt/models/qwen3.py +65 -1
sglang/srt/models/qwen3_moe.py +57 -24
sglang/srt/models/vila.py +1 -1
sglang/srt/multimodal/processors/base_processor.py +91 -97
sglang/srt/multimodal/processors/clip.py +21 -19
sglang/srt/multimodal/processors/deepseek_vl_v2.py +8 -26
sglang/srt/multimodal/processors/gemma3.py +13 -17
sglang/srt/multimodal/processors/gemma3n.py +19 -23
sglang/srt/multimodal/processors/internvl.py +9 -10
sglang/srt/multimodal/processors/janus_pro.py +12 -27
sglang/srt/multimodal/processors/kimi_vl.py +12 -14
sglang/srt/multimodal/processors/llava.py +4 -2
sglang/srt/multimodal/processors/minicpm.py +35 -44
sglang/srt/multimodal/processors/mlama.py +21 -18
sglang/srt/multimodal/processors/mllama4.py +4 -5
sglang/srt/multimodal/processors/phi4mm.py +63 -39
sglang/srt/multimodal/processors/pixtral.py +14 -35
sglang/srt/multimodal/processors/qwen_audio.py +65 -0
sglang/srt/multimodal/processors/qwen_vl.py +16 -21
sglang/srt/multimodal/processors/vila.py +14 -14
sglang/srt/reasoning_parser.py +46 -4
sglang/srt/sampling/sampling_batch_info.py +6 -5
sglang/srt/sampling/sampling_params.py +8 -1
sglang/srt/server_args.py +454 -270
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +33 -28
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +46 -37
sglang/srt/speculative/eagle_utils.py +51 -23
sglang/srt/speculative/eagle_worker.py +59 -44
sglang/srt/two_batch_overlap.py +10 -5
sglang/srt/utils.py +44 -69
sglang/test/runners.py +14 -3
sglang/test/test_activation.py +50 -1
sglang/test/test_block_fp8.py +8 -3
sglang/test/test_block_fp8_ep.py +1 -1
sglang/test/test_custom_ops.py +12 -7
sglang/test/test_cutlass_w4a8_moe.py +1 -3
sglang/test/test_fp4_moe.py +1 -3
sglang/test/test_marlin_moe.py +286 -0
sglang/test/test_marlin_utils.py +171 -0
sglang/test/test_utils.py +35 -0
sglang/version.py +1 -1
{sglang-0.4.9.post2.dist-info → sglang-0.4.9.post4.dist-info}/METADATA +10 -10
{sglang-0.4.9.post2.dist-info → sglang-0.4.9.post4.dist-info}/RECORD +198 -175
sglang/srt/layers/quantization/quant_utils.py +0 -166
sglang/srt/managers/multimodal_processors/qwen_audio.py +0 -94
{sglang-0.4.9.post2.dist-info → sglang-0.4.9.post4.dist-info}/WHEEL +0 -0
{sglang-0.4.9.post2.dist-info → sglang-0.4.9.post4.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.9.post2.dist-info → sglang-0.4.9.post4.dist-info}/top_level.txt +0 -0

sglang/srt/multimodal/processors/base_processor.py CHANGED Viewed

@@ -5,8 +5,7 @@ import multiprocessing as mp
 import os
 import re
 from abc import ABC, abstractmethod
-from functools import lru_cache
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
 import numpy as np
 import torch
@@ -22,7 +21,7 @@ class BaseMultiModalProcessorOutput:
     # input_text, with each frame of video/image represented with a image_token
     input_text: str
-    # frames loaded from image and video, in given order
+    # frames loaded from image, in given order
     images: Optional[list[Union[Image.Image, dict]]] = None
     # videos
@@ -45,14 +44,26 @@ class BaseMultiModalProcessorOutput:
 @dataclasses.dataclass
 class MultimodalSpecialTokens:
-    image_token: Optional[Union[int, str, List[str]]] = None
-    video_token: Optional[Union[int, str, List[str]]] = None
-    audio_token: Optional[Union[int, str, List[str]]] = None
+    image_token: Optional[Union[str, List[str]]] = None
+    video_token: Optional[Union[str, List[str]]] = None
+    audio_token: Optional[Union[str, List[str]]] = None
+    image_token_id: Optional[int] = None
+    video_token_id: Optional[int] = None
+    audio_token_id: Optional[int] = None
     image_token_regex: Optional[re.Pattern] = None
     video_token_regex: Optional[re.Pattern] = None
     audio_token_regex: Optional[re.Pattern] = None
+    combined_regex: Optional[re.Pattern] = None
+    def build(self, processor):
+        self.convert_to_strs(processor)
+        self.parse_regex()
+        self.get_combined_regex()
+        return self
     def convert_to_str(self, token: Union[str, int], processor) -> str:
         if token is None:
             return token
@@ -61,11 +72,14 @@ class MultimodalSpecialTokens:
         return processor.tokenizer.convert_ids_to_tokens([token])[0]
     def convert_to_strs(self, processor):
-        self.image_token = self.convert_to_str(self.image_token, processor)
-        self.video_token = self.convert_to_str(self.video_token, processor)
-        self.audio_token = self.convert_to_str(self.audio_token, processor)
-    def get_modality_of_token(self, token) -> Optional[Modality]:
+        if not self.image_token:
+            self.image_token = self.convert_to_str(self.image_token_id, processor)
+        if not self.video_token:
+            self.video_token = self.convert_to_str(self.video_token_id, processor)
+        if not self.audio_token:
+            self.audio_token = self.convert_to_str(self.audio_token_id, processor)
+    def get_modality_of_token(self, token: str) -> Optional[Modality]:
         """
         :return: the modality associated with the given token, if the token is a special_token or matches with the multimodal token regex
         """
@@ -87,6 +101,14 @@ class MultimodalSpecialTokens:
         return None
+    def get_token_id_by_modality(self, modality: Modality) -> Optional[int]:
+        return {
+            Modality.IMAGE: self.image_token_id,
+            Modality.MULTI_IMAGES: self.image_token_id,
+            Modality.VIDEO: self.video_token_id,
+            Modality.AUDIO: self.audio_token_id,
+        }.get(modality)
     def parse_regex(self):
         if self.image_token_regex is None and self.image_token is not None:
             self.image_token_regex = re.compile(re.escape(self.image_token))
@@ -95,7 +117,12 @@ class MultimodalSpecialTokens:
         if self.audio_token_regex is None and self.audio_token is not None:
             self.audio_token_regex = re.compile(re.escape(self.audio_token))
-    def combine_regex(self) -> re.Pattern:
+    def get_combined_regex(self) -> re.Pattern:
+        """
+        Builds and returns a regex, used to split input str into tokens (with mm special tokens)
+        """
+        if self.combined_regex:
+            return self.combined_regex
         tokens = [
             self.image_token_regex,
             self.video_token_regex,
@@ -108,7 +135,8 @@ class MultimodalSpecialTokens:
                 patterns.append(t.pattern)
                 flags |= t.flags
         combined = "(" + "|".join(f"(?:{p})" for p in patterns) + ")"
-        return re.compile(combined, flags)
+        self.combined_regex = re.compile(combined, flags)
+        return self.combined_regex
 class BaseMultimodalProcessor(ABC):
@@ -135,27 +163,33 @@ class BaseMultimodalProcessor(ABC):
         self.ATTR_NAME_TO_MODALITY = {
             # Image-related attributes
             "pixel_values": Modality.IMAGE,
-            "pixel_values_videos": Modality.VIDEO,
             "image_sizes": Modality.IMAGE,
             "image_grid_thw": Modality.IMAGE,
+            "image_attention_mask": Modality.IMAGE,
             "image_emb_mask": Modality.IMAGE,
-            "image_spatial_crop": Modality.IMAGE,
+            "images_spatial_crop": Modality.IMAGE,
             "tgt_size": Modality.IMAGE,
             "image_grid_hws": Modality.IMAGE,
-            "aspect_ratio_id": Modality.IMAGE,
+            "aspect_ratio_ids": Modality.IMAGE,
             "aspect_ratio_mask": Modality.IMAGE,
-            "second_per_grid_ts": Modality.IMAGE,
             # Audio-related attributes
             "audio_features": Modality.AUDIO,
             "audio_feature_lens": Modality.AUDIO,
             "input_features": Modality.AUDIO,
             "input_features_mask": Modality.AUDIO,
+            "audio_attention_mask": Modality.AUDIO,
             # Video-related attributes
+            "pixel_values_videos": Modality.VIDEO,
+            "second_per_grid_ts": Modality.VIDEO,
             "video_grid_thw": Modality.VIDEO,
             # Generic attributes that could apply to multiple modalities
-            # "precomputed_features" - handled specially as it can be any modality
+            # "precomputed_embeddings" - handled specially as it can be any modality
         }
+        # name of the feature filed
+        # TODO: pass from processors
+        self.FEATURE_NAMES = ["pixel_values", "pixel_values_videos", "audio_features"]
     def process_mm_data(
         self, input_text, images=None, videos=None, audios=None, **kwargs
     ):
@@ -196,7 +230,6 @@ class BaseMultimodalProcessor(ABC):
         audio_data,
         input_text,
         request_obj,
-        max_req_input_len,
         **kwargs,
     ) -> Optional[Dict[str, Any]]:
         pass
@@ -227,7 +260,11 @@ class BaseMultimodalProcessor(ABC):
     @staticmethod
     def _load_single_item(
-        data, modality: Modality, frame_count_limit=None, discard_alpha_channel=True
+        data,
+        modality: Modality,
+        frame_count_limit=None,
+        audio_sample_rate: Optional[int] = None,
+        discard_alpha_channel=True,
     ):
         """
         Load a single multimodal data.
@@ -244,7 +281,7 @@ class BaseMultimodalProcessor(ABC):
             elif modality == Modality.VIDEO:
                 return load_video(data, frame_count_limit)
             elif modality == Modality.AUDIO:
-                return load_audio(data)
+                return load_audio(data, audio_sample_rate)
         except Exception as e:
             raise RuntimeError(f"Error while loading data {data}: {e}")
@@ -253,11 +290,12 @@ class BaseMultimodalProcessor(ABC):
         self,
         text_parts: List[str],
         multimodal_tokens: MultimodalSpecialTokens,
-        data_iterators: dict,
+        data_iterators: dict[Modality, Iterator[Any]],
         discard_alpha_channel: bool = True,
         image_estimated_frames_iter: Optional[iter] = None,
         image_scaling_factor: float = 1.0,
         max_image_frames: int = 30,
+        audio_sample_rate: Optional[int] = None,
     ) -> Tuple[List, List]:
         """
         load multimodal data parallelly using iterators.
@@ -300,6 +338,7 @@ class BaseMultimodalProcessor(ABC):
                         data,
                         modality,
                         frame_count_limit,
+                        audio_sample_rate,
                         discard_alpha_channel,
                     )
                 )
@@ -322,12 +361,12 @@ class BaseMultimodalProcessor(ABC):
         self,
         prompt: str,
         multimodal_tokens: MultimodalSpecialTokens,
-        max_req_input_len: int,
         image_data: Optional[list] = None,
         video_data: Optional[list] = None,
         audio_data: Optional[list] = None,
         return_text: Optional[bool] = True,
         discard_alpha_channel: bool = True,
+        audio_sample_rate: Optional[int] = None,
     ) -> BaseMultiModalProcessorOutput:
         """
         Each frame of video/image will be replaced by a single image token
@@ -338,9 +377,8 @@ class BaseMultimodalProcessor(ABC):
             discard_alpha_channel: if True, discards the alpha channel in the returned images
         """
-        multimodal_tokens.convert_to_strs(self._processor)
-        multimodal_tokens.parse_regex()
-        multimodal_tokens_pattern = multimodal_tokens.combine_regex()
+        multimodal_tokens_pattern = multimodal_tokens.get_combined_regex()
         if isinstance(prompt, list) and return_text:
             assert len(prompt) and isinstance(prompt[0], int)
             prompt = self._processor.tokenizer.decode(prompt)
@@ -367,6 +405,7 @@ class BaseMultimodalProcessor(ABC):
             multimodal_tokens=multimodal_tokens,
             data_iterators=data_iterators,
             discard_alpha_channel=discard_alpha_channel,
+            audio_sample_rate=audio_sample_rate,
         )
         task_info_iter = iter(task_info)
         futures_iter = iter(futures)
@@ -442,7 +481,6 @@ class BaseMultimodalProcessor(ABC):
             return result = [(2,4),(6,7)]
         """
         mask = input_ids == mm_token_id
         start_positions = (mask & ~torch.roll(mask, 1)).nonzero(as_tuple=True)[0]
         end_positions = (mask & ~torch.roll(mask, -1)).nonzero(as_tuple=True)[0]
@@ -457,50 +495,11 @@ class BaseMultimodalProcessor(ABC):
         return list(zip(indices_start.tolist(), indices_end.tolist()))
-    @staticmethod
-    def _extract_processor_features(
-        items: List[dict], attr_name: str
-    ) -> Optional[torch.Tensor]:
-        """
-        Helper function to concat extracted attributes from processor output.
-        """
-        values = [value for item in items if (value := item.get(attr_name)) is not None]
-        return torch.cat(values) if values else None
-    # When we assume that all the items have the same attributes
-    def _extract_processor_features_from_all_attributes(
-        self, items: List[dict]
-    ) -> dict:
-        values = {}
-        # Verify all items have the same keys
-        first_keys = set(items[0].keys())
-        for item in items[1:]:
-            if set(item.keys()) != first_keys:
-                raise ValueError(
-                    f"All items must have the same attributes. "
-                    f"First item has {first_keys}, but found {set(item.keys())}"
-                )
-        # Process each attribute
-        for k, v in items[0].items():
-            if isinstance(v, list):
-                values[k] = self._extract_processor_features(items, k)
-            else:
-                # Verify all items have the same value for non-list attributes
-                for item in items[1:]:
-                    if item[k] != v:
-                        raise ValueError(
-                            f"All items must have the same value for attribute {k}. "
-                            f"First item has {v}, but found {item[k]}"
-                        )
-                values[k] = v
-        return values
     def collect_mm_items_from_processor_output(
         self, data_dict: dict
     ) -> List[MultimodalDataItem]:
         """Create mm_items directly from processor output."""
-        items = {}  # modality -> MultimodalDataItem
+        items: dict[Modality, MultimodalDataItem] = {}
         for attr_name, value in data_dict.items():
             if attr_name == "input_ids":
@@ -509,23 +508,24 @@ class BaseMultimodalProcessor(ABC):
             # Get modality for this attribute
             modality = self.ATTR_NAME_TO_MODALITY.get(attr_name)
-            if not modality and attr_name == "precomputed_features":
+            if attr_name == "precomputed_embeddings":
                 modality_str = data_dict.get("modality")
-                try:
-                    modality = (
-                        Modality.from_str(modality_str)
-                        if modality_str
-                        else Modality.IMAGE
-                    )
-                except ValueError:
-                    modality = Modality.IMAGE
+                modality = Modality.IMAGE
+                if modality_str:
+                    try:
+                        modality = Modality.from_str(modality_str)
+                    except ValueError:
+                        pass
             if modality:
                 # Create item if needed
                 if modality not in items:
                     items[modality] = MultimodalDataItem(modality=modality)
-                # Set attribute
-                setattr(items[modality], attr_name, value)
+                if attr_name in self.FEATURE_NAMES:
+                    attr_name = "feature"
+                items[modality].set(attr_name, value)
         return list(items.values())
@@ -548,7 +548,10 @@ class BaseMultimodalProcessor(ABC):
         return collected_items, input_ids, ret
     def process_and_combine_mm_data(
-        self, base_output: BaseMultiModalProcessorOutput
+        self,
+        base_output: BaseMultiModalProcessorOutput,
+        mm_tokens: MultimodalSpecialTokens,
+        **kwargs,
     ) -> Tuple[List[MultimodalDataItem], torch.Tensor, dict]:
         """
         Process multimodal data and return the combined multimodal items and input_ids.
@@ -581,7 +584,7 @@ class BaseMultimodalProcessor(ABC):
             else:
                 raise ValueError(f"Unknown multimodal item type: {type(item)}")
         # Process items and get input_ids
-        all_collected_items = []
+        all_collected_items: list[MultimodalDataItem] = []
         input_ids = None
         # Handle dict items (already processed)
@@ -597,6 +600,7 @@ class BaseMultimodalProcessor(ABC):
                 images=raw_images,
                 audios=raw_audios,
                 videos=raw_videos,
+                **kwargs,
             )
             all_collected_items.extend(collected_items)
         else:
@@ -612,22 +616,12 @@ class BaseMultimodalProcessor(ABC):
         # Add offsets to all items
         for mm_item in all_collected_items:
-            if mm_item.modality in [Modality.IMAGE, Modality.MULTI_IMAGES]:
-                mm_item.offsets = self.get_mm_items_offset(
-                    input_ids=input_ids,
-                    mm_token_id=self.IM_TOKEN_ID,
-                )
-            elif mm_item.modality == Modality.AUDIO:
-                mm_item.offsets = self.get_mm_items_offset(
-                    input_ids=input_ids,
-                    mm_token_id=self.AUDIO_TOKEN_ID,
-                )
-            elif mm_item.modality == Modality.VIDEO:
-                mm_item.offsets = self.get_mm_items_offset(
-                    input_ids=input_ids,
-                    mm_token_id=self.VIDEO_TOKEN_ID,
-                )
-            else:
-                raise ValueError(f"Unknown modality: {mm_item.modality}")
+            mm_token_id = mm_tokens.get_token_id_by_modality(mm_item.modality)
+            if mm_token_id is None:
+                raise ValueError(f"No token id found for modality: {mm_item.modality}")
+            mm_item.offsets = self.get_mm_items_offset(
+                input_ids=input_ids,
+                mm_token_id=mm_token_id,
+            )
         return all_collected_items, input_ids, ret

sglang/srt/multimodal/processors/clip.py CHANGED Viewed

@@ -1,9 +1,10 @@
 from typing import List, Union
-from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
 from sglang.srt.models.clip import CLIPModel
-from sglang.srt.multimodal.processors.base_processor import BaseMultimodalProcessor
-from sglang.srt.utils import load_image
+from sglang.srt.multimodal.processors.base_processor import (
+    BaseMultimodalProcessor,
+    MultimodalSpecialTokens,
+)
 class ClipImageProcessor(BaseMultimodalProcessor):
@@ -11,23 +12,24 @@ class ClipImageProcessor(BaseMultimodalProcessor):
     def __init__(self, hf_config, server_args, _processor):
         super().__init__(hf_config, server_args, _processor)
+        self.mm_tokens = MultimodalSpecialTokens(image_token="<image>").build(
+            _processor
+        )
     async def process_mm_data_async(
         self, image_data: List[Union[str, bytes]], input_text, *args, **kwargs
     ):
-        if isinstance(input_text, list):
-            assert len(input_text) and isinstance(input_text[0], int)
-            input_text = self._processor.tokenizer.decode(input_text)
-        images = [load_image(image)[0] for image in image_data]
-        image_inputs = self.process_mm_data(input_text=input_text, images=images)
-        image_inputs["data_hashes"] = [hash(str(image_data))]
-        image_inputs["input_ids"] = image_inputs["input_ids"].tolist()[0]
-        image_inputs["mm_items"] = [
-            MultimodalDataItem(
-                pixel_values=image_inputs["pixel_values"], modality=Modality.IMAGE
-            )
-        ]
-        return image_inputs
+        base_output = self.load_mm_data(
+            prompt=input_text,
+            multimodal_tokens=self.mm_tokens,
+            image_data=image_data,
+        )
+        mm_items, input_ids, _ = self.process_and_combine_mm_data(
+            base_output, self.mm_tokens
+        )
+        return {
+            "input_ids": input_ids.tolist(),
+            "mm_items": mm_items,
+        }

sglang/srt/multimodal/processors/deepseek_vl_v2.py CHANGED Viewed

@@ -33,7 +33,9 @@ class DeepseekVL2ImageProcessor(BaseMultimodalProcessor):
     def __init__(self, hf_config, server_args, _processor):
         super().__init__(hf_config, server_args, _processor)
-        self.IMAGE_TOKEN = "<image>"
+        self.mm_tokens = MultimodalSpecialTokens(
+            image_token="<image>", image_token_id=self._processor.image_token_id
+        ).build(_processor)
     async def process_mm_data_async(
         self,
@@ -47,37 +49,17 @@ class DeepseekVL2ImageProcessor(BaseMultimodalProcessor):
         base_output = self.load_mm_data(
             input_text,
             image_data=image_data,
-            multimodal_tokens=MultimodalSpecialTokens(image_token=self.IMAGE_TOKEN),
-            max_req_input_len=max_req_input_len,
+            multimodal_tokens=self.mm_tokens,
         )
-        res = self.process_mm_data(
-            input_text=base_output.input_text,
-            images=base_output.images,
+        mm_items, input_ids, _ = self.process_and_combine_mm_data(
+            base_output,
+            self.mm_tokens,
             max_req_input_len=max_req_input_len,
             conversations=base_output.input_text,
         )
-        images_seq_mask = res["images_seq_mask"]
-        images_spatial_crop = res["images_spatial_crop"]
-        batched_images_spatial_crop = []
-        batched_images_spatial_crop.append(images_spatial_crop)
-        batched_images_spatial_crop = torch.stack(batched_images_spatial_crop, dim=0)
-        items = []
-        input_ids = res["input_ids"]
-        image_offsets = self.get_mm_items_offset(
-            input_ids=input_ids, mm_token_id=self._processor.image_token_id
-        )
-        item = MultimodalDataItem(
-            pixel_values=res["images"],
-            offsets=image_offsets,
-            modality=Modality.IMAGE,
-            image_emb_mask=images_seq_mask,
-            image_spatial_crop=batched_images_spatial_crop,
-        )
-        items += [item]
         return {
-            "mm_items": items,
+            "mm_items": mm_items,
             "input_ids": input_ids.tolist(),
             "im_token_id": self._processor.image_token_id,
         }

sglang/srt/multimodal/processors/gemma3.py CHANGED Viewed

@@ -4,7 +4,6 @@ from typing import Dict, List, Union
 from sglang.srt.managers.multimodal_processor import (
     BaseMultimodalProcessor as SGLangBaseProcessor,
 )
-from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
 from sglang.srt.models.gemma3_mm import Gemma3ForConditionalGeneration
 from sglang.srt.multimodal.processors.base_processor import MultimodalSpecialTokens
@@ -17,39 +16,36 @@ class Gemma3SGLangImageProcessor(SGLangBaseProcessor):
     def __init__(self, hf_config, server_args, _processor):
         super().__init__(hf_config, server_args, _processor)
-        # The single, pre-expanded image token.
-        self.IMAGE_TOKEN = "<start_of_image>"
-        # The regex that matches expanded image tokens.
-        self.IMAGE_TOKEN_REGEX = re.compile(
-            r"<start_of_image>(?:(?:<image_soft_token>)*<end_of_image>)?"
-        )
         self.IM_START_TOKEN_ID = hf_config.boi_token_index
         self.IM_END_TOKEN_ID = hf_config.eoi_token_index
-        self.IM_TOKEN_ID = hf_config.image_token_index
+        self.mm_tokens = MultimodalSpecialTokens(
+            # The single, pre-expanded image token.
+            image_token="<start_of_image>",
+            image_token_id=hf_config.image_token_index,
+            # The regex that matches expanded image tokens.
+            image_token_regex=re.compile(
+                r"<start_of_image>(?:(?:<image_soft_token>)*<end_of_image>)?"
+            ),
+        ).build(_processor)
     async def process_mm_data_async(
         self,
         image_data: List[Union[str, bytes, Dict]],
         input_text,
         request_obj,
-        max_req_input_len,
         *args,
         **kwargs,
     ):
-        print(f"{image_data=}")
         base_output = self.load_mm_data(
             prompt=input_text,
             image_data=image_data,
-            multimodal_tokens=MultimodalSpecialTokens(
-                image_token=self.IMAGE_TOKEN, image_token_regex=self.IMAGE_TOKEN_REGEX
-            ),
-            max_req_input_len=max_req_input_len,
+            multimodal_tokens=self.mm_tokens,
             discard_alpha_channel=True,
         )
-        mm_items, input_ids, _ = self.process_and_combine_mm_data(base_output)
-        print(f"{base_output=}")
-        print(f"{mm_items=}")
+        mm_items, input_ids, _ = self.process_and_combine_mm_data(
+            base_output, self.mm_tokens
+        )
         return {
             "input_ids": input_ids.tolist(),
             "mm_items": mm_items,

sglang/srt/multimodal/processors/gemma3n.py CHANGED Viewed

@@ -30,23 +30,23 @@ class Gemma3nSGLangProcessor(SGLangBaseProcessor):
     def __init__(self, hf_config, server_args, _processor):
         super().__init__(hf_config, server_args, _processor)
-        self.IMAGE_TOKEN = "<image_soft_token>"
-        self.IMAGE_TOKEN_REGEX = re.compile(
-            r"<start_of_image>(?:(?:<image_soft_token>)*<end_of_image>)?"
-        )
-        self.AUDIO_TOKEN = "<audio_soft_token>"
-        self.AUDIO_TOKEN_REGEX = re.compile(
-            r"<start_of_audio>(?:(?:<audio_soft_token>)*<end_of_audio>)?"
-        )
-        self.IM_TOKEN_ID = hf_config.image_token_id
         self.IM_START_TOKEN_ID = hf_config.boi_token_id
         self.IM_END_TOKEN_ID = hf_config.eoi_token_id
-        self.AUDIO_TOKEN_ID = hf_config.audio_token_id
         self.AUDIO_START_TOKEN_ID = hf_config.boa_token_id
         self.AUDIO_END_TOKEN_ID = hf_config.eoa_token_id
+        self.mm_tokens = MultimodalSpecialTokens(
+            image_token="<image_soft_token>",
+            image_token_id=hf_config.image_token_id,
+            image_token_regex=re.compile(
+                r"<start_of_image>(?:(?:<image_soft_token>)*<end_of_image>)?"
+            ),
+            audio_token="<audio_soft_token>",
+            audio_token_id=hf_config.audio_token_id,
+            audio_token_regex=re.compile(
+                r"<start_of_audio>(?:(?:<audio_soft_token>)*<end_of_audio>)?"
+            ),
+        ).build(_processor)
     async def process_mm_data_async(
         self,
@@ -54,7 +54,6 @@ class Gemma3nSGLangProcessor(SGLangBaseProcessor):
         audio_data: Optional[List[Union[str, bytes, Dict]]] = None,
         input_text: str = "",
         request_obj=None,
-        max_req_input_len: int = 0,
         *args,
         **kwargs,
     ):
@@ -63,20 +62,17 @@ class Gemma3nSGLangProcessor(SGLangBaseProcessor):
             prompt=input_text,
             image_data=image_data,
             audio_data=audio_data,
-            max_req_input_len=max_req_input_len,
-            multimodal_tokens=MultimodalSpecialTokens(
-                image_token=self.IMAGE_TOKEN,
-                image_token_regex=self.IMAGE_TOKEN_REGEX,
-                audio_token=self.AUDIO_TOKEN,
-                audio_token_regex=self.AUDIO_TOKEN_REGEX,
-            ),
+            multimodal_tokens=self.mm_tokens,
         )
-        mm_items, input_ids, _ = self.process_and_combine_mm_data(base_output)
+        mm_items, input_ids, _ = self.process_and_combine_mm_data(
+            base_output, self.mm_tokens
+        )
         return {
             "input_ids": input_ids.tolist(),
             "mm_items": mm_items,
-            "im_token_id": self.IM_TOKEN_ID,
-            "audio_token_id": self.AUDIO_TOKEN_ID,
+            # TODO(mick): could we return MultimodalSpecialTokens directly?
+            "im_token_id": self.mm_tokens.image_token_id,
+            "audio_token_id": self.mm_tokens.audio_token_id,
         }

sglang 0.4.9.post2__py3-none-any.whl → 0.4.9.post4__py3-none-any.whl

sglang 0.4.9.post2py3-none-any.whl → 0.4.9.post4py3-none-any.whl