PyPI - sglang - Versions diffs - 0.4.9.post2__py3-none-any.whl → 0.4.9.post4__py3-none-any.whl - Mend

sglang 0.4.9.post2py3-none-any.whl → 0.4.9.post4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (200) hide show

sglang/bench_one_batch.py +2 -1
sglang/eval/loogle_eval.py +7 -0
sglang/srt/_custom_ops.py +29 -1
sglang/srt/configs/deepseekvl2.py +11 -2
sglang/srt/configs/internvl.py +3 -0
sglang/srt/configs/janus_pro.py +3 -0
sglang/srt/configs/model_config.py +10 -8
sglang/srt/configs/update_config.py +3 -1
sglang/srt/conversation.py +2 -1
sglang/srt/custom_op.py +5 -2
sglang/srt/disaggregation/common/conn.py +34 -6
sglang/srt/disaggregation/decode.py +9 -1
sglang/srt/disaggregation/mini_lb.py +3 -2
sglang/srt/disaggregation/mooncake/conn.py +93 -76
sglang/srt/disaggregation/mooncake/transfer_engine.py +4 -2
sglang/srt/disaggregation/nixl/conn.py +17 -13
sglang/srt/distributed/device_communicators/custom_all_reduce.py +3 -91
sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +96 -1
sglang/srt/distributed/device_communicators/quick_all_reduce.py +273 -0
sglang/srt/distributed/device_communicators/shm_broadcast.py +12 -5
sglang/srt/distributed/parallel_state.py +103 -15
sglang/srt/entrypoints/engine.py +31 -33
sglang/srt/entrypoints/http_server.py +20 -32
sglang/srt/entrypoints/openai/protocol.py +3 -3
sglang/srt/entrypoints/openai/serving_chat.py +48 -6
sglang/srt/eplb/expert_location_dispatch.py +1 -1
sglang/srt/function_call/base_format_detector.py +74 -12
sglang/srt/function_call/deepseekv3_detector.py +26 -11
sglang/srt/function_call/ebnf_composer.py +95 -63
sglang/srt/function_call/function_call_parser.py +4 -2
sglang/srt/function_call/kimik2_detector.py +41 -16
sglang/srt/function_call/llama32_detector.py +6 -3
sglang/srt/function_call/mistral_detector.py +11 -3
sglang/srt/function_call/pythonic_detector.py +16 -14
sglang/srt/function_call/qwen25_detector.py +12 -3
sglang/srt/function_call/qwen3_coder_detector.py +151 -0
sglang/srt/hf_transformers_utils.py +0 -1
sglang/srt/layers/activation.py +24 -3
sglang/srt/layers/attention/base_attn_backend.py +3 -1
sglang/srt/layers/attention/flashattention_backend.py +3 -3
sglang/srt/layers/attention/flashinfer_backend.py +40 -1
sglang/srt/layers/communicator.py +12 -12
sglang/srt/layers/dp_attention.py +72 -24
sglang/srt/layers/linear.py +13 -102
sglang/srt/layers/logits_processor.py +34 -24
sglang/srt/layers/moe/ep_moe/kernels.py +4 -2
sglang/srt/layers/moe/ep_moe/layer.py +23 -402
sglang/srt/layers/moe/fused_moe_native.py +7 -47
sglang/srt/layers/moe/fused_moe_triton/__init__.py +4 -4
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=320,device_name=NVIDIA_H20-3e.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=385,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=385,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +54 -263
sglang/srt/layers/moe/fused_moe_triton/layer.py +14 -396
sglang/srt/layers/moe/topk.py +190 -23
sglang/srt/layers/quantization/__init__.py +20 -134
sglang/srt/layers/quantization/awq.py +578 -11
sglang/srt/layers/quantization/awq_triton.py +339 -0
sglang/srt/layers/quantization/base_config.py +85 -10
sglang/srt/layers/quantization/blockwise_int8.py +17 -55
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +13 -11
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +23 -79
sglang/srt/layers/quantization/fp8.py +273 -62
sglang/srt/layers/quantization/fp8_kernel.py +210 -46
sglang/srt/layers/quantization/fp8_utils.py +2 -2
sglang/srt/layers/quantization/gptq.py +501 -143
sglang/srt/layers/quantization/marlin_utils.py +790 -0
sglang/srt/layers/quantization/modelopt_quant.py +34 -112
sglang/srt/layers/quantization/moe_wna16.py +45 -49
sglang/srt/layers/quantization/petit.py +252 -0
sglang/srt/layers/quantization/petit_utils.py +104 -0
sglang/srt/layers/quantization/qoq.py +7 -6
sglang/srt/layers/quantization/scalar_type.py +352 -0
sglang/srt/layers/quantization/unquant.py +422 -0
sglang/srt/layers/quantization/utils.py +340 -9
sglang/srt/layers/quantization/w4afp8.py +8 -4
sglang/srt/layers/quantization/w8a8_fp8.py +17 -51
sglang/srt/layers/quantization/w8a8_int8.py +51 -115
sglang/srt/layers/radix_attention.py +5 -3
sglang/srt/layers/vocab_parallel_embedding.py +1 -41
sglang/srt/lora/lora.py +0 -4
sglang/srt/lora/lora_manager.py +162 -164
sglang/srt/lora/lora_registry.py +124 -0
sglang/srt/lora/mem_pool.py +83 -35
sglang/srt/lora/utils.py +12 -5
sglang/srt/managers/cache_controller.py +288 -0
sglang/srt/managers/io_struct.py +60 -30
sglang/srt/managers/mm_utils.py +7 -8
sglang/srt/managers/schedule_batch.py +163 -113
sglang/srt/managers/schedule_policy.py +68 -27
sglang/srt/managers/scheduler.py +256 -86
sglang/srt/managers/scheduler_output_processor_mixin.py +22 -4
sglang/srt/managers/tokenizer_manager.py +38 -27
sglang/srt/managers/tp_worker.py +16 -4
sglang/srt/managers/tp_worker_overlap_thread.py +11 -0
sglang/srt/mem_cache/allocator.py +74 -23
sglang/srt/mem_cache/base_prefix_cache.py +14 -2
sglang/srt/mem_cache/chunk_cache.py +5 -2
sglang/srt/mem_cache/hicache_storage.py +168 -0
sglang/srt/mem_cache/hiradix_cache.py +194 -5
sglang/srt/mem_cache/memory_pool.py +16 -1
sglang/srt/mem_cache/memory_pool_host.py +44 -2
sglang/srt/mem_cache/radix_cache.py +26 -0
sglang/srt/mem_cache/swa_radix_cache.py +1025 -0
sglang/srt/metrics/collector.py +9 -0
sglang/srt/model_executor/cuda_graph_runner.py +66 -31
sglang/srt/model_executor/forward_batch_info.py +210 -25
sglang/srt/model_executor/model_runner.py +147 -42
sglang/srt/model_loader/loader.py +7 -1
sglang/srt/model_loader/utils.py +4 -4
sglang/srt/models/clip.py +1 -1
sglang/srt/models/deepseek.py +9 -6
sglang/srt/models/deepseek_janus_pro.py +1 -1
sglang/srt/models/deepseek_v2.py +192 -173
sglang/srt/models/deepseek_vl2.py +5 -5
sglang/srt/models/gemma.py +48 -0
sglang/srt/models/gemma2.py +52 -0
sglang/srt/models/gemma3_causal.py +63 -0
sglang/srt/models/gemma3_mm.py +1 -1
sglang/srt/models/gemma3n_mm.py +2 -4
sglang/srt/models/granitemoe.py +385 -0
sglang/srt/models/grok.py +9 -3
sglang/srt/models/hunyuan.py +63 -16
sglang/srt/models/internvl.py +1 -1
sglang/srt/models/kimi_vl.py +1 -1
sglang/srt/models/llama.py +41 -0
sglang/srt/models/llama4.py +11 -11
sglang/srt/models/llava.py +2 -2
sglang/srt/models/llavavid.py +1 -1
sglang/srt/models/minicpm.py +0 -2
sglang/srt/models/minicpmo.py +3 -7
sglang/srt/models/minicpmv.py +1 -1
sglang/srt/models/mistral.py +1 -1
sglang/srt/models/mixtral.py +9 -2
sglang/srt/models/mllama.py +3 -5
sglang/srt/models/mllama4.py +13 -6
sglang/srt/models/olmoe.py +8 -5
sglang/srt/models/persimmon.py +330 -0
sglang/srt/models/phi.py +321 -0
sglang/srt/models/phi4mm.py +44 -4
sglang/srt/models/phi4mm_audio.py +1260 -0
sglang/srt/models/phi4mm_utils.py +1917 -0
sglang/srt/models/phimoe.py +9 -3
sglang/srt/models/qwen.py +37 -0
sglang/srt/models/qwen2.py +41 -0
sglang/srt/models/qwen2_5_vl.py +4 -4
sglang/srt/models/qwen2_audio.py +1 -1
sglang/srt/models/qwen2_moe.py +53 -9
sglang/srt/models/qwen2_vl.py +4 -4
sglang/srt/models/qwen3.py +65 -1
sglang/srt/models/qwen3_moe.py +57 -24
sglang/srt/models/vila.py +1 -1
sglang/srt/multimodal/processors/base_processor.py +91 -97
sglang/srt/multimodal/processors/clip.py +21 -19
sglang/srt/multimodal/processors/deepseek_vl_v2.py +8 -26
sglang/srt/multimodal/processors/gemma3.py +13 -17
sglang/srt/multimodal/processors/gemma3n.py +19 -23
sglang/srt/multimodal/processors/internvl.py +9 -10
sglang/srt/multimodal/processors/janus_pro.py +12 -27
sglang/srt/multimodal/processors/kimi_vl.py +12 -14
sglang/srt/multimodal/processors/llava.py +4 -2
sglang/srt/multimodal/processors/minicpm.py +35 -44
sglang/srt/multimodal/processors/mlama.py +21 -18
sglang/srt/multimodal/processors/mllama4.py +4 -5
sglang/srt/multimodal/processors/phi4mm.py +63 -39
sglang/srt/multimodal/processors/pixtral.py +14 -35
sglang/srt/multimodal/processors/qwen_audio.py +65 -0
sglang/srt/multimodal/processors/qwen_vl.py +16 -21
sglang/srt/multimodal/processors/vila.py +14 -14
sglang/srt/reasoning_parser.py +46 -4
sglang/srt/sampling/sampling_batch_info.py +6 -5
sglang/srt/sampling/sampling_params.py +8 -1
sglang/srt/server_args.py +454 -270
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +33 -28
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +46 -37
sglang/srt/speculative/eagle_utils.py +51 -23
sglang/srt/speculative/eagle_worker.py +59 -44
sglang/srt/two_batch_overlap.py +10 -5
sglang/srt/utils.py +44 -69
sglang/test/runners.py +14 -3
sglang/test/test_activation.py +50 -1
sglang/test/test_block_fp8.py +8 -3
sglang/test/test_block_fp8_ep.py +1 -1
sglang/test/test_custom_ops.py +12 -7
sglang/test/test_cutlass_w4a8_moe.py +1 -3
sglang/test/test_fp4_moe.py +1 -3
sglang/test/test_marlin_moe.py +286 -0
sglang/test/test_marlin_utils.py +171 -0
sglang/test/test_utils.py +35 -0
sglang/version.py +1 -1
{sglang-0.4.9.post2.dist-info → sglang-0.4.9.post4.dist-info}/METADATA +10 -10
{sglang-0.4.9.post2.dist-info → sglang-0.4.9.post4.dist-info}/RECORD +198 -175
sglang/srt/layers/quantization/quant_utils.py +0 -166
sglang/srt/managers/multimodal_processors/qwen_audio.py +0 -94
{sglang-0.4.9.post2.dist-info → sglang-0.4.9.post4.dist-info}/WHEEL +0 -0
{sglang-0.4.9.post2.dist-info → sglang-0.4.9.post4.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.9.post2.dist-info → sglang-0.4.9.post4.dist-info}/top_level.txt +0 -0

sglang/srt/multimodal/processors/internvl.py CHANGED Viewed

@@ -24,7 +24,6 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
         self.IMG_CONTEXT_TOKEN = "<IMG_CONTEXT>"
         self.IMG_START_TOKEN = "<img>"
         self.IMG_END_TOKEN = "</img>"
-        self.IMG_TOKEN = "<image>"
         self.num_image_token = int(
             (image_size // patch_size) ** 2 * (hf_config.downsample_ratio**2)
         )
@@ -32,9 +31,10 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
         tokenizer = self._processor
         self.img_start_token_id = tokenizer.convert_tokens_to_ids(self.IMG_START_TOKEN)
         self.img_end_token_id = tokenizer.convert_tokens_to_ids(self.IMG_END_TOKEN)
-        self.img_context_token_id = tokenizer.convert_tokens_to_ids(
-            self.IMG_CONTEXT_TOKEN
-        )
+        self.mm_tokens = MultimodalSpecialTokens(
+            image_token="<image>",
+            image_token_id=tokenizer.convert_tokens_to_ids(self.IMG_CONTEXT_TOKEN),
+        ).build(_image_processor)
     @staticmethod
     def build_transform(input_size):
@@ -170,13 +170,12 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
         return pixel_values, num_patches_list
     async def process_mm_data_async(
-        self, image_data, input_text, request_obj, max_req_input_len, **kwargs
+        self, image_data, input_text, request_obj, **kwargs
     ):
         base_output = self.load_mm_data(
             prompt=input_text,
             image_data=image_data,
-            multimodal_tokens=MultimodalSpecialTokens(image_token=self.IMG_TOKEN),
-            max_req_input_len=max_req_input_len,
+            multimodal_tokens=self.mm_tokens,
             discard_alpha_channel=True,
         )
@@ -219,11 +218,11 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
         input_ids = tokenizer(input_text, return_tensors="pt")["input_ids"].flatten()
         image_offsets = self.get_mm_items_offset(
             input_ids=input_ids,
-            mm_token_id=self.img_context_token_id,
+            mm_token_id=self.mm_tokens.image_token_id,
         )
         items = [
             MultimodalDataItem(
-                pixel_values=pixel_values,
+                feature=pixel_values,
                 modality=Modality.IMAGE,
                 offsets=image_offsets,
             )
@@ -234,5 +233,5 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
             "mm_items": items,
             "im_start_id": self.img_start_token_id,
             "im_end_id": self.img_end_token_id,
-            "im_token_id": self.img_context_token_id,
+            "im_token_id": self.mm_tokens.image_token_id,
         }

sglang/srt/multimodal/processors/janus_pro.py CHANGED Viewed

@@ -14,47 +14,32 @@ class JanusProImageProcessor(BaseMultimodalProcessor):
     def __init__(self, hf_config, server_args, _processor):
         super().__init__(hf_config, server_args, _processor)
+        self.mm_tokens = MultimodalSpecialTokens(
+            image_token=_processor.image_token,
+            image_token_id=_processor.image_id,
+        ).build(_processor)
     async def process_mm_data_async(
         self,
         image_data: List[Union[str, bytes]],
         input_text,
         request_obj,
-        max_req_input_len,
         **kwargs,
     ):
-        processor = self._processor
         base_out = self.load_mm_data(
             prompt=input_text,
             image_data=image_data,
-            multimodal_tokens=MultimodalSpecialTokens(
-                image_token=processor.image_token
-            ),
-            max_req_input_len=max_req_input_len,
+            multimodal_tokens=self.mm_tokens,
         )
-        images = base_out.images
-        res = self.process_mm_data(
-            input_text=base_out.input_text,
-            prompt=base_out.input_text,
-            images=images,
+        mm_items, input_ids, _ = self.process_and_combine_mm_data(
+            base_out, self.mm_tokens, prompt=base_out.input_text
         )
-        input_ids = res["input_ids"].flatten()
-        image_offsets = self.get_mm_items_offset(
-            input_ids=input_ids, mm_token_id=processor.image_id
-        )
         return {
-            "mm_items": [
-                MultimodalDataItem(
-                    pixel_values=res["pixel_values"],
-                    image_emb_mask=res["images_emb_mask"],
-                    offsets=image_offsets,
-                    modality=Modality.IMAGE,
-                )
-            ],
+            "mm_items": mm_items,
             "input_ids": input_ids.tolist(),
-            "im_start_id": processor.image_start_id,
-            "im_end_id": processor.image_end_id,
-            "im_token_id": processor.image_id,
+            "im_start_id": self._processor.image_start_id,
+            "im_end_id": self._processor.image_end_id,
+            "im_token_id": self.mm_tokens.image_token_id,
         }

sglang/srt/multimodal/processors/kimi_vl.py CHANGED Viewed

@@ -1,9 +1,6 @@
 import re
-from typing import Any, Dict, List, Optional, Union
+from typing import Dict, List, Union
-import torch
-from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
 from sglang.srt.models.kimi_vl import KimiVLForConditionalGeneration
 from sglang.srt.multimodal.processors.base_processor import (
     BaseMultimodalProcessor as SGLangBaseProcessor,
@@ -17,32 +14,33 @@ class KimiVLImageProcessor(SGLangBaseProcessor):
     def __init__(self, hf_config, server_args, _processor):
         super().__init__(hf_config, server_args, _processor)
-        self.IMAGE_TOKEN = "<|media_pad|>"
-        self.IMAGE_TOKEN_REGEX = re.compile(r"(?:<\|media_pad\|>)+")
-        self.IM_TOKEN_ID = _processor.tokenizer.convert_tokens_to_ids(self.IMAGE_TOKEN)
+        self.mm_tokens = MultimodalSpecialTokens(
+            image_token="<|media_pad|>",
+            # TODO: could we convert in MultimodalSpecialTokens?
+            image_token_id=hf_config.media_placeholder_token_id,
+            image_token_regex=re.compile(r"(?:<\|media_pad\|>)+"),
+        ).build(_processor)
     async def process_mm_data_async(
         self,
         image_data: List[Union[str, bytes, Dict]],
         input_text,
         request_obj,
-        max_req_input_len,
         *args,
         **kwargs,
     ):
         base_output = self.load_mm_data(
             prompt=input_text,
             image_data=image_data,
-            multimodal_tokens=MultimodalSpecialTokens(
-                image_token=self.IMAGE_TOKEN, image_token_regex=self.IMAGE_TOKEN_REGEX
-            ),
-            max_req_input_len=max_req_input_len,
+            multimodal_tokens=self.mm_tokens,
         )
-        mm_items, input_ids, _ = self.process_and_combine_mm_data(base_output)
+        mm_items, input_ids, _ = self.process_and_combine_mm_data(
+            base_output, self.mm_tokens
+        )
         return {
             "input_ids": input_ids.tolist(),
             "mm_items": mm_items,
-            "im_token_id": self.IM_TOKEN_ID,
+            "im_token_id": self.mm_tokens.image_token_id,
         }

sglang/srt/multimodal/processors/llava.py CHANGED Viewed

@@ -158,8 +158,10 @@ class LlavaImageProcessor(BaseMultimodalProcessor):
         return {
             "mm_items": [
                 MultimodalDataItem(
-                    pixel_values=pixel_values,
-                    image_sizes=image_sizes,
+                    feature=pixel_values,
+                    model_specific_data={
+                        "image_sizes": image_sizes,
+                    },
                     modality=modality,
                 )
             ],

sglang/srt/multimodal/processors/minicpm.py CHANGED Viewed

@@ -17,9 +17,22 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
     def __init__(self, hf_config, server_args, _processor):
         super().__init__(hf_config, server_args, _processor)
-        self.image_token = "(<image>./</image>)"
-        self.audio_token = "(<audio>./</audio>)"
-        self.video_token = "(<video>./</video>)"
+        # Collect special token ids
+        tokenizer = self._processor.tokenizer
+        self.slice_start_id = getattr(tokenizer, "slice_start_id", None)
+        self.slice_end_id = getattr(tokenizer, "slice_end_id", None)
+        self.audio_start_id = getattr(tokenizer, "audio_start_id", None)
+        self.audio_end_id = getattr(tokenizer, "audio_end_id", None)
+        self.im_start_id = getattr(tokenizer, "im_start_id", None)
+        self.im_end_id = getattr(tokenizer, "im_end_id", None)
+        self.im_token_id = getattr(tokenizer, "unk_id", None)
+        self.mm_tokens = MultimodalSpecialTokens(
+            image_token="(<image>./</image>)",
+            audio_token="(<audio>./</audio>)",
+            video_token="(<video>./</video>)",
+            image_token_id=self.im_token_id,
+        ).build(_processor)
     async def process_mm_data_async(
         self,
@@ -27,19 +40,13 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
         audio_data: List[Union[str, bytes]],
         input_text,
         request_obj,
-        max_req_input_len,
         **kwargs,
     ):
         base_output = self.load_mm_data(
             prompt=input_text,
-            max_req_input_len=max_req_input_len,
             audio_data=audio_data,
             image_data=image_data,
-            multimodal_tokens=MultimodalSpecialTokens(
-                image_token=self.image_token,
-                video_token=self.video_token,
-                audio_token=self.audio_token,
-            ),
+            multimodal_tokens=self.mm_tokens,
         )
         if base_output is None:
             return None
@@ -50,24 +57,6 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
             audios=base_output.audios,
         )
-        # Collect special token ids
-        tokenizer = self._processor.tokenizer
-        slice_start_id, slice_end_id, audio_start_id, audio_end_id = (
-            None,
-            None,
-            None,
-            None,
-        )
-        if tokenizer.slice_start_id:
-            slice_start_id = tokenizer.slice_start_id
-            slice_end_id = tokenizer.slice_end_id
-        if hasattr(tokenizer, "audio_start_id"):
-            audio_start_id = tokenizer.audio_start_id
-            audio_end_id = tokenizer.audio_end_id
-        im_start_id = tokenizer.im_start_id
-        im_end_id = tokenizer.im_end_id
-        im_token_id = tokenizer.unk_id
         pixel_values = res["pixel_values"]
         tgt_sizes = res["tgt_sizes"]
@@ -104,19 +93,21 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
         items = []
         input_ids = res["input_ids"].flatten()
         image_offsets = self.get_mm_items_offset_by_pair(
-            input_ids=input_ids, mm_start_id=im_start_id, mm_end_id=im_end_id
+            input_ids=input_ids, mm_start_id=self.im_start_id, mm_end_id=self.im_end_id
         )
         slice_offsets = self.get_mm_items_offset_by_pair(
-            input_ids=input_ids, mm_start_id=slice_start_id, mm_end_id=slice_end_id
+            input_ids=input_ids,
+            mm_start_id=self.slice_start_id,
+            mm_end_id=self.slice_end_id,
         )
         image_offsets.extend(slice_offsets)
         image_offsets = sorted(image_offsets)
         if len(pixel_values) != 0:
             item = MultimodalDataItem(
-                pixel_values=pixel_values,
+                feature=pixel_values,
                 offsets=image_offsets,
-                tgt_size=tgt_sizes_flat,
+                model_specific_data={"tgt_size": tgt_sizes_flat},
                 modality=Modality.IMAGE,
             )
             items += [item]
@@ -126,17 +117,17 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
             and res["audio_features"] is not None
             and len(res["audio_features"]) != 0
         ):
-            if audio_start_id is not None and audio_end_id is not None:
+            if self.audio_start_id is not None and self.audio_end_id is not None:
                 audio_offsets = self.get_mm_items_offset_by_pair(
                     input_ids=input_ids,
-                    mm_start_id=audio_start_id,
-                    mm_end_id=audio_end_id,
+                    mm_start_id=self.audio_start_id,
+                    mm_end_id=self.audio_end_id,
                 )
             else:
                 audio_offsets = None
             item = MultimodalDataItem(
-                audio_features=[res["audio_features"]],
-                audio_feature_lens=res["audio_feature_lens"],
+                feature=[res["audio_features"]],
+                model_specific_data={"audio_feature_lens": res["audio_feature_lens"]},
                 offsets=audio_offsets,
                 modality=Modality.AUDIO,
             )
@@ -144,11 +135,11 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
         return {
             "mm_items": items,
             "input_ids": input_ids.tolist(),
-            "audio_start_id": audio_start_id,
-            "audio_end_id": audio_end_id,
-            "im_token_id": im_token_id,
-            "im_start_id": im_start_id,
-            "im_end_id": im_end_id,
-            "slice_start_id": slice_start_id,
-            "slice_end_id": slice_end_id,
+            "audio_start_id": self.audio_start_id,
+            "audio_end_id": self.audio_end_id,
+            "im_token_id": self.im_token_id,
+            "im_start_id": self.im_start_id,
+            "im_end_id": self.im_end_id,
+            "slice_start_id": self.slice_start_id,
+            "slice_end_id": self.slice_end_id,
         }

sglang/srt/multimodal/processors/mlama.py CHANGED Viewed

@@ -1,9 +1,10 @@
 from typing import List, Union
-from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
 from sglang.srt.models.mllama import MllamaForConditionalGeneration
-from sglang.srt.multimodal.processors.base_processor import BaseMultimodalProcessor
-from sglang.srt.utils import load_image
+from sglang.srt.multimodal.processors.base_processor import (
+    BaseMultimodalProcessor,
+    MultimodalSpecialTokens,
+)
 class MllamaImageProcessor(BaseMultimodalProcessor):
@@ -11,24 +12,26 @@ class MllamaImageProcessor(BaseMultimodalProcessor):
     def __init__(self, hf_config, server_args, _processor):
         super().__init__(hf_config, server_args, _processor)
+        self.mm_tokens = MultimodalSpecialTokens(
+            image_token=self._processor.image_token,
+            image_token_id=self._processor.image_token_id,
+        ).build(_processor)
     async def process_mm_data_async(
         self, image_data: List[Union[str, bytes]], input_text, *args, **kwargs
     ):
-        if isinstance(input_text, list):
-            assert len(input_text) and isinstance(input_text[0], int)
-            input_text = self._processor.tokenizer.decode(input_text)
+        base_out = self.load_mm_data(
+            prompt=input_text,
+            image_data=image_data,
+            multimodal_tokens=self.mm_tokens,
+        )
-        images = [load_image(image)[0] for image in image_data]
-        image_inputs = self.process_mm_data(input_text=input_text, images=images)
-        image_inputs["input_ids"] = image_inputs["input_ids"].tolist()[0]
-        image_inputs["mm_items"] = [
-            MultimodalDataItem(
-                pixel_values=image_inputs["pixel_values"],
-                aspect_ratio_id=image_inputs["aspect_ratio_ids"],
-                aspect_ratio_mask=image_inputs["aspect_ratio_mask"],
-                modality=Modality.IMAGE,
-            )
-        ]
+        mm_items, input_ids, _ = self.process_and_combine_mm_data(
+            base_out, self.mm_tokens
+        )
-        return image_inputs
+        return {
+            "mm_items": mm_items,
+            "input_ids": input_ids.tolist(),
+            "im_token_id": self.mm_tokens.image_token_id,
+        }

sglang/srt/multimodal/processors/mllama4.py CHANGED Viewed

@@ -26,14 +26,14 @@ class Mllama4ImageProcessor(BaseMultimodalProcessor):
         self.eoi_token_index = hf_config.eoi_token_index
         self.image_token_index = hf_config.image_token_index
         self.multimodal_tokens = MultimodalSpecialTokens(
-            image_token=_processor.image_token
-        )
+            image_token=_processor.image_token,
+            image_token_id=self.image_token_index,
+        ).build(_processor)
     async def process_mm_data_async(
         self,
         image_data: List[Union[str, bytes]],
         input_text,
-        max_req_input_len=None,
         *args,
         **kwargs,
     ):
@@ -45,7 +45,6 @@ class Mllama4ImageProcessor(BaseMultimodalProcessor):
         processed_data = self.load_mm_data(
             prompt=input_text,
             multimodal_tokens=self.multimodal_tokens,
-            max_req_input_len=max_req_input_len or 4096,
             image_data=image_data,
             return_text=True,
         )
@@ -142,7 +141,7 @@ class Mllama4ImageProcessor(BaseMultimodalProcessor):
         # Add metadata for image processing
         processor_output["mm_items"] = [
             MultimodalDataItem(
-                pixel_values=processor_output["pixel_values"],
+                feature=processor_output["pixel_values"],
                 modality=Modality.IMAGE,
                 offsets=image_offsets,
             )

sglang/srt/multimodal/processors/phi4mm.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import logging
 from typing import List, Union
+from transformers.processing_utils import ProcessorMixin
 from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
 from sglang.srt.models.phi4mm import Phi4MMForCausalLM
 from sglang.srt.multimodal.processors.base_processor import (
@@ -10,18 +12,59 @@ from sglang.srt.multimodal.processors.base_processor import (
 logger = logging.getLogger(__name__)
-_IMAGE_SPECIAL_TOKEN = "<|endoftext10|>"
-_IMAGE_SPECIAL_TOKEN_ID = 200010
+# It is an adapter of hf phi4 mm processor to make it work for sglang
+# Ref: https://huggingface.co/microsoft/Phi-4-multimodal-instruct/blob/main/processing_phi4mm.py#L693
+class Phi4MMProcessorAdapter(ProcessorMixin):
+    def __init__(self, _processor) -> None:
+        self._processor = _processor
+    def __call__(self, **kwargs):
+        result = self._processor(**kwargs)
+        # Map HuggingFace output keys to sglang standard keys
+        key_mapping = {
+            "input_image_embeds": "pixel_values",
+            "input_audio_embeds": "audio_features",
+            "audio_embed_sizes": "audio_feature_lens",
+        }
+        for hf_key, sglang_key in key_mapping.items():
+            if hf_key in result:
+                result[sglang_key] = result[hf_key]
+                del result[hf_key]
+        # Filter out None or empty tensors from the result.
+        # This prevents the sglang function base_processor.collect_mm_items_from_processor_output()
+        # from misclassifying audio content as image content, and vice versa.
+        filtered_result = {
+            k: v
+            for k, v in result.items()
+            if v is not None and (not hasattr(v, "numel") or v.numel() > 0)
+        }
+        return filtered_result
-class Phi4MMImageProcessor(BaseMultimodalProcessor):
+class Phi4MMMultimodalProcessor(BaseMultimodalProcessor):
     models = [Phi4MMForCausalLM]
     def __init__(self, hf_config, server_args, _processor):
-        super().__init__(hf_config, server_args, _processor)
-        self.multimodal_tokens = MultimodalSpecialTokens(
-            image_token=_IMAGE_SPECIAL_TOKEN,
-        )
+        self.processor = Phi4MMProcessorAdapter(_processor)
+        super().__init__(hf_config, server_args, self.processor)
+        # the following CONSTANTS come from hugging-face microsoft/Phi-4-multimodal-instruct's processing_phi4mm.py file
+        # ref: https://huggingface.co/microsoft/Phi-4-multimodal-instruct/blob/main/processing_phi4mm.py
+        self.IMAGE_TOKEN = "<|endoftext10|>"
+        self.AUDIO_TOKEN = "<|endoftext11|>"
+        self.IM_TOKEN_ID = 200010
+        self.AUDIO_TOKEN_ID = 200011
+        self.AUDIO_SAMPLE_RATE = 16000
+        self.mm_tokens = MultimodalSpecialTokens(
+            image_token=self.IMAGE_TOKEN,
+            image_token_id=self.IM_TOKEN_ID,
+            audio_token=self.AUDIO_TOKEN,
+            audio_token_id=self.AUDIO_TOKEN_ID,
+        ).build(self.processor)
     async def process_mm_data_async(
         self,
@@ -29,49 +72,30 @@ class Phi4MMImageProcessor(BaseMultimodalProcessor):
         audio_data,
         input_text,
         request_obj,
-        max_req_input_len,
         **kwargs,
     ):
-        if audio_data:
-            logger.warning(
-                "Currently SGLang does not support audio data for Phi4MM. We are working on it. You can file an issue to help us prioritize."
-            )
-            audio_data = []
         base_output = self.load_mm_data(
             prompt=input_text,
-            max_req_input_len=max_req_input_len,
             audio_data=audio_data,
             image_data=image_data,
-            multimodal_tokens=self.multimodal_tokens,
+            multimodal_tokens=self.mm_tokens,
+            audio_sample_rate=self.AUDIO_SAMPLE_RATE,
         )
-        if base_output is None:
-            return None
-        res = self.process_mm_data(
-            input_text=base_output.input_text,
-            images=base_output.images,
-            audios=base_output.audios,
-        )
+        if base_output.audios is not None:
+            # hugging-face microsoft/Phi-4-multimodal-instruct's processing_phi4mm.py file requires the audio input to be tuple of (audio, sample_rate)
+            # ref: https://huggingface.co/microsoft/Phi-4-multimodal-instruct/blob/main/processing_phi4mm.py
+            base_output.audios = [
+                (audio, self.AUDIO_SAMPLE_RATE) for audio in base_output.audios
+            ]
-        input_ids = res["input_ids"].flatten()
-        image_offsets = self.get_mm_items_offset(
-            input_ids=input_ids,
-            mm_token_id=_IMAGE_SPECIAL_TOKEN_ID,
+        mm_items, input_ids, _ = self.process_and_combine_mm_data(
+            base_output, self.mm_tokens
         )
-        items = [
-            MultimodalDataItem(
-                pixel_values=res["input_image_embeds"],
-                image_sizes=res["image_sizes"],
-                image_emb_mask=res["image_attention_mask"],
-                offsets=image_offsets,
-                modality=Modality.IMAGE,
-            )
-        ]
         return {
-            "mm_items": items,
             "input_ids": input_ids.tolist(),
-            "im_token_id": _IMAGE_SPECIAL_TOKEN_ID,
+            "mm_items": mm_items,
+            "im_token_id": self.mm_tokens.image_token_id,
+            "audio_token_id": self.mm_tokens.audio_token_id,
         }

sglang/srt/multimodal/processors/pixtral.py CHANGED Viewed

@@ -6,7 +6,6 @@ from transformers.models.pixtral.image_processing_pixtral import (
     _num_image_tokens as _get_pixtral_hf_num_image_tokens,
 )
-from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
 from sglang.srt.models.pixtral import PixtralVisionModel
 from sglang.srt.multimodal.processors.base_processor import (
     BaseMultimodalProcessor,
@@ -45,7 +44,7 @@ class PixtralProcessor(BaseMultimodalProcessor):
     def __init__(self, hf_config, server_args, _processor):
         super().__init__(hf_config, server_args, _processor)
-        self.image_token_id = getattr(
+        self.IM_TOKEN_ID = getattr(
             hf_config, "image_token_index", PixtralVisionModel.DEFAULT_IMAGE_TOKEN_ID
         )
         # Instantiate the patcher logic helper using the class defined above
@@ -53,9 +52,10 @@ class PixtralProcessor(BaseMultimodalProcessor):
         self.vision_config = hf_config.vision_config
         self.image_size = self.vision_config.image_size
         self.patch_size = self.vision_config.patch_size
-        self.multimodal_tokens = MultimodalSpecialTokens(
-            image_token=_processor.image_token
-        )
+        self.mm_tokens = MultimodalSpecialTokens(
+            image_token=_processor.image_token,
+            image_token_id=self.IM_TOKEN_ID,
+        ).build(_processor)
         _processor.tokenizer.add_special_tokens(
             {
                 "pad_token": getattr(hf_config, "pad_token", self.PAD_TOKEN),
@@ -80,42 +80,21 @@ class PixtralProcessor(BaseMultimodalProcessor):
     ):
         mm_data = self.load_mm_data(
             prompt=input_text,
-            multimodal_tokens=self.multimodal_tokens,
-            max_req_input_len=kwargs.get("max_req_input_len", 4096),
+            multimodal_tokens=self.mm_tokens,
             image_data=image_data,
             return_text=True,
         )
         if mm_data.images:
             resize_tasks = [self._resize(image) for image in mm_data.images]
             mm_data.images = await asyncio.gather(*resize_tasks)
-        processor_output = self.process_mm_data(
-            input_text=mm_data.input_text,
-            images=mm_data.images,
+        mm_items, input_ids, _ = self.process_and_combine_mm_data(
+            mm_data, self.mm_tokens
         )
-        if "pixel_values" in processor_output:
-            input_ids = processor_output["input_ids"].view(-1)
-            image_offsets = self.get_mm_items_offset(
-                input_ids=input_ids,
-                mm_token_id=self.image_token_id,
-            )
-            mm_items = [
-                MultimodalDataItem(
-                    pixel_values=processor_output["pixel_values"],
-                    image_sizes=processor_output["image_sizes"],
-                    modality=Modality.IMAGE,
-                    offsets=image_offsets,
-                )
-            ]
-            input_ids = input_ids.tolist()
-            processor_output.update(
-                input_ids=input_ids,
-                mm_items=mm_items,
-                # there's no im_start_id for pixtral, only im_token and im_end_token
-                im_end_id=self.IMG_END_TOKEN_ID,
-                im_token_id=self.image_token_id,
-            )
-        return processor_output
+        return {
+            "mm_items": mm_items,
+            "input_ids": input_ids.tolist(),
+            "im_token_id": self.IM_TOKEN_ID,
+            "im_token": self._processor.image_token,
+        }

sglang 0.4.9.post2__py3-none-any.whl → 0.4.9.post4__py3-none-any.whl

sglang 0.4.9.post2py3-none-any.whl → 0.4.9.post4py3-none-any.whl