sglang 0.4.9.post2__py3-none-any.whl → 0.4.9.post4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (200) hide show
  1. sglang/bench_one_batch.py +2 -1
  2. sglang/eval/loogle_eval.py +7 -0
  3. sglang/srt/_custom_ops.py +29 -1
  4. sglang/srt/configs/deepseekvl2.py +11 -2
  5. sglang/srt/configs/internvl.py +3 -0
  6. sglang/srt/configs/janus_pro.py +3 -0
  7. sglang/srt/configs/model_config.py +10 -8
  8. sglang/srt/configs/update_config.py +3 -1
  9. sglang/srt/conversation.py +2 -1
  10. sglang/srt/custom_op.py +5 -2
  11. sglang/srt/disaggregation/common/conn.py +34 -6
  12. sglang/srt/disaggregation/decode.py +9 -1
  13. sglang/srt/disaggregation/mini_lb.py +3 -2
  14. sglang/srt/disaggregation/mooncake/conn.py +93 -76
  15. sglang/srt/disaggregation/mooncake/transfer_engine.py +4 -2
  16. sglang/srt/disaggregation/nixl/conn.py +17 -13
  17. sglang/srt/distributed/device_communicators/custom_all_reduce.py +3 -91
  18. sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +96 -1
  19. sglang/srt/distributed/device_communicators/quick_all_reduce.py +273 -0
  20. sglang/srt/distributed/device_communicators/shm_broadcast.py +12 -5
  21. sglang/srt/distributed/parallel_state.py +103 -15
  22. sglang/srt/entrypoints/engine.py +31 -33
  23. sglang/srt/entrypoints/http_server.py +20 -32
  24. sglang/srt/entrypoints/openai/protocol.py +3 -3
  25. sglang/srt/entrypoints/openai/serving_chat.py +48 -6
  26. sglang/srt/eplb/expert_location_dispatch.py +1 -1
  27. sglang/srt/function_call/base_format_detector.py +74 -12
  28. sglang/srt/function_call/deepseekv3_detector.py +26 -11
  29. sglang/srt/function_call/ebnf_composer.py +95 -63
  30. sglang/srt/function_call/function_call_parser.py +4 -2
  31. sglang/srt/function_call/kimik2_detector.py +41 -16
  32. sglang/srt/function_call/llama32_detector.py +6 -3
  33. sglang/srt/function_call/mistral_detector.py +11 -3
  34. sglang/srt/function_call/pythonic_detector.py +16 -14
  35. sglang/srt/function_call/qwen25_detector.py +12 -3
  36. sglang/srt/function_call/qwen3_coder_detector.py +151 -0
  37. sglang/srt/hf_transformers_utils.py +0 -1
  38. sglang/srt/layers/activation.py +24 -3
  39. sglang/srt/layers/attention/base_attn_backend.py +3 -1
  40. sglang/srt/layers/attention/flashattention_backend.py +3 -3
  41. sglang/srt/layers/attention/flashinfer_backend.py +40 -1
  42. sglang/srt/layers/communicator.py +12 -12
  43. sglang/srt/layers/dp_attention.py +72 -24
  44. sglang/srt/layers/linear.py +13 -102
  45. sglang/srt/layers/logits_processor.py +34 -24
  46. sglang/srt/layers/moe/ep_moe/kernels.py +4 -2
  47. sglang/srt/layers/moe/ep_moe/layer.py +23 -402
  48. sglang/srt/layers/moe/fused_moe_native.py +7 -47
  49. sglang/srt/layers/moe/fused_moe_triton/__init__.py +4 -4
  50. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=320,device_name=NVIDIA_H20-3e.json +146 -0
  51. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  52. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  53. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  54. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=385,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  55. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=385,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  56. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +54 -263
  57. sglang/srt/layers/moe/fused_moe_triton/layer.py +14 -396
  58. sglang/srt/layers/moe/topk.py +190 -23
  59. sglang/srt/layers/quantization/__init__.py +20 -134
  60. sglang/srt/layers/quantization/awq.py +578 -11
  61. sglang/srt/layers/quantization/awq_triton.py +339 -0
  62. sglang/srt/layers/quantization/base_config.py +85 -10
  63. sglang/srt/layers/quantization/blockwise_int8.py +17 -55
  64. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +13 -11
  65. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +23 -79
  66. sglang/srt/layers/quantization/fp8.py +273 -62
  67. sglang/srt/layers/quantization/fp8_kernel.py +210 -46
  68. sglang/srt/layers/quantization/fp8_utils.py +2 -2
  69. sglang/srt/layers/quantization/gptq.py +501 -143
  70. sglang/srt/layers/quantization/marlin_utils.py +790 -0
  71. sglang/srt/layers/quantization/modelopt_quant.py +34 -112
  72. sglang/srt/layers/quantization/moe_wna16.py +45 -49
  73. sglang/srt/layers/quantization/petit.py +252 -0
  74. sglang/srt/layers/quantization/petit_utils.py +104 -0
  75. sglang/srt/layers/quantization/qoq.py +7 -6
  76. sglang/srt/layers/quantization/scalar_type.py +352 -0
  77. sglang/srt/layers/quantization/unquant.py +422 -0
  78. sglang/srt/layers/quantization/utils.py +340 -9
  79. sglang/srt/layers/quantization/w4afp8.py +8 -4
  80. sglang/srt/layers/quantization/w8a8_fp8.py +17 -51
  81. sglang/srt/layers/quantization/w8a8_int8.py +51 -115
  82. sglang/srt/layers/radix_attention.py +5 -3
  83. sglang/srt/layers/vocab_parallel_embedding.py +1 -41
  84. sglang/srt/lora/lora.py +0 -4
  85. sglang/srt/lora/lora_manager.py +162 -164
  86. sglang/srt/lora/lora_registry.py +124 -0
  87. sglang/srt/lora/mem_pool.py +83 -35
  88. sglang/srt/lora/utils.py +12 -5
  89. sglang/srt/managers/cache_controller.py +288 -0
  90. sglang/srt/managers/io_struct.py +60 -30
  91. sglang/srt/managers/mm_utils.py +7 -8
  92. sglang/srt/managers/schedule_batch.py +163 -113
  93. sglang/srt/managers/schedule_policy.py +68 -27
  94. sglang/srt/managers/scheduler.py +256 -86
  95. sglang/srt/managers/scheduler_output_processor_mixin.py +22 -4
  96. sglang/srt/managers/tokenizer_manager.py +38 -27
  97. sglang/srt/managers/tp_worker.py +16 -4
  98. sglang/srt/managers/tp_worker_overlap_thread.py +11 -0
  99. sglang/srt/mem_cache/allocator.py +74 -23
  100. sglang/srt/mem_cache/base_prefix_cache.py +14 -2
  101. sglang/srt/mem_cache/chunk_cache.py +5 -2
  102. sglang/srt/mem_cache/hicache_storage.py +168 -0
  103. sglang/srt/mem_cache/hiradix_cache.py +194 -5
  104. sglang/srt/mem_cache/memory_pool.py +16 -1
  105. sglang/srt/mem_cache/memory_pool_host.py +44 -2
  106. sglang/srt/mem_cache/radix_cache.py +26 -0
  107. sglang/srt/mem_cache/swa_radix_cache.py +1025 -0
  108. sglang/srt/metrics/collector.py +9 -0
  109. sglang/srt/model_executor/cuda_graph_runner.py +66 -31
  110. sglang/srt/model_executor/forward_batch_info.py +210 -25
  111. sglang/srt/model_executor/model_runner.py +147 -42
  112. sglang/srt/model_loader/loader.py +7 -1
  113. sglang/srt/model_loader/utils.py +4 -4
  114. sglang/srt/models/clip.py +1 -1
  115. sglang/srt/models/deepseek.py +9 -6
  116. sglang/srt/models/deepseek_janus_pro.py +1 -1
  117. sglang/srt/models/deepseek_v2.py +192 -173
  118. sglang/srt/models/deepseek_vl2.py +5 -5
  119. sglang/srt/models/gemma.py +48 -0
  120. sglang/srt/models/gemma2.py +52 -0
  121. sglang/srt/models/gemma3_causal.py +63 -0
  122. sglang/srt/models/gemma3_mm.py +1 -1
  123. sglang/srt/models/gemma3n_mm.py +2 -4
  124. sglang/srt/models/granitemoe.py +385 -0
  125. sglang/srt/models/grok.py +9 -3
  126. sglang/srt/models/hunyuan.py +63 -16
  127. sglang/srt/models/internvl.py +1 -1
  128. sglang/srt/models/kimi_vl.py +1 -1
  129. sglang/srt/models/llama.py +41 -0
  130. sglang/srt/models/llama4.py +11 -11
  131. sglang/srt/models/llava.py +2 -2
  132. sglang/srt/models/llavavid.py +1 -1
  133. sglang/srt/models/minicpm.py +0 -2
  134. sglang/srt/models/minicpmo.py +3 -7
  135. sglang/srt/models/minicpmv.py +1 -1
  136. sglang/srt/models/mistral.py +1 -1
  137. sglang/srt/models/mixtral.py +9 -2
  138. sglang/srt/models/mllama.py +3 -5
  139. sglang/srt/models/mllama4.py +13 -6
  140. sglang/srt/models/olmoe.py +8 -5
  141. sglang/srt/models/persimmon.py +330 -0
  142. sglang/srt/models/phi.py +321 -0
  143. sglang/srt/models/phi4mm.py +44 -4
  144. sglang/srt/models/phi4mm_audio.py +1260 -0
  145. sglang/srt/models/phi4mm_utils.py +1917 -0
  146. sglang/srt/models/phimoe.py +9 -3
  147. sglang/srt/models/qwen.py +37 -0
  148. sglang/srt/models/qwen2.py +41 -0
  149. sglang/srt/models/qwen2_5_vl.py +4 -4
  150. sglang/srt/models/qwen2_audio.py +1 -1
  151. sglang/srt/models/qwen2_moe.py +53 -9
  152. sglang/srt/models/qwen2_vl.py +4 -4
  153. sglang/srt/models/qwen3.py +65 -1
  154. sglang/srt/models/qwen3_moe.py +57 -24
  155. sglang/srt/models/vila.py +1 -1
  156. sglang/srt/multimodal/processors/base_processor.py +91 -97
  157. sglang/srt/multimodal/processors/clip.py +21 -19
  158. sglang/srt/multimodal/processors/deepseek_vl_v2.py +8 -26
  159. sglang/srt/multimodal/processors/gemma3.py +13 -17
  160. sglang/srt/multimodal/processors/gemma3n.py +19 -23
  161. sglang/srt/multimodal/processors/internvl.py +9 -10
  162. sglang/srt/multimodal/processors/janus_pro.py +12 -27
  163. sglang/srt/multimodal/processors/kimi_vl.py +12 -14
  164. sglang/srt/multimodal/processors/llava.py +4 -2
  165. sglang/srt/multimodal/processors/minicpm.py +35 -44
  166. sglang/srt/multimodal/processors/mlama.py +21 -18
  167. sglang/srt/multimodal/processors/mllama4.py +4 -5
  168. sglang/srt/multimodal/processors/phi4mm.py +63 -39
  169. sglang/srt/multimodal/processors/pixtral.py +14 -35
  170. sglang/srt/multimodal/processors/qwen_audio.py +65 -0
  171. sglang/srt/multimodal/processors/qwen_vl.py +16 -21
  172. sglang/srt/multimodal/processors/vila.py +14 -14
  173. sglang/srt/reasoning_parser.py +46 -4
  174. sglang/srt/sampling/sampling_batch_info.py +6 -5
  175. sglang/srt/sampling/sampling_params.py +8 -1
  176. sglang/srt/server_args.py +454 -270
  177. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +33 -28
  178. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +46 -37
  179. sglang/srt/speculative/eagle_utils.py +51 -23
  180. sglang/srt/speculative/eagle_worker.py +59 -44
  181. sglang/srt/two_batch_overlap.py +10 -5
  182. sglang/srt/utils.py +44 -69
  183. sglang/test/runners.py +14 -3
  184. sglang/test/test_activation.py +50 -1
  185. sglang/test/test_block_fp8.py +8 -3
  186. sglang/test/test_block_fp8_ep.py +1 -1
  187. sglang/test/test_custom_ops.py +12 -7
  188. sglang/test/test_cutlass_w4a8_moe.py +1 -3
  189. sglang/test/test_fp4_moe.py +1 -3
  190. sglang/test/test_marlin_moe.py +286 -0
  191. sglang/test/test_marlin_utils.py +171 -0
  192. sglang/test/test_utils.py +35 -0
  193. sglang/version.py +1 -1
  194. {sglang-0.4.9.post2.dist-info → sglang-0.4.9.post4.dist-info}/METADATA +10 -10
  195. {sglang-0.4.9.post2.dist-info → sglang-0.4.9.post4.dist-info}/RECORD +198 -175
  196. sglang/srt/layers/quantization/quant_utils.py +0 -166
  197. sglang/srt/managers/multimodal_processors/qwen_audio.py +0 -94
  198. {sglang-0.4.9.post2.dist-info → sglang-0.4.9.post4.dist-info}/WHEEL +0 -0
  199. {sglang-0.4.9.post2.dist-info → sglang-0.4.9.post4.dist-info}/licenses/LICENSE +0 -0
  200. {sglang-0.4.9.post2.dist-info → sglang-0.4.9.post4.dist-info}/top_level.txt +0 -0
@@ -24,7 +24,6 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
24
24
  self.IMG_CONTEXT_TOKEN = "<IMG_CONTEXT>"
25
25
  self.IMG_START_TOKEN = "<img>"
26
26
  self.IMG_END_TOKEN = "</img>"
27
- self.IMG_TOKEN = "<image>"
28
27
  self.num_image_token = int(
29
28
  (image_size // patch_size) ** 2 * (hf_config.downsample_ratio**2)
30
29
  )
@@ -32,9 +31,10 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
32
31
  tokenizer = self._processor
33
32
  self.img_start_token_id = tokenizer.convert_tokens_to_ids(self.IMG_START_TOKEN)
34
33
  self.img_end_token_id = tokenizer.convert_tokens_to_ids(self.IMG_END_TOKEN)
35
- self.img_context_token_id = tokenizer.convert_tokens_to_ids(
36
- self.IMG_CONTEXT_TOKEN
37
- )
34
+ self.mm_tokens = MultimodalSpecialTokens(
35
+ image_token="<image>",
36
+ image_token_id=tokenizer.convert_tokens_to_ids(self.IMG_CONTEXT_TOKEN),
37
+ ).build(_image_processor)
38
38
 
39
39
  @staticmethod
40
40
  def build_transform(input_size):
@@ -170,13 +170,12 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
170
170
  return pixel_values, num_patches_list
171
171
 
172
172
  async def process_mm_data_async(
173
- self, image_data, input_text, request_obj, max_req_input_len, **kwargs
173
+ self, image_data, input_text, request_obj, **kwargs
174
174
  ):
175
175
  base_output = self.load_mm_data(
176
176
  prompt=input_text,
177
177
  image_data=image_data,
178
- multimodal_tokens=MultimodalSpecialTokens(image_token=self.IMG_TOKEN),
179
- max_req_input_len=max_req_input_len,
178
+ multimodal_tokens=self.mm_tokens,
180
179
  discard_alpha_channel=True,
181
180
  )
182
181
 
@@ -219,11 +218,11 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
219
218
  input_ids = tokenizer(input_text, return_tensors="pt")["input_ids"].flatten()
220
219
  image_offsets = self.get_mm_items_offset(
221
220
  input_ids=input_ids,
222
- mm_token_id=self.img_context_token_id,
221
+ mm_token_id=self.mm_tokens.image_token_id,
223
222
  )
224
223
  items = [
225
224
  MultimodalDataItem(
226
- pixel_values=pixel_values,
225
+ feature=pixel_values,
227
226
  modality=Modality.IMAGE,
228
227
  offsets=image_offsets,
229
228
  )
@@ -234,5 +233,5 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
234
233
  "mm_items": items,
235
234
  "im_start_id": self.img_start_token_id,
236
235
  "im_end_id": self.img_end_token_id,
237
- "im_token_id": self.img_context_token_id,
236
+ "im_token_id": self.mm_tokens.image_token_id,
238
237
  }
@@ -14,47 +14,32 @@ class JanusProImageProcessor(BaseMultimodalProcessor):
14
14
  def __init__(self, hf_config, server_args, _processor):
15
15
  super().__init__(hf_config, server_args, _processor)
16
16
 
17
+ self.mm_tokens = MultimodalSpecialTokens(
18
+ image_token=_processor.image_token,
19
+ image_token_id=_processor.image_id,
20
+ ).build(_processor)
21
+
17
22
  async def process_mm_data_async(
18
23
  self,
19
24
  image_data: List[Union[str, bytes]],
20
25
  input_text,
21
26
  request_obj,
22
- max_req_input_len,
23
27
  **kwargs,
24
28
  ):
25
- processor = self._processor
26
-
27
29
  base_out = self.load_mm_data(
28
30
  prompt=input_text,
29
31
  image_data=image_data,
30
- multimodal_tokens=MultimodalSpecialTokens(
31
- image_token=processor.image_token
32
- ),
33
- max_req_input_len=max_req_input_len,
32
+ multimodal_tokens=self.mm_tokens,
34
33
  )
35
34
 
36
- images = base_out.images
37
- res = self.process_mm_data(
38
- input_text=base_out.input_text,
39
- prompt=base_out.input_text,
40
- images=images,
35
+ mm_items, input_ids, _ = self.process_and_combine_mm_data(
36
+ base_out, self.mm_tokens, prompt=base_out.input_text
41
37
  )
42
38
 
43
- input_ids = res["input_ids"].flatten()
44
- image_offsets = self.get_mm_items_offset(
45
- input_ids=input_ids, mm_token_id=processor.image_id
46
- )
47
39
  return {
48
- "mm_items": [
49
- MultimodalDataItem(
50
- pixel_values=res["pixel_values"],
51
- image_emb_mask=res["images_emb_mask"],
52
- offsets=image_offsets,
53
- modality=Modality.IMAGE,
54
- )
55
- ],
40
+ "mm_items": mm_items,
56
41
  "input_ids": input_ids.tolist(),
57
- "im_start_id": processor.image_start_id,
58
- "im_end_id": processor.image_end_id,
59
- "im_token_id": processor.image_id,
42
+ "im_start_id": self._processor.image_start_id,
43
+ "im_end_id": self._processor.image_end_id,
44
+ "im_token_id": self.mm_tokens.image_token_id,
60
45
  }
@@ -1,9 +1,6 @@
1
1
  import re
2
- from typing import Any, Dict, List, Optional, Union
2
+ from typing import Dict, List, Union
3
3
 
4
- import torch
5
-
6
- from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
7
4
  from sglang.srt.models.kimi_vl import KimiVLForConditionalGeneration
8
5
  from sglang.srt.multimodal.processors.base_processor import (
9
6
  BaseMultimodalProcessor as SGLangBaseProcessor,
@@ -17,32 +14,33 @@ class KimiVLImageProcessor(SGLangBaseProcessor):
17
14
 
18
15
  def __init__(self, hf_config, server_args, _processor):
19
16
  super().__init__(hf_config, server_args, _processor)
20
- self.IMAGE_TOKEN = "<|media_pad|>"
21
- self.IMAGE_TOKEN_REGEX = re.compile(r"(?:<\|media_pad\|>)+")
22
- self.IM_TOKEN_ID = _processor.tokenizer.convert_tokens_to_ids(self.IMAGE_TOKEN)
17
+ self.mm_tokens = MultimodalSpecialTokens(
18
+ image_token="<|media_pad|>",
19
+ # TODO: could we convert in MultimodalSpecialTokens?
20
+ image_token_id=hf_config.media_placeholder_token_id,
21
+ image_token_regex=re.compile(r"(?:<\|media_pad\|>)+"),
22
+ ).build(_processor)
23
23
 
24
24
  async def process_mm_data_async(
25
25
  self,
26
26
  image_data: List[Union[str, bytes, Dict]],
27
27
  input_text,
28
28
  request_obj,
29
- max_req_input_len,
30
29
  *args,
31
30
  **kwargs,
32
31
  ):
33
32
  base_output = self.load_mm_data(
34
33
  prompt=input_text,
35
34
  image_data=image_data,
36
- multimodal_tokens=MultimodalSpecialTokens(
37
- image_token=self.IMAGE_TOKEN, image_token_regex=self.IMAGE_TOKEN_REGEX
38
- ),
39
- max_req_input_len=max_req_input_len,
35
+ multimodal_tokens=self.mm_tokens,
40
36
  )
41
37
 
42
- mm_items, input_ids, _ = self.process_and_combine_mm_data(base_output)
38
+ mm_items, input_ids, _ = self.process_and_combine_mm_data(
39
+ base_output, self.mm_tokens
40
+ )
43
41
 
44
42
  return {
45
43
  "input_ids": input_ids.tolist(),
46
44
  "mm_items": mm_items,
47
- "im_token_id": self.IM_TOKEN_ID,
45
+ "im_token_id": self.mm_tokens.image_token_id,
48
46
  }
@@ -158,8 +158,10 @@ class LlavaImageProcessor(BaseMultimodalProcessor):
158
158
  return {
159
159
  "mm_items": [
160
160
  MultimodalDataItem(
161
- pixel_values=pixel_values,
162
- image_sizes=image_sizes,
161
+ feature=pixel_values,
162
+ model_specific_data={
163
+ "image_sizes": image_sizes,
164
+ },
163
165
  modality=modality,
164
166
  )
165
167
  ],
@@ -17,9 +17,22 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
17
17
 
18
18
  def __init__(self, hf_config, server_args, _processor):
19
19
  super().__init__(hf_config, server_args, _processor)
20
- self.image_token = "(<image>./</image>)"
21
- self.audio_token = "(<audio>./</audio>)"
22
- self.video_token = "(<video>./</video>)"
20
+ # Collect special token ids
21
+ tokenizer = self._processor.tokenizer
22
+ self.slice_start_id = getattr(tokenizer, "slice_start_id", None)
23
+ self.slice_end_id = getattr(tokenizer, "slice_end_id", None)
24
+ self.audio_start_id = getattr(tokenizer, "audio_start_id", None)
25
+ self.audio_end_id = getattr(tokenizer, "audio_end_id", None)
26
+ self.im_start_id = getattr(tokenizer, "im_start_id", None)
27
+ self.im_end_id = getattr(tokenizer, "im_end_id", None)
28
+ self.im_token_id = getattr(tokenizer, "unk_id", None)
29
+
30
+ self.mm_tokens = MultimodalSpecialTokens(
31
+ image_token="(<image>./</image>)",
32
+ audio_token="(<audio>./</audio>)",
33
+ video_token="(<video>./</video>)",
34
+ image_token_id=self.im_token_id,
35
+ ).build(_processor)
23
36
 
24
37
  async def process_mm_data_async(
25
38
  self,
@@ -27,19 +40,13 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
27
40
  audio_data: List[Union[str, bytes]],
28
41
  input_text,
29
42
  request_obj,
30
- max_req_input_len,
31
43
  **kwargs,
32
44
  ):
33
45
  base_output = self.load_mm_data(
34
46
  prompt=input_text,
35
- max_req_input_len=max_req_input_len,
36
47
  audio_data=audio_data,
37
48
  image_data=image_data,
38
- multimodal_tokens=MultimodalSpecialTokens(
39
- image_token=self.image_token,
40
- video_token=self.video_token,
41
- audio_token=self.audio_token,
42
- ),
49
+ multimodal_tokens=self.mm_tokens,
43
50
  )
44
51
  if base_output is None:
45
52
  return None
@@ -50,24 +57,6 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
50
57
  audios=base_output.audios,
51
58
  )
52
59
 
53
- # Collect special token ids
54
- tokenizer = self._processor.tokenizer
55
- slice_start_id, slice_end_id, audio_start_id, audio_end_id = (
56
- None,
57
- None,
58
- None,
59
- None,
60
- )
61
- if tokenizer.slice_start_id:
62
- slice_start_id = tokenizer.slice_start_id
63
- slice_end_id = tokenizer.slice_end_id
64
- if hasattr(tokenizer, "audio_start_id"):
65
- audio_start_id = tokenizer.audio_start_id
66
- audio_end_id = tokenizer.audio_end_id
67
-
68
- im_start_id = tokenizer.im_start_id
69
- im_end_id = tokenizer.im_end_id
70
- im_token_id = tokenizer.unk_id
71
60
  pixel_values = res["pixel_values"]
72
61
  tgt_sizes = res["tgt_sizes"]
73
62
 
@@ -104,19 +93,21 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
104
93
  items = []
105
94
  input_ids = res["input_ids"].flatten()
106
95
  image_offsets = self.get_mm_items_offset_by_pair(
107
- input_ids=input_ids, mm_start_id=im_start_id, mm_end_id=im_end_id
96
+ input_ids=input_ids, mm_start_id=self.im_start_id, mm_end_id=self.im_end_id
108
97
  )
109
98
  slice_offsets = self.get_mm_items_offset_by_pair(
110
- input_ids=input_ids, mm_start_id=slice_start_id, mm_end_id=slice_end_id
99
+ input_ids=input_ids,
100
+ mm_start_id=self.slice_start_id,
101
+ mm_end_id=self.slice_end_id,
111
102
  )
112
103
  image_offsets.extend(slice_offsets)
113
104
  image_offsets = sorted(image_offsets)
114
105
 
115
106
  if len(pixel_values) != 0:
116
107
  item = MultimodalDataItem(
117
- pixel_values=pixel_values,
108
+ feature=pixel_values,
118
109
  offsets=image_offsets,
119
- tgt_size=tgt_sizes_flat,
110
+ model_specific_data={"tgt_size": tgt_sizes_flat},
120
111
  modality=Modality.IMAGE,
121
112
  )
122
113
  items += [item]
@@ -126,17 +117,17 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
126
117
  and res["audio_features"] is not None
127
118
  and len(res["audio_features"]) != 0
128
119
  ):
129
- if audio_start_id is not None and audio_end_id is not None:
120
+ if self.audio_start_id is not None and self.audio_end_id is not None:
130
121
  audio_offsets = self.get_mm_items_offset_by_pair(
131
122
  input_ids=input_ids,
132
- mm_start_id=audio_start_id,
133
- mm_end_id=audio_end_id,
123
+ mm_start_id=self.audio_start_id,
124
+ mm_end_id=self.audio_end_id,
134
125
  )
135
126
  else:
136
127
  audio_offsets = None
137
128
  item = MultimodalDataItem(
138
- audio_features=[res["audio_features"]],
139
- audio_feature_lens=res["audio_feature_lens"],
129
+ feature=[res["audio_features"]],
130
+ model_specific_data={"audio_feature_lens": res["audio_feature_lens"]},
140
131
  offsets=audio_offsets,
141
132
  modality=Modality.AUDIO,
142
133
  )
@@ -144,11 +135,11 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
144
135
  return {
145
136
  "mm_items": items,
146
137
  "input_ids": input_ids.tolist(),
147
- "audio_start_id": audio_start_id,
148
- "audio_end_id": audio_end_id,
149
- "im_token_id": im_token_id,
150
- "im_start_id": im_start_id,
151
- "im_end_id": im_end_id,
152
- "slice_start_id": slice_start_id,
153
- "slice_end_id": slice_end_id,
138
+ "audio_start_id": self.audio_start_id,
139
+ "audio_end_id": self.audio_end_id,
140
+ "im_token_id": self.im_token_id,
141
+ "im_start_id": self.im_start_id,
142
+ "im_end_id": self.im_end_id,
143
+ "slice_start_id": self.slice_start_id,
144
+ "slice_end_id": self.slice_end_id,
154
145
  }
@@ -1,9 +1,10 @@
1
1
  from typing import List, Union
2
2
 
3
- from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
4
3
  from sglang.srt.models.mllama import MllamaForConditionalGeneration
5
- from sglang.srt.multimodal.processors.base_processor import BaseMultimodalProcessor
6
- from sglang.srt.utils import load_image
4
+ from sglang.srt.multimodal.processors.base_processor import (
5
+ BaseMultimodalProcessor,
6
+ MultimodalSpecialTokens,
7
+ )
7
8
 
8
9
 
9
10
  class MllamaImageProcessor(BaseMultimodalProcessor):
@@ -11,24 +12,26 @@ class MllamaImageProcessor(BaseMultimodalProcessor):
11
12
 
12
13
  def __init__(self, hf_config, server_args, _processor):
13
14
  super().__init__(hf_config, server_args, _processor)
15
+ self.mm_tokens = MultimodalSpecialTokens(
16
+ image_token=self._processor.image_token,
17
+ image_token_id=self._processor.image_token_id,
18
+ ).build(_processor)
14
19
 
15
20
  async def process_mm_data_async(
16
21
  self, image_data: List[Union[str, bytes]], input_text, *args, **kwargs
17
22
  ):
18
- if isinstance(input_text, list):
19
- assert len(input_text) and isinstance(input_text[0], int)
20
- input_text = self._processor.tokenizer.decode(input_text)
23
+ base_out = self.load_mm_data(
24
+ prompt=input_text,
25
+ image_data=image_data,
26
+ multimodal_tokens=self.mm_tokens,
27
+ )
21
28
 
22
- images = [load_image(image)[0] for image in image_data]
23
- image_inputs = self.process_mm_data(input_text=input_text, images=images)
24
- image_inputs["input_ids"] = image_inputs["input_ids"].tolist()[0]
25
- image_inputs["mm_items"] = [
26
- MultimodalDataItem(
27
- pixel_values=image_inputs["pixel_values"],
28
- aspect_ratio_id=image_inputs["aspect_ratio_ids"],
29
- aspect_ratio_mask=image_inputs["aspect_ratio_mask"],
30
- modality=Modality.IMAGE,
31
- )
32
- ]
29
+ mm_items, input_ids, _ = self.process_and_combine_mm_data(
30
+ base_out, self.mm_tokens
31
+ )
33
32
 
34
- return image_inputs
33
+ return {
34
+ "mm_items": mm_items,
35
+ "input_ids": input_ids.tolist(),
36
+ "im_token_id": self.mm_tokens.image_token_id,
37
+ }
@@ -26,14 +26,14 @@ class Mllama4ImageProcessor(BaseMultimodalProcessor):
26
26
  self.eoi_token_index = hf_config.eoi_token_index
27
27
  self.image_token_index = hf_config.image_token_index
28
28
  self.multimodal_tokens = MultimodalSpecialTokens(
29
- image_token=_processor.image_token
30
- )
29
+ image_token=_processor.image_token,
30
+ image_token_id=self.image_token_index,
31
+ ).build(_processor)
31
32
 
32
33
  async def process_mm_data_async(
33
34
  self,
34
35
  image_data: List[Union[str, bytes]],
35
36
  input_text,
36
- max_req_input_len=None,
37
37
  *args,
38
38
  **kwargs,
39
39
  ):
@@ -45,7 +45,6 @@ class Mllama4ImageProcessor(BaseMultimodalProcessor):
45
45
  processed_data = self.load_mm_data(
46
46
  prompt=input_text,
47
47
  multimodal_tokens=self.multimodal_tokens,
48
- max_req_input_len=max_req_input_len or 4096,
49
48
  image_data=image_data,
50
49
  return_text=True,
51
50
  )
@@ -142,7 +141,7 @@ class Mllama4ImageProcessor(BaseMultimodalProcessor):
142
141
  # Add metadata for image processing
143
142
  processor_output["mm_items"] = [
144
143
  MultimodalDataItem(
145
- pixel_values=processor_output["pixel_values"],
144
+ feature=processor_output["pixel_values"],
146
145
  modality=Modality.IMAGE,
147
146
  offsets=image_offsets,
148
147
  )
@@ -1,6 +1,8 @@
1
1
  import logging
2
2
  from typing import List, Union
3
3
 
4
+ from transformers.processing_utils import ProcessorMixin
5
+
4
6
  from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
5
7
  from sglang.srt.models.phi4mm import Phi4MMForCausalLM
6
8
  from sglang.srt.multimodal.processors.base_processor import (
@@ -10,18 +12,59 @@ from sglang.srt.multimodal.processors.base_processor import (
10
12
 
11
13
  logger = logging.getLogger(__name__)
12
14
 
13
- _IMAGE_SPECIAL_TOKEN = "<|endoftext10|>"
14
- _IMAGE_SPECIAL_TOKEN_ID = 200010
15
+
16
+ # It is an adapter of hf phi4 mm processor to make it work for sglang
17
+ # Ref: https://huggingface.co/microsoft/Phi-4-multimodal-instruct/blob/main/processing_phi4mm.py#L693
18
+ class Phi4MMProcessorAdapter(ProcessorMixin):
19
+ def __init__(self, _processor) -> None:
20
+ self._processor = _processor
21
+
22
+ def __call__(self, **kwargs):
23
+ result = self._processor(**kwargs)
24
+
25
+ # Map HuggingFace output keys to sglang standard keys
26
+ key_mapping = {
27
+ "input_image_embeds": "pixel_values",
28
+ "input_audio_embeds": "audio_features",
29
+ "audio_embed_sizes": "audio_feature_lens",
30
+ }
31
+ for hf_key, sglang_key in key_mapping.items():
32
+ if hf_key in result:
33
+ result[sglang_key] = result[hf_key]
34
+ del result[hf_key]
35
+
36
+ # Filter out None or empty tensors from the result.
37
+ # This prevents the sglang function base_processor.collect_mm_items_from_processor_output()
38
+ # from misclassifying audio content as image content, and vice versa.
39
+ filtered_result = {
40
+ k: v
41
+ for k, v in result.items()
42
+ if v is not None and (not hasattr(v, "numel") or v.numel() > 0)
43
+ }
44
+ return filtered_result
15
45
 
16
46
 
17
- class Phi4MMImageProcessor(BaseMultimodalProcessor):
47
+ class Phi4MMMultimodalProcessor(BaseMultimodalProcessor):
18
48
  models = [Phi4MMForCausalLM]
19
49
 
20
50
  def __init__(self, hf_config, server_args, _processor):
21
- super().__init__(hf_config, server_args, _processor)
22
- self.multimodal_tokens = MultimodalSpecialTokens(
23
- image_token=_IMAGE_SPECIAL_TOKEN,
24
- )
51
+ self.processor = Phi4MMProcessorAdapter(_processor)
52
+ super().__init__(hf_config, server_args, self.processor)
53
+
54
+ # the following CONSTANTS come from hugging-face microsoft/Phi-4-multimodal-instruct's processing_phi4mm.py file
55
+ # ref: https://huggingface.co/microsoft/Phi-4-multimodal-instruct/blob/main/processing_phi4mm.py
56
+ self.IMAGE_TOKEN = "<|endoftext10|>"
57
+ self.AUDIO_TOKEN = "<|endoftext11|>"
58
+ self.IM_TOKEN_ID = 200010
59
+ self.AUDIO_TOKEN_ID = 200011
60
+ self.AUDIO_SAMPLE_RATE = 16000
61
+
62
+ self.mm_tokens = MultimodalSpecialTokens(
63
+ image_token=self.IMAGE_TOKEN,
64
+ image_token_id=self.IM_TOKEN_ID,
65
+ audio_token=self.AUDIO_TOKEN,
66
+ audio_token_id=self.AUDIO_TOKEN_ID,
67
+ ).build(self.processor)
25
68
 
26
69
  async def process_mm_data_async(
27
70
  self,
@@ -29,49 +72,30 @@ class Phi4MMImageProcessor(BaseMultimodalProcessor):
29
72
  audio_data,
30
73
  input_text,
31
74
  request_obj,
32
- max_req_input_len,
33
75
  **kwargs,
34
76
  ):
35
- if audio_data:
36
- logger.warning(
37
- "Currently SGLang does not support audio data for Phi4MM. We are working on it. You can file an issue to help us prioritize."
38
- )
39
- audio_data = []
40
-
41
77
  base_output = self.load_mm_data(
42
78
  prompt=input_text,
43
- max_req_input_len=max_req_input_len,
44
79
  audio_data=audio_data,
45
80
  image_data=image_data,
46
- multimodal_tokens=self.multimodal_tokens,
81
+ multimodal_tokens=self.mm_tokens,
82
+ audio_sample_rate=self.AUDIO_SAMPLE_RATE,
47
83
  )
48
- if base_output is None:
49
- return None
50
84
 
51
- res = self.process_mm_data(
52
- input_text=base_output.input_text,
53
- images=base_output.images,
54
- audios=base_output.audios,
55
- )
85
+ if base_output.audios is not None:
86
+ # hugging-face microsoft/Phi-4-multimodal-instruct's processing_phi4mm.py file requires the audio input to be tuple of (audio, sample_rate)
87
+ # ref: https://huggingface.co/microsoft/Phi-4-multimodal-instruct/blob/main/processing_phi4mm.py
88
+ base_output.audios = [
89
+ (audio, self.AUDIO_SAMPLE_RATE) for audio in base_output.audios
90
+ ]
56
91
 
57
- input_ids = res["input_ids"].flatten()
58
- image_offsets = self.get_mm_items_offset(
59
- input_ids=input_ids,
60
- mm_token_id=_IMAGE_SPECIAL_TOKEN_ID,
92
+ mm_items, input_ids, _ = self.process_and_combine_mm_data(
93
+ base_output, self.mm_tokens
61
94
  )
62
95
 
63
- items = [
64
- MultimodalDataItem(
65
- pixel_values=res["input_image_embeds"],
66
- image_sizes=res["image_sizes"],
67
- image_emb_mask=res["image_attention_mask"],
68
- offsets=image_offsets,
69
- modality=Modality.IMAGE,
70
- )
71
- ]
72
-
73
96
  return {
74
- "mm_items": items,
75
97
  "input_ids": input_ids.tolist(),
76
- "im_token_id": _IMAGE_SPECIAL_TOKEN_ID,
98
+ "mm_items": mm_items,
99
+ "im_token_id": self.mm_tokens.image_token_id,
100
+ "audio_token_id": self.mm_tokens.audio_token_id,
77
101
  }
@@ -6,7 +6,6 @@ from transformers.models.pixtral.image_processing_pixtral import (
6
6
  _num_image_tokens as _get_pixtral_hf_num_image_tokens,
7
7
  )
8
8
 
9
- from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
10
9
  from sglang.srt.models.pixtral import PixtralVisionModel
11
10
  from sglang.srt.multimodal.processors.base_processor import (
12
11
  BaseMultimodalProcessor,
@@ -45,7 +44,7 @@ class PixtralProcessor(BaseMultimodalProcessor):
45
44
 
46
45
  def __init__(self, hf_config, server_args, _processor):
47
46
  super().__init__(hf_config, server_args, _processor)
48
- self.image_token_id = getattr(
47
+ self.IM_TOKEN_ID = getattr(
49
48
  hf_config, "image_token_index", PixtralVisionModel.DEFAULT_IMAGE_TOKEN_ID
50
49
  )
51
50
  # Instantiate the patcher logic helper using the class defined above
@@ -53,9 +52,10 @@ class PixtralProcessor(BaseMultimodalProcessor):
53
52
  self.vision_config = hf_config.vision_config
54
53
  self.image_size = self.vision_config.image_size
55
54
  self.patch_size = self.vision_config.patch_size
56
- self.multimodal_tokens = MultimodalSpecialTokens(
57
- image_token=_processor.image_token
58
- )
55
+ self.mm_tokens = MultimodalSpecialTokens(
56
+ image_token=_processor.image_token,
57
+ image_token_id=self.IM_TOKEN_ID,
58
+ ).build(_processor)
59
59
  _processor.tokenizer.add_special_tokens(
60
60
  {
61
61
  "pad_token": getattr(hf_config, "pad_token", self.PAD_TOKEN),
@@ -80,42 +80,21 @@ class PixtralProcessor(BaseMultimodalProcessor):
80
80
  ):
81
81
  mm_data = self.load_mm_data(
82
82
  prompt=input_text,
83
- multimodal_tokens=self.multimodal_tokens,
84
- max_req_input_len=kwargs.get("max_req_input_len", 4096),
83
+ multimodal_tokens=self.mm_tokens,
85
84
  image_data=image_data,
86
85
  return_text=True,
87
86
  )
88
-
89
87
  if mm_data.images:
90
88
  resize_tasks = [self._resize(image) for image in mm_data.images]
91
89
  mm_data.images = await asyncio.gather(*resize_tasks)
92
90
 
93
- processor_output = self.process_mm_data(
94
- input_text=mm_data.input_text,
95
- images=mm_data.images,
91
+ mm_items, input_ids, _ = self.process_and_combine_mm_data(
92
+ mm_data, self.mm_tokens
96
93
  )
97
94
 
98
- if "pixel_values" in processor_output:
99
- input_ids = processor_output["input_ids"].view(-1)
100
- image_offsets = self.get_mm_items_offset(
101
- input_ids=input_ids,
102
- mm_token_id=self.image_token_id,
103
- )
104
- mm_items = [
105
- MultimodalDataItem(
106
- pixel_values=processor_output["pixel_values"],
107
- image_sizes=processor_output["image_sizes"],
108
- modality=Modality.IMAGE,
109
- offsets=image_offsets,
110
- )
111
- ]
112
-
113
- input_ids = input_ids.tolist()
114
- processor_output.update(
115
- input_ids=input_ids,
116
- mm_items=mm_items,
117
- # there's no im_start_id for pixtral, only im_token and im_end_token
118
- im_end_id=self.IMG_END_TOKEN_ID,
119
- im_token_id=self.image_token_id,
120
- )
121
- return processor_output
95
+ return {
96
+ "mm_items": mm_items,
97
+ "input_ids": input_ids.tolist(),
98
+ "im_token_id": self.IM_TOKEN_ID,
99
+ "im_token": self._processor.image_token,
100
+ }