sglang 0.4.6.post3__py3-none-any.whl → 0.4.6.post5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (180) hide show
  1. sglang/bench_offline_throughput.py +10 -8
  2. sglang/bench_one_batch.py +7 -6
  3. sglang/bench_one_batch_server.py +157 -21
  4. sglang/bench_serving.py +137 -59
  5. sglang/compile_deep_gemm.py +5 -5
  6. sglang/eval/loogle_eval.py +157 -0
  7. sglang/lang/chat_template.py +78 -78
  8. sglang/lang/tracer.py +1 -1
  9. sglang/srt/code_completion_parser.py +1 -1
  10. sglang/srt/configs/deepseekvl2.py +2 -2
  11. sglang/srt/configs/model_config.py +40 -28
  12. sglang/srt/constrained/base_grammar_backend.py +55 -72
  13. sglang/srt/constrained/llguidance_backend.py +25 -21
  14. sglang/srt/constrained/outlines_backend.py +27 -26
  15. sglang/srt/constrained/reasoner_grammar_backend.py +22 -33
  16. sglang/srt/constrained/xgrammar_backend.py +69 -43
  17. sglang/srt/conversation.py +49 -44
  18. sglang/srt/disaggregation/base/conn.py +1 -0
  19. sglang/srt/disaggregation/decode.py +129 -135
  20. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +142 -0
  21. sglang/srt/disaggregation/fake/conn.py +3 -13
  22. sglang/srt/disaggregation/kv_events.py +357 -0
  23. sglang/srt/disaggregation/mini_lb.py +57 -24
  24. sglang/srt/disaggregation/mooncake/conn.py +238 -122
  25. sglang/srt/disaggregation/mooncake/transfer_engine.py +2 -1
  26. sglang/srt/disaggregation/nixl/conn.py +10 -19
  27. sglang/srt/disaggregation/prefill.py +132 -47
  28. sglang/srt/disaggregation/utils.py +123 -6
  29. sglang/srt/distributed/utils.py +3 -3
  30. sglang/srt/entrypoints/EngineBase.py +5 -0
  31. sglang/srt/entrypoints/engine.py +44 -9
  32. sglang/srt/entrypoints/http_server.py +23 -6
  33. sglang/srt/entrypoints/http_server_engine.py +5 -2
  34. sglang/srt/function_call/base_format_detector.py +250 -0
  35. sglang/srt/function_call/core_types.py +34 -0
  36. sglang/srt/function_call/deepseekv3_detector.py +157 -0
  37. sglang/srt/function_call/ebnf_composer.py +234 -0
  38. sglang/srt/function_call/function_call_parser.py +175 -0
  39. sglang/srt/function_call/llama32_detector.py +74 -0
  40. sglang/srt/function_call/mistral_detector.py +84 -0
  41. sglang/srt/function_call/pythonic_detector.py +163 -0
  42. sglang/srt/function_call/qwen25_detector.py +67 -0
  43. sglang/srt/function_call/utils.py +35 -0
  44. sglang/srt/hf_transformers_utils.py +46 -7
  45. sglang/srt/layers/attention/aiter_backend.py +513 -0
  46. sglang/srt/layers/attention/flashattention_backend.py +64 -18
  47. sglang/srt/layers/attention/flashinfer_mla_backend.py +8 -4
  48. sglang/srt/layers/attention/flashmla_backend.py +340 -78
  49. sglang/srt/layers/attention/triton_backend.py +3 -0
  50. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +1 -1
  51. sglang/srt/layers/attention/utils.py +6 -4
  52. sglang/srt/layers/attention/vision.py +1 -1
  53. sglang/srt/layers/communicator.py +451 -0
  54. sglang/srt/layers/dp_attention.py +61 -21
  55. sglang/srt/layers/layernorm.py +1 -1
  56. sglang/srt/layers/logits_processor.py +46 -11
  57. sglang/srt/layers/moe/cutlass_moe.py +207 -0
  58. sglang/srt/layers/moe/ep_moe/kernels.py +34 -12
  59. sglang/srt/layers/moe/ep_moe/layer.py +105 -51
  60. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +82 -7
  61. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +1 -1
  62. sglang/srt/layers/moe/fused_moe_triton/layer.py +14 -0
  63. sglang/srt/layers/moe/topk.py +67 -10
  64. sglang/srt/layers/multimodal.py +70 -0
  65. sglang/srt/layers/quantization/__init__.py +8 -3
  66. sglang/srt/layers/quantization/blockwise_int8.py +2 -2
  67. sglang/srt/layers/quantization/deep_gemm.py +77 -74
  68. sglang/srt/layers/quantization/fp8.py +92 -2
  69. sglang/srt/layers/quantization/fp8_kernel.py +3 -3
  70. sglang/srt/layers/quantization/fp8_utils.py +6 -0
  71. sglang/srt/layers/quantization/gptq.py +298 -6
  72. sglang/srt/layers/quantization/int8_kernel.py +20 -7
  73. sglang/srt/layers/quantization/qoq.py +244 -0
  74. sglang/srt/layers/sampler.py +0 -4
  75. sglang/srt/layers/vocab_parallel_embedding.py +18 -7
  76. sglang/srt/lora/lora_manager.py +2 -4
  77. sglang/srt/lora/mem_pool.py +4 -4
  78. sglang/srt/lora/triton_ops/gate_up_lora_b.py +1 -1
  79. sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
  80. sglang/srt/lora/triton_ops/sgemm_lora_a.py +1 -1
  81. sglang/srt/lora/triton_ops/sgemm_lora_b.py +1 -1
  82. sglang/srt/lora/utils.py +1 -1
  83. sglang/srt/managers/data_parallel_controller.py +3 -3
  84. sglang/srt/managers/deepseek_eplb.py +278 -0
  85. sglang/srt/managers/detokenizer_manager.py +21 -8
  86. sglang/srt/managers/eplb_manager.py +55 -0
  87. sglang/srt/managers/expert_distribution.py +704 -56
  88. sglang/srt/managers/expert_location.py +394 -0
  89. sglang/srt/managers/expert_location_dispatch.py +91 -0
  90. sglang/srt/managers/io_struct.py +19 -4
  91. sglang/srt/managers/mm_utils.py +294 -140
  92. sglang/srt/managers/multimodal_processors/base_processor.py +127 -42
  93. sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +6 -1
  94. sglang/srt/managers/multimodal_processors/gemma3.py +31 -6
  95. sglang/srt/managers/multimodal_processors/internvl.py +14 -5
  96. sglang/srt/managers/multimodal_processors/janus_pro.py +7 -1
  97. sglang/srt/managers/multimodal_processors/kimi_vl.py +7 -6
  98. sglang/srt/managers/multimodal_processors/llava.py +46 -0
  99. sglang/srt/managers/multimodal_processors/minicpm.py +25 -31
  100. sglang/srt/managers/multimodal_processors/mllama4.py +6 -0
  101. sglang/srt/managers/multimodal_processors/pixtral.py +127 -0
  102. sglang/srt/managers/multimodal_processors/qwen_vl.py +58 -16
  103. sglang/srt/managers/schedule_batch.py +122 -42
  104. sglang/srt/managers/schedule_policy.py +1 -5
  105. sglang/srt/managers/scheduler.py +205 -138
  106. sglang/srt/managers/scheduler_output_processor_mixin.py +124 -55
  107. sglang/srt/managers/session_controller.py +1 -1
  108. sglang/srt/managers/tokenizer_manager.py +232 -58
  109. sglang/srt/managers/tp_worker.py +12 -9
  110. sglang/srt/managers/tp_worker_overlap_thread.py +22 -11
  111. sglang/srt/mem_cache/base_prefix_cache.py +3 -0
  112. sglang/srt/mem_cache/chunk_cache.py +3 -1
  113. sglang/srt/mem_cache/hiradix_cache.py +4 -4
  114. sglang/srt/mem_cache/memory_pool.py +76 -52
  115. sglang/srt/mem_cache/multimodal_cache.py +45 -0
  116. sglang/srt/mem_cache/radix_cache.py +58 -5
  117. sglang/srt/metrics/collector.py +314 -39
  118. sglang/srt/mm_utils.py +10 -0
  119. sglang/srt/model_executor/cuda_graph_runner.py +29 -19
  120. sglang/srt/model_executor/expert_location_updater.py +422 -0
  121. sglang/srt/model_executor/forward_batch_info.py +5 -1
  122. sglang/srt/model_executor/model_runner.py +163 -68
  123. sglang/srt/model_loader/loader.py +10 -6
  124. sglang/srt/models/clip.py +5 -1
  125. sglang/srt/models/deepseek_janus_pro.py +2 -2
  126. sglang/srt/models/deepseek_v2.py +308 -351
  127. sglang/srt/models/exaone.py +8 -3
  128. sglang/srt/models/gemma3_mm.py +70 -33
  129. sglang/srt/models/llama.py +2 -0
  130. sglang/srt/models/llama4.py +15 -8
  131. sglang/srt/models/llava.py +258 -7
  132. sglang/srt/models/mimo_mtp.py +220 -0
  133. sglang/srt/models/minicpmo.py +5 -12
  134. sglang/srt/models/mistral.py +71 -1
  135. sglang/srt/models/mixtral.py +98 -34
  136. sglang/srt/models/mllama.py +3 -3
  137. sglang/srt/models/pixtral.py +467 -0
  138. sglang/srt/models/qwen2.py +95 -26
  139. sglang/srt/models/qwen2_5_vl.py +8 -0
  140. sglang/srt/models/qwen2_moe.py +330 -60
  141. sglang/srt/models/qwen2_vl.py +6 -0
  142. sglang/srt/models/qwen3.py +52 -10
  143. sglang/srt/models/qwen3_moe.py +411 -48
  144. sglang/srt/models/roberta.py +1 -1
  145. sglang/srt/models/siglip.py +294 -0
  146. sglang/srt/models/torch_native_llama.py +1 -1
  147. sglang/srt/openai_api/adapter.py +58 -20
  148. sglang/srt/openai_api/protocol.py +6 -8
  149. sglang/srt/operations.py +154 -0
  150. sglang/srt/operations_strategy.py +31 -0
  151. sglang/srt/reasoning_parser.py +3 -3
  152. sglang/srt/sampling/custom_logit_processor.py +18 -3
  153. sglang/srt/sampling/sampling_batch_info.py +4 -56
  154. sglang/srt/sampling/sampling_params.py +2 -2
  155. sglang/srt/server_args.py +162 -22
  156. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +3 -3
  157. sglang/srt/speculative/eagle_utils.py +138 -7
  158. sglang/srt/speculative/eagle_worker.py +69 -21
  159. sglang/srt/utils.py +74 -17
  160. sglang/test/few_shot_gsm8k.py +2 -2
  161. sglang/test/few_shot_gsm8k_engine.py +2 -2
  162. sglang/test/run_eval.py +2 -2
  163. sglang/test/runners.py +8 -1
  164. sglang/test/send_one.py +13 -3
  165. sglang/test/simple_eval_common.py +1 -1
  166. sglang/test/simple_eval_humaneval.py +1 -1
  167. sglang/test/test_cutlass_moe.py +278 -0
  168. sglang/test/test_programs.py +5 -5
  169. sglang/test/test_utils.py +55 -14
  170. sglang/utils.py +3 -3
  171. sglang/version.py +1 -1
  172. {sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/METADATA +23 -13
  173. {sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/RECORD +178 -149
  174. {sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/WHEEL +1 -1
  175. sglang/srt/function_call_parser.py +0 -858
  176. sglang/srt/platforms/interface.py +0 -371
  177. /sglang/{llama3_eval.py → eval/llama3_eval.py} +0 -0
  178. /sglang/srt/models/{xiaomi_mimo.py → mimo.py} +0 -0
  179. {sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/licenses/LICENSE +0 -0
  180. {sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/top_level.txt +0 -0
@@ -3,16 +3,16 @@ import concurrent.futures
3
3
  import dataclasses
4
4
  import multiprocessing as mp
5
5
  import os
6
+ import re
6
7
  from abc import ABC, abstractmethod
7
- from typing import List, Optional
8
+ from typing import List, Optional, Tuple, Union
8
9
 
9
10
  import numpy as np
10
- import PIL
11
11
  import torch
12
12
  from PIL import Image
13
13
  from transformers import BaseImageProcessorFast
14
14
 
15
- from sglang.srt.managers.schedule_batch import Modality
15
+ from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
16
16
  from sglang.srt.utils import encode_video, load_audio, load_image
17
17
 
18
18
 
@@ -22,13 +22,13 @@ class BaseMultiModalProcessorOutput:
22
22
  input_text: str
23
23
 
24
24
  # frames loaded from image and video, in given order
25
- images: Optional[list[PIL.Image]] = None
25
+ images: Optional[list[Union[Image.Image, MultimodalDataItem]]] = None
26
26
 
27
27
  # audios
28
- audios: Optional[list[np.ndarray]] = None
28
+ audios: Optional[list[Union[np.ndarray, MultimodalDataItem]]] = None
29
29
 
30
30
  def normalize(self):
31
- for field_name in ["image_sizes", "images", "audios"]:
31
+ for field_name in ["images", "audios"]:
32
32
  field = getattr(self, field_name, None)
33
33
  if field is not None and isinstance(field, list) and len(field) == 0:
34
34
  setattr(self, field_name, None)
@@ -36,16 +36,48 @@ class BaseMultiModalProcessorOutput:
36
36
 
37
37
  @dataclasses.dataclass
38
38
  class MultimodalSpecialTokens:
39
- image_token: Optional[str] = None
40
- video_token: Optional[str] = None
41
- audio_token: Optional[str] = None
42
-
43
- def collect(self) -> list[str]:
44
- return [
45
- token
46
- for token in [self.image_token, self.video_token, self.audio_token]
47
- if token
39
+ image_token: Optional[Union[int, str, List[str]]] = None
40
+ video_token: Optional[Union[int, str, List[str]]] = None
41
+ audio_token: Optional[Union[int, str, List[str]]] = None
42
+
43
+ def convert_to_str(self, token: Union[str, int], processor) -> str:
44
+ if token is None:
45
+ return token
46
+ if isinstance(token, str):
47
+ return token
48
+ return processor.tokenizer.convert_ids_to_tokens([token])[0]
49
+
50
+ def convert_to_strs(self, processor):
51
+ self.image_token = self.convert_to_str(self.image_token, processor)
52
+ self.video_token = self.convert_to_str(self.video_token, processor)
53
+ self.audio_token = self.convert_to_str(self.audio_token, processor)
54
+
55
+ image_token_regex: Optional[re.Pattern] = None
56
+ video_token_regex: Optional[re.Pattern] = None
57
+ audio_token_regex: Optional[re.Pattern] = None
58
+
59
+ def __post_init__(self):
60
+ if self.image_token_regex is None and self.image_token is not None:
61
+ self.image_token_regex = re.compile(re.escape(self.image_token))
62
+ if self.video_token_regex is None and self.video_token is not None:
63
+ self.video_token_regex = re.compile(re.escape(self.video_token))
64
+ if self.audio_token_regex is None and self.audio_token is not None:
65
+ self.audio_token_regex = re.compile(re.escape(self.audio_token))
66
+
67
+ def collect(self) -> re.Pattern:
68
+ tokens = [
69
+ self.image_token_regex,
70
+ self.video_token_regex,
71
+ self.audio_token_regex,
48
72
  ]
73
+ patterns = []
74
+ flags = 0
75
+ for t in tokens:
76
+ if t is not None:
77
+ patterns.append(t.pattern)
78
+ flags |= t.flags
79
+ combined = "(" + "|".join(f"(?:{p})" for p in patterns) + ")"
80
+ return re.compile(combined, flags)
49
81
 
50
82
 
51
83
  class BaseMultimodalProcessor(ABC):
@@ -54,6 +86,7 @@ class BaseMultimodalProcessor(ABC):
54
86
  def __init__(self, hf_config, server_args, _processor):
55
87
  self.hf_config = hf_config
56
88
  self._processor = _processor
89
+ self.arch = hf_config.architectures[0]
57
90
  self.server_args = server_args
58
91
  # FIXME: not accurate, model and image specific
59
92
  self.NUM_TOKEN_PER_FRAME = 330
@@ -136,6 +169,10 @@ class BaseMultimodalProcessor(ABC):
136
169
  data, is_video, is_audio, frame_count_limit=None, discard_alpha_channel=True
137
170
  ):
138
171
  """Static method that can be pickled for multiprocessing"""
172
+ if isinstance(data, dict):
173
+ return MultimodalDataItem.from_dict(data)
174
+ if isinstance(data, MultimodalDataItem):
175
+ return data
139
176
  try:
140
177
  if is_audio:
141
178
  return load_audio(data)
@@ -175,7 +212,10 @@ class BaseMultimodalProcessor(ABC):
175
212
  image_index, audio_index = 0, 0
176
213
 
177
214
  for text_part in text_parts:
178
- if text_part == multimodal_tokens.image_token:
215
+ if (
216
+ multimodal_tokens.image_token_regex
217
+ and multimodal_tokens.image_token_regex.match(text_part)
218
+ ):
179
219
  data = image_data[image_index]
180
220
  is_video = isinstance(data, str) and data.startswith("video:")
181
221
  estimated_frames = estimated_frames_list[image_index]
@@ -192,7 +232,10 @@ class BaseMultimodalProcessor(ABC):
192
232
  )
193
233
  task_info.append((Modality.IMAGE, data, frame_count_limit))
194
234
  image_index += 1
195
- elif text_part == multimodal_tokens.audio_token:
235
+ elif (
236
+ multimodal_tokens.audio_token_regex
237
+ and multimodal_tokens.audio_token_regex.match(text_part)
238
+ ):
196
239
  data = audio_data[audio_index]
197
240
  futures.append(
198
241
  self.io_executor.submit(
@@ -228,17 +271,13 @@ class BaseMultimodalProcessor(ABC):
228
271
  discard_alpha_channel: if True, discards the alpha channel in the returned images
229
272
 
230
273
  """
231
-
274
+ if not return_text:
275
+ raise NotImplementedError()
232
276
  if image_data is None:
233
277
  image_data = []
234
- if isinstance(multimodal_tokens.image_token, int):
235
- multimodal_tokens.image_token = (
236
- self._processor.tokenizer.convert_ids_to_tokens(
237
- multimodal_tokens.image_token
238
- )
239
- )
240
- else:
241
- multimodal_tokens.image_token = multimodal_tokens.image_token
278
+
279
+ multimodal_tokens.convert_to_strs(self._processor)
280
+ multimodal_tokens_pattern = multimodal_tokens.collect()
242
281
 
243
282
  if isinstance(prompt, list) and return_text:
244
283
  assert len(prompt) and isinstance(prompt[0], int)
@@ -247,16 +286,8 @@ class BaseMultimodalProcessor(ABC):
247
286
  prompt = prompt
248
287
 
249
288
  assert isinstance(prompt, str)
250
- if return_text:
251
- import re
252
-
253
- pattern = (
254
- "("
255
- + "|".join(re.escape(sep) for sep in multimodal_tokens.collect())
256
- + ")"
257
- )
258
- # split text into list of normal text and special tokens
259
- text_parts = re.split(pattern, prompt)
289
+ # split text into list of normal text and special tokens
290
+ text_parts = re.split(multimodal_tokens_pattern, prompt)
260
291
 
261
292
  futures, task_info = self.submit_data_loading_tasks(
262
293
  text_parts=text_parts,
@@ -266,34 +297,88 @@ class BaseMultimodalProcessor(ABC):
266
297
  discard_alpha_channel=discard_alpha_channel,
267
298
  )
268
299
  # Process results
269
- image_sizes, images, audios = [], [], []
300
+ images, audios = [], []
270
301
  new_text = ""
271
302
  task_ptr = 0
272
303
 
273
304
  for text_part in text_parts:
274
- if text_part in multimodal_tokens.collect():
305
+ if multimodal_tokens_pattern.match(text_part):
275
306
  task_type, data, frame_limit = task_info[task_ptr]
276
307
  result = futures[task_ptr].result()
277
308
  task_ptr += 1
278
309
 
279
310
  if task_type == Modality.IMAGE:
311
+ # If data is already processed it will be a
312
+ # dictionary. In this case we want to keep the
313
+ # expanded tokens in text_part. Otherwise, we will
314
+ # call the processor code, so keep only a single image
315
+ # token.
316
+ mm_tokens = (
317
+ text_part
318
+ if isinstance(data, dict)
319
+ else multimodal_tokens.image_token
320
+ )
280
321
  frames = [result] if not isinstance(result, list) else result
281
322
  if frames:
282
- image_sizes += frames[0].size * len(frames)
283
323
  images += frames
284
- new_text += multimodal_tokens.image_token * len(frames)
324
+ new_text += mm_tokens * len(frames)
285
325
  elif task_type == Modality.AUDIO:
286
326
  # audio
327
+ mm_tokens = (
328
+ text_part
329
+ if isinstance(data, dict)
330
+ else multimodal_tokens.audio_token
331
+ )
287
332
  audios.append(result)
288
- new_text += multimodal_tokens.audio_token
333
+ new_text += mm_tokens
289
334
  # TODO: handle video
290
335
  else:
291
336
  new_text += text_part
292
337
 
293
338
  out = BaseMultiModalProcessorOutput(
339
+ input_text=new_text,
294
340
  images=images,
295
341
  audios=audios,
296
- input_text=new_text,
297
342
  )
298
343
  out.normalize()
299
344
  return out
345
+
346
+ @staticmethod
347
+ def get_mm_items_offset(
348
+ input_ids: torch.Tensor, mm_token_id: int
349
+ ) -> List[Tuple[int, int]]:
350
+ """
351
+ Get a set of range for mm_items from input_ids
352
+ Example:
353
+ input_ids = [1, 2, 3, 3, 3, 4, 3, 3]
354
+ mm_token_id = 3
355
+ return result = [(2,4),(6,7)]
356
+ """
357
+ mask = input_ids == mm_token_id
358
+
359
+ start_positions = (mask & ~torch.roll(mask, 1)).nonzero(as_tuple=True)[0]
360
+ end_positions = (mask & ~torch.roll(mask, -1)).nonzero(as_tuple=True)[0]
361
+
362
+ return list(zip(start_positions.tolist(), end_positions.tolist()))
363
+
364
+ @staticmethod
365
+ def get_mm_items_offset_by_pair(
366
+ input_ids: torch.Tensor, mm_start_id: int, mm_end_id: int
367
+ ) -> List[Tuple[int, int]]:
368
+ indices_start = (input_ids == mm_start_id).nonzero(as_tuple=True)[0] + 1
369
+ indices_end = (input_ids == mm_end_id).nonzero(as_tuple=True)[0] - 1
370
+
371
+ return list(zip(indices_start.tolist(), indices_end.tolist()))
372
+
373
+ def mm_inputs_are_preprocessed(self, mm_inputs: Optional[list]):
374
+ """Returns true if all images are preprocessed, false if all are not, and error otherwise."""
375
+ if not mm_inputs:
376
+ return True
377
+ ret = any(isinstance(mm_input, MultimodalDataItem) for mm_input in mm_inputs)
378
+ if ret and not all(
379
+ isinstance(mm_input, MultimodalDataItem) for mm_input in mm_inputs
380
+ ):
381
+ raise ValueError(
382
+ "Unsupported: mixture of multimodal inputs where some but not all are preprocessed."
383
+ )
384
+ return ret
@@ -70,8 +70,13 @@ class DeepseekVL2ImageProcessor(BaseMultimodalProcessor):
70
70
  batched_images_spatial_crop = torch.stack(batched_images_spatial_crop, dim=0)
71
71
 
72
72
  items = []
73
+ input_ids = res["input_ids"]
74
+ image_offsets = self.get_mm_items_offset(
75
+ input_ids=input_ids, mm_token_id=self._processor.image_token_id
76
+ )
73
77
  item = MultimodalDataItem(
74
78
  pixel_values=res["images"],
79
+ image_offsets=image_offsets,
75
80
  modality=Modality.IMAGE,
76
81
  image_emb_mask=images_seq_mask,
77
82
  image_spatial_crop=batched_images_spatial_crop,
@@ -80,6 +85,6 @@ class DeepseekVL2ImageProcessor(BaseMultimodalProcessor):
80
85
 
81
86
  return {
82
87
  "mm_items": items,
83
- "input_ids": res["input_ids"].tolist(),
88
+ "input_ids": input_ids.tolist(),
84
89
  "im_token_id": self._processor.image_token_id,
85
90
  }
@@ -1,4 +1,5 @@
1
- from typing import List, Union
1
+ import re
2
+ from typing import Dict, List, Union
2
3
 
3
4
  from sglang.srt.managers.multimodal_processor import (
4
5
  BaseMultimodalProcessor as SGLangBaseProcessor,
@@ -18,13 +19,18 @@ class Gemma3SGLangImageProcessor(SGLangBaseProcessor):
18
19
 
19
20
  def __init__(self, hf_config, server_args, _processor):
20
21
  super().__init__(hf_config, server_args, _processor)
22
+ # The single, pre-expanded image token.
21
23
  self.IMAGE_TOKEN = "<start_of_image>"
24
+ # The regex that matches expanded image tokens.
25
+ self.IMAGE_TOKEN_REGEX = re.compile(
26
+ r"<start_of_image>(?:(?:<image_soft_token>)*<end_of_image>)?"
27
+ )
22
28
  self.IM_START_TOKEN_ID = hf_config.boi_token_index
23
29
  self.IM_END_TOKEN_ID = hf_config.eoi_token_index
24
30
 
25
31
  async def process_mm_data_async(
26
32
  self,
27
- image_data: List[Union[str, bytes]],
33
+ image_data: List[Union[str, bytes, Dict]],
28
34
  input_text,
29
35
  request_obj,
30
36
  max_req_input_len,
@@ -37,29 +43,48 @@ class Gemma3SGLangImageProcessor(SGLangBaseProcessor):
37
43
  image_data = [image_data]
38
44
 
39
45
  image_token = self.IMAGE_TOKEN
46
+ image_token_regex = self.IMAGE_TOKEN_REGEX
40
47
  base_output = self.load_mm_data(
41
48
  prompt=input_text,
42
49
  image_data=image_data,
43
- multimodal_tokens=MultimodalSpecialTokens(image_token=image_token),
50
+ multimodal_tokens=MultimodalSpecialTokens(
51
+ image_token=image_token, image_token_regex=image_token_regex
52
+ ),
44
53
  max_req_input_len=max_req_input_len,
45
54
  discard_alpha_channel=True,
46
55
  )
47
56
 
57
+ images_are_preprocessed = self.mm_inputs_are_preprocessed(base_output.images)
48
58
  ret = self.process_mm_data(
49
- input_text=base_output.input_text, images=base_output.images
59
+ input_text=base_output.input_text,
60
+ images=None if images_are_preprocessed else base_output.images,
50
61
  )
51
62
 
52
63
  items = []
64
+ input_ids = ret["input_ids"].flatten()
65
+ image_offsets = self.get_mm_items_offset(
66
+ input_ids=input_ids,
67
+ mm_token_id=self.hf_config.image_token_index,
68
+ )
53
69
  for i, image in enumerate(base_output.images):
70
+ if images_are_preprocessed:
71
+ pixel_values = image.pixel_values
72
+ precomputed_features = image.precomputed_features
73
+ else:
74
+ pixel_values = ret["pixel_values"][i]
75
+ precomputed_features = None
76
+
54
77
  item = MultimodalDataItem(
55
- pixel_values=ret["pixel_values"][i],
78
+ pixel_values=pixel_values,
79
+ precomputed_features=precomputed_features,
56
80
  modality=Modality.IMAGE,
81
+ image_offsets=image_offsets[i],
57
82
  )
58
83
  items += [item]
59
84
 
60
85
  return {
61
86
  "mm_items": items,
62
- "input_ids": ret["input_ids"].flatten().tolist(),
87
+ "input_ids": input_ids.tolist(),
63
88
  "im_start_id": self.IM_START_TOKEN_ID,
64
89
  "im_end_id": self.IM_END_TOKEN_ID,
65
90
  }
@@ -3,7 +3,6 @@
3
3
  import numpy as np
4
4
  import torch
5
5
  from decord import VideoReader, cpu
6
- from numpy.distutils.cpuinfo import cpu
7
6
  from PIL import Image
8
7
 
9
8
  from sglang.srt.managers.multimodal_processors.base_processor import (
@@ -210,7 +209,6 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
210
209
  return None
211
210
 
212
211
  pixel_values = torch.cat(pixel_values, dim=0)
213
- items = [MultimodalDataItem(pixel_values=pixel_values, modality=Modality.IMAGE)]
214
212
 
215
213
  for idx, num_patches in enumerate(num_patches_list):
216
214
  image_tokens = (
@@ -221,10 +219,21 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
221
219
  input_text = input_text.replace("<image>", image_tokens, 1)
222
220
 
223
221
  tokenizer = self._processor
222
+ input_ids = tokenizer(input_text, return_tensors="pt")["input_ids"].flatten()
223
+ image_offsets = self.get_mm_items_offset(
224
+ input_ids=input_ids,
225
+ mm_token_id=self.img_context_token_id,
226
+ )
227
+ items = [
228
+ MultimodalDataItem(
229
+ pixel_values=pixel_values,
230
+ modality=Modality.IMAGE,
231
+ image_offsets=image_offsets,
232
+ )
233
+ ]
234
+
224
235
  return {
225
- "input_ids": tokenizer(input_text, return_tensors="pt")["input_ids"]
226
- .flatten()
227
- .tolist(),
236
+ "input_ids": input_ids.tolist(),
228
237
  "mm_items": items,
229
238
  "im_start_id": self.img_start_token_id,
230
239
  "im_end_id": self.img_end_token_id,
@@ -45,15 +45,21 @@ class JanusProImageProcessor(BaseMultimodalProcessor):
45
45
  prompt=base_out.input_text,
46
46
  images=images,
47
47
  )
48
+
49
+ input_ids = res["input_ids"].flatten()
50
+ image_offsets = self.get_mm_items_offset(
51
+ input_ids=input_ids, mm_token_id=processor.image_id
52
+ )
48
53
  return {
49
54
  "mm_items": [
50
55
  MultimodalDataItem(
51
56
  pixel_values=res["pixel_values"],
52
57
  image_emb_mask=res["images_emb_mask"],
58
+ image_offsets=image_offsets,
53
59
  modality=Modality.IMAGE,
54
60
  )
55
61
  ],
56
- "input_ids": res["input_ids"].flatten().tolist(),
62
+ "input_ids": input_ids.tolist(),
57
63
  "im_start_id": processor.image_start_id,
58
64
  "im_end_id": processor.image_end_id,
59
65
  "im_token_id": processor.image_id,
@@ -1,10 +1,5 @@
1
- import asyncio
2
- import math
3
1
  from typing import List, Union
4
2
 
5
- import torch
6
- from PIL import Image
7
-
8
3
  from sglang.srt.managers.multimodal_processors.base_processor import (
9
4
  BaseMultimodalProcessor as SGLangBaseProcessor,
10
5
  )
@@ -57,13 +52,19 @@ class KimiVLImageProcessor(SGLangBaseProcessor):
57
52
  input_text=base_output.input_text,
58
53
  images=base_output.images,
59
54
  )
55
+ input_ids = ret["input_ids"].flatten()
56
+ image_offsets = self.get_mm_items_offset(
57
+ input_ids=input_ids,
58
+ mm_token_id=self.im_token_id,
59
+ )
60
60
  return {
61
- "input_ids": ret["input_ids"].flatten().tolist(),
61
+ "input_ids": input_ids.tolist(),
62
62
  "mm_items": [
63
63
  MultimodalDataItem(
64
64
  pixel_values=ret["pixel_values"],
65
65
  image_grid_thws=ret["image_grid_hws"],
66
66
  modality=Modality.IMAGE,
67
+ image_offsets=image_offsets,
67
68
  )
68
69
  ],
69
70
  "im_token_id": self.im_token_id,
@@ -2,18 +2,24 @@ import asyncio
2
2
  from typing import List, Optional, Union
3
3
 
4
4
  import numpy as np
5
+ from transformers.models.auto.processing_auto import (
6
+ PROCESSOR_MAPPING_NAMES as HF_MAPPING_NAMES,
7
+ )
5
8
 
9
+ import sglang.srt.managers.multimodal_processor as sgl_mm_processor_utils
6
10
  from sglang.srt.managers.multimodal_processors.base_processor import (
7
11
  BaseMultimodalProcessor,
8
12
  )
9
13
  from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
10
14
  from sglang.srt.mm_utils import expand2square, process_anyres_image
11
15
  from sglang.srt.models.llava import (
16
+ LlavaForConditionalGeneration,
12
17
  LlavaLlamaForCausalLM,
13
18
  LlavaMistralForCausalLM,
14
19
  LlavaQwenForCausalLM,
15
20
  )
16
21
  from sglang.srt.models.llavavid import LlavaVidForCausalLM
22
+ from sglang.srt.models.mistral import Mistral3ForConditionalGeneration
17
23
  from sglang.srt.utils import load_image, logger
18
24
  from sglang.utils import get_exception_traceback
19
25
 
@@ -133,6 +139,7 @@ class LlavaImageProcessor(BaseMultimodalProcessor):
133
139
  img_data, aspect_ratio, grid_pinpoints
134
140
  )
135
141
  )
142
+
136
143
  res = await asyncio.gather(*res)
137
144
  for pixel_v, image_h, image_s in res:
138
145
  pixel_values.append(pixel_v)
@@ -165,3 +172,42 @@ class LlavaImageProcessor(BaseMultimodalProcessor):
165
172
  )
166
173
  ],
167
174
  }
175
+
176
+
177
+ class LlavaMultimodalProcessor(BaseMultimodalProcessor):
178
+ """
179
+ This is a wrapper class used to identify the multimodal processor for Llava architectures' vision model.
180
+ """
181
+
182
+ models = [LlavaForConditionalGeneration, Mistral3ForConditionalGeneration]
183
+
184
+ def _get_sgl_processor_cls(self, model_type: str):
185
+ if hf_name := HF_MAPPING_NAMES.get(model_type):
186
+ sgl_mm_processor_set = sgl_mm_processor_utils.PROCESSOR_MAPPING.values()
187
+ sgl_processor_cls = list(
188
+ filter(lambda p: p.__name__ == hf_name, sgl_mm_processor_set)
189
+ )
190
+ if sgl_processor_cls:
191
+ return sgl_processor_cls[0]
192
+ raise ValueError(
193
+ f"Cannot find corresponding multimodal processor registered in sglang for model type `{model_type}`"
194
+ )
195
+
196
+ def __init__(self, hf_config, server_args, _processor):
197
+ assert hasattr(hf_config, "vision_config")
198
+ assert hasattr(hf_config, "text_config")
199
+ self.vision_config = hf_config.vision_config
200
+ self.text_config = hf_config.text_config
201
+ self.hf_config = hf_config
202
+
203
+ if vision_type := getattr(self.vision_config, "model_type"):
204
+ self.inner = self._get_sgl_processor_cls(vision_type)(
205
+ hf_config, server_args, _processor
206
+ )
207
+ else:
208
+ raise ValueError(
209
+ f"Required `vision_config.model_type` is not found in hf_config: `{hf_config}`"
210
+ )
211
+
212
+ async def process_mm_data_async(self, *args, **kwargs):
213
+ return await self.inner.process_mm_data_async(*args, **kwargs)
@@ -1,7 +1,6 @@
1
1
  from typing import List, Union
2
2
 
3
3
  import torch
4
- from transformers import BaseImageProcessorFast
5
4
 
6
5
  from sglang.srt.managers.multimodal_processors.base_processor import (
7
6
  BaseMultimodalProcessor,
@@ -21,33 +20,6 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
21
20
  self.image_token = "(<image>./</image>)"
22
21
  self.audio_token = "(<audio>./</audio>)"
23
22
 
24
- def process_data_task(self, input_text, images=None, audios=None):
25
-
26
- if isinstance(images, list) and len(images) == 0:
27
- images = None
28
- if isinstance(audios, list) and len(audios) == 0:
29
- audios = None
30
- processor = self._processor
31
- args = {}
32
- if isinstance(processor, BaseImageProcessorFast):
33
- args["device"] = "cuda"
34
- result = self._processor.__call__(
35
- text=input_text,
36
- images=images,
37
- audios=audios,
38
- return_tensors="pt",
39
- chunk_input=True,
40
- **args,
41
- )
42
- return {
43
- "input_ids": result.input_ids,
44
- "pixel_values": getattr(result, "pixel_values", None),
45
- "tgt_sizes": getattr(result, "tgt_sizes", None),
46
- "audio_features": getattr(result, "audio_features", None),
47
- "audio_feature_lens": getattr(result, "audio_feature_lens", None),
48
- "audio_bounds": getattr(result, "audio_bounds", None),
49
- }
50
-
51
23
  async def process_mm_data_async(
52
24
  self,
53
25
  image_data: List[Union[str, bytes]],
@@ -97,6 +69,8 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
97
69
  audio_start_id = tokenizer.audio_start_id
98
70
  audio_end_id = tokenizer.audio_end_id
99
71
 
72
+ im_start_id = tokenizer.im_start_id
73
+ im_end_id = tokenizer.im_end_id
100
74
  im_token_id = tokenizer.unk_id
101
75
  pixel_values = res["pixel_values"]
102
76
  tgt_sizes = res["tgt_sizes"]
@@ -132,9 +106,20 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
132
106
  pixel_values = pixel_values_flat
133
107
 
134
108
  items = []
109
+ input_ids = res["input_ids"].flatten()
110
+ image_offsets = self.get_mm_items_offset_by_pair(
111
+ input_ids=input_ids, mm_start_id=im_start_id, mm_end_id=im_end_id
112
+ )
113
+ slice_offsets = self.get_mm_items_offset_by_pair(
114
+ input_ids=input_ids, mm_start_id=slice_start_id, mm_end_id=slice_end_id
115
+ )
116
+ image_offsets.extend(slice_offsets)
117
+ image_offsets = sorted(image_offsets)
118
+
135
119
  if len(pixel_values) != 0:
136
120
  item = MultimodalDataItem(
137
121
  pixel_values=pixel_values,
122
+ image_offsets=image_offsets,
138
123
  tgt_size=tgt_sizes_flat,
139
124
  modality=Modality.IMAGE,
140
125
  )
@@ -145,21 +130,30 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
145
130
  and res["audio_features"] is not None
146
131
  and len(res["audio_features"]) != 0
147
132
  ):
133
+ if audio_start_id is not None and audio_end_id is not None:
134
+ audio_offsets = self.get_mm_items_offset_by_pair(
135
+ input_ids=input_ids,
136
+ mm_start_id=audio_start_id,
137
+ mm_end_id=audio_end_id,
138
+ )
139
+ else:
140
+ audio_offsets = None
148
141
  item = MultimodalDataItem(
149
142
  audio_features=[res["audio_features"]],
150
143
  audio_feature_lens=res["audio_feature_lens"],
144
+ audio_offsets=audio_offsets,
151
145
  modality=Modality.AUDIO,
152
146
  )
153
147
  items += [item]
154
148
 
155
149
  return {
156
150
  "mm_items": items,
157
- "input_ids": res["input_ids"].flatten().tolist(),
151
+ "input_ids": input_ids.tolist(),
158
152
  "audio_start_id": audio_start_id,
159
153
  "audio_end_id": audio_end_id,
160
154
  "im_token_id": im_token_id,
161
- "im_start_id": tokenizer.im_start_id,
162
- "im_end_id": tokenizer.im_end_id,
155
+ "im_start_id": im_start_id,
156
+ "im_end_id": im_end_id,
163
157
  "slice_start_id": slice_start_id,
164
158
  "slice_end_id": slice_end_id,
165
159
  }
@@ -135,11 +135,17 @@ class Mllama4ImageProcessor(BaseMultimodalProcessor):
135
135
  processor_output["im_end_id"] = self.eoi_token_index
136
136
  processor_output["im_token_id"] = self.image_token_index
137
137
 
138
+ image_offsets = self.get_mm_items_offset(
139
+ input_ids=torch.tensor(processor_output["input_ids"]),
140
+ mm_token_id=self.image_token_index,
141
+ )
142
+
138
143
  # Add metadata for image processing
139
144
  processor_output["mm_items"] = [
140
145
  MultimodalDataItem(
141
146
  pixel_values=processor_output["pixel_values"],
142
147
  modality=Modality.IMAGE,
148
+ image_offsets=image_offsets,
143
149
  )
144
150
  ]
145
151