sglang 0.4.9.post1__py3-none-any.whl → 0.4.9.post2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. sglang/srt/configs/model_config.py +24 -1
  2. sglang/srt/conversation.py +21 -2
  3. sglang/srt/disaggregation/ascend/__init__.py +6 -0
  4. sglang/srt/disaggregation/ascend/conn.py +44 -0
  5. sglang/srt/disaggregation/ascend/transfer_engine.py +58 -0
  6. sglang/srt/disaggregation/mooncake/conn.py +15 -14
  7. sglang/srt/disaggregation/mooncake/transfer_engine.py +17 -8
  8. sglang/srt/disaggregation/utils.py +25 -3
  9. sglang/srt/entrypoints/engine.py +1 -1
  10. sglang/srt/entrypoints/http_server.py +1 -0
  11. sglang/srt/entrypoints/openai/protocol.py +11 -0
  12. sglang/srt/entrypoints/openai/serving_chat.py +7 -0
  13. sglang/srt/function_call/function_call_parser.py +2 -0
  14. sglang/srt/function_call/kimik2_detector.py +220 -0
  15. sglang/srt/hf_transformers_utils.py +18 -0
  16. sglang/srt/jinja_template_utils.py +8 -0
  17. sglang/srt/layers/communicator.py +17 -4
  18. sglang/srt/layers/linear.py +12 -2
  19. sglang/srt/layers/moe/ep_moe/kernels.py +2 -1
  20. sglang/srt/layers/moe/ep_moe/layer.py +2 -1
  21. sglang/srt/layers/moe/fused_moe_triton/layer.py +7 -2
  22. sglang/srt/layers/moe/topk.py +8 -2
  23. sglang/srt/layers/parameter.py +19 -3
  24. sglang/srt/layers/quantization/fp8_kernel.py +2 -2
  25. sglang/srt/layers/quantization/moe_wna16.py +1 -2
  26. sglang/srt/layers/quantization/w8a8_int8.py +738 -14
  27. sglang/srt/managers/io_struct.py +27 -2
  28. sglang/srt/managers/mm_utils.py +55 -94
  29. sglang/srt/managers/schedule_batch.py +16 -5
  30. sglang/srt/managers/scheduler.py +21 -1
  31. sglang/srt/managers/tokenizer_manager.py +16 -0
  32. sglang/srt/mem_cache/memory_pool.py +65 -40
  33. sglang/srt/model_executor/forward_batch_info.py +13 -1
  34. sglang/srt/model_loader/loader.py +23 -12
  35. sglang/srt/models/deepseek_janus_pro.py +1 -1
  36. sglang/srt/models/deepseek_v2.py +62 -17
  37. sglang/srt/models/deepseek_vl2.py +1 -1
  38. sglang/srt/models/gemma3_mm.py +1 -1
  39. sglang/srt/models/gemma3n_mm.py +6 -3
  40. sglang/srt/models/internvl.py +8 -2
  41. sglang/srt/models/kimi_vl.py +8 -2
  42. sglang/srt/models/llama.py +2 -0
  43. sglang/srt/models/llava.py +3 -1
  44. sglang/srt/models/llavavid.py +1 -1
  45. sglang/srt/models/minicpmo.py +1 -2
  46. sglang/srt/models/minicpmv.py +1 -1
  47. sglang/srt/models/mixtral_quant.py +4 -0
  48. sglang/srt/models/mllama4.py +13 -4
  49. sglang/srt/models/phi4mm.py +8 -2
  50. sglang/srt/models/phimoe.py +553 -0
  51. sglang/srt/models/qwen2.py +2 -0
  52. sglang/srt/models/qwen2_5_vl.py +10 -7
  53. sglang/srt/models/qwen2_vl.py +12 -1
  54. sglang/srt/models/vila.py +8 -2
  55. sglang/srt/multimodal/processors/base_processor.py +197 -137
  56. sglang/srt/multimodal/processors/deepseek_vl_v2.py +1 -1
  57. sglang/srt/multimodal/processors/gemma3.py +4 -2
  58. sglang/srt/multimodal/processors/gemma3n.py +1 -1
  59. sglang/srt/multimodal/processors/internvl.py +1 -1
  60. sglang/srt/multimodal/processors/janus_pro.py +1 -1
  61. sglang/srt/multimodal/processors/kimi_vl.py +1 -1
  62. sglang/srt/multimodal/processors/minicpm.py +4 -3
  63. sglang/srt/multimodal/processors/mllama4.py +1 -1
  64. sglang/srt/multimodal/processors/phi4mm.py +1 -1
  65. sglang/srt/multimodal/processors/pixtral.py +1 -1
  66. sglang/srt/multimodal/processors/qwen_vl.py +203 -80
  67. sglang/srt/multimodal/processors/vila.py +1 -1
  68. sglang/srt/server_args.py +11 -4
  69. sglang/srt/utils.py +154 -31
  70. sglang/version.py +1 -1
  71. {sglang-0.4.9.post1.dist-info → sglang-0.4.9.post2.dist-info}/METADATA +4 -3
  72. {sglang-0.4.9.post1.dist-info → sglang-0.4.9.post2.dist-info}/RECORD +75 -70
  73. {sglang-0.4.9.post1.dist-info → sglang-0.4.9.post2.dist-info}/WHEEL +0 -0
  74. {sglang-0.4.9.post1.dist-info → sglang-0.4.9.post2.dist-info}/licenses/LICENSE +0 -0
  75. {sglang-0.4.9.post1.dist-info → sglang-0.4.9.post2.dist-info}/top_level.txt +0 -0
@@ -5,7 +5,7 @@ import multiprocessing as mp
5
5
  import os
6
6
  import re
7
7
  from abc import ABC, abstractmethod
8
- from enum import Enum
8
+ from functools import lru_cache
9
9
  from typing import Any, Dict, List, Optional, Tuple, Union
10
10
 
11
11
  import numpy as np
@@ -14,7 +14,7 @@ from PIL import Image
14
14
  from transformers import BaseImageProcessorFast
15
15
 
16
16
  from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
17
- from sglang.srt.utils import encode_video, load_audio, load_image
17
+ from sglang.srt.utils import load_audio, load_image, load_video, logger
18
18
 
19
19
 
20
20
  @dataclasses.dataclass
@@ -25,14 +25,22 @@ class BaseMultiModalProcessorOutput:
25
25
  # frames loaded from image and video, in given order
26
26
  images: Optional[list[Union[Image.Image, dict]]] = None
27
27
 
28
+ # videos
29
+ videos: Optional[list[Union[torch.Tensor, dict]]] = None
30
+
28
31
  # audios
29
32
  audios: Optional[list[Union[np.ndarray, dict]]] = None
30
33
 
31
- def normalize(self):
32
- for field_name in ["images", "audios"]:
33
- field = getattr(self, field_name, None)
34
- if field is not None and isinstance(field, list) and len(field) == 0:
35
- setattr(self, field_name, None)
34
+ def organize_results(self) -> List[Tuple[Modality, Any]]:
35
+ """
36
+
37
+ :return: a list of results, with their corresponding modalities
38
+ """
39
+ return (
40
+ [(Modality.IMAGE, data) for data in self.images]
41
+ + [(Modality.VIDEO, data) for data in self.videos]
42
+ + [(Modality.AUDIO, data) for data in self.audios]
43
+ )
36
44
 
37
45
 
38
46
  @dataclasses.dataclass
@@ -41,6 +49,10 @@ class MultimodalSpecialTokens:
41
49
  video_token: Optional[Union[int, str, List[str]]] = None
42
50
  audio_token: Optional[Union[int, str, List[str]]] = None
43
51
 
52
+ image_token_regex: Optional[re.Pattern] = None
53
+ video_token_regex: Optional[re.Pattern] = None
54
+ audio_token_regex: Optional[re.Pattern] = None
55
+
44
56
  def convert_to_str(self, token: Union[str, int], processor) -> str:
45
57
  if token is None:
46
58
  return token
@@ -53,11 +65,29 @@ class MultimodalSpecialTokens:
53
65
  self.video_token = self.convert_to_str(self.video_token, processor)
54
66
  self.audio_token = self.convert_to_str(self.audio_token, processor)
55
67
 
56
- image_token_regex: Optional[re.Pattern] = None
57
- video_token_regex: Optional[re.Pattern] = None
58
- audio_token_regex: Optional[re.Pattern] = None
59
-
60
- def __post_init__(self):
68
+ def get_modality_of_token(self, token) -> Optional[Modality]:
69
+ """
70
+ :return: the modality associated with the given token, if the token is a special_token or matches with the multimodal token regex
71
+ """
72
+ modality = {
73
+ self.image_token: Modality.IMAGE,
74
+ self.video_token: Modality.VIDEO,
75
+ self.audio_token: Modality.AUDIO,
76
+ }.get(token)
77
+ if modality:
78
+ return modality
79
+
80
+ for regex, modality in [
81
+ (self.image_token_regex, Modality.IMAGE),
82
+ (self.video_token_regex, Modality.VIDEO),
83
+ (self.audio_token_regex, Modality.AUDIO),
84
+ ]:
85
+ if regex and regex.match(token):
86
+ return modality
87
+
88
+ return None
89
+
90
+ def parse_regex(self):
61
91
  if self.image_token_regex is None and self.image_token is not None:
62
92
  self.image_token_regex = re.compile(re.escape(self.image_token))
63
93
  if self.video_token_regex is None and self.video_token is not None:
@@ -65,7 +95,7 @@ class MultimodalSpecialTokens:
65
95
  if self.audio_token_regex is None and self.audio_token is not None:
66
96
  self.audio_token_regex = re.compile(re.escape(self.audio_token))
67
97
 
68
- def collect(self) -> re.Pattern:
98
+ def combine_regex(self) -> re.Pattern:
69
99
  tokens = [
70
100
  self.image_token_regex,
71
101
  self.video_token_regex,
@@ -105,6 +135,7 @@ class BaseMultimodalProcessor(ABC):
105
135
  self.ATTR_NAME_TO_MODALITY = {
106
136
  # Image-related attributes
107
137
  "pixel_values": Modality.IMAGE,
138
+ "pixel_values_videos": Modality.VIDEO,
108
139
  "image_sizes": Modality.IMAGE,
109
140
  "image_grid_thw": Modality.IMAGE,
110
141
  "image_emb_mask": Modality.IMAGE,
@@ -120,7 +151,7 @@ class BaseMultimodalProcessor(ABC):
120
151
  "input_features": Modality.AUDIO,
121
152
  "input_features_mask": Modality.AUDIO,
122
153
  # Video-related attributes
123
- "video_grid_thws": Modality.VIDEO,
154
+ "video_grid_thw": Modality.VIDEO,
124
155
  # Generic attributes that could apply to multiple modalities
125
156
  # "precomputed_features" - handled specially as it can be any modality
126
157
  }
@@ -196,20 +227,25 @@ class BaseMultimodalProcessor(ABC):
196
227
 
197
228
  @staticmethod
198
229
  def _load_single_item(
199
- data, is_video, is_audio, frame_count_limit=None, discard_alpha_channel=True
230
+ data, modality: Modality, frame_count_limit=None, discard_alpha_channel=True
200
231
  ):
201
- """Static method that can be pickled for multiprocessing"""
232
+ """
233
+ Load a single multimodal data.
234
+
235
+ If data is precomputed, returns directly.
236
+
237
+ Static method that can be pickled for multiprocessing"""
202
238
  if isinstance(data, dict):
203
239
  return data
204
240
  try:
205
- if is_audio:
206
- return load_audio(data)
207
- elif is_video:
208
- path = data[len("video:") :]
209
- return encode_video(path, frame_count_limit)
210
- else:
241
+ if modality == Modality.IMAGE:
211
242
  img, _ = load_image(data)
212
243
  return img.convert("RGB") if discard_alpha_channel else img
244
+ elif modality == Modality.VIDEO:
245
+ return load_video(data, frame_count_limit)
246
+ elif modality == Modality.AUDIO:
247
+ return load_audio(data)
248
+
213
249
  except Exception as e:
214
250
  raise RuntimeError(f"Error while loading data {data}: {e}")
215
251
 
@@ -217,75 +253,78 @@ class BaseMultimodalProcessor(ABC):
217
253
  self,
218
254
  text_parts: List[str],
219
255
  multimodal_tokens: MultimodalSpecialTokens,
220
- image_data: Optional[list] = None,
221
- audio_data: Optional[list] = None,
256
+ data_iterators: dict,
222
257
  discard_alpha_channel: bool = True,
223
- ):
258
+ image_estimated_frames_iter: Optional[iter] = None,
259
+ image_scaling_factor: float = 1.0,
260
+ max_image_frames: int = 30,
261
+ ) -> Tuple[List, List]:
224
262
  """
225
- load multimodal data parallelly
263
+ load multimodal data parallelly using iterators.
226
264
  """
227
-
228
- # TODO(mick): load from server_args, env, or sampling_params
229
- MAX_NUM_FRAMES = 30
230
- estimated_frames_list = self.get_estimated_frames_list(image_data=image_data)
231
- total_frame_count = sum(estimated_frames_list)
232
- # a heuristic value, suggesting the maximum fraction of frames to embed from all visual inputs.
233
- # e.g., 0.1 suggests that 1 frame out of 10 input frames should be used
234
- scaling_factor = min(1.0, MAX_NUM_FRAMES / max(1, total_frame_count))
235
-
236
- assert len(image_data) == len(estimated_frames_list)
237
- # Submit all tasks
238
265
  futures = []
239
266
  task_info = []
240
- image_index, audio_index = 0, 0
241
267
 
242
268
  for text_part in text_parts:
243
- if (
244
- multimodal_tokens.image_token_regex
245
- and multimodal_tokens.image_token_regex.match(text_part)
246
- ):
247
- data = image_data[image_index]
248
- is_video = isinstance(data, str) and data.startswith("video:")
249
- estimated_frames = estimated_frames_list[image_index]
250
- frame_count_limit = max(1, int(estimated_frames * scaling_factor))
269
+ modality = multimodal_tokens.get_modality_of_token(text_part)
270
+ if modality is not None:
271
+ data_iterator = data_iterators.get(modality)
272
+ if data_iterator is None:
273
+ raise ValueError(f"No data iterator found for token: {text_part}")
274
+
275
+ try:
276
+ data = next(data_iterator)
277
+ except StopIteration:
278
+ raise ValueError(
279
+ f"Mismatch: More '{text_part}' tokens found than corresponding data items provided."
280
+ )
281
+
282
+ frame_count_limit = None
283
+ if modality == Modality.IMAGE and image_estimated_frames_iter:
284
+ try:
285
+ estimated_frames = next(image_estimated_frames_iter)
286
+ # Use the pre-calculated scaling factor and max frames
287
+ frame_count_limit = max(
288
+ 1, int(estimated_frames * image_scaling_factor)
289
+ )
290
+ # Ensure we don't exceed the absolute max (redundant if scaling_factor handles it)
291
+ # frame_count_limit = min(frame_count_limit, max_image_frames)
292
+ except StopIteration:
293
+ raise ValueError(
294
+ "Mismatch between image tokens and estimated frame counts."
295
+ )
296
+
251
297
  futures.append(
252
298
  self.io_executor.submit(
253
299
  BaseMultimodalProcessor._load_single_item,
254
300
  data,
255
- is_video,
256
- False,
301
+ modality,
257
302
  frame_count_limit,
258
303
  discard_alpha_channel,
259
304
  )
260
305
  )
261
- task_info.append((Modality.IMAGE, data, frame_count_limit))
262
- image_index += 1
263
- elif (
264
- multimodal_tokens.audio_token_regex
265
- and multimodal_tokens.audio_token_regex.match(text_part)
266
- ):
267
- data = audio_data[audio_index]
268
- futures.append(
269
- self.io_executor.submit(
270
- BaseMultimodalProcessor._load_single_item,
271
- data,
272
- False,
273
- True,
274
- None,
275
- discard_alpha_channel,
276
- )
306
+ task_info.append((modality, data, frame_count_limit))
307
+
308
+ for modality, iterator in data_iterators.items():
309
+ try:
310
+ next(iterator)
311
+ logger.warning(
312
+ f"Warning: More {modality.name.lower()} data items provided than corresponding tokens found in the prompt."
277
313
  )
278
- task_info.append((Modality.AUDIO, data, None))
279
- audio_index += 1
314
+ except StopIteration:
315
+ pass
316
+ except Exception:
317
+ pass
280
318
 
281
319
  return futures, task_info
282
320
 
283
321
  def load_mm_data(
284
322
  self,
285
- prompt: str | List[int],
323
+ prompt: str,
286
324
  multimodal_tokens: MultimodalSpecialTokens,
287
325
  max_req_input_len: int,
288
326
  image_data: Optional[list] = None,
327
+ video_data: Optional[list] = None,
289
328
  audio_data: Optional[list] = None,
290
329
  return_text: Optional[bool] = True,
291
330
  discard_alpha_channel: bool = True,
@@ -299,14 +338,9 @@ class BaseMultimodalProcessor(ABC):
299
338
  discard_alpha_channel: if True, discards the alpha channel in the returned images
300
339
 
301
340
  """
302
- if not return_text:
303
- raise NotImplementedError()
304
- if image_data is None:
305
- image_data = []
306
-
307
341
  multimodal_tokens.convert_to_strs(self._processor)
308
- multimodal_tokens_pattern = multimodal_tokens.collect()
309
-
342
+ multimodal_tokens.parse_regex()
343
+ multimodal_tokens_pattern = multimodal_tokens.combine_regex()
310
344
  if isinstance(prompt, list) and return_text:
311
345
  assert len(prompt) and isinstance(prompt[0], int)
312
346
  prompt = self._processor.tokenizer.decode(prompt)
@@ -317,59 +351,84 @@ class BaseMultimodalProcessor(ABC):
317
351
  # split text into list of normal text and special tokens
318
352
  text_parts = re.split(multimodal_tokens_pattern, prompt)
319
353
 
354
+ # collect all data
355
+ data_iterators = {}
356
+ if multimodal_tokens.image_token and image_data:
357
+ data_iterators[Modality.IMAGE] = iter(image_data)
358
+ if multimodal_tokens.video_token and video_data:
359
+ data_iterators[Modality.VIDEO] = iter(video_data)
360
+ if multimodal_tokens.audio_token and audio_data:
361
+ data_iterators[Modality.AUDIO] = iter(audio_data)
362
+
363
+ # futures: the futures of loaded data
364
+ # task_info: modality, raw_data, and other metadata of each data
320
365
  futures, task_info = self.submit_data_loading_tasks(
321
366
  text_parts=text_parts,
322
367
  multimodal_tokens=multimodal_tokens,
323
- image_data=image_data,
324
- audio_data=audio_data,
368
+ data_iterators=data_iterators,
325
369
  discard_alpha_channel=discard_alpha_channel,
326
370
  )
327
- # Process results
328
- images, audios = [], []
329
- new_text = ""
330
- task_ptr = 0
371
+ task_info_iter = iter(task_info)
372
+ futures_iter = iter(futures)
331
373
 
374
+ # Process results
375
+ images, videos, audios = [], [], []
376
+ new_text_parts = []
332
377
  for text_part in text_parts:
333
- if multimodal_tokens_pattern.match(text_part):
334
- task_type, data, frame_limit = task_info[task_ptr]
335
- result = futures[task_ptr].result()
336
- task_ptr += 1
337
-
338
- if task_type == Modality.IMAGE:
339
- # If data is already processed it will be a
340
- # dictionary. In this case we want to keep the
341
- # expanded tokens in text_part. Otherwise, we will
342
- # call the processor code, so keep only a single image
343
- # token.
344
- mm_tokens = (
345
- text_part
346
- if isinstance(data, dict)
347
- else multimodal_tokens.image_token
348
- )
349
- frames = [result] if not isinstance(result, list) else result
350
- if frames:
351
- images += frames
352
- new_text += mm_tokens * len(frames)
353
- elif task_type == Modality.AUDIO:
354
- # audio
355
- mm_tokens = (
356
- text_part
357
- if isinstance(data, dict)
358
- else multimodal_tokens.audio_token
359
- )
360
- audios.append(result)
361
- new_text += mm_tokens
362
- # TODO: handle video
363
- else:
364
- new_text += text_part
365
-
366
- out = BaseMultiModalProcessorOutput(
367
- input_text=new_text,
378
+ try:
379
+ if multimodal_tokens_pattern.match(text_part):
380
+ modality, raw_data, frame_limit = next(task_info_iter)
381
+ is_precomputed = isinstance(raw_data, dict)
382
+ result = next(futures_iter).result()
383
+
384
+ if modality == Modality.IMAGE:
385
+ # If data is already processed it will be a
386
+ # dictionary(precomputed). In this case we want to keep the
387
+ # expanded tokens in text_part. Otherwise, we will
388
+ # call the processor code, so keep only a single image
389
+ # token.
390
+ mm_tokens = (
391
+ text_part
392
+ if is_precomputed
393
+ else multimodal_tokens.image_token
394
+ )
395
+ frames = [result] if not isinstance(result, list) else result
396
+ if frames:
397
+ # only for minicpmv
398
+ images += frames
399
+ new_text_parts += mm_tokens * len(frames)
400
+ elif modality == Modality.VIDEO:
401
+ # load as video
402
+ mm_tokens = (
403
+ text_part
404
+ if is_precomputed
405
+ else multimodal_tokens.video_token
406
+ )
407
+ videos += [result]
408
+ new_text_parts += mm_tokens
409
+ elif modality == Modality.AUDIO:
410
+ # audio
411
+ mm_tokens = (
412
+ text_part
413
+ if is_precomputed
414
+ else multimodal_tokens.audio_token
415
+ )
416
+ audios += [result]
417
+ new_text_parts += mm_tokens
418
+ else:
419
+ # normal text
420
+ new_text_parts += [text_part]
421
+
422
+ except Exception as e:
423
+ raise RuntimeError(
424
+ f"An exception occurred while loading multimodal data: {e}"
425
+ )
426
+ return BaseMultiModalProcessorOutput(
368
427
  images=images,
369
428
  audios=audios,
429
+ videos=videos,
430
+ input_text="".join(new_text_parts),
370
431
  )
371
- out.normalize()
372
- return out
373
432
 
374
433
  @staticmethod
375
434
  def get_mm_items_offset(
@@ -460,21 +519,19 @@ class BaseMultimodalProcessor(ABC):
460
519
  )
461
520
  except ValueError:
462
521
  modality = Modality.IMAGE
463
-
464
522
  if modality:
465
523
  # Create item if needed
466
524
  if modality not in items:
467
525
  items[modality] = MultimodalDataItem(modality=modality)
468
526
 
469
527
  # Set attribute
470
- if hasattr(items[modality], attr_name):
471
- setattr(items[modality], attr_name, value)
528
+ setattr(items[modality], attr_name, value)
472
529
 
473
530
  return list(items.values())
474
531
 
475
532
  def _process_and_collect_mm_items(
476
533
  self, input_text: str, images=None, audios=None, videos=None, **kwargs
477
- ) -> Tuple[List[MultimodalDataItem], torch.Tensor]:
534
+ ) -> Tuple[List[MultimodalDataItem], torch.Tensor, dict]:
478
535
  """
479
536
  Helper method to process multimodal data and create mm_items in one step.
480
537
 
@@ -488,11 +545,11 @@ class BaseMultimodalProcessor(ABC):
488
545
  input_ids = ret["input_ids"].flatten()
489
546
  collected_items = self.collect_mm_items_from_processor_output(ret)
490
547
 
491
- return collected_items, input_ids
548
+ return collected_items, input_ids, ret
492
549
 
493
550
  def process_and_combine_mm_data(
494
551
  self, base_output: BaseMultiModalProcessorOutput
495
- ) -> Tuple[List[MultimodalDataItem], torch.Tensor]:
552
+ ) -> Tuple[List[MultimodalDataItem], torch.Tensor, dict]:
496
553
  """
497
554
  Process multimodal data and return the combined multimodal items and input_ids.
498
555
  Supports mixed modalities (images and audio in the same request).
@@ -501,8 +558,7 @@ class BaseMultimodalProcessor(ABC):
501
558
  Tuple of (list of mm_items, input_ids)
502
559
  """
503
560
  # Collect all items and categorize them
504
- all_items = (base_output.images or []) + (base_output.audios or [])
505
-
561
+ all_items = base_output.organize_results()
506
562
  # Handle text-only case
507
563
  if not all_items:
508
564
  input_ids = self._processor.tokenizer(
@@ -510,19 +566,20 @@ class BaseMultimodalProcessor(ABC):
510
566
  return_tensors="pt",
511
567
  add_special_tokens=True,
512
568
  ).input_ids.flatten()
513
- return [], input_ids
569
+ return [], input_ids, {}
514
570
 
515
- dict_items, raw_images, raw_audios = [], [], []
516
- for item in all_items:
571
+ dict_items, raw_images, raw_audios, raw_videos = [], [], [], []
572
+ for modality, item in all_items:
517
573
  if isinstance(item, dict):
518
574
  dict_items.append(item)
519
- elif isinstance(item, Image.Image):
575
+ elif modality == Modality.IMAGE:
520
576
  raw_images.append(item)
521
- elif isinstance(item, np.ndarray):
577
+ elif modality == Modality.AUDIO:
522
578
  raw_audios.append(item)
579
+ elif modality == Modality.VIDEO:
580
+ raw_videos.append(item)
523
581
  else:
524
582
  raise ValueError(f"Unknown multimodal item type: {type(item)}")
525
-
526
583
  # Process items and get input_ids
527
584
  all_collected_items = []
528
585
  input_ids = None
@@ -534,13 +591,16 @@ class BaseMultimodalProcessor(ABC):
534
591
  )
535
592
 
536
593
  # Handle raw items (need processing)
537
- if raw_images or raw_audios:
538
- collected_items, input_ids = self._process_and_collect_mm_items(
594
+ if raw_images or raw_audios or raw_videos:
595
+ collected_items, input_ids, ret = self._process_and_collect_mm_items(
539
596
  input_text=base_output.input_text,
540
597
  images=raw_images,
541
598
  audios=raw_audios,
599
+ videos=raw_videos,
542
600
  )
543
601
  all_collected_items.extend(collected_items)
602
+ else:
603
+ ret = None
544
604
 
545
605
  # Fallback tokenization if no raw items were processed
546
606
  if input_ids is None:
@@ -553,21 +613,21 @@ class BaseMultimodalProcessor(ABC):
553
613
  # Add offsets to all items
554
614
  for mm_item in all_collected_items:
555
615
  if mm_item.modality in [Modality.IMAGE, Modality.MULTI_IMAGES]:
556
- mm_item.image_offsets = self.get_mm_items_offset(
616
+ mm_item.offsets = self.get_mm_items_offset(
557
617
  input_ids=input_ids,
558
618
  mm_token_id=self.IM_TOKEN_ID,
559
619
  )
560
620
  elif mm_item.modality == Modality.AUDIO:
561
- mm_item.audio_offsets = self.get_mm_items_offset(
621
+ mm_item.offsets = self.get_mm_items_offset(
562
622
  input_ids=input_ids,
563
623
  mm_token_id=self.AUDIO_TOKEN_ID,
564
624
  )
565
625
  elif mm_item.modality == Modality.VIDEO:
566
- mm_item.video_offsets = self.get_mm_items_offset(
626
+ mm_item.offsets = self.get_mm_items_offset(
567
627
  input_ids=input_ids,
568
628
  mm_token_id=self.VIDEO_TOKEN_ID,
569
629
  )
570
630
  else:
571
631
  raise ValueError(f"Unknown modality: {mm_item.modality}")
572
632
 
573
- return all_collected_items, input_ids
633
+ return all_collected_items, input_ids, ret
@@ -69,7 +69,7 @@ class DeepseekVL2ImageProcessor(BaseMultimodalProcessor):
69
69
  )
70
70
  item = MultimodalDataItem(
71
71
  pixel_values=res["images"],
72
- image_offsets=image_offsets,
72
+ offsets=image_offsets,
73
73
  modality=Modality.IMAGE,
74
74
  image_emb_mask=images_seq_mask,
75
75
  image_spatial_crop=batched_images_spatial_crop,
@@ -36,6 +36,7 @@ class Gemma3SGLangImageProcessor(SGLangBaseProcessor):
36
36
  *args,
37
37
  **kwargs,
38
38
  ):
39
+ print(f"{image_data=}")
39
40
  base_output = self.load_mm_data(
40
41
  prompt=input_text,
41
42
  image_data=image_data,
@@ -46,8 +47,9 @@ class Gemma3SGLangImageProcessor(SGLangBaseProcessor):
46
47
  discard_alpha_channel=True,
47
48
  )
48
49
 
49
- mm_items, input_ids = self.process_and_combine_mm_data(base_output)
50
-
50
+ mm_items, input_ids, _ = self.process_and_combine_mm_data(base_output)
51
+ print(f"{base_output=}")
52
+ print(f"{mm_items=}")
51
53
  return {
52
54
  "input_ids": input_ids.tolist(),
53
55
  "mm_items": mm_items,
@@ -72,7 +72,7 @@ class Gemma3nSGLangProcessor(SGLangBaseProcessor):
72
72
  ),
73
73
  )
74
74
 
75
- mm_items, input_ids = self.process_and_combine_mm_data(base_output)
75
+ mm_items, input_ids, _ = self.process_and_combine_mm_data(base_output)
76
76
 
77
77
  return {
78
78
  "input_ids": input_ids.tolist(),
@@ -225,7 +225,7 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
225
225
  MultimodalDataItem(
226
226
  pixel_values=pixel_values,
227
227
  modality=Modality.IMAGE,
228
- image_offsets=image_offsets,
228
+ offsets=image_offsets,
229
229
  )
230
230
  ]
231
231
 
@@ -49,7 +49,7 @@ class JanusProImageProcessor(BaseMultimodalProcessor):
49
49
  MultimodalDataItem(
50
50
  pixel_values=res["pixel_values"],
51
51
  image_emb_mask=res["images_emb_mask"],
52
- image_offsets=image_offsets,
52
+ offsets=image_offsets,
53
53
  modality=Modality.IMAGE,
54
54
  )
55
55
  ],
@@ -39,7 +39,7 @@ class KimiVLImageProcessor(SGLangBaseProcessor):
39
39
  max_req_input_len=max_req_input_len,
40
40
  )
41
41
 
42
- mm_items, input_ids = self.process_and_combine_mm_data(base_output)
42
+ mm_items, input_ids, _ = self.process_and_combine_mm_data(base_output)
43
43
 
44
44
  return {
45
45
  "input_ids": input_ids.tolist(),
@@ -19,6 +19,7 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
19
19
  super().__init__(hf_config, server_args, _processor)
20
20
  self.image_token = "(<image>./</image>)"
21
21
  self.audio_token = "(<audio>./</audio>)"
22
+ self.video_token = "(<video>./</video>)"
22
23
 
23
24
  async def process_mm_data_async(
24
25
  self,
@@ -36,6 +37,7 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
36
37
  image_data=image_data,
37
38
  multimodal_tokens=MultimodalSpecialTokens(
38
39
  image_token=self.image_token,
40
+ video_token=self.video_token,
39
41
  audio_token=self.audio_token,
40
42
  ),
41
43
  )
@@ -113,7 +115,7 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
113
115
  if len(pixel_values) != 0:
114
116
  item = MultimodalDataItem(
115
117
  pixel_values=pixel_values,
116
- image_offsets=image_offsets,
118
+ offsets=image_offsets,
117
119
  tgt_size=tgt_sizes_flat,
118
120
  modality=Modality.IMAGE,
119
121
  )
@@ -135,11 +137,10 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
135
137
  item = MultimodalDataItem(
136
138
  audio_features=[res["audio_features"]],
137
139
  audio_feature_lens=res["audio_feature_lens"],
138
- audio_offsets=audio_offsets,
140
+ offsets=audio_offsets,
139
141
  modality=Modality.AUDIO,
140
142
  )
141
143
  items += [item]
142
-
143
144
  return {
144
145
  "mm_items": items,
145
146
  "input_ids": input_ids.tolist(),
@@ -144,7 +144,7 @@ class Mllama4ImageProcessor(BaseMultimodalProcessor):
144
144
  MultimodalDataItem(
145
145
  pixel_values=processor_output["pixel_values"],
146
146
  modality=Modality.IMAGE,
147
- image_offsets=image_offsets,
147
+ offsets=image_offsets,
148
148
  )
149
149
  ]
150
150
 
@@ -65,7 +65,7 @@ class Phi4MMImageProcessor(BaseMultimodalProcessor):
65
65
  pixel_values=res["input_image_embeds"],
66
66
  image_sizes=res["image_sizes"],
67
67
  image_emb_mask=res["image_attention_mask"],
68
- image_offsets=image_offsets,
68
+ offsets=image_offsets,
69
69
  modality=Modality.IMAGE,
70
70
  )
71
71
  ]