sglang 0.4.8.post1__py3-none-any.whl → 0.4.9.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (158) hide show
  1. sglang/bench_one_batch_server.py +17 -2
  2. sglang/bench_serving.py +170 -24
  3. sglang/srt/configs/internvl.py +4 -2
  4. sglang/srt/configs/janus_pro.py +1 -1
  5. sglang/srt/configs/model_config.py +60 -1
  6. sglang/srt/configs/update_config.py +119 -0
  7. sglang/srt/conversation.py +69 -1
  8. sglang/srt/disaggregation/decode.py +21 -5
  9. sglang/srt/disaggregation/mooncake/conn.py +35 -4
  10. sglang/srt/disaggregation/nixl/conn.py +6 -6
  11. sglang/srt/disaggregation/prefill.py +2 -2
  12. sglang/srt/disaggregation/utils.py +1 -1
  13. sglang/srt/distributed/parallel_state.py +44 -17
  14. sglang/srt/entrypoints/EngineBase.py +8 -0
  15. sglang/srt/entrypoints/engine.py +40 -6
  16. sglang/srt/entrypoints/http_server.py +111 -24
  17. sglang/srt/entrypoints/http_server_engine.py +1 -1
  18. sglang/srt/entrypoints/openai/protocol.py +4 -2
  19. sglang/srt/eplb/__init__.py +0 -0
  20. sglang/srt/{managers → eplb}/eplb_algorithms/__init__.py +1 -1
  21. sglang/srt/{managers → eplb}/eplb_manager.py +2 -4
  22. sglang/srt/{eplb_simulator → eplb/eplb_simulator}/reader.py +1 -1
  23. sglang/srt/{managers → eplb}/expert_distribution.py +1 -5
  24. sglang/srt/{managers → eplb}/expert_location.py +1 -1
  25. sglang/srt/{managers → eplb}/expert_location_dispatch.py +1 -1
  26. sglang/srt/{model_executor → eplb}/expert_location_updater.py +17 -1
  27. sglang/srt/hf_transformers_utils.py +2 -1
  28. sglang/srt/layers/activation.py +2 -2
  29. sglang/srt/layers/amx_utils.py +86 -0
  30. sglang/srt/layers/attention/ascend_backend.py +219 -0
  31. sglang/srt/layers/attention/flashattention_backend.py +32 -9
  32. sglang/srt/layers/attention/tbo_backend.py +37 -9
  33. sglang/srt/layers/communicator.py +20 -2
  34. sglang/srt/layers/dp_attention.py +9 -3
  35. sglang/srt/layers/elementwise.py +76 -12
  36. sglang/srt/layers/flashinfer_comm_fusion.py +202 -0
  37. sglang/srt/layers/layernorm.py +26 -0
  38. sglang/srt/layers/linear.py +84 -14
  39. sglang/srt/layers/logits_processor.py +4 -4
  40. sglang/srt/layers/moe/cutlass_w4a8_moe.py +215 -0
  41. sglang/srt/layers/moe/ep_moe/kernels.py +81 -8
  42. sglang/srt/layers/moe/ep_moe/layer.py +176 -15
  43. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +23 -17
  44. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +3 -2
  45. sglang/srt/layers/moe/fused_moe_triton/layer.py +211 -74
  46. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +176 -0
  47. sglang/srt/layers/moe/router.py +60 -22
  48. sglang/srt/layers/moe/topk.py +10 -28
  49. sglang/srt/layers/parameter.py +67 -7
  50. sglang/srt/layers/quantization/__init__.py +2 -0
  51. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +1 -1
  52. sglang/srt/layers/quantization/fp8.py +72 -7
  53. sglang/srt/layers/quantization/fp8_kernel.py +1 -1
  54. sglang/srt/layers/quantization/fp8_utils.py +1 -2
  55. sglang/srt/layers/quantization/gptq.py +5 -1
  56. sglang/srt/layers/quantization/modelopt_quant.py +244 -1
  57. sglang/srt/layers/quantization/moe_wna16.py +1 -1
  58. sglang/srt/layers/quantization/quant_utils.py +166 -0
  59. sglang/srt/layers/quantization/w4afp8.py +264 -0
  60. sglang/srt/layers/quantization/w8a8_int8.py +52 -1
  61. sglang/srt/layers/rotary_embedding.py +2 -2
  62. sglang/srt/layers/vocab_parallel_embedding.py +20 -10
  63. sglang/srt/lora/lora.py +4 -5
  64. sglang/srt/lora/lora_manager.py +73 -20
  65. sglang/srt/lora/triton_ops/gate_up_lora_b.py +30 -19
  66. sglang/srt/lora/triton_ops/qkv_lora_b.py +30 -19
  67. sglang/srt/lora/triton_ops/sgemm_lora_a.py +27 -11
  68. sglang/srt/lora/triton_ops/sgemm_lora_b.py +27 -15
  69. sglang/srt/managers/cache_controller.py +41 -195
  70. sglang/srt/managers/configure_logging.py +1 -1
  71. sglang/srt/managers/io_struct.py +58 -14
  72. sglang/srt/managers/mm_utils.py +77 -61
  73. sglang/srt/managers/multimodal_processor.py +2 -6
  74. sglang/srt/managers/multimodal_processors/qwen_audio.py +94 -0
  75. sglang/srt/managers/schedule_batch.py +78 -85
  76. sglang/srt/managers/scheduler.py +130 -64
  77. sglang/srt/managers/scheduler_output_processor_mixin.py +8 -2
  78. sglang/srt/managers/session_controller.py +12 -3
  79. sglang/srt/managers/tokenizer_manager.py +314 -103
  80. sglang/srt/managers/tp_worker.py +13 -1
  81. sglang/srt/managers/tp_worker_overlap_thread.py +8 -0
  82. sglang/srt/mem_cache/allocator.py +290 -0
  83. sglang/srt/mem_cache/chunk_cache.py +34 -2
  84. sglang/srt/mem_cache/hiradix_cache.py +2 -0
  85. sglang/srt/mem_cache/memory_pool.py +402 -66
  86. sglang/srt/mem_cache/memory_pool_host.py +6 -109
  87. sglang/srt/mem_cache/multimodal_cache.py +3 -0
  88. sglang/srt/mem_cache/radix_cache.py +8 -4
  89. sglang/srt/model_executor/cuda_graph_runner.py +2 -1
  90. sglang/srt/model_executor/forward_batch_info.py +17 -4
  91. sglang/srt/model_executor/model_runner.py +297 -56
  92. sglang/srt/model_loader/loader.py +41 -0
  93. sglang/srt/model_loader/weight_utils.py +72 -4
  94. sglang/srt/models/deepseek_nextn.py +1 -3
  95. sglang/srt/models/deepseek_v2.py +195 -45
  96. sglang/srt/models/deepseek_vl2.py +3 -5
  97. sglang/srt/models/gemma3_causal.py +1 -2
  98. sglang/srt/models/gemma3n_causal.py +4 -3
  99. sglang/srt/models/gemma3n_mm.py +4 -20
  100. sglang/srt/models/hunyuan.py +1 -1
  101. sglang/srt/models/kimi_vl.py +1 -2
  102. sglang/srt/models/llama.py +10 -4
  103. sglang/srt/models/llama4.py +32 -45
  104. sglang/srt/models/llama_eagle3.py +61 -11
  105. sglang/srt/models/llava.py +5 -5
  106. sglang/srt/models/minicpmo.py +2 -2
  107. sglang/srt/models/mistral.py +1 -1
  108. sglang/srt/models/mllama4.py +402 -89
  109. sglang/srt/models/phi4mm.py +1 -3
  110. sglang/srt/models/pixtral.py +3 -7
  111. sglang/srt/models/qwen2.py +31 -3
  112. sglang/srt/models/qwen2_5_vl.py +1 -3
  113. sglang/srt/models/qwen2_audio.py +200 -0
  114. sglang/srt/models/qwen2_moe.py +32 -6
  115. sglang/srt/models/qwen2_vl.py +1 -4
  116. sglang/srt/models/qwen3.py +94 -25
  117. sglang/srt/models/qwen3_moe.py +68 -21
  118. sglang/srt/models/vila.py +3 -8
  119. sglang/srt/{mm_utils.py → multimodal/mm_utils.py} +2 -2
  120. sglang/srt/{managers/multimodal_processors → multimodal/processors}/base_processor.py +140 -158
  121. sglang/srt/{managers/multimodal_processors → multimodal/processors}/clip.py +2 -13
  122. sglang/srt/{managers/multimodal_processors → multimodal/processors}/deepseek_vl_v2.py +4 -11
  123. sglang/srt/{managers/multimodal_processors → multimodal/processors}/gemma3.py +3 -10
  124. sglang/srt/{managers/multimodal_processors → multimodal/processors}/gemma3n.py +5 -20
  125. sglang/srt/{managers/multimodal_processors → multimodal/processors}/internvl.py +3 -10
  126. sglang/srt/{managers/multimodal_processors → multimodal/processors}/janus_pro.py +3 -9
  127. sglang/srt/{managers/multimodal_processors → multimodal/processors}/kimi_vl.py +6 -13
  128. sglang/srt/{managers/multimodal_processors → multimodal/processors}/llava.py +2 -10
  129. sglang/srt/{managers/multimodal_processors → multimodal/processors}/minicpm.py +5 -12
  130. sglang/srt/{managers/multimodal_processors → multimodal/processors}/mlama.py +2 -14
  131. sglang/srt/{managers/multimodal_processors → multimodal/processors}/mllama4.py +65 -66
  132. sglang/srt/{managers/multimodal_processors → multimodal/processors}/phi4mm.py +4 -14
  133. sglang/srt/{managers/multimodal_processors → multimodal/processors}/pixtral.py +3 -9
  134. sglang/srt/{managers/multimodal_processors → multimodal/processors}/qwen_vl.py +8 -14
  135. sglang/srt/{managers/multimodal_processors → multimodal/processors}/vila.py +13 -31
  136. sglang/srt/operations_strategy.py +6 -2
  137. sglang/srt/reasoning_parser.py +26 -0
  138. sglang/srt/sampling/sampling_batch_info.py +39 -1
  139. sglang/srt/server_args.py +84 -22
  140. sglang/srt/speculative/build_eagle_tree.py +57 -18
  141. sglang/srt/speculative/eagle_worker.py +6 -4
  142. sglang/srt/two_batch_overlap.py +203 -27
  143. sglang/srt/utils.py +343 -163
  144. sglang/srt/warmup.py +12 -3
  145. sglang/test/runners.py +10 -1
  146. sglang/test/test_cutlass_w4a8_moe.py +281 -0
  147. sglang/test/test_utils.py +15 -3
  148. sglang/utils.py +5 -5
  149. sglang/version.py +1 -1
  150. {sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/METADATA +12 -8
  151. {sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/RECORD +157 -146
  152. sglang/math_utils.py +0 -8
  153. /sglang/srt/{managers → eplb}/eplb_algorithms/deepseek.py +0 -0
  154. /sglang/srt/{managers → eplb}/eplb_algorithms/deepseek_vec.py +0 -0
  155. /sglang/srt/{eplb_simulator → eplb/eplb_simulator}/__init__.py +0 -0
  156. {sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/WHEEL +0 -0
  157. {sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/licenses/LICENSE +0 -0
  158. {sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/top_level.txt +0 -0
@@ -17,15 +17,6 @@ from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
17
17
  from sglang.srt.utils import encode_video, load_audio, load_image
18
18
 
19
19
 
20
- class MultimodalInputFormat(Enum):
21
- """Enum for different multimodal input formats."""
22
-
23
- RAW_IMAGES = "raw_images"
24
- PRECOMPUTED_FEATURES = "precomputed_features"
25
- PIXEL_VALUES = "pixel_values"
26
- AUDIO = "audio"
27
-
28
-
29
20
  @dataclasses.dataclass
30
21
  class BaseMultiModalProcessorOutput:
31
22
  # input_text, with each frame of video/image represented with a image_token
@@ -98,6 +89,7 @@ class BaseMultimodalProcessor(ABC):
98
89
  self._processor = _processor
99
90
  self.arch = hf_config.architectures[0]
100
91
  self.server_args = server_args
92
+
101
93
  # FIXME: not accurate, model and image specific
102
94
  self.NUM_TOKEN_PER_FRAME = 330
103
95
 
@@ -109,18 +101,45 @@ class BaseMultimodalProcessor(ABC):
109
101
  max_workers=int(os.environ.get("SGLANG_CPU_WORKERS", os.cpu_count())),
110
102
  )
111
103
 
104
+ # Mapping from attribute names to modality types
105
+ self.ATTR_NAME_TO_MODALITY = {
106
+ # Image-related attributes
107
+ "pixel_values": Modality.IMAGE,
108
+ "image_sizes": Modality.IMAGE,
109
+ "image_grid_thw": Modality.IMAGE,
110
+ "image_emb_mask": Modality.IMAGE,
111
+ "image_spatial_crop": Modality.IMAGE,
112
+ "tgt_size": Modality.IMAGE,
113
+ "image_grid_hws": Modality.IMAGE,
114
+ "aspect_ratio_id": Modality.IMAGE,
115
+ "aspect_ratio_mask": Modality.IMAGE,
116
+ "second_per_grid_ts": Modality.IMAGE,
117
+ # Audio-related attributes
118
+ "audio_features": Modality.AUDIO,
119
+ "audio_feature_lens": Modality.AUDIO,
120
+ "input_features": Modality.AUDIO,
121
+ "input_features_mask": Modality.AUDIO,
122
+ # Video-related attributes
123
+ "video_grid_thws": Modality.VIDEO,
124
+ # Generic attributes that could apply to multiple modalities
125
+ # "precomputed_features" - handled specially as it can be any modality
126
+ }
127
+
112
128
  def process_mm_data(
113
129
  self, input_text, images=None, videos=None, audios=None, **kwargs
114
130
  ):
115
131
  """
116
132
  process multimodal data with transformers AutoProcessor
117
133
  """
118
- if images is not None:
134
+ if images:
119
135
  kwargs["images"] = images
120
- if videos is not None:
136
+ if videos:
121
137
  kwargs["videos"] = videos
122
- if audios is not None:
138
+ if audios:
123
139
  kwargs["audios"] = audios
140
+ if self.__class__.__name__ == "Gemma3nSGLangProcessor":
141
+ # Note(Xinyuan): for gemma3n, ref: https://github.com/huggingface/transformers/blob/ccf2ca162e33f381e454cdb74bf4b41a51ab976d/src/transformers/models/gemma3n/processing_gemma3n.py#L107
142
+ kwargs["audio"] = audios
124
143
 
125
144
  processor = self._processor
126
145
  if hasattr(processor, "image_processor") and isinstance(
@@ -143,6 +162,7 @@ class BaseMultimodalProcessor(ABC):
143
162
  async def process_mm_data_async(
144
163
  self,
145
164
  image_data,
165
+ audio_data,
146
166
  input_text,
147
167
  request_obj,
148
168
  max_req_input_len,
@@ -417,175 +437,137 @@ class BaseMultimodalProcessor(ABC):
417
437
  values[k] = v
418
438
  return values
419
439
 
440
+ def collect_mm_items_from_processor_output(
441
+ self, data_dict: dict
442
+ ) -> List[MultimodalDataItem]:
443
+ """Create mm_items directly from processor output."""
444
+ items = {} # modality -> MultimodalDataItem
445
+
446
+ for attr_name, value in data_dict.items():
447
+ if attr_name == "input_ids":
448
+ continue
449
+
450
+ # Get modality for this attribute
451
+ modality = self.ATTR_NAME_TO_MODALITY.get(attr_name)
452
+
453
+ if not modality and attr_name == "precomputed_features":
454
+ modality_str = data_dict.get("modality")
455
+ try:
456
+ modality = (
457
+ Modality.from_str(modality_str)
458
+ if modality_str
459
+ else Modality.IMAGE
460
+ )
461
+ except ValueError:
462
+ modality = Modality.IMAGE
463
+
464
+ if modality:
465
+ # Create item if needed
466
+ if modality not in items:
467
+ items[modality] = MultimodalDataItem(modality=modality)
468
+
469
+ # Set attribute
470
+ if hasattr(items[modality], attr_name):
471
+ setattr(items[modality], attr_name, value)
472
+
473
+ return list(items.values())
474
+
475
+ def _process_and_collect_mm_items(
476
+ self, input_text: str, images=None, audios=None, videos=None, **kwargs
477
+ ) -> Tuple[List[MultimodalDataItem], torch.Tensor]:
478
+ """
479
+ Helper method to process multimodal data and create mm_items in one step.
480
+
481
+ Returns:
482
+ Tuple of (created mm_items, input_ids)
483
+ """
484
+ ret = self.process_mm_data(
485
+ input_text=input_text, images=images, audios=audios, videos=videos, **kwargs
486
+ )
487
+
488
+ input_ids = ret["input_ids"].flatten()
489
+ collected_items = self.collect_mm_items_from_processor_output(ret)
490
+
491
+ return collected_items, input_ids
492
+
420
493
  def process_and_combine_mm_data(
421
494
  self, base_output: BaseMultiModalProcessorOutput
422
- ) -> Tuple[Optional[MultimodalDataItem], torch.Tensor]:
495
+ ) -> Tuple[List[MultimodalDataItem], torch.Tensor]:
423
496
  """
424
- Process multimodal data and return the combined multimodal item and input_ids.
425
- Handles all three input formats at the same abstraction level.
497
+ Process multimodal data and return the combined multimodal items and input_ids.
498
+ Supports mixed modalities (images and audio in the same request).
426
499
 
427
500
  Returns:
428
- Tuple of (combined_mm_item, input_ids)
501
+ Tuple of (list of mm_items, input_ids)
429
502
  """
503
+ # Collect all items and categorize them
504
+ all_items = (base_output.images or []) + (base_output.audios or [])
430
505
 
431
- def tokenize_text(input_text: str) -> torch.Tensor:
432
- """Tokenize input text."""
433
- return self._processor.tokenizer(
434
- input_text,
506
+ # Handle text-only case
507
+ if not all_items:
508
+ input_ids = self._processor.tokenizer(
509
+ base_output.input_text,
435
510
  return_tensors="pt",
436
511
  add_special_tokens=True,
437
512
  ).input_ids.flatten()
513
+ return [], input_ids
514
+
515
+ dict_items, raw_images, raw_audios = [], [], []
516
+ for item in all_items:
517
+ if isinstance(item, dict):
518
+ dict_items.append(item)
519
+ elif isinstance(item, Image.Image):
520
+ raw_images.append(item)
521
+ elif isinstance(item, np.ndarray):
522
+ raw_audios.append(item)
523
+ else:
524
+ raise ValueError(f"Unknown multimodal item type: {type(item)}")
438
525
 
439
- def categorize_mm_inputs(mm_inputs: List) -> MultimodalInputFormat:
440
- """Categorize multimodal inputs and validate consistency."""
441
- try:
442
- has_image = False
443
- has_pixel_values = False
444
- has_precomputed_features = False
445
- has_audio = False
446
-
447
- for mm_input in mm_inputs:
448
- if isinstance(mm_input, Image.Image):
449
- has_image = True
450
- elif isinstance(mm_input, np.ndarray):
451
- has_audio = True
452
- elif isinstance(mm_input, dict):
453
- if mm_input.get("precomputed_features", None) is not None:
454
- has_precomputed_features = True
455
- elif mm_input.get("pixel_values", None) is not None:
456
- has_pixel_values = True
457
- else:
458
- raise ValueError(
459
- f"Invalid multimodal input: {mm_input}, expected dict with pixel_values or precomputed_features"
460
- )
461
- else:
462
- raise ValueError(
463
- f"Invalid multimodal input: {mm_input}, expected Image.Image or dict"
464
- )
526
+ # Process items and get input_ids
527
+ all_collected_items = []
528
+ input_ids = None
465
529
 
466
- # Validate format consistency
467
- format_count = sum(
468
- [has_image, has_pixel_values, has_precomputed_features, has_audio]
469
- )
470
- if format_count > 1:
471
- raise ValueError(
472
- "Unsupported: mixture of multimodal input formats. "
473
- f"Found formats: image={has_image}, pixel_values={has_pixel_values}, "
474
- f"precomputed_features={has_precomputed_features}, audio={has_audio}"
475
- )
476
-
477
- if has_image:
478
- return MultimodalInputFormat.RAW_IMAGES
479
- elif has_precomputed_features:
480
- return MultimodalInputFormat.PRECOMPUTED_FEATURES
481
- elif has_pixel_values:
482
- return MultimodalInputFormat.PIXEL_VALUES
483
- elif has_audio:
484
- return MultimodalInputFormat.AUDIO
485
- else:
486
- raise ValueError("No valid multimodal input format found")
487
- except Exception as e:
488
- raise ValueError(f"Failed to categorize inputs: {e}")
489
-
490
- def process_raw_images(
491
- base_output: BaseMultiModalProcessorOutput,
492
- ) -> Tuple[MultimodalDataItem, torch.Tensor]:
493
- """Process raw Image.Image objects using transformers processor."""
494
- ret = self.process_mm_data(
495
- input_text=base_output.input_text,
496
- images=base_output.images,
497
- )
498
- combined_mm_item = MultimodalDataItem(modality=Modality.IMAGE)
499
-
500
- # Copy all fields from processor output except input_ids
501
- for key, value in ret.items():
502
- if key != "input_ids" and hasattr(combined_mm_item, key):
503
- setattr(combined_mm_item, key, value)
504
-
505
- input_ids = ret["input_ids"].flatten()
506
- return combined_mm_item, input_ids
507
-
508
- def process_precomputed_features(
509
- base_output: BaseMultiModalProcessorOutput,
510
- ) -> Tuple[MultimodalDataItem, torch.Tensor]:
511
- """Process inputs with precomputed features."""
512
- combined_mm_item = MultimodalDataItem(modality=Modality.IMAGE)
513
- combined_mm_item.precomputed_features = self._extract_processor_features(
514
- base_output.images, "precomputed_features"
530
+ # Handle dict items (already processed)
531
+ for dict_item in dict_items:
532
+ all_collected_items.extend(
533
+ self.collect_mm_items_from_processor_output(dict_item)
515
534
  )
516
- input_ids = tokenize_text(base_output.input_text)
517
- return combined_mm_item, input_ids
518
-
519
- def process_pixel_values(
520
- base_output: BaseMultiModalProcessorOutput,
521
- ) -> Tuple[MultimodalDataItem, torch.Tensor]:
522
- """Process inputs with pixel values."""
523
- values = self._extract_processor_features_from_all_attributes(
524
- base_output.images
525
- )
526
- combined_mm_item = MultimodalDataItem.from_dict(values)
527
- input_ids = tokenize_text(base_output.input_text)
528
- return combined_mm_item, input_ids
529
-
530
- def process_audio(
531
- base_output: BaseMultiModalProcessorOutput,
532
- ) -> Tuple[MultimodalDataItem, torch.Tensor]:
533
- """Process inputs with audio."""
534
- ret = self.process_mm_data(
535
+
536
+ # Handle raw items (need processing)
537
+ if raw_images or raw_audios:
538
+ collected_items, input_ids = self._process_and_collect_mm_items(
535
539
  input_text=base_output.input_text,
536
- audio=base_output.audios, # Note: "audio" is for gemma3n only
540
+ images=raw_images,
541
+ audios=raw_audios,
537
542
  )
538
- combined_mm_item = MultimodalDataItem(modality=Modality.AUDIO)
539
- for key, value in ret.items():
540
- if key != "input_ids" and hasattr(combined_mm_item, key):
541
- setattr(combined_mm_item, key, value)
542
- input_ids = ret["input_ids"].flatten()
543
- return combined_mm_item, input_ids
544
-
545
- def finalize_mm_item(
546
- combined_mm_item: MultimodalDataItem, input_ids: torch.Tensor
547
- ) -> MultimodalDataItem:
548
- """Apply common post-processing to the multimodal item."""
549
- if combined_mm_item.modality in [Modality.IMAGE, Modality.MULTI_IMAGES]:
550
- combined_mm_item.image_offsets = self.get_mm_items_offset(
543
+ all_collected_items.extend(collected_items)
544
+
545
+ # Fallback tokenization if no raw items were processed
546
+ if input_ids is None:
547
+ input_ids = self._processor.tokenizer(
548
+ base_output.input_text,
549
+ return_tensors="pt",
550
+ add_special_tokens=True,
551
+ ).input_ids.flatten()
552
+
553
+ # Add offsets to all items
554
+ for mm_item in all_collected_items:
555
+ if mm_item.modality in [Modality.IMAGE, Modality.MULTI_IMAGES]:
556
+ mm_item.image_offsets = self.get_mm_items_offset(
551
557
  input_ids=input_ids,
552
558
  mm_token_id=self.IM_TOKEN_ID,
553
559
  )
554
- elif combined_mm_item.modality == Modality.AUDIO:
555
- combined_mm_item.audio_offsets = self.get_mm_items_offset(
560
+ elif mm_item.modality == Modality.AUDIO:
561
+ mm_item.audio_offsets = self.get_mm_items_offset(
556
562
  input_ids=input_ids,
557
563
  mm_token_id=self.AUDIO_TOKEN_ID,
558
564
  )
559
- elif combined_mm_item.modality == Modality.VIDEO:
560
- combined_mm_item.video_offsets = self.get_mm_items_offset(
565
+ elif mm_item.modality == Modality.VIDEO:
566
+ mm_item.video_offsets = self.get_mm_items_offset(
561
567
  input_ids=input_ids,
562
568
  mm_token_id=self.VIDEO_TOKEN_ID,
563
569
  )
564
570
  else:
565
- raise ValueError(f"Unknown modality: {combined_mm_item.modality}")
566
- return combined_mm_item
567
-
568
- # Main logic - determine input type and handle text-only case
569
- mm_inputs = base_output.images or base_output.audios
570
- if not mm_inputs:
571
- input_ids = tokenize_text(base_output.input_text)
572
- return None, input_ids
573
-
574
- # Categorize input formats
575
- input_format = categorize_mm_inputs(mm_inputs)
576
-
577
- # Process based on format
578
- if input_format == MultimodalInputFormat.RAW_IMAGES:
579
- combined_mm_item, input_ids = process_raw_images(base_output)
580
- elif input_format == MultimodalInputFormat.PRECOMPUTED_FEATURES:
581
- combined_mm_item, input_ids = process_precomputed_features(base_output)
582
- elif input_format == MultimodalInputFormat.PIXEL_VALUES:
583
- combined_mm_item, input_ids = process_pixel_values(base_output)
584
- elif input_format == MultimodalInputFormat.AUDIO:
585
- combined_mm_item, input_ids = process_audio(base_output)
586
- else:
587
- raise ValueError(f"Unknown input format: {input_format}")
571
+ raise ValueError(f"Unknown modality: {mm_item.modality}")
588
572
 
589
- # Finalize with common processing
590
- combined_mm_item = finalize_mm_item(combined_mm_item, input_ids)
591
- return combined_mm_item, input_ids
573
+ return all_collected_items, input_ids
@@ -1,10 +1,8 @@
1
1
  from typing import List, Union
2
2
 
3
- from sglang.srt.managers.multimodal_processors.base_processor import (
4
- BaseMultimodalProcessor,
5
- )
6
3
  from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
7
4
  from sglang.srt.models.clip import CLIPModel
5
+ from sglang.srt.multimodal.processors.base_processor import BaseMultimodalProcessor
8
6
  from sglang.srt.utils import load_image
9
7
 
10
8
 
@@ -17,20 +15,11 @@ class ClipImageProcessor(BaseMultimodalProcessor):
17
15
  async def process_mm_data_async(
18
16
  self, image_data: List[Union[str, bytes]], input_text, *args, **kwargs
19
17
  ):
20
- if not image_data:
21
- return None
22
-
23
18
  if isinstance(input_text, list):
24
19
  assert len(input_text) and isinstance(input_text[0], int)
25
20
  input_text = self._processor.tokenizer.decode(input_text)
26
21
 
27
- if not isinstance(image_data, list):
28
- image_data = [image_data]
29
-
30
- if len(image_data) > 0:
31
- images = [load_image(image)[0] for image in image_data]
32
- else:
33
- images = load_image(image_data[0])[0]
22
+ images = [load_image(image)[0] for image in image_data]
34
23
 
35
24
  image_inputs = self.process_mm_data(input_text=input_text, images=images)
36
25
  image_inputs["data_hashes"] = [hash(str(image_data))]
@@ -20,12 +20,12 @@ from typing import List, Union
20
20
 
21
21
  import torch
22
22
 
23
- from sglang.srt.managers.multimodal_processors.base_processor import (
23
+ from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
24
+ from sglang.srt.models.deepseek_vl2 import DeepseekVL2ForCausalLM
25
+ from sglang.srt.multimodal.processors.base_processor import (
24
26
  BaseMultimodalProcessor,
25
27
  MultimodalSpecialTokens,
26
28
  )
27
- from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
28
- from sglang.srt.models.deepseek_vl2 import DeepseekVL2ForCausalLM
29
29
 
30
30
 
31
31
  class DeepseekVL2ImageProcessor(BaseMultimodalProcessor):
@@ -44,17 +44,10 @@ class DeepseekVL2ImageProcessor(BaseMultimodalProcessor):
44
44
  *args,
45
45
  **kwargs
46
46
  ):
47
- if not image_data:
48
- return None
49
-
50
- if not isinstance(image_data, list):
51
- image_data = [image_data]
52
-
53
- image_token = self.IMAGE_TOKEN
54
47
  base_output = self.load_mm_data(
55
48
  input_text,
56
49
  image_data=image_data,
57
- multimodal_tokens=MultimodalSpecialTokens(image_token=image_token),
50
+ multimodal_tokens=MultimodalSpecialTokens(image_token=self.IMAGE_TOKEN),
58
51
  max_req_input_len=max_req_input_len,
59
52
  )
60
53
  res = self.process_mm_data(
@@ -4,11 +4,9 @@ from typing import Dict, List, Union
4
4
  from sglang.srt.managers.multimodal_processor import (
5
5
  BaseMultimodalProcessor as SGLangBaseProcessor,
6
6
  )
7
- from sglang.srt.managers.multimodal_processors.base_processor import (
8
- MultimodalSpecialTokens,
9
- )
10
7
  from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
11
8
  from sglang.srt.models.gemma3_mm import Gemma3ForConditionalGeneration
9
+ from sglang.srt.multimodal.processors.base_processor import MultimodalSpecialTokens
12
10
 
13
11
  # Copied from: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gemma3/image_processing_gemma3_fast.py
14
12
  # will be removed in the future
@@ -38,11 +36,6 @@ class Gemma3SGLangImageProcessor(SGLangBaseProcessor):
38
36
  *args,
39
37
  **kwargs,
40
38
  ):
41
- if not image_data:
42
- return None
43
- if isinstance(image_data, str):
44
- image_data = [image_data]
45
-
46
39
  base_output = self.load_mm_data(
47
40
  prompt=input_text,
48
41
  image_data=image_data,
@@ -53,11 +46,11 @@ class Gemma3SGLangImageProcessor(SGLangBaseProcessor):
53
46
  discard_alpha_channel=True,
54
47
  )
55
48
 
56
- combined_mm_item, input_ids = self.process_and_combine_mm_data(base_output)
49
+ mm_items, input_ids = self.process_and_combine_mm_data(base_output)
57
50
 
58
51
  return {
59
52
  "input_ids": input_ids.tolist(),
60
- "mm_items": [combined_mm_item] if combined_mm_item is not None else [],
53
+ "mm_items": mm_items,
61
54
  "im_start_id": self.IM_START_TOKEN_ID,
62
55
  "im_end_id": self.IM_END_TOKEN_ID,
63
56
  }
@@ -18,10 +18,8 @@ from typing import Dict, List, Optional, Union
18
18
  from sglang.srt.managers.multimodal_processor import (
19
19
  BaseMultimodalProcessor as SGLangBaseProcessor,
20
20
  )
21
- from sglang.srt.managers.multimodal_processors.base_processor import (
22
- MultimodalSpecialTokens,
23
- )
24
21
  from sglang.srt.models.gemma3n_mm import Gemma3nForConditionalGeneration
22
+ from sglang.srt.multimodal.processors.base_processor import MultimodalSpecialTokens
25
23
 
26
24
 
27
25
  class Gemma3nSGLangProcessor(SGLangBaseProcessor):
@@ -61,17 +59,6 @@ class Gemma3nSGLangProcessor(SGLangBaseProcessor):
61
59
  **kwargs,
62
60
  ):
63
61
  """Process multimodal data including images and audio."""
64
-
65
- audio_data = request_obj.audio_data
66
- if not image_data and not audio_data:
67
- return None
68
-
69
- if isinstance(image_data, str):
70
- image_data = [image_data]
71
-
72
- if isinstance(audio_data, str):
73
- audio_data = [audio_data]
74
-
75
62
  base_output = self.load_mm_data(
76
63
  prompt=input_text,
77
64
  image_data=image_data,
@@ -85,13 +72,11 @@ class Gemma3nSGLangProcessor(SGLangBaseProcessor):
85
72
  ),
86
73
  )
87
74
 
88
- combined_mm_item, input_ids = self.process_and_combine_mm_data(base_output)
75
+ mm_items, input_ids = self.process_and_combine_mm_data(base_output)
89
76
 
90
77
  return {
91
78
  "input_ids": input_ids.tolist(),
92
- "mm_items": [combined_mm_item] if combined_mm_item is not None else [],
93
- "im_start_id": self.IM_START_TOKEN_ID,
94
- "im_end_id": self.IM_END_TOKEN_ID,
95
- "audio_start_id": self.AUDIO_START_TOKEN_ID,
96
- "audio_end_id": self.AUDIO_END_TOKEN_ID,
79
+ "mm_items": mm_items,
80
+ "im_token_id": self.IM_TOKEN_ID,
81
+ "audio_token_id": self.AUDIO_TOKEN_ID,
97
82
  }
@@ -5,12 +5,12 @@ import torch
5
5
  from decord import VideoReader, cpu
6
6
  from PIL import Image
7
7
 
8
- from sglang.srt.managers.multimodal_processors.base_processor import (
8
+ from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
9
+ from sglang.srt.models.internvl import InternVLChatModel
10
+ from sglang.srt.multimodal.processors.base_processor import (
9
11
  BaseMultimodalProcessor,
10
12
  MultimodalSpecialTokens,
11
13
  )
12
- from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
13
- from sglang.srt.models.internvl import InternVLChatModel
14
14
 
15
15
 
16
16
  class InternVLImageProcessor(BaseMultimodalProcessor):
@@ -172,13 +172,6 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
172
172
  async def process_mm_data_async(
173
173
  self, image_data, input_text, request_obj, max_req_input_len, **kwargs
174
174
  ):
175
- if not image_data:
176
- return None
177
-
178
- # Ensure image_data is a list
179
- if isinstance(image_data, str):
180
- image_data = [image_data]
181
-
182
175
  base_output = self.load_mm_data(
183
176
  prompt=input_text,
184
177
  image_data=image_data,
@@ -1,11 +1,11 @@
1
1
  from typing import List, Union
2
2
 
3
- from sglang.srt.managers.multimodal_processors.base_processor import (
3
+ from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
4
+ from sglang.srt.models.deepseek_janus_pro import MultiModalityCausalLM
5
+ from sglang.srt.multimodal.processors.base_processor import (
4
6
  BaseMultimodalProcessor,
5
7
  MultimodalSpecialTokens,
6
8
  )
7
- from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
8
- from sglang.srt.models.deepseek_janus_pro import MultiModalityCausalLM
9
9
 
10
10
 
11
11
  class JanusProImageProcessor(BaseMultimodalProcessor):
@@ -22,12 +22,6 @@ class JanusProImageProcessor(BaseMultimodalProcessor):
22
22
  max_req_input_len,
23
23
  **kwargs,
24
24
  ):
25
- if not image_data:
26
- return None
27
-
28
- if not isinstance(image_data, list):
29
- image_data = [image_data]
30
-
31
25
  processor = self._processor
32
26
 
33
27
  base_out = self.load_mm_data(
@@ -3,14 +3,12 @@ from typing import Any, Dict, List, Optional, Union
3
3
 
4
4
  import torch
5
5
 
6
- from sglang.srt.managers.multimodal_processors.base_processor import (
7
- BaseMultimodalProcessor as SGLangBaseProcessor,
8
- )
9
- from sglang.srt.managers.multimodal_processors.base_processor import (
10
- MultimodalSpecialTokens,
11
- )
12
6
  from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
13
7
  from sglang.srt.models.kimi_vl import KimiVLForConditionalGeneration
8
+ from sglang.srt.multimodal.processors.base_processor import (
9
+ BaseMultimodalProcessor as SGLangBaseProcessor,
10
+ )
11
+ from sglang.srt.multimodal.processors.base_processor import MultimodalSpecialTokens
14
12
 
15
13
 
16
14
  # Compatible with KimiVLForConditionalGeneration
@@ -32,11 +30,6 @@ class KimiVLImageProcessor(SGLangBaseProcessor):
32
30
  *args,
33
31
  **kwargs,
34
32
  ):
35
- if not image_data:
36
- return None
37
- if isinstance(image_data, str):
38
- image_data = [image_data]
39
-
40
33
  base_output = self.load_mm_data(
41
34
  prompt=input_text,
42
35
  image_data=image_data,
@@ -46,10 +39,10 @@ class KimiVLImageProcessor(SGLangBaseProcessor):
46
39
  max_req_input_len=max_req_input_len,
47
40
  )
48
41
 
49
- combined_mm_item, input_ids = self.process_and_combine_mm_data(base_output)
42
+ mm_items, input_ids = self.process_and_combine_mm_data(base_output)
50
43
 
51
44
  return {
52
45
  "input_ids": input_ids.tolist(),
53
- "mm_items": [combined_mm_item] if combined_mm_item is not None else [],
46
+ "mm_items": mm_items,
54
47
  "im_token_id": self.IM_TOKEN_ID,
55
48
  }
@@ -7,11 +7,7 @@ from transformers.models.auto.processing_auto import (
7
7
  )
8
8
 
9
9
  import sglang.srt.managers.multimodal_processor as sgl_mm_processor_utils
10
- from sglang.srt.managers.multimodal_processors.base_processor import (
11
- BaseMultimodalProcessor,
12
- )
13
10
  from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
14
- from sglang.srt.mm_utils import expand2square, process_anyres_image
15
11
  from sglang.srt.models.llava import (
16
12
  LlavaForConditionalGeneration,
17
13
  LlavaLlamaForCausalLM,
@@ -20,6 +16,8 @@ from sglang.srt.models.llava import (
20
16
  )
21
17
  from sglang.srt.models.llavavid import LlavaVidForCausalLM
22
18
  from sglang.srt.models.mistral import Mistral3ForConditionalGeneration
19
+ from sglang.srt.multimodal.mm_utils import expand2square, process_anyres_image
20
+ from sglang.srt.multimodal.processors.base_processor import BaseMultimodalProcessor
23
21
  from sglang.srt.utils import load_image, logger
24
22
  from sglang.utils import get_exception_traceback
25
23
 
@@ -112,9 +110,6 @@ class LlavaImageProcessor(BaseMultimodalProcessor):
112
110
  *args,
113
111
  **kwargs,
114
112
  ):
115
- if not image_data:
116
- return None
117
-
118
113
  modalities = request_obj.modalities or ["image"]
119
114
  aspect_ratio = getattr(self.hf_config, "image_aspect_ratio", None)
120
115
  grid_pinpoints = (
@@ -124,9 +119,6 @@ class LlavaImageProcessor(BaseMultimodalProcessor):
124
119
  else None
125
120
  )
126
121
 
127
- if isinstance(image_data, str):
128
- image_data = [image_data]
129
-
130
122
  if isinstance(image_data, list) and len(image_data) > 0:
131
123
  if "multi-images" in modalities or "video" in modalities:
132
124
  # Multiple images