sglang 0.4.8__py3-none-any.whl → 0.4.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. sglang/bench_one_batch_server.py +17 -2
  2. sglang/bench_serving.py +168 -22
  3. sglang/srt/configs/internvl.py +4 -2
  4. sglang/srt/configs/janus_pro.py +1 -1
  5. sglang/srt/configs/model_config.py +49 -0
  6. sglang/srt/configs/update_config.py +119 -0
  7. sglang/srt/conversation.py +35 -0
  8. sglang/srt/custom_op.py +7 -1
  9. sglang/srt/disaggregation/base/conn.py +2 -0
  10. sglang/srt/disaggregation/decode.py +22 -6
  11. sglang/srt/disaggregation/mooncake/conn.py +289 -48
  12. sglang/srt/disaggregation/mooncake/transfer_engine.py +31 -1
  13. sglang/srt/disaggregation/nixl/conn.py +100 -52
  14. sglang/srt/disaggregation/prefill.py +5 -4
  15. sglang/srt/disaggregation/utils.py +13 -12
  16. sglang/srt/distributed/parallel_state.py +44 -17
  17. sglang/srt/entrypoints/EngineBase.py +8 -0
  18. sglang/srt/entrypoints/engine.py +45 -9
  19. sglang/srt/entrypoints/http_server.py +111 -24
  20. sglang/srt/entrypoints/openai/protocol.py +51 -6
  21. sglang/srt/entrypoints/openai/serving_chat.py +52 -76
  22. sglang/srt/entrypoints/openai/serving_completions.py +1 -0
  23. sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
  24. sglang/srt/eplb/__init__.py +0 -0
  25. sglang/srt/{managers → eplb}/eplb_algorithms/__init__.py +1 -1
  26. sglang/srt/{managers → eplb}/eplb_manager.py +2 -4
  27. sglang/srt/{eplb_simulator → eplb/eplb_simulator}/reader.py +1 -1
  28. sglang/srt/{managers → eplb}/expert_distribution.py +18 -1
  29. sglang/srt/{managers → eplb}/expert_location.py +1 -1
  30. sglang/srt/{managers → eplb}/expert_location_dispatch.py +1 -1
  31. sglang/srt/{model_executor → eplb}/expert_location_updater.py +17 -1
  32. sglang/srt/hf_transformers_utils.py +2 -1
  33. sglang/srt/layers/activation.py +7 -0
  34. sglang/srt/layers/amx_utils.py +86 -0
  35. sglang/srt/layers/attention/ascend_backend.py +219 -0
  36. sglang/srt/layers/attention/flashattention_backend.py +56 -23
  37. sglang/srt/layers/attention/tbo_backend.py +37 -9
  38. sglang/srt/layers/communicator.py +18 -2
  39. sglang/srt/layers/dp_attention.py +9 -3
  40. sglang/srt/layers/elementwise.py +76 -12
  41. sglang/srt/layers/flashinfer_comm_fusion.py +202 -0
  42. sglang/srt/layers/layernorm.py +41 -0
  43. sglang/srt/layers/linear.py +99 -12
  44. sglang/srt/layers/logits_processor.py +15 -6
  45. sglang/srt/layers/moe/ep_moe/kernels.py +23 -8
  46. sglang/srt/layers/moe/ep_moe/layer.py +115 -25
  47. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +42 -19
  48. sglang/srt/layers/moe/fused_moe_native.py +7 -0
  49. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +8 -4
  50. sglang/srt/layers/moe/fused_moe_triton/layer.py +129 -10
  51. sglang/srt/layers/moe/router.py +60 -22
  52. sglang/srt/layers/moe/topk.py +36 -28
  53. sglang/srt/layers/parameter.py +67 -7
  54. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +1 -1
  55. sglang/srt/layers/quantization/fp8.py +44 -0
  56. sglang/srt/layers/quantization/fp8_kernel.py +1 -1
  57. sglang/srt/layers/quantization/fp8_utils.py +6 -6
  58. sglang/srt/layers/quantization/gptq.py +5 -1
  59. sglang/srt/layers/quantization/moe_wna16.py +1 -1
  60. sglang/srt/layers/quantization/quant_utils.py +166 -0
  61. sglang/srt/layers/quantization/w8a8_int8.py +52 -1
  62. sglang/srt/layers/rotary_embedding.py +105 -13
  63. sglang/srt/layers/vocab_parallel_embedding.py +19 -2
  64. sglang/srt/lora/lora.py +4 -5
  65. sglang/srt/lora/lora_manager.py +73 -20
  66. sglang/srt/managers/configure_logging.py +1 -1
  67. sglang/srt/managers/io_struct.py +60 -15
  68. sglang/srt/managers/mm_utils.py +73 -59
  69. sglang/srt/managers/multimodal_processor.py +2 -6
  70. sglang/srt/managers/multimodal_processors/qwen_audio.py +94 -0
  71. sglang/srt/managers/schedule_batch.py +80 -79
  72. sglang/srt/managers/scheduler.py +153 -63
  73. sglang/srt/managers/scheduler_output_processor_mixin.py +8 -2
  74. sglang/srt/managers/session_controller.py +12 -3
  75. sglang/srt/managers/tokenizer_manager.py +314 -103
  76. sglang/srt/managers/tp_worker.py +13 -1
  77. sglang/srt/managers/tp_worker_overlap_thread.py +8 -0
  78. sglang/srt/mem_cache/allocator.py +290 -0
  79. sglang/srt/mem_cache/chunk_cache.py +34 -2
  80. sglang/srt/mem_cache/memory_pool.py +289 -3
  81. sglang/srt/mem_cache/multimodal_cache.py +3 -0
  82. sglang/srt/model_executor/cuda_graph_runner.py +3 -2
  83. sglang/srt/model_executor/forward_batch_info.py +17 -4
  84. sglang/srt/model_executor/model_runner.py +302 -58
  85. sglang/srt/model_loader/loader.py +86 -10
  86. sglang/srt/model_loader/weight_utils.py +160 -3
  87. sglang/srt/models/deepseek_nextn.py +5 -4
  88. sglang/srt/models/deepseek_v2.py +305 -26
  89. sglang/srt/models/deepseek_vl2.py +3 -5
  90. sglang/srt/models/gemma3_causal.py +1 -2
  91. sglang/srt/models/gemma3n_audio.py +949 -0
  92. sglang/srt/models/gemma3n_causal.py +1010 -0
  93. sglang/srt/models/gemma3n_mm.py +495 -0
  94. sglang/srt/models/hunyuan.py +771 -0
  95. sglang/srt/models/kimi_vl.py +1 -2
  96. sglang/srt/models/llama.py +10 -4
  97. sglang/srt/models/llama4.py +32 -45
  98. sglang/srt/models/llama_eagle3.py +61 -11
  99. sglang/srt/models/llava.py +5 -5
  100. sglang/srt/models/minicpmo.py +2 -2
  101. sglang/srt/models/mistral.py +1 -1
  102. sglang/srt/models/mllama4.py +43 -11
  103. sglang/srt/models/phi4mm.py +1 -3
  104. sglang/srt/models/pixtral.py +3 -7
  105. sglang/srt/models/qwen2.py +31 -3
  106. sglang/srt/models/qwen2_5_vl.py +1 -3
  107. sglang/srt/models/qwen2_audio.py +200 -0
  108. sglang/srt/models/qwen2_moe.py +32 -6
  109. sglang/srt/models/qwen2_vl.py +1 -4
  110. sglang/srt/models/qwen3.py +94 -25
  111. sglang/srt/models/qwen3_moe.py +68 -21
  112. sglang/srt/models/vila.py +3 -8
  113. sglang/srt/{managers/multimodal_processors → multimodal/processors}/base_processor.py +150 -133
  114. sglang/srt/{managers/multimodal_processors → multimodal/processors}/clip.py +2 -13
  115. sglang/srt/{managers/multimodal_processors → multimodal/processors}/deepseek_vl_v2.py +4 -11
  116. sglang/srt/{managers/multimodal_processors → multimodal/processors}/gemma3.py +3 -10
  117. sglang/srt/multimodal/processors/gemma3n.py +82 -0
  118. sglang/srt/{managers/multimodal_processors → multimodal/processors}/internvl.py +3 -10
  119. sglang/srt/{managers/multimodal_processors → multimodal/processors}/janus_pro.py +3 -9
  120. sglang/srt/{managers/multimodal_processors → multimodal/processors}/kimi_vl.py +6 -13
  121. sglang/srt/{managers/multimodal_processors → multimodal/processors}/llava.py +2 -10
  122. sglang/srt/{managers/multimodal_processors → multimodal/processors}/minicpm.py +5 -12
  123. sglang/srt/{managers/multimodal_processors → multimodal/processors}/mlama.py +2 -14
  124. sglang/srt/{managers/multimodal_processors → multimodal/processors}/mllama4.py +3 -6
  125. sglang/srt/{managers/multimodal_processors → multimodal/processors}/phi4mm.py +4 -14
  126. sglang/srt/{managers/multimodal_processors → multimodal/processors}/pixtral.py +3 -9
  127. sglang/srt/{managers/multimodal_processors → multimodal/processors}/qwen_vl.py +8 -14
  128. sglang/srt/{managers/multimodal_processors → multimodal/processors}/vila.py +13 -31
  129. sglang/srt/operations_strategy.py +6 -2
  130. sglang/srt/reasoning_parser.py +26 -0
  131. sglang/srt/sampling/sampling_batch_info.py +39 -1
  132. sglang/srt/server_args.py +85 -24
  133. sglang/srt/speculative/build_eagle_tree.py +57 -18
  134. sglang/srt/speculative/eagle_worker.py +6 -4
  135. sglang/srt/two_batch_overlap.py +204 -28
  136. sglang/srt/utils.py +369 -138
  137. sglang/srt/warmup.py +12 -3
  138. sglang/test/runners.py +10 -1
  139. sglang/test/test_utils.py +15 -3
  140. sglang/version.py +1 -1
  141. {sglang-0.4.8.dist-info → sglang-0.4.9.dist-info}/METADATA +9 -6
  142. {sglang-0.4.8.dist-info → sglang-0.4.9.dist-info}/RECORD +149 -137
  143. sglang/math_utils.py +0 -8
  144. /sglang/srt/{managers → eplb}/eplb_algorithms/deepseek.py +0 -0
  145. /sglang/srt/{managers → eplb}/eplb_algorithms/deepseek_vec.py +0 -0
  146. /sglang/srt/{eplb_simulator → eplb/eplb_simulator}/__init__.py +0 -0
  147. /sglang/srt/{mm_utils.py → multimodal/mm_utils.py} +0 -0
  148. {sglang-0.4.8.dist-info → sglang-0.4.9.dist-info}/WHEEL +0 -0
  149. {sglang-0.4.8.dist-info → sglang-0.4.9.dist-info}/licenses/LICENSE +0 -0
  150. {sglang-0.4.8.dist-info → sglang-0.4.9.dist-info}/top_level.txt +0 -0
@@ -4,11 +4,9 @@ from typing import Dict, List, Union
4
4
  from sglang.srt.managers.multimodal_processor import (
5
5
  BaseMultimodalProcessor as SGLangBaseProcessor,
6
6
  )
7
- from sglang.srt.managers.multimodal_processors.base_processor import (
8
- MultimodalSpecialTokens,
9
- )
10
7
  from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
11
8
  from sglang.srt.models.gemma3_mm import Gemma3ForConditionalGeneration
9
+ from sglang.srt.multimodal.processors.base_processor import MultimodalSpecialTokens
12
10
 
13
11
  # Copied from: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gemma3/image_processing_gemma3_fast.py
14
12
  # will be removed in the future
@@ -38,11 +36,6 @@ class Gemma3SGLangImageProcessor(SGLangBaseProcessor):
38
36
  *args,
39
37
  **kwargs,
40
38
  ):
41
- if not image_data:
42
- return None
43
- if isinstance(image_data, str):
44
- image_data = [image_data]
45
-
46
39
  base_output = self.load_mm_data(
47
40
  prompt=input_text,
48
41
  image_data=image_data,
@@ -53,11 +46,11 @@ class Gemma3SGLangImageProcessor(SGLangBaseProcessor):
53
46
  discard_alpha_channel=True,
54
47
  )
55
48
 
56
- combined_mm_item, input_ids = self.process_and_combine_mm_data(base_output)
49
+ mm_items, input_ids = self.process_and_combine_mm_data(base_output)
57
50
 
58
51
  return {
59
52
  "input_ids": input_ids.tolist(),
60
- "mm_items": [combined_mm_item] if combined_mm_item is not None else [],
53
+ "mm_items": mm_items,
61
54
  "im_start_id": self.IM_START_TOKEN_ID,
62
55
  "im_end_id": self.IM_END_TOKEN_ID,
63
56
  }
@@ -0,0 +1,82 @@
1
+ # Copyright 2025 SGLang Team
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ==============================================================================
14
+
15
+ import re
16
+ from typing import Dict, List, Optional, Union
17
+
18
+ from sglang.srt.managers.multimodal_processor import (
19
+ BaseMultimodalProcessor as SGLangBaseProcessor,
20
+ )
21
+ from sglang.srt.models.gemma3n_mm import Gemma3nForConditionalGeneration
22
+ from sglang.srt.multimodal.processors.base_processor import MultimodalSpecialTokens
23
+
24
+
25
+ class Gemma3nSGLangProcessor(SGLangBaseProcessor):
26
+ """Multimodal processor for Gemma3n supporting image and audio inputs."""
27
+
28
+ models = [Gemma3nForConditionalGeneration]
29
+
30
+ def __init__(self, hf_config, server_args, _processor):
31
+ super().__init__(hf_config, server_args, _processor)
32
+
33
+ self.IMAGE_TOKEN = "<image_soft_token>"
34
+ self.IMAGE_TOKEN_REGEX = re.compile(
35
+ r"<start_of_image>(?:(?:<image_soft_token>)*<end_of_image>)?"
36
+ )
37
+
38
+ self.AUDIO_TOKEN = "<audio_soft_token>"
39
+ self.AUDIO_TOKEN_REGEX = re.compile(
40
+ r"<start_of_audio>(?:(?:<audio_soft_token>)*<end_of_audio>)?"
41
+ )
42
+
43
+ self.IM_TOKEN_ID = hf_config.image_token_id
44
+ self.IM_START_TOKEN_ID = hf_config.boi_token_id
45
+ self.IM_END_TOKEN_ID = hf_config.eoi_token_id
46
+
47
+ self.AUDIO_TOKEN_ID = hf_config.audio_token_id
48
+ self.AUDIO_START_TOKEN_ID = hf_config.boa_token_id
49
+ self.AUDIO_END_TOKEN_ID = hf_config.eoa_token_id
50
+
51
+ async def process_mm_data_async(
52
+ self,
53
+ image_data: Optional[List[Union[str, bytes, Dict]]] = None,
54
+ audio_data: Optional[List[Union[str, bytes, Dict]]] = None,
55
+ input_text: str = "",
56
+ request_obj=None,
57
+ max_req_input_len: int = 0,
58
+ *args,
59
+ **kwargs,
60
+ ):
61
+ """Process multimodal data including images and audio."""
62
+ base_output = self.load_mm_data(
63
+ prompt=input_text,
64
+ image_data=image_data,
65
+ audio_data=audio_data,
66
+ max_req_input_len=max_req_input_len,
67
+ multimodal_tokens=MultimodalSpecialTokens(
68
+ image_token=self.IMAGE_TOKEN,
69
+ image_token_regex=self.IMAGE_TOKEN_REGEX,
70
+ audio_token=self.AUDIO_TOKEN,
71
+ audio_token_regex=self.AUDIO_TOKEN_REGEX,
72
+ ),
73
+ )
74
+
75
+ mm_items, input_ids = self.process_and_combine_mm_data(base_output)
76
+
77
+ return {
78
+ "input_ids": input_ids.tolist(),
79
+ "mm_items": mm_items,
80
+ "im_token_id": self.IM_TOKEN_ID,
81
+ "audio_token_id": self.AUDIO_TOKEN_ID,
82
+ }
@@ -5,12 +5,12 @@ import torch
5
5
  from decord import VideoReader, cpu
6
6
  from PIL import Image
7
7
 
8
- from sglang.srt.managers.multimodal_processors.base_processor import (
8
+ from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
9
+ from sglang.srt.models.internvl import InternVLChatModel
10
+ from sglang.srt.multimodal.processors.base_processor import (
9
11
  BaseMultimodalProcessor,
10
12
  MultimodalSpecialTokens,
11
13
  )
12
- from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
13
- from sglang.srt.models.internvl import InternVLChatModel
14
14
 
15
15
 
16
16
  class InternVLImageProcessor(BaseMultimodalProcessor):
@@ -172,13 +172,6 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
172
172
  async def process_mm_data_async(
173
173
  self, image_data, input_text, request_obj, max_req_input_len, **kwargs
174
174
  ):
175
- if not image_data:
176
- return None
177
-
178
- # Ensure image_data is a list
179
- if isinstance(image_data, str):
180
- image_data = [image_data]
181
-
182
175
  base_output = self.load_mm_data(
183
176
  prompt=input_text,
184
177
  image_data=image_data,
@@ -1,11 +1,11 @@
1
1
  from typing import List, Union
2
2
 
3
- from sglang.srt.managers.multimodal_processors.base_processor import (
3
+ from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
4
+ from sglang.srt.models.deepseek_janus_pro import MultiModalityCausalLM
5
+ from sglang.srt.multimodal.processors.base_processor import (
4
6
  BaseMultimodalProcessor,
5
7
  MultimodalSpecialTokens,
6
8
  )
7
- from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
8
- from sglang.srt.models.deepseek_janus_pro import MultiModalityCausalLM
9
9
 
10
10
 
11
11
  class JanusProImageProcessor(BaseMultimodalProcessor):
@@ -22,12 +22,6 @@ class JanusProImageProcessor(BaseMultimodalProcessor):
22
22
  max_req_input_len,
23
23
  **kwargs,
24
24
  ):
25
- if not image_data:
26
- return None
27
-
28
- if not isinstance(image_data, list):
29
- image_data = [image_data]
30
-
31
25
  processor = self._processor
32
26
 
33
27
  base_out = self.load_mm_data(
@@ -3,14 +3,12 @@ from typing import Any, Dict, List, Optional, Union
3
3
 
4
4
  import torch
5
5
 
6
- from sglang.srt.managers.multimodal_processors.base_processor import (
7
- BaseMultimodalProcessor as SGLangBaseProcessor,
8
- )
9
- from sglang.srt.managers.multimodal_processors.base_processor import (
10
- MultimodalSpecialTokens,
11
- )
12
6
  from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
13
7
  from sglang.srt.models.kimi_vl import KimiVLForConditionalGeneration
8
+ from sglang.srt.multimodal.processors.base_processor import (
9
+ BaseMultimodalProcessor as SGLangBaseProcessor,
10
+ )
11
+ from sglang.srt.multimodal.processors.base_processor import MultimodalSpecialTokens
14
12
 
15
13
 
16
14
  # Compatible with KimiVLForConditionalGeneration
@@ -32,11 +30,6 @@ class KimiVLImageProcessor(SGLangBaseProcessor):
32
30
  *args,
33
31
  **kwargs,
34
32
  ):
35
- if not image_data:
36
- return None
37
- if isinstance(image_data, str):
38
- image_data = [image_data]
39
-
40
33
  base_output = self.load_mm_data(
41
34
  prompt=input_text,
42
35
  image_data=image_data,
@@ -46,10 +39,10 @@ class KimiVLImageProcessor(SGLangBaseProcessor):
46
39
  max_req_input_len=max_req_input_len,
47
40
  )
48
41
 
49
- combined_mm_item, input_ids = self.process_and_combine_mm_data(base_output)
42
+ mm_items, input_ids = self.process_and_combine_mm_data(base_output)
50
43
 
51
44
  return {
52
45
  "input_ids": input_ids.tolist(),
53
- "mm_items": [combined_mm_item] if combined_mm_item is not None else [],
46
+ "mm_items": mm_items,
54
47
  "im_token_id": self.IM_TOKEN_ID,
55
48
  }
@@ -7,11 +7,7 @@ from transformers.models.auto.processing_auto import (
7
7
  )
8
8
 
9
9
  import sglang.srt.managers.multimodal_processor as sgl_mm_processor_utils
10
- from sglang.srt.managers.multimodal_processors.base_processor import (
11
- BaseMultimodalProcessor,
12
- )
13
10
  from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
14
- from sglang.srt.mm_utils import expand2square, process_anyres_image
15
11
  from sglang.srt.models.llava import (
16
12
  LlavaForConditionalGeneration,
17
13
  LlavaLlamaForCausalLM,
@@ -20,6 +16,8 @@ from sglang.srt.models.llava import (
20
16
  )
21
17
  from sglang.srt.models.llavavid import LlavaVidForCausalLM
22
18
  from sglang.srt.models.mistral import Mistral3ForConditionalGeneration
19
+ from sglang.srt.multimodal.mm_utils import expand2square, process_anyres_image
20
+ from sglang.srt.multimodal.processors.base_processor import BaseMultimodalProcessor
23
21
  from sglang.srt.utils import load_image, logger
24
22
  from sglang.utils import get_exception_traceback
25
23
 
@@ -112,9 +110,6 @@ class LlavaImageProcessor(BaseMultimodalProcessor):
112
110
  *args,
113
111
  **kwargs,
114
112
  ):
115
- if not image_data:
116
- return None
117
-
118
113
  modalities = request_obj.modalities or ["image"]
119
114
  aspect_ratio = getattr(self.hf_config, "image_aspect_ratio", None)
120
115
  grid_pinpoints = (
@@ -124,9 +119,6 @@ class LlavaImageProcessor(BaseMultimodalProcessor):
124
119
  else None
125
120
  )
126
121
 
127
- if isinstance(image_data, str):
128
- image_data = [image_data]
129
-
130
122
  if isinstance(image_data, list) and len(image_data) > 0:
131
123
  if "multi-images" in modalities or "video" in modalities:
132
124
  # Multiple images
@@ -2,13 +2,13 @@ from typing import List, Union
2
2
 
3
3
  import torch
4
4
 
5
- from sglang.srt.managers.multimodal_processors.base_processor import (
6
- BaseMultimodalProcessor,
7
- MultimodalSpecialTokens,
8
- )
9
5
  from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
10
6
  from sglang.srt.models.minicpmo import MiniCPMO
11
7
  from sglang.srt.models.minicpmv import MiniCPMV
8
+ from sglang.srt.multimodal.processors.base_processor import (
9
+ BaseMultimodalProcessor,
10
+ MultimodalSpecialTokens,
11
+ )
12
12
 
13
13
 
14
14
  # Compatible with both 'O' and 'V'
@@ -23,19 +23,12 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
23
23
  async def process_mm_data_async(
24
24
  self,
25
25
  image_data: List[Union[str, bytes]],
26
+ audio_data: List[Union[str, bytes]],
26
27
  input_text,
27
28
  request_obj,
28
29
  max_req_input_len,
29
30
  **kwargs,
30
31
  ):
31
- audio_data = request_obj.audio_data
32
- if not image_data and not audio_data:
33
- return None
34
- if not isinstance(image_data, list):
35
- image_data = [image_data]
36
- if not isinstance(audio_data, list):
37
- audio_data = [audio_data]
38
-
39
32
  base_output = self.load_mm_data(
40
33
  prompt=input_text,
41
34
  max_req_input_len=max_req_input_len,
@@ -1,10 +1,8 @@
1
1
  from typing import List, Union
2
2
 
3
- from sglang.srt.managers.multimodal_processors.base_processor import (
4
- BaseMultimodalProcessor,
5
- )
6
3
  from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
7
4
  from sglang.srt.models.mllama import MllamaForConditionalGeneration
5
+ from sglang.srt.multimodal.processors.base_processor import BaseMultimodalProcessor
8
6
  from sglang.srt.utils import load_image
9
7
 
10
8
 
@@ -17,21 +15,11 @@ class MllamaImageProcessor(BaseMultimodalProcessor):
17
15
  async def process_mm_data_async(
18
16
  self, image_data: List[Union[str, bytes]], input_text, *args, **kwargs
19
17
  ):
20
- if not image_data:
21
- return None
22
-
23
18
  if isinstance(input_text, list):
24
19
  assert len(input_text) and isinstance(input_text[0], int)
25
20
  input_text = self._processor.tokenizer.decode(input_text)
26
21
 
27
- if not isinstance(image_data, list):
28
- image_data = [image_data]
29
-
30
- if len(image_data) > 0:
31
- images = [load_image(image)[0] for image in image_data]
32
- else:
33
- images = load_image(image_data[0])[0]
34
-
22
+ images = [load_image(image)[0] for image in image_data]
35
23
  image_inputs = self.process_mm_data(input_text=input_text, images=images)
36
24
  image_inputs["input_ids"] = image_inputs["input_ids"].tolist()[0]
37
25
  image_inputs["mm_items"] = [
@@ -7,12 +7,12 @@ from transformers.models.llama4.image_processing_llama4_fast import (
7
7
  get_best_fit,
8
8
  )
9
9
 
10
- from sglang.srt.managers.multimodal_processors.base_processor import (
10
+ from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
11
+ from sglang.srt.models.mllama4 import Llama4ForConditionalGeneration
12
+ from sglang.srt.multimodal.processors.base_processor import (
11
13
  BaseMultimodalProcessor,
12
14
  MultimodalSpecialTokens,
13
15
  )
14
- from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
15
- from sglang.srt.models.mllama4 import Llama4ForConditionalGeneration
16
16
 
17
17
 
18
18
  class Mllama4ImageProcessor(BaseMultimodalProcessor):
@@ -37,9 +37,6 @@ class Mllama4ImageProcessor(BaseMultimodalProcessor):
37
37
  *args,
38
38
  **kwargs,
39
39
  ):
40
- if not image_data:
41
- return None
42
-
43
40
  if isinstance(input_text, list):
44
41
  assert len(input_text) and isinstance(input_text[0], int)
45
42
  input_text = self._processor.tokenizer.decode(input_text)
@@ -1,12 +1,12 @@
1
1
  import logging
2
2
  from typing import List, Union
3
3
 
4
- from sglang.srt.managers.multimodal_processors.base_processor import (
4
+ from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
5
+ from sglang.srt.models.phi4mm import Phi4MMForCausalLM
6
+ from sglang.srt.multimodal.processors.base_processor import (
5
7
  BaseMultimodalProcessor,
6
8
  MultimodalSpecialTokens,
7
9
  )
8
- from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
9
- from sglang.srt.models.phi4mm import Phi4MMForCausalLM
10
10
 
11
11
  logger = logging.getLogger(__name__)
12
12
 
@@ -26,22 +26,12 @@ class Phi4MMImageProcessor(BaseMultimodalProcessor):
26
26
  async def process_mm_data_async(
27
27
  self,
28
28
  image_data: List[Union[str, bytes]],
29
+ audio_data,
29
30
  input_text,
30
31
  request_obj,
31
32
  max_req_input_len,
32
33
  **kwargs,
33
34
  ):
34
- audio_data = request_obj.audio_data
35
-
36
- if not image_data and not audio_data:
37
- return None
38
-
39
- if not isinstance(image_data, list):
40
- image_data = [image_data]
41
-
42
- if not isinstance(audio_data, list):
43
- audio_data = [audio_data]
44
-
45
35
  if audio_data:
46
36
  logger.warning(
47
37
  "Currently SGLang does not support audio data for Phi4MM. We are working on it. You can file an issue to help us prioritize."
@@ -6,12 +6,12 @@ from transformers.models.pixtral.image_processing_pixtral import (
6
6
  _num_image_tokens as _get_pixtral_hf_num_image_tokens,
7
7
  )
8
8
 
9
- from sglang.srt.managers.multimodal_processors.base_processor import (
9
+ from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
10
+ from sglang.srt.models.pixtral import PixtralVisionModel
11
+ from sglang.srt.multimodal.processors.base_processor import (
10
12
  BaseMultimodalProcessor,
11
13
  MultimodalSpecialTokens,
12
14
  )
13
- from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
14
- from sglang.srt.models.pixtral import PixtralVisionModel
15
15
 
16
16
 
17
17
  class PixtralProcessor(BaseMultimodalProcessor):
@@ -78,12 +78,6 @@ class PixtralProcessor(BaseMultimodalProcessor):
78
78
  *args,
79
79
  **kwargs,
80
80
  ):
81
- if not image_data:
82
- return None
83
-
84
- if isinstance(image_data, str):
85
- image_data = [image_data]
86
-
87
81
  mm_data = self.load_mm_data(
88
82
  prompt=input_text,
89
83
  multimodal_tokens=self.multimodal_tokens,
@@ -3,19 +3,15 @@ import math
3
3
  import re
4
4
  from typing import Dict, List, Union
5
5
 
6
- import torch
7
6
  from PIL import Image
8
7
 
9
8
  from sglang.srt.layers.rotary_embedding import MRotaryEmbedding
10
- from sglang.srt.managers.multimodal_processors.base_processor import (
11
- BaseMultimodalProcessor as SGLangBaseProcessor,
12
- )
13
- from sglang.srt.managers.multimodal_processors.base_processor import (
14
- MultimodalSpecialTokens,
15
- )
16
- from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
17
9
  from sglang.srt.models.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
18
10
  from sglang.srt.models.qwen2_vl import Qwen2VLForConditionalGeneration
11
+ from sglang.srt.multimodal.processors.base_processor import (
12
+ BaseMultimodalProcessor as SGLangBaseProcessor,
13
+ )
14
+ from sglang.srt.multimodal.processors.base_processor import MultimodalSpecialTokens
19
15
 
20
16
 
21
17
  # Compatible with Qwen2VL and Qwen2_5VL
@@ -51,9 +47,6 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
51
47
  *args,
52
48
  **kwargs,
53
49
  ):
54
- if isinstance(image_data, str):
55
- image_data = [image_data]
56
-
57
50
  base_output = self.load_mm_data(
58
51
  prompt=input_text,
59
52
  image_data=image_data,
@@ -132,12 +125,13 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
132
125
 
133
126
  video_grid_thw = None # TODO
134
127
 
135
- combined_mm_item, input_ids = self.process_and_combine_mm_data(base_output)
128
+ mm_items, input_ids = self.process_and_combine_mm_data(base_output)
136
129
 
137
- if combined_mm_item is None:
130
+ if not mm_items:
138
131
  # Note(Xinyuan): This is the case where image loading fails.
139
132
  return None
140
133
 
134
+ combined_mm_item = mm_items[0] # only image is supported for now
141
135
  video_grid_thw = None # TODO
142
136
  second_per_grid_ts = getattr(combined_mm_item, "second_per_grid_ts", None)
143
137
 
@@ -159,7 +153,7 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
159
153
 
160
154
  return {
161
155
  "input_ids": input_ids.tolist(),
162
- "mm_items": [combined_mm_item],
156
+ "mm_items": mm_items,
163
157
  "im_start_id": self.IM_START_TOKEN_ID,
164
158
  "im_end_id": self.IM_END_TOKEN_ID,
165
159
  "im_token_id": self.IM_TOKEN_ID,
@@ -10,12 +10,12 @@ from sglang.srt.managers.io_struct import (
10
10
  GenerateReqInput,
11
11
  ImageDataItem,
12
12
  )
13
- from sglang.srt.managers.multimodal_processors.base_processor import (
13
+ from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
14
+ from sglang.srt.models.vila import VILAForConditionalGeneration
15
+ from sglang.srt.multimodal.processors.base_processor import (
14
16
  BaseMultimodalProcessor,
15
17
  MultimodalSpecialTokens,
16
18
  )
17
- from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
18
- from sglang.srt.models.vila import VILAForConditionalGeneration
19
19
  from sglang.srt.server_args import ServerArgs
20
20
 
21
21
 
@@ -37,6 +37,8 @@ class VILAMultimodalProcessor(BaseMultimodalProcessor):
37
37
  _processor: VILAProcessor,
38
38
  ) -> None:
39
39
  super().__init__(hf_config, server_args, _processor)
40
+ self.IM_TOKEN_ID = hf_config.image_token_id
41
+ self.VIDEO_TOKEN_ID = hf_config.video_token_id
40
42
 
41
43
  async def process_mm_data_async(
42
44
  self,
@@ -46,13 +48,7 @@ class VILAMultimodalProcessor(BaseMultimodalProcessor):
46
48
  max_req_input_len: int,
47
49
  **kwargs,
48
50
  ) -> Optional[Dict[str, Any]]:
49
- if not image_data:
50
- return None
51
-
52
- if not isinstance(image_data, list):
53
- image_data = [image_data]
54
-
55
- mm_data = self.load_mm_data(
51
+ base_output = self.load_mm_data(
56
52
  prompt=input_text,
57
53
  multimodal_tokens=MultimodalSpecialTokens(
58
54
  image_token=self._processor.tokenizer.image_token
@@ -61,25 +57,11 @@ class VILAMultimodalProcessor(BaseMultimodalProcessor):
61
57
  image_data=image_data,
62
58
  )
63
59
 
64
- inputs = self.process_mm_data(
65
- input_text=mm_data.input_text,
66
- images=mm_data.images,
67
- )
68
-
69
- image_offsets = self.get_mm_items_offset(
70
- input_ids=inputs.input_ids[0],
71
- mm_token_id=cast(int, self._processor.tokenizer.image_token_id),
72
- )
60
+ mm_items, input_ids = self.process_and_combine_mm_data(base_output)
73
61
 
74
- mm_items: List[MultimodalDataItem] = [
75
- MultimodalDataItem(
76
- modality=Modality.IMAGE,
77
- image_offsets=image_offsets,
78
- pixel_values=inputs.pixel_values,
79
- )
80
- ]
81
-
82
- return dict(
83
- input_ids=inputs.input_ids[0].tolist(),
84
- mm_items=mm_items,
85
- )
62
+ return {
63
+ "input_ids": input_ids.tolist(),
64
+ "mm_items": mm_items,
65
+ "im_token_id": self.IM_TOKEN_ID,
66
+ "video_token_id": self.VIDEO_TOKEN_ID,
67
+ }
@@ -71,7 +71,9 @@ def _compute_moe_deepseek_layer_operations_strategy_tbo(
71
71
  assert layer.is_layer_sparse, "dense layer TBO not yet implemented"
72
72
  if forward_mode == ForwardMode.EXTEND:
73
73
  return _compute_moe_deepseek_blog_prefill(layer)
74
- elif forward_mode == ForwardMode.DECODE:
74
+ elif (
75
+ forward_mode == ForwardMode.DECODE or forward_mode == ForwardMode.TARGET_VERIFY
76
+ ):
75
77
  return _compute_moe_deepseek_blog_decode(layer)
76
78
  else:
77
79
  raise NotImplementedError(f"Unsupported {forward_mode=}")
@@ -146,7 +148,9 @@ def _compute_moe_qwen3_layer_operations_strategy_tbo(
146
148
  assert layer.is_layer_sparse, "qwen3 moe only support sparse layers"
147
149
  if forward_mode == ForwardMode.EXTEND:
148
150
  return _compute_moe_qwen3_prefill(layer)
149
- elif forward_mode == ForwardMode.DECODE:
151
+ elif (
152
+ forward_mode == ForwardMode.DECODE or forward_mode == ForwardMode.TARGET_VERIFY
153
+ ):
150
154
  return _compute_moe_qwen3_decode(layer)
151
155
  else:
152
156
  raise NotImplementedError(f"Unsupported {forward_mode=}")
@@ -66,6 +66,13 @@ class BaseReasoningFormatDetector:
66
66
  self._buffer += new_text
67
67
  current_text = self._buffer
68
68
 
69
+ # If the current text is a prefix of the think token, keep buffering
70
+ if any(
71
+ token.startswith(current_text) and token != current_text
72
+ for token in [self.think_start_token, self.think_end_token]
73
+ ):
74
+ return StreamingParseResult()
75
+
69
76
  # Strip `<think>` token if present
70
77
  if not self.stripped_think_start and self.think_start_token in current_text:
71
78
  current_text = current_text.replace(self.think_start_token, "")
@@ -150,6 +157,24 @@ class Qwen3Detector(BaseReasoningFormatDetector):
150
157
  )
151
158
 
152
159
 
160
+ class KimiDetector(BaseReasoningFormatDetector):
161
+ """
162
+ Detector for Kimi Thinking model.
163
+ Assumes reasoning format:
164
+ ◁think▷*(.*)◁/think▷
165
+ Returns all the text before the ◁/think▷ tag as `reasoning_text`
166
+ and the rest of the text as `normal_text`.
167
+ """
168
+
169
+ def __init__(self, stream_reasoning: bool = True):
170
+ super().__init__(
171
+ "◁think▷",
172
+ "◁/think▷",
173
+ force_reasoning=False,
174
+ stream_reasoning=stream_reasoning,
175
+ )
176
+
177
+
153
178
  class ReasoningParser:
154
179
  """
155
180
  Parser that handles both streaming and non-streaming scenarios for extracting
@@ -164,6 +189,7 @@ class ReasoningParser:
164
189
  DetectorMap: Dict[str, Type[BaseReasoningFormatDetector]] = {
165
190
  "deepseek-r1": DeepSeekR1Detector,
166
191
  "qwen3": Qwen3Detector,
192
+ "kimi": KimiDetector,
167
193
  }
168
194
 
169
195
  def __init__(self, model_type: Optional[str] = None, stream_reasoning: bool = True):