sglang 0.4.8.post1__py3-none-any.whl → 0.4.9.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (158) hide show
  1. sglang/bench_one_batch_server.py +17 -2
  2. sglang/bench_serving.py +170 -24
  3. sglang/srt/configs/internvl.py +4 -2
  4. sglang/srt/configs/janus_pro.py +1 -1
  5. sglang/srt/configs/model_config.py +60 -1
  6. sglang/srt/configs/update_config.py +119 -0
  7. sglang/srt/conversation.py +69 -1
  8. sglang/srt/disaggregation/decode.py +21 -5
  9. sglang/srt/disaggregation/mooncake/conn.py +35 -4
  10. sglang/srt/disaggregation/nixl/conn.py +6 -6
  11. sglang/srt/disaggregation/prefill.py +2 -2
  12. sglang/srt/disaggregation/utils.py +1 -1
  13. sglang/srt/distributed/parallel_state.py +44 -17
  14. sglang/srt/entrypoints/EngineBase.py +8 -0
  15. sglang/srt/entrypoints/engine.py +40 -6
  16. sglang/srt/entrypoints/http_server.py +111 -24
  17. sglang/srt/entrypoints/http_server_engine.py +1 -1
  18. sglang/srt/entrypoints/openai/protocol.py +4 -2
  19. sglang/srt/eplb/__init__.py +0 -0
  20. sglang/srt/{managers → eplb}/eplb_algorithms/__init__.py +1 -1
  21. sglang/srt/{managers → eplb}/eplb_manager.py +2 -4
  22. sglang/srt/{eplb_simulator → eplb/eplb_simulator}/reader.py +1 -1
  23. sglang/srt/{managers → eplb}/expert_distribution.py +1 -5
  24. sglang/srt/{managers → eplb}/expert_location.py +1 -1
  25. sglang/srt/{managers → eplb}/expert_location_dispatch.py +1 -1
  26. sglang/srt/{model_executor → eplb}/expert_location_updater.py +17 -1
  27. sglang/srt/hf_transformers_utils.py +2 -1
  28. sglang/srt/layers/activation.py +2 -2
  29. sglang/srt/layers/amx_utils.py +86 -0
  30. sglang/srt/layers/attention/ascend_backend.py +219 -0
  31. sglang/srt/layers/attention/flashattention_backend.py +32 -9
  32. sglang/srt/layers/attention/tbo_backend.py +37 -9
  33. sglang/srt/layers/communicator.py +20 -2
  34. sglang/srt/layers/dp_attention.py +9 -3
  35. sglang/srt/layers/elementwise.py +76 -12
  36. sglang/srt/layers/flashinfer_comm_fusion.py +202 -0
  37. sglang/srt/layers/layernorm.py +26 -0
  38. sglang/srt/layers/linear.py +84 -14
  39. sglang/srt/layers/logits_processor.py +4 -4
  40. sglang/srt/layers/moe/cutlass_w4a8_moe.py +215 -0
  41. sglang/srt/layers/moe/ep_moe/kernels.py +81 -8
  42. sglang/srt/layers/moe/ep_moe/layer.py +176 -15
  43. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +23 -17
  44. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +3 -2
  45. sglang/srt/layers/moe/fused_moe_triton/layer.py +211 -74
  46. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +176 -0
  47. sglang/srt/layers/moe/router.py +60 -22
  48. sglang/srt/layers/moe/topk.py +10 -28
  49. sglang/srt/layers/parameter.py +67 -7
  50. sglang/srt/layers/quantization/__init__.py +2 -0
  51. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +1 -1
  52. sglang/srt/layers/quantization/fp8.py +72 -7
  53. sglang/srt/layers/quantization/fp8_kernel.py +1 -1
  54. sglang/srt/layers/quantization/fp8_utils.py +1 -2
  55. sglang/srt/layers/quantization/gptq.py +5 -1
  56. sglang/srt/layers/quantization/modelopt_quant.py +244 -1
  57. sglang/srt/layers/quantization/moe_wna16.py +1 -1
  58. sglang/srt/layers/quantization/quant_utils.py +166 -0
  59. sglang/srt/layers/quantization/w4afp8.py +264 -0
  60. sglang/srt/layers/quantization/w8a8_int8.py +52 -1
  61. sglang/srt/layers/rotary_embedding.py +2 -2
  62. sglang/srt/layers/vocab_parallel_embedding.py +20 -10
  63. sglang/srt/lora/lora.py +4 -5
  64. sglang/srt/lora/lora_manager.py +73 -20
  65. sglang/srt/lora/triton_ops/gate_up_lora_b.py +30 -19
  66. sglang/srt/lora/triton_ops/qkv_lora_b.py +30 -19
  67. sglang/srt/lora/triton_ops/sgemm_lora_a.py +27 -11
  68. sglang/srt/lora/triton_ops/sgemm_lora_b.py +27 -15
  69. sglang/srt/managers/cache_controller.py +41 -195
  70. sglang/srt/managers/configure_logging.py +1 -1
  71. sglang/srt/managers/io_struct.py +58 -14
  72. sglang/srt/managers/mm_utils.py +77 -61
  73. sglang/srt/managers/multimodal_processor.py +2 -6
  74. sglang/srt/managers/multimodal_processors/qwen_audio.py +94 -0
  75. sglang/srt/managers/schedule_batch.py +78 -85
  76. sglang/srt/managers/scheduler.py +130 -64
  77. sglang/srt/managers/scheduler_output_processor_mixin.py +8 -2
  78. sglang/srt/managers/session_controller.py +12 -3
  79. sglang/srt/managers/tokenizer_manager.py +314 -103
  80. sglang/srt/managers/tp_worker.py +13 -1
  81. sglang/srt/managers/tp_worker_overlap_thread.py +8 -0
  82. sglang/srt/mem_cache/allocator.py +290 -0
  83. sglang/srt/mem_cache/chunk_cache.py +34 -2
  84. sglang/srt/mem_cache/hiradix_cache.py +2 -0
  85. sglang/srt/mem_cache/memory_pool.py +402 -66
  86. sglang/srt/mem_cache/memory_pool_host.py +6 -109
  87. sglang/srt/mem_cache/multimodal_cache.py +3 -0
  88. sglang/srt/mem_cache/radix_cache.py +8 -4
  89. sglang/srt/model_executor/cuda_graph_runner.py +2 -1
  90. sglang/srt/model_executor/forward_batch_info.py +17 -4
  91. sglang/srt/model_executor/model_runner.py +297 -56
  92. sglang/srt/model_loader/loader.py +41 -0
  93. sglang/srt/model_loader/weight_utils.py +72 -4
  94. sglang/srt/models/deepseek_nextn.py +1 -3
  95. sglang/srt/models/deepseek_v2.py +195 -45
  96. sglang/srt/models/deepseek_vl2.py +3 -5
  97. sglang/srt/models/gemma3_causal.py +1 -2
  98. sglang/srt/models/gemma3n_causal.py +4 -3
  99. sglang/srt/models/gemma3n_mm.py +4 -20
  100. sglang/srt/models/hunyuan.py +1 -1
  101. sglang/srt/models/kimi_vl.py +1 -2
  102. sglang/srt/models/llama.py +10 -4
  103. sglang/srt/models/llama4.py +32 -45
  104. sglang/srt/models/llama_eagle3.py +61 -11
  105. sglang/srt/models/llava.py +5 -5
  106. sglang/srt/models/minicpmo.py +2 -2
  107. sglang/srt/models/mistral.py +1 -1
  108. sglang/srt/models/mllama4.py +402 -89
  109. sglang/srt/models/phi4mm.py +1 -3
  110. sglang/srt/models/pixtral.py +3 -7
  111. sglang/srt/models/qwen2.py +31 -3
  112. sglang/srt/models/qwen2_5_vl.py +1 -3
  113. sglang/srt/models/qwen2_audio.py +200 -0
  114. sglang/srt/models/qwen2_moe.py +32 -6
  115. sglang/srt/models/qwen2_vl.py +1 -4
  116. sglang/srt/models/qwen3.py +94 -25
  117. sglang/srt/models/qwen3_moe.py +68 -21
  118. sglang/srt/models/vila.py +3 -8
  119. sglang/srt/{mm_utils.py → multimodal/mm_utils.py} +2 -2
  120. sglang/srt/{managers/multimodal_processors → multimodal/processors}/base_processor.py +140 -158
  121. sglang/srt/{managers/multimodal_processors → multimodal/processors}/clip.py +2 -13
  122. sglang/srt/{managers/multimodal_processors → multimodal/processors}/deepseek_vl_v2.py +4 -11
  123. sglang/srt/{managers/multimodal_processors → multimodal/processors}/gemma3.py +3 -10
  124. sglang/srt/{managers/multimodal_processors → multimodal/processors}/gemma3n.py +5 -20
  125. sglang/srt/{managers/multimodal_processors → multimodal/processors}/internvl.py +3 -10
  126. sglang/srt/{managers/multimodal_processors → multimodal/processors}/janus_pro.py +3 -9
  127. sglang/srt/{managers/multimodal_processors → multimodal/processors}/kimi_vl.py +6 -13
  128. sglang/srt/{managers/multimodal_processors → multimodal/processors}/llava.py +2 -10
  129. sglang/srt/{managers/multimodal_processors → multimodal/processors}/minicpm.py +5 -12
  130. sglang/srt/{managers/multimodal_processors → multimodal/processors}/mlama.py +2 -14
  131. sglang/srt/{managers/multimodal_processors → multimodal/processors}/mllama4.py +65 -66
  132. sglang/srt/{managers/multimodal_processors → multimodal/processors}/phi4mm.py +4 -14
  133. sglang/srt/{managers/multimodal_processors → multimodal/processors}/pixtral.py +3 -9
  134. sglang/srt/{managers/multimodal_processors → multimodal/processors}/qwen_vl.py +8 -14
  135. sglang/srt/{managers/multimodal_processors → multimodal/processors}/vila.py +13 -31
  136. sglang/srt/operations_strategy.py +6 -2
  137. sglang/srt/reasoning_parser.py +26 -0
  138. sglang/srt/sampling/sampling_batch_info.py +39 -1
  139. sglang/srt/server_args.py +84 -22
  140. sglang/srt/speculative/build_eagle_tree.py +57 -18
  141. sglang/srt/speculative/eagle_worker.py +6 -4
  142. sglang/srt/two_batch_overlap.py +203 -27
  143. sglang/srt/utils.py +343 -163
  144. sglang/srt/warmup.py +12 -3
  145. sglang/test/runners.py +10 -1
  146. sglang/test/test_cutlass_w4a8_moe.py +281 -0
  147. sglang/test/test_utils.py +15 -3
  148. sglang/utils.py +5 -5
  149. sglang/version.py +1 -1
  150. {sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/METADATA +12 -8
  151. {sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/RECORD +157 -146
  152. sglang/math_utils.py +0 -8
  153. /sglang/srt/{managers → eplb}/eplb_algorithms/deepseek.py +0 -0
  154. /sglang/srt/{managers → eplb}/eplb_algorithms/deepseek_vec.py +0 -0
  155. /sglang/srt/{eplb_simulator → eplb/eplb_simulator}/__init__.py +0 -0
  156. {sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/WHEEL +0 -0
  157. {sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/licenses/LICENSE +0 -0
  158. {sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/top_level.txt +0 -0
@@ -2,13 +2,13 @@ from typing import List, Union
2
2
 
3
3
  import torch
4
4
 
5
- from sglang.srt.managers.multimodal_processors.base_processor import (
6
- BaseMultimodalProcessor,
7
- MultimodalSpecialTokens,
8
- )
9
5
  from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
10
6
  from sglang.srt.models.minicpmo import MiniCPMO
11
7
  from sglang.srt.models.minicpmv import MiniCPMV
8
+ from sglang.srt.multimodal.processors.base_processor import (
9
+ BaseMultimodalProcessor,
10
+ MultimodalSpecialTokens,
11
+ )
12
12
 
13
13
 
14
14
  # Compatible with both 'O' and 'V'
@@ -23,19 +23,12 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
23
23
  async def process_mm_data_async(
24
24
  self,
25
25
  image_data: List[Union[str, bytes]],
26
+ audio_data: List[Union[str, bytes]],
26
27
  input_text,
27
28
  request_obj,
28
29
  max_req_input_len,
29
30
  **kwargs,
30
31
  ):
31
- audio_data = request_obj.audio_data
32
- if not image_data and not audio_data:
33
- return None
34
- if not isinstance(image_data, list):
35
- image_data = [image_data]
36
- if not isinstance(audio_data, list):
37
- audio_data = [audio_data]
38
-
39
32
  base_output = self.load_mm_data(
40
33
  prompt=input_text,
41
34
  max_req_input_len=max_req_input_len,
@@ -1,10 +1,8 @@
1
1
  from typing import List, Union
2
2
 
3
- from sglang.srt.managers.multimodal_processors.base_processor import (
4
- BaseMultimodalProcessor,
5
- )
6
3
  from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
7
4
  from sglang.srt.models.mllama import MllamaForConditionalGeneration
5
+ from sglang.srt.multimodal.processors.base_processor import BaseMultimodalProcessor
8
6
  from sglang.srt.utils import load_image
9
7
 
10
8
 
@@ -17,21 +15,11 @@ class MllamaImageProcessor(BaseMultimodalProcessor):
17
15
  async def process_mm_data_async(
18
16
  self, image_data: List[Union[str, bytes]], input_text, *args, **kwargs
19
17
  ):
20
- if not image_data:
21
- return None
22
-
23
18
  if isinstance(input_text, list):
24
19
  assert len(input_text) and isinstance(input_text[0], int)
25
20
  input_text = self._processor.tokenizer.decode(input_text)
26
21
 
27
- if not isinstance(image_data, list):
28
- image_data = [image_data]
29
-
30
- if len(image_data) > 0:
31
- images = [load_image(image)[0] for image in image_data]
32
- else:
33
- images = load_image(image_data[0])[0]
34
-
22
+ images = [load_image(image)[0] for image in image_data]
35
23
  image_inputs = self.process_mm_data(input_text=input_text, images=images)
36
24
  image_inputs["input_ids"] = image_inputs["input_ids"].tolist()[0]
37
25
  image_inputs["mm_items"] = [
@@ -7,12 +7,12 @@ from transformers.models.llama4.image_processing_llama4_fast import (
7
7
  get_best_fit,
8
8
  )
9
9
 
10
- from sglang.srt.managers.multimodal_processors.base_processor import (
10
+ from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
11
+ from sglang.srt.models.mllama4 import Llama4ForConditionalGeneration
12
+ from sglang.srt.multimodal.processors.base_processor import (
11
13
  BaseMultimodalProcessor,
12
14
  MultimodalSpecialTokens,
13
15
  )
14
- from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
15
- from sglang.srt.models.mllama4 import Llama4ForConditionalGeneration
16
16
 
17
17
 
18
18
  class Mllama4ImageProcessor(BaseMultimodalProcessor):
@@ -37,9 +37,6 @@ class Mllama4ImageProcessor(BaseMultimodalProcessor):
37
37
  *args,
38
38
  **kwargs,
39
39
  ):
40
- if not image_data:
41
- return None
42
-
43
40
  if isinstance(input_text, list):
44
41
  assert len(input_text) and isinstance(input_text[0], int)
45
42
  input_text = self._processor.tokenizer.decode(input_text)
@@ -63,70 +60,72 @@ class Mllama4ImageProcessor(BaseMultimodalProcessor):
63
60
  )
64
61
 
65
62
  # Handle image resolutions and aspect ratios
66
- if "pixel_values" in processor_output:
67
- image_processor = processor.image_processor
68
- tokenizer = self._processor.tokenizer
63
+ if "pixel_values" not in processor_output: # no image processed
64
+ return None
65
+
66
+ image_processor = processor.image_processor
67
+ tokenizer = self._processor.tokenizer
69
68
 
70
- # Calculate tile size and find supported resolutions
71
- tile_size = self.vision_config.image_size
72
- max_num_tiles = getattr(self.vision_config, "max_patches", 1)
69
+ # Calculate tile size and find supported resolutions
70
+ tile_size = self.vision_config.image_size
71
+ max_num_tiles = getattr(self.vision_config, "max_patches", 1)
73
72
 
74
- possible_resolutions = find_supported_resolutions(
75
- max_num_chunks=max_num_tiles,
76
- patch_size=SizeDict(height=tile_size, width=tile_size),
73
+ possible_resolutions = find_supported_resolutions(
74
+ max_num_chunks=max_num_tiles,
75
+ patch_size=SizeDict(height=tile_size, width=tile_size),
76
+ )
77
+
78
+ # Find best fit for each image
79
+ best_fit_sizes = [
80
+ get_best_fit(
81
+ (image.size[1], image.size[0]), # (height, width)
82
+ torch.tensor(possible_resolutions),
83
+ resize_to_max_canvas=image_processor.resize_to_max_canvas,
77
84
  )
85
+ for image in processed_data.images
86
+ ]
87
+
88
+ # Calculate aspect ratios and patches per image
89
+ aspect_ratios = [
90
+ (image_size[0] // tile_size, image_size[1] // tile_size)
91
+ for image_size in best_fit_sizes
92
+ ]
93
+
94
+ patches_per_image = [
95
+ 1 if r_h * r_w == 1 else 1 + r_h * r_w for (r_h, r_w) in aspect_ratios
96
+ ]
97
+
98
+ # Add to image_inputs
99
+ processor_output["aspect_ratios"] = aspect_ratios
100
+ processor_output["patches_per_image"] = torch.tensor(patches_per_image)
101
+
102
+ # Process embed_is_patch
103
+ vocab = tokenizer.get_vocab()
104
+ patch_id = vocab.get(processor.img_patch_token, -1)
105
+ image_end_id = vocab.get(processor.end_of_img_token, -1)
106
+
107
+ if patch_id != -1 and image_end_id != -1:
108
+ input_ids = processor_output["input_ids"].view(-1)
109
+
110
+ # Remove BOS token if present
111
+ if input_ids.size(0) > 0 and input_ids[0] == tokenizer.bos_token_id:
112
+ input_ids = input_ids[1:]
113
+
114
+ # Find image end indices and split input_ids
115
+ image_end_indices = (input_ids == image_end_id).nonzero().view(-1)
116
+
117
+ if image_end_indices.size(0) > 0:
118
+ # Split at image boundaries
119
+ split_indices = (image_end_indices + 1)[:-1]
120
+ split_input_ids = torch.tensor_split(input_ids, split_indices)
121
+ split_input_ids = [x for x in split_input_ids if x.numel() > 0]
122
+
123
+ # Create embed_is_patch for each image
124
+ embed_is_patch = []
125
+ for per_image_input_ids in split_input_ids:
126
+ embed_is_patch.append(per_image_input_ids == patch_id)
78
127
 
79
- # Find best fit for each image
80
- best_fit_sizes = [
81
- get_best_fit(
82
- (image.size[1], image.size[0]), # (height, width)
83
- torch.tensor(possible_resolutions),
84
- resize_to_max_canvas=image_processor.resize_to_max_canvas,
85
- )
86
- for image in processed_data.images
87
- ]
88
-
89
- # Calculate aspect ratios and patches per image
90
- aspect_ratios = [
91
- (image_size[0] // tile_size, image_size[1] // tile_size)
92
- for image_size in best_fit_sizes
93
- ]
94
-
95
- patches_per_image = [
96
- 1 if r_h * r_w == 1 else 1 + r_h * r_w for (r_h, r_w) in aspect_ratios
97
- ]
98
-
99
- # Add to image_inputs
100
- processor_output["aspect_ratios"] = aspect_ratios
101
- processor_output["patches_per_image"] = torch.tensor(patches_per_image)
102
-
103
- # Process embed_is_patch
104
- vocab = tokenizer.get_vocab()
105
- patch_id = vocab.get(processor.img_patch_token, -1)
106
- image_end_id = vocab.get(processor.end_of_img_token, -1)
107
-
108
- if patch_id != -1 and image_end_id != -1:
109
- input_ids = processor_output["input_ids"].view(-1)
110
-
111
- # Remove BOS token if present
112
- if input_ids.size(0) > 0 and input_ids[0] == tokenizer.bos_token_id:
113
- input_ids = input_ids[1:]
114
-
115
- # Find image end indices and split input_ids
116
- image_end_indices = (input_ids == image_end_id).nonzero().view(-1)
117
-
118
- if image_end_indices.size(0) > 0:
119
- # Split at image boundaries
120
- split_indices = (image_end_indices + 1)[:-1]
121
- split_input_ids = torch.tensor_split(input_ids, split_indices)
122
- split_input_ids = [x for x in split_input_ids if x.numel() > 0]
123
-
124
- # Create embed_is_patch for each image
125
- embed_is_patch = []
126
- for per_image_input_ids in split_input_ids:
127
- embed_is_patch.append(per_image_input_ids == patch_id)
128
-
129
- processor_output["embed_is_patch"] = embed_is_patch
128
+ processor_output["embed_is_patch"] = embed_is_patch
130
129
 
131
130
  # Convert to the format expected by SGLang
132
131
  processor_output["input_ids"] = processor_output["input_ids"].tolist()[0]
@@ -1,12 +1,12 @@
1
1
  import logging
2
2
  from typing import List, Union
3
3
 
4
- from sglang.srt.managers.multimodal_processors.base_processor import (
4
+ from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
5
+ from sglang.srt.models.phi4mm import Phi4MMForCausalLM
6
+ from sglang.srt.multimodal.processors.base_processor import (
5
7
  BaseMultimodalProcessor,
6
8
  MultimodalSpecialTokens,
7
9
  )
8
- from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
9
- from sglang.srt.models.phi4mm import Phi4MMForCausalLM
10
10
 
11
11
  logger = logging.getLogger(__name__)
12
12
 
@@ -26,22 +26,12 @@ class Phi4MMImageProcessor(BaseMultimodalProcessor):
26
26
  async def process_mm_data_async(
27
27
  self,
28
28
  image_data: List[Union[str, bytes]],
29
+ audio_data,
29
30
  input_text,
30
31
  request_obj,
31
32
  max_req_input_len,
32
33
  **kwargs,
33
34
  ):
34
- audio_data = request_obj.audio_data
35
-
36
- if not image_data and not audio_data:
37
- return None
38
-
39
- if not isinstance(image_data, list):
40
- image_data = [image_data]
41
-
42
- if not isinstance(audio_data, list):
43
- audio_data = [audio_data]
44
-
45
35
  if audio_data:
46
36
  logger.warning(
47
37
  "Currently SGLang does not support audio data for Phi4MM. We are working on it. You can file an issue to help us prioritize."
@@ -6,12 +6,12 @@ from transformers.models.pixtral.image_processing_pixtral import (
6
6
  _num_image_tokens as _get_pixtral_hf_num_image_tokens,
7
7
  )
8
8
 
9
- from sglang.srt.managers.multimodal_processors.base_processor import (
9
+ from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
10
+ from sglang.srt.models.pixtral import PixtralVisionModel
11
+ from sglang.srt.multimodal.processors.base_processor import (
10
12
  BaseMultimodalProcessor,
11
13
  MultimodalSpecialTokens,
12
14
  )
13
- from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
14
- from sglang.srt.models.pixtral import PixtralVisionModel
15
15
 
16
16
 
17
17
  class PixtralProcessor(BaseMultimodalProcessor):
@@ -78,12 +78,6 @@ class PixtralProcessor(BaseMultimodalProcessor):
78
78
  *args,
79
79
  **kwargs,
80
80
  ):
81
- if not image_data:
82
- return None
83
-
84
- if isinstance(image_data, str):
85
- image_data = [image_data]
86
-
87
81
  mm_data = self.load_mm_data(
88
82
  prompt=input_text,
89
83
  multimodal_tokens=self.multimodal_tokens,
@@ -3,19 +3,15 @@ import math
3
3
  import re
4
4
  from typing import Dict, List, Union
5
5
 
6
- import torch
7
6
  from PIL import Image
8
7
 
9
8
  from sglang.srt.layers.rotary_embedding import MRotaryEmbedding
10
- from sglang.srt.managers.multimodal_processors.base_processor import (
11
- BaseMultimodalProcessor as SGLangBaseProcessor,
12
- )
13
- from sglang.srt.managers.multimodal_processors.base_processor import (
14
- MultimodalSpecialTokens,
15
- )
16
- from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
17
9
  from sglang.srt.models.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
18
10
  from sglang.srt.models.qwen2_vl import Qwen2VLForConditionalGeneration
11
+ from sglang.srt.multimodal.processors.base_processor import (
12
+ BaseMultimodalProcessor as SGLangBaseProcessor,
13
+ )
14
+ from sglang.srt.multimodal.processors.base_processor import MultimodalSpecialTokens
19
15
 
20
16
 
21
17
  # Compatible with Qwen2VL and Qwen2_5VL
@@ -51,9 +47,6 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
51
47
  *args,
52
48
  **kwargs,
53
49
  ):
54
- if isinstance(image_data, str):
55
- image_data = [image_data]
56
-
57
50
  base_output = self.load_mm_data(
58
51
  prompt=input_text,
59
52
  image_data=image_data,
@@ -132,12 +125,13 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
132
125
 
133
126
  video_grid_thw = None # TODO
134
127
 
135
- combined_mm_item, input_ids = self.process_and_combine_mm_data(base_output)
128
+ mm_items, input_ids = self.process_and_combine_mm_data(base_output)
136
129
 
137
- if combined_mm_item is None:
130
+ if not mm_items:
138
131
  # Note(Xinyuan): This is the case where image loading fails.
139
132
  return None
140
133
 
134
+ combined_mm_item = mm_items[0] # only image is supported for now
141
135
  video_grid_thw = None # TODO
142
136
  second_per_grid_ts = getattr(combined_mm_item, "second_per_grid_ts", None)
143
137
 
@@ -159,7 +153,7 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
159
153
 
160
154
  return {
161
155
  "input_ids": input_ids.tolist(),
162
- "mm_items": [combined_mm_item],
156
+ "mm_items": mm_items,
163
157
  "im_start_id": self.IM_START_TOKEN_ID,
164
158
  "im_end_id": self.IM_END_TOKEN_ID,
165
159
  "im_token_id": self.IM_TOKEN_ID,
@@ -10,12 +10,12 @@ from sglang.srt.managers.io_struct import (
10
10
  GenerateReqInput,
11
11
  ImageDataItem,
12
12
  )
13
- from sglang.srt.managers.multimodal_processors.base_processor import (
13
+ from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
14
+ from sglang.srt.models.vila import VILAForConditionalGeneration
15
+ from sglang.srt.multimodal.processors.base_processor import (
14
16
  BaseMultimodalProcessor,
15
17
  MultimodalSpecialTokens,
16
18
  )
17
- from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
18
- from sglang.srt.models.vila import VILAForConditionalGeneration
19
19
  from sglang.srt.server_args import ServerArgs
20
20
 
21
21
 
@@ -37,6 +37,8 @@ class VILAMultimodalProcessor(BaseMultimodalProcessor):
37
37
  _processor: VILAProcessor,
38
38
  ) -> None:
39
39
  super().__init__(hf_config, server_args, _processor)
40
+ self.IM_TOKEN_ID = hf_config.image_token_id
41
+ self.VIDEO_TOKEN_ID = hf_config.video_token_id
40
42
 
41
43
  async def process_mm_data_async(
42
44
  self,
@@ -46,13 +48,7 @@ class VILAMultimodalProcessor(BaseMultimodalProcessor):
46
48
  max_req_input_len: int,
47
49
  **kwargs,
48
50
  ) -> Optional[Dict[str, Any]]:
49
- if not image_data:
50
- return None
51
-
52
- if not isinstance(image_data, list):
53
- image_data = [image_data]
54
-
55
- mm_data = self.load_mm_data(
51
+ base_output = self.load_mm_data(
56
52
  prompt=input_text,
57
53
  multimodal_tokens=MultimodalSpecialTokens(
58
54
  image_token=self._processor.tokenizer.image_token
@@ -61,25 +57,11 @@ class VILAMultimodalProcessor(BaseMultimodalProcessor):
61
57
  image_data=image_data,
62
58
  )
63
59
 
64
- inputs = self.process_mm_data(
65
- input_text=mm_data.input_text,
66
- images=mm_data.images,
67
- )
68
-
69
- image_offsets = self.get_mm_items_offset(
70
- input_ids=inputs.input_ids[0],
71
- mm_token_id=cast(int, self._processor.tokenizer.image_token_id),
72
- )
60
+ mm_items, input_ids = self.process_and_combine_mm_data(base_output)
73
61
 
74
- mm_items: List[MultimodalDataItem] = [
75
- MultimodalDataItem(
76
- modality=Modality.IMAGE,
77
- image_offsets=image_offsets,
78
- pixel_values=inputs.pixel_values,
79
- )
80
- ]
81
-
82
- return dict(
83
- input_ids=inputs.input_ids[0].tolist(),
84
- mm_items=mm_items,
85
- )
62
+ return {
63
+ "input_ids": input_ids.tolist(),
64
+ "mm_items": mm_items,
65
+ "im_token_id": self.IM_TOKEN_ID,
66
+ "video_token_id": self.VIDEO_TOKEN_ID,
67
+ }
@@ -71,7 +71,9 @@ def _compute_moe_deepseek_layer_operations_strategy_tbo(
71
71
  assert layer.is_layer_sparse, "dense layer TBO not yet implemented"
72
72
  if forward_mode == ForwardMode.EXTEND:
73
73
  return _compute_moe_deepseek_blog_prefill(layer)
74
- elif forward_mode == ForwardMode.DECODE:
74
+ elif (
75
+ forward_mode == ForwardMode.DECODE or forward_mode == ForwardMode.TARGET_VERIFY
76
+ ):
75
77
  return _compute_moe_deepseek_blog_decode(layer)
76
78
  else:
77
79
  raise NotImplementedError(f"Unsupported {forward_mode=}")
@@ -146,7 +148,9 @@ def _compute_moe_qwen3_layer_operations_strategy_tbo(
146
148
  assert layer.is_layer_sparse, "qwen3 moe only support sparse layers"
147
149
  if forward_mode == ForwardMode.EXTEND:
148
150
  return _compute_moe_qwen3_prefill(layer)
149
- elif forward_mode == ForwardMode.DECODE:
151
+ elif (
152
+ forward_mode == ForwardMode.DECODE or forward_mode == ForwardMode.TARGET_VERIFY
153
+ ):
150
154
  return _compute_moe_qwen3_decode(layer)
151
155
  else:
152
156
  raise NotImplementedError(f"Unsupported {forward_mode=}")
@@ -66,6 +66,13 @@ class BaseReasoningFormatDetector:
66
66
  self._buffer += new_text
67
67
  current_text = self._buffer
68
68
 
69
+ # If the current text is a prefix of the think token, keep buffering
70
+ if any(
71
+ token.startswith(current_text) and token != current_text
72
+ for token in [self.think_start_token, self.think_end_token]
73
+ ):
74
+ return StreamingParseResult()
75
+
69
76
  # Strip `<think>` token if present
70
77
  if not self.stripped_think_start and self.think_start_token in current_text:
71
78
  current_text = current_text.replace(self.think_start_token, "")
@@ -150,6 +157,24 @@ class Qwen3Detector(BaseReasoningFormatDetector):
150
157
  )
151
158
 
152
159
 
160
+ class KimiDetector(BaseReasoningFormatDetector):
161
+ """
162
+ Detector for Kimi Thinking model.
163
+ Assumes reasoning format:
164
+ ◁think▷*(.*)◁/think▷
165
+ Returns all the text before the ◁/think▷ tag as `reasoning_text`
166
+ and the rest of the text as `normal_text`.
167
+ """
168
+
169
+ def __init__(self, stream_reasoning: bool = True):
170
+ super().__init__(
171
+ "◁think▷",
172
+ "◁/think▷",
173
+ force_reasoning=False,
174
+ stream_reasoning=stream_reasoning,
175
+ )
176
+
177
+
153
178
  class ReasoningParser:
154
179
  """
155
180
  Parser that handles both streaming and non-streaming scenarios for extracting
@@ -164,6 +189,7 @@ class ReasoningParser:
164
189
  DetectorMap: Dict[str, Type[BaseReasoningFormatDetector]] = {
165
190
  "deepseek-r1": DeepSeekR1Detector,
166
191
  "qwen3": Qwen3Detector,
192
+ "kimi": KimiDetector,
167
193
  }
168
194
 
169
195
  def __init__(self, model_type: Optional[str] = None, stream_reasoning: bool = True):
@@ -10,7 +10,6 @@ import torch
10
10
  import sglang.srt.sampling.penaltylib as penaltylib
11
11
  from sglang.srt.sampling.custom_logit_processor import CustomLogitProcessor
12
12
  from sglang.srt.sampling.sampling_params import TOP_K_ALL
13
- from sglang.srt.utils import merge_bias_tensor
14
13
 
15
14
  if TYPE_CHECKING:
16
15
  from sglang.srt.managers.schedule_batch import ScheduleBatch
@@ -345,3 +344,42 @@ class SamplingBatchInfo:
345
344
  self.logit_bias = merge_bias_tensor(
346
345
  self.logit_bias, other.logit_bias, len(self), len(other), self.device, 0.0
347
346
  )
347
+
348
+
349
+ def merge_bias_tensor(
350
+ lhs: Optional[torch.Tensor],
351
+ rhs: Optional[torch.Tensor],
352
+ bs1: int,
353
+ bs2: int,
354
+ device: str,
355
+ default: float,
356
+ ):
357
+ """Merge two bias tensors for batch merging.
358
+
359
+ Args:
360
+ lhs: Left-hand side tensor
361
+ rhs: Right-hand side tensor
362
+ bs1: Batch size of left-hand side tensor
363
+ bs2: Batch size of right-hand side tensor
364
+ device: Device to place the merged tensor on
365
+ default: Default value for missing tensor elements
366
+
367
+ Returns:
368
+ Merged tensor or None if both inputs are None
369
+ """
370
+ if lhs is None and rhs is None:
371
+ return None
372
+
373
+ if lhs is not None and rhs is not None:
374
+ return torch.cat([lhs, rhs])
375
+ else:
376
+ if lhs is not None:
377
+ shape, dtype = lhs.shape[1:], lhs.dtype
378
+ else:
379
+ shape, dtype = rhs.shape[1:], rhs.dtype
380
+
381
+ if lhs is None:
382
+ lhs = torch.empty((bs1, *shape), device=device, dtype=dtype).fill_(default)
383
+ if rhs is None:
384
+ rhs = torch.empty((bs2, *shape), device=device, dtype=dtype).fill_(default)
385
+ return torch.cat([lhs, rhs])