sglang 0.4.9__py3-none-any.whl → 0.4.9.post2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. sglang/bench_serving.py +2 -2
  2. sglang/srt/configs/model_config.py +36 -2
  3. sglang/srt/conversation.py +56 -3
  4. sglang/srt/disaggregation/ascend/__init__.py +6 -0
  5. sglang/srt/disaggregation/ascend/conn.py +44 -0
  6. sglang/srt/disaggregation/ascend/transfer_engine.py +58 -0
  7. sglang/srt/disaggregation/mooncake/conn.py +50 -18
  8. sglang/srt/disaggregation/mooncake/transfer_engine.py +17 -8
  9. sglang/srt/disaggregation/utils.py +25 -3
  10. sglang/srt/entrypoints/engine.py +1 -1
  11. sglang/srt/entrypoints/http_server.py +1 -0
  12. sglang/srt/entrypoints/http_server_engine.py +1 -1
  13. sglang/srt/entrypoints/openai/protocol.py +11 -0
  14. sglang/srt/entrypoints/openai/serving_chat.py +7 -0
  15. sglang/srt/function_call/function_call_parser.py +2 -0
  16. sglang/srt/function_call/kimik2_detector.py +220 -0
  17. sglang/srt/hf_transformers_utils.py +18 -0
  18. sglang/srt/jinja_template_utils.py +8 -0
  19. sglang/srt/layers/communicator.py +20 -5
  20. sglang/srt/layers/flashinfer_comm_fusion.py +3 -3
  21. sglang/srt/layers/layernorm.py +2 -2
  22. sglang/srt/layers/linear.py +12 -2
  23. sglang/srt/layers/moe/cutlass_w4a8_moe.py +215 -0
  24. sglang/srt/layers/moe/ep_moe/kernels.py +60 -1
  25. sglang/srt/layers/moe/ep_moe/layer.py +141 -2
  26. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +2 -0
  27. sglang/srt/layers/moe/fused_moe_triton/layer.py +141 -59
  28. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +176 -0
  29. sglang/srt/layers/moe/topk.py +8 -2
  30. sglang/srt/layers/parameter.py +19 -3
  31. sglang/srt/layers/quantization/__init__.py +2 -0
  32. sglang/srt/layers/quantization/fp8.py +28 -7
  33. sglang/srt/layers/quantization/fp8_kernel.py +2 -2
  34. sglang/srt/layers/quantization/modelopt_quant.py +244 -1
  35. sglang/srt/layers/quantization/moe_wna16.py +1 -2
  36. sglang/srt/layers/quantization/w4afp8.py +264 -0
  37. sglang/srt/layers/quantization/w8a8_int8.py +738 -14
  38. sglang/srt/layers/vocab_parallel_embedding.py +9 -3
  39. sglang/srt/lora/triton_ops/gate_up_lora_b.py +30 -19
  40. sglang/srt/lora/triton_ops/qkv_lora_b.py +30 -19
  41. sglang/srt/lora/triton_ops/sgemm_lora_a.py +27 -11
  42. sglang/srt/lora/triton_ops/sgemm_lora_b.py +27 -15
  43. sglang/srt/managers/cache_controller.py +41 -195
  44. sglang/srt/managers/io_struct.py +35 -3
  45. sglang/srt/managers/mm_utils.py +59 -96
  46. sglang/srt/managers/schedule_batch.py +17 -6
  47. sglang/srt/managers/scheduler.py +38 -6
  48. sglang/srt/managers/tokenizer_manager.py +16 -0
  49. sglang/srt/mem_cache/hiradix_cache.py +2 -0
  50. sglang/srt/mem_cache/memory_pool.py +176 -101
  51. sglang/srt/mem_cache/memory_pool_host.py +6 -109
  52. sglang/srt/mem_cache/radix_cache.py +8 -4
  53. sglang/srt/model_executor/forward_batch_info.py +13 -1
  54. sglang/srt/model_loader/loader.py +23 -12
  55. sglang/srt/models/deepseek_janus_pro.py +1 -1
  56. sglang/srt/models/deepseek_v2.py +78 -19
  57. sglang/srt/models/deepseek_vl2.py +1 -1
  58. sglang/srt/models/gemma3_mm.py +1 -1
  59. sglang/srt/models/gemma3n_mm.py +6 -3
  60. sglang/srt/models/internvl.py +8 -2
  61. sglang/srt/models/kimi_vl.py +8 -2
  62. sglang/srt/models/llama.py +2 -0
  63. sglang/srt/models/llava.py +3 -1
  64. sglang/srt/models/llavavid.py +1 -1
  65. sglang/srt/models/minicpmo.py +1 -2
  66. sglang/srt/models/minicpmv.py +1 -1
  67. sglang/srt/models/mixtral_quant.py +4 -0
  68. sglang/srt/models/mllama4.py +372 -82
  69. sglang/srt/models/phi4mm.py +8 -2
  70. sglang/srt/models/phimoe.py +553 -0
  71. sglang/srt/models/qwen2.py +2 -0
  72. sglang/srt/models/qwen2_5_vl.py +10 -7
  73. sglang/srt/models/qwen2_vl.py +12 -1
  74. sglang/srt/models/vila.py +8 -2
  75. sglang/srt/multimodal/mm_utils.py +2 -2
  76. sglang/srt/multimodal/processors/base_processor.py +197 -137
  77. sglang/srt/multimodal/processors/deepseek_vl_v2.py +1 -1
  78. sglang/srt/multimodal/processors/gemma3.py +4 -2
  79. sglang/srt/multimodal/processors/gemma3n.py +1 -1
  80. sglang/srt/multimodal/processors/internvl.py +1 -1
  81. sglang/srt/multimodal/processors/janus_pro.py +1 -1
  82. sglang/srt/multimodal/processors/kimi_vl.py +1 -1
  83. sglang/srt/multimodal/processors/minicpm.py +4 -3
  84. sglang/srt/multimodal/processors/mllama4.py +63 -61
  85. sglang/srt/multimodal/processors/phi4mm.py +1 -1
  86. sglang/srt/multimodal/processors/pixtral.py +1 -1
  87. sglang/srt/multimodal/processors/qwen_vl.py +203 -80
  88. sglang/srt/multimodal/processors/vila.py +1 -1
  89. sglang/srt/server_args.py +26 -4
  90. sglang/srt/two_batch_overlap.py +3 -0
  91. sglang/srt/utils.py +191 -48
  92. sglang/test/test_cutlass_w4a8_moe.py +281 -0
  93. sglang/utils.py +5 -5
  94. sglang/version.py +1 -1
  95. {sglang-0.4.9.dist-info → sglang-0.4.9.post2.dist-info}/METADATA +6 -4
  96. {sglang-0.4.9.dist-info → sglang-0.4.9.post2.dist-info}/RECORD +99 -90
  97. {sglang-0.4.9.dist-info → sglang-0.4.9.post2.dist-info}/WHEEL +0 -0
  98. {sglang-0.4.9.dist-info → sglang-0.4.9.post2.dist-info}/licenses/LICENSE +0 -0
  99. {sglang-0.4.9.dist-info → sglang-0.4.9.post2.dist-info}/top_level.txt +0 -0
@@ -60,70 +60,72 @@ class Mllama4ImageProcessor(BaseMultimodalProcessor):
60
60
  )
61
61
 
62
62
  # Handle image resolutions and aspect ratios
63
- if "pixel_values" in processor_output:
64
- image_processor = processor.image_processor
65
- tokenizer = self._processor.tokenizer
63
+ if "pixel_values" not in processor_output: # no image processed
64
+ return None
66
65
 
67
- # Calculate tile size and find supported resolutions
68
- tile_size = self.vision_config.image_size
69
- max_num_tiles = getattr(self.vision_config, "max_patches", 1)
66
+ image_processor = processor.image_processor
67
+ tokenizer = self._processor.tokenizer
70
68
 
71
- possible_resolutions = find_supported_resolutions(
72
- max_num_chunks=max_num_tiles,
73
- patch_size=SizeDict(height=tile_size, width=tile_size),
69
+ # Calculate tile size and find supported resolutions
70
+ tile_size = self.vision_config.image_size
71
+ max_num_tiles = getattr(self.vision_config, "max_patches", 1)
72
+
73
+ possible_resolutions = find_supported_resolutions(
74
+ max_num_chunks=max_num_tiles,
75
+ patch_size=SizeDict(height=tile_size, width=tile_size),
76
+ )
77
+
78
+ # Find best fit for each image
79
+ best_fit_sizes = [
80
+ get_best_fit(
81
+ (image.size[1], image.size[0]), # (height, width)
82
+ torch.tensor(possible_resolutions),
83
+ resize_to_max_canvas=image_processor.resize_to_max_canvas,
74
84
  )
85
+ for image in processed_data.images
86
+ ]
87
+
88
+ # Calculate aspect ratios and patches per image
89
+ aspect_ratios = [
90
+ (image_size[0] // tile_size, image_size[1] // tile_size)
91
+ for image_size in best_fit_sizes
92
+ ]
93
+
94
+ patches_per_image = [
95
+ 1 if r_h * r_w == 1 else 1 + r_h * r_w for (r_h, r_w) in aspect_ratios
96
+ ]
97
+
98
+ # Add to image_inputs
99
+ processor_output["aspect_ratios"] = aspect_ratios
100
+ processor_output["patches_per_image"] = torch.tensor(patches_per_image)
101
+
102
+ # Process embed_is_patch
103
+ vocab = tokenizer.get_vocab()
104
+ patch_id = vocab.get(processor.img_patch_token, -1)
105
+ image_end_id = vocab.get(processor.end_of_img_token, -1)
106
+
107
+ if patch_id != -1 and image_end_id != -1:
108
+ input_ids = processor_output["input_ids"].view(-1)
109
+
110
+ # Remove BOS token if present
111
+ if input_ids.size(0) > 0 and input_ids[0] == tokenizer.bos_token_id:
112
+ input_ids = input_ids[1:]
113
+
114
+ # Find image end indices and split input_ids
115
+ image_end_indices = (input_ids == image_end_id).nonzero().view(-1)
116
+
117
+ if image_end_indices.size(0) > 0:
118
+ # Split at image boundaries
119
+ split_indices = (image_end_indices + 1)[:-1]
120
+ split_input_ids = torch.tensor_split(input_ids, split_indices)
121
+ split_input_ids = [x for x in split_input_ids if x.numel() > 0]
122
+
123
+ # Create embed_is_patch for each image
124
+ embed_is_patch = []
125
+ for per_image_input_ids in split_input_ids:
126
+ embed_is_patch.append(per_image_input_ids == patch_id)
75
127
 
76
- # Find best fit for each image
77
- best_fit_sizes = [
78
- get_best_fit(
79
- (image.size[1], image.size[0]), # (height, width)
80
- torch.tensor(possible_resolutions),
81
- resize_to_max_canvas=image_processor.resize_to_max_canvas,
82
- )
83
- for image in processed_data.images
84
- ]
85
-
86
- # Calculate aspect ratios and patches per image
87
- aspect_ratios = [
88
- (image_size[0] // tile_size, image_size[1] // tile_size)
89
- for image_size in best_fit_sizes
90
- ]
91
-
92
- patches_per_image = [
93
- 1 if r_h * r_w == 1 else 1 + r_h * r_w for (r_h, r_w) in aspect_ratios
94
- ]
95
-
96
- # Add to image_inputs
97
- processor_output["aspect_ratios"] = aspect_ratios
98
- processor_output["patches_per_image"] = torch.tensor(patches_per_image)
99
-
100
- # Process embed_is_patch
101
- vocab = tokenizer.get_vocab()
102
- patch_id = vocab.get(processor.img_patch_token, -1)
103
- image_end_id = vocab.get(processor.end_of_img_token, -1)
104
-
105
- if patch_id != -1 and image_end_id != -1:
106
- input_ids = processor_output["input_ids"].view(-1)
107
-
108
- # Remove BOS token if present
109
- if input_ids.size(0) > 0 and input_ids[0] == tokenizer.bos_token_id:
110
- input_ids = input_ids[1:]
111
-
112
- # Find image end indices and split input_ids
113
- image_end_indices = (input_ids == image_end_id).nonzero().view(-1)
114
-
115
- if image_end_indices.size(0) > 0:
116
- # Split at image boundaries
117
- split_indices = (image_end_indices + 1)[:-1]
118
- split_input_ids = torch.tensor_split(input_ids, split_indices)
119
- split_input_ids = [x for x in split_input_ids if x.numel() > 0]
120
-
121
- # Create embed_is_patch for each image
122
- embed_is_patch = []
123
- for per_image_input_ids in split_input_ids:
124
- embed_is_patch.append(per_image_input_ids == patch_id)
125
-
126
- processor_output["embed_is_patch"] = embed_is_patch
128
+ processor_output["embed_is_patch"] = embed_is_patch
127
129
 
128
130
  # Convert to the format expected by SGLang
129
131
  processor_output["input_ids"] = processor_output["input_ids"].tolist()[0]
@@ -142,7 +144,7 @@ class Mllama4ImageProcessor(BaseMultimodalProcessor):
142
144
  MultimodalDataItem(
143
145
  pixel_values=processor_output["pixel_values"],
144
146
  modality=Modality.IMAGE,
145
- image_offsets=image_offsets,
147
+ offsets=image_offsets,
146
148
  )
147
149
  ]
148
150
 
@@ -65,7 +65,7 @@ class Phi4MMImageProcessor(BaseMultimodalProcessor):
65
65
  pixel_values=res["input_image_embeds"],
66
66
  image_sizes=res["image_sizes"],
67
67
  image_emb_mask=res["image_attention_mask"],
68
- image_offsets=image_offsets,
68
+ offsets=image_offsets,
69
69
  modality=Modality.IMAGE,
70
70
  )
71
71
  ]
@@ -106,7 +106,7 @@ class PixtralProcessor(BaseMultimodalProcessor):
106
106
  pixel_values=processor_output["pixel_values"],
107
107
  image_sizes=processor_output["image_sizes"],
108
108
  modality=Modality.IMAGE,
109
- image_offsets=image_offsets,
109
+ offsets=image_offsets,
110
110
  )
111
111
  ]
112
112
 
@@ -1,9 +1,13 @@
1
1
  import asyncio
2
2
  import math
3
+ import os
3
4
  import re
4
- from typing import Dict, List, Union
5
+ from typing import List, Union
5
6
 
7
+ import torch
8
+ import torchvision
6
9
  from PIL import Image
10
+ from torchvision.transforms import InterpolationMode
7
11
 
8
12
  from sglang.srt.layers.rotary_embedding import MRotaryEmbedding
9
13
  from sglang.srt.models.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
@@ -12,6 +16,185 @@ from sglang.srt.multimodal.processors.base_processor import (
12
16
  BaseMultimodalProcessor as SGLangBaseProcessor,
13
17
  )
14
18
  from sglang.srt.multimodal.processors.base_processor import MultimodalSpecialTokens
19
+ from sglang.utils import logger
20
+
21
+ IMAGE_FACTOR = 28
22
+ MIN_PIXELS = 4 * 28 * 28
23
+ MAX_PIXELS = 16384 * 28 * 28
24
+ MAX_RATIO = 200
25
+ VIDEO_TOTAL_PIXELS = int(
26
+ float(os.environ.get("VIDEO_MAX_PIXELS", 128000 * 28 * 28 * 0.9))
27
+ )
28
+
29
+ VIDEO_MIN_PIXELS = 128 * 28 * 28
30
+ VIDEO_MAX_PIXELS = 768 * 28 * 28
31
+ FRAME_FACTOR = 2
32
+ FPS = 2.0
33
+ FPS_MIN_FRAMES = 4
34
+ FPS_MAX_FRAMES = 768
35
+
36
+
37
+ def smart_resize(
38
+ height: int,
39
+ width: int,
40
+ factor: int = IMAGE_FACTOR,
41
+ min_pixels: int = MIN_PIXELS,
42
+ max_pixels: int = MAX_PIXELS,
43
+ ) -> tuple[int, int]:
44
+ """
45
+ Rescales the image so that the following conditions are met:
46
+
47
+ 1. Both dimensions (height and width) are divisible by 'factor'.
48
+
49
+ 2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
50
+
51
+ 3. The aspect ratio of the image is maintained as closely as possible.
52
+ """
53
+ if max(height, width) / min(height, width) > MAX_RATIO:
54
+ raise ValueError(
55
+ f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)}"
56
+ )
57
+ h_bar = max(factor, round_by_factor(height, factor))
58
+ w_bar = max(factor, round_by_factor(width, factor))
59
+ if h_bar * w_bar > max_pixels:
60
+ beta = math.sqrt((height * width) / max_pixels)
61
+ h_bar = floor_by_factor(height / beta, factor)
62
+ w_bar = floor_by_factor(width / beta, factor)
63
+ elif h_bar * w_bar < min_pixels:
64
+ beta = math.sqrt(min_pixels / (height * width))
65
+ h_bar = ceil_by_factor(height * beta, factor)
66
+ w_bar = ceil_by_factor(width * beta, factor)
67
+ return h_bar, w_bar
68
+
69
+
70
+ def resize_image(image, size_factor: int = IMAGE_FACTOR) -> Image.Image:
71
+ width, height = image.size
72
+ min_pixels = MIN_PIXELS
73
+ max_pixels = MAX_PIXELS
74
+ resized_height, resized_width = smart_resize(
75
+ height,
76
+ width,
77
+ factor=size_factor,
78
+ min_pixels=min_pixels,
79
+ max_pixels=max_pixels,
80
+ )
81
+ image = image.resize((resized_width, resized_height))
82
+ return image
83
+
84
+
85
+ def round_by_factor(number: int, factor: int) -> int:
86
+ """Returns the closest integer to 'number' that is divisible by 'factor'."""
87
+ return round(number / factor) * factor
88
+
89
+
90
+ def ceil_by_factor(number: int, factor: int) -> int:
91
+ """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
92
+ return math.ceil(number / factor) * factor
93
+
94
+
95
+ def floor_by_factor(number: int, factor: int) -> int:
96
+ """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
97
+ return math.floor(number / factor) * factor
98
+
99
+
100
+ async def resize_image_async(image):
101
+ return resize_image(image)
102
+
103
+
104
+ def smart_nframes(
105
+ ele: dict,
106
+ total_frames: int,
107
+ video_fps: int | float,
108
+ ) -> int:
109
+ """calculate the number of frames for video used for model inputs.
110
+
111
+ Args:
112
+ ele (dict): a dict contains the configuration of video.
113
+ support either `fps` or `nframes`:
114
+ - nframes: the number of frames to extract for model inputs.
115
+ - fps: the fps to extract frames for model inputs.
116
+ - min_frames: the minimum number of frames of the video, only used when fps is provided.
117
+ - max_frames: the maximum number of frames of the video, only used when fps is provided.
118
+ total_frames (int): the original total number of frames of the video.
119
+ video_fps (int | float): the original fps of the video.
120
+
121
+ Raises:
122
+ ValueError: nframes should in interval [FRAME_FACTOR, total_frames].
123
+
124
+ Returns:
125
+ int: the number of frames for video used for model inputs.
126
+ """
127
+ assert not (
128
+ "fps" in ele and "nframes" in ele
129
+ ), "Only accept either `fps` or `nframes`"
130
+ if "nframes" in ele:
131
+ nframes = round_by_factor(ele["nframes"], FRAME_FACTOR)
132
+ else:
133
+ fps = ele.get("fps", FPS)
134
+ min_frames = ceil_by_factor(ele.get("min_frames", FPS_MIN_FRAMES), FRAME_FACTOR)
135
+ max_frames = floor_by_factor(
136
+ ele.get("max_frames", min(FPS_MAX_FRAMES, total_frames)), FRAME_FACTOR
137
+ )
138
+ nframes = total_frames / video_fps * fps
139
+ if nframes > total_frames:
140
+ logger.warning(
141
+ f"smart_nframes: nframes[{nframes}] > total_frames[{total_frames}]"
142
+ )
143
+ nframes = min(min(max(nframes, min_frames), max_frames), total_frames)
144
+ nframes = floor_by_factor(nframes, FRAME_FACTOR)
145
+ if not (FRAME_FACTOR <= nframes and nframes <= total_frames):
146
+ raise ValueError(
147
+ f"nframes should in interval [{FRAME_FACTOR}, {total_frames}], but got {nframes}."
148
+ )
149
+ return nframes
150
+
151
+
152
+ # process video, qwen-specific
153
+ async def preprocess_video(
154
+ vr,
155
+ image_factor: int = IMAGE_FACTOR,
156
+ # vr: VideoReader, image_factor: int = IMAGE_FACTOR
157
+ ) -> torch.Tensor:
158
+ ele = {}
159
+ total_frames, video_fps = len(vr), vr.get_avg_fps()
160
+ nframes = smart_nframes({}, total_frames=total_frames, video_fps=video_fps)
161
+ idx = torch.linspace(0, total_frames - 1, nframes).round().long().tolist()
162
+ video = vr.get_batch(idx).asnumpy()
163
+ video = torch.tensor(video).permute(0, 3, 1, 2) # Convert to TCHW format
164
+ nframes, _, height, width = video.shape
165
+ min_pixels = ele.get("min_pixels", VIDEO_MIN_PIXELS)
166
+ total_pixels = ele.get("total_pixels", VIDEO_TOTAL_PIXELS)
167
+ max_pixels = max(
168
+ min(VIDEO_MAX_PIXELS, total_pixels / nframes * FRAME_FACTOR),
169
+ int(min_pixels * 1.05),
170
+ )
171
+ max_pixels_supposed = ele.get("max_pixels", max_pixels)
172
+ if max_pixels_supposed > max_pixels:
173
+ logger.warning(
174
+ f"The given max_pixels[{max_pixels_supposed}] exceeds limit[{max_pixels}]."
175
+ )
176
+ max_pixels = min(max_pixels_supposed, max_pixels)
177
+ if "resized_height" in ele and "resized_width" in ele:
178
+ resized_height, resized_width = smart_resize(
179
+ ele["resized_height"],
180
+ ele["resized_width"],
181
+ factor=image_factor,
182
+ )
183
+ else:
184
+ resized_height, resized_width = smart_resize(
185
+ height,
186
+ width,
187
+ factor=image_factor,
188
+ min_pixels=min_pixels,
189
+ max_pixels=max_pixels,
190
+ )
191
+ video = torchvision.transforms.functional.resize(
192
+ video,
193
+ [resized_height, resized_width],
194
+ interpolation=InterpolationMode.BICUBIC,
195
+ antialias=True,
196
+ ).float()
197
+ return video
15
198
 
16
199
 
17
200
  # Compatible with Qwen2VL and Qwen2_5VL
@@ -37,104 +220,44 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
37
220
  self.MIN_PIXELS = 4 * 28 * 28
38
221
  self.MAX_PIXELS = 16384 * 28 * 28
39
222
  self.MAX_RATIO = 200
223
+ # TODO(mick): move all MultimodalSpecialTokens initializations into processor init
224
+ self.mm_special_tokens = MultimodalSpecialTokens(
225
+ image_token=self.IMAGE_TOKEN,
226
+ image_token_regex=self.IMAGE_TOKEN_REGEX,
227
+ video_token=self.VIDEO_TOKEN_ID,
228
+ )
40
229
 
41
230
  async def process_mm_data_async(
42
231
  self,
43
- image_data: List[Union[str, bytes, Dict]],
232
+ image_data: List[Union[str, bytes]],
44
233
  input_text,
45
234
  request_obj,
46
235
  max_req_input_len,
47
236
  *args,
48
237
  **kwargs,
49
238
  ):
239
+
50
240
  base_output = self.load_mm_data(
51
241
  prompt=input_text,
52
242
  image_data=image_data,
53
- multimodal_tokens=MultimodalSpecialTokens(
54
- image_token=self.IMAGE_TOKEN,
55
- image_token_regex=self.IMAGE_TOKEN_REGEX,
56
- ),
243
+ video_data=request_obj.video_data,
244
+ multimodal_tokens=self.mm_special_tokens,
57
245
  max_req_input_len=max_req_input_len,
58
246
  )
59
247
 
60
- def smart_resize(
61
- height: int,
62
- width: int,
63
- factor: int = self.IMAGE_FACTOR,
64
- min_pixels: int = self.MIN_PIXELS,
65
- max_pixels: int = self.MAX_PIXELS,
66
- ) -> tuple[int, int]:
67
- """
68
- Rescales the image so that the following conditions are met:
69
-
70
- 1. Both dimensions (height and width) are divisible by 'factor'.
71
-
72
- 2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
73
-
74
- 3. The aspect ratio of the image is maintained as closely as possible.
75
- """
76
- if max(height, width) / min(height, width) > self.MAX_RATIO:
77
- raise ValueError(
78
- f"absolute aspect ratio must be smaller than {self.MAX_RATIO}, got {max(height, width) / min(height, width)}"
79
- )
80
- h_bar = max(factor, round_by_factor(height, factor))
81
- w_bar = max(factor, round_by_factor(width, factor))
82
- if h_bar * w_bar > max_pixels:
83
- beta = math.sqrt((height * width) / max_pixels)
84
- h_bar = floor_by_factor(height / beta, factor)
85
- w_bar = floor_by_factor(width / beta, factor)
86
- elif h_bar * w_bar < min_pixels:
87
- beta = math.sqrt(min_pixels / (height * width))
88
- h_bar = ceil_by_factor(height * beta, factor)
89
- w_bar = ceil_by_factor(width * beta, factor)
90
- return h_bar, w_bar
91
-
92
- def resize_image(image, size_factor: int = self.IMAGE_FACTOR) -> Image.Image:
93
- width, height = image.size
94
- min_pixels = self.MIN_PIXELS
95
- max_pixels = self.MAX_PIXELS
96
- resized_height, resized_width = smart_resize(
97
- height,
98
- width,
99
- factor=size_factor,
100
- min_pixels=min_pixels,
101
- max_pixels=max_pixels,
102
- )
103
- image = image.resize((resized_width, resized_height))
104
- return image
105
-
106
- def round_by_factor(number: int, factor: int) -> int:
107
- """Returns the closest integer to 'number' that is divisible by 'factor'."""
108
- return round(number / factor) * factor
109
-
110
- def ceil_by_factor(number: int, factor: int) -> int:
111
- """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
112
- return math.ceil(number / factor) * factor
113
-
114
- def floor_by_factor(number: int, factor: int) -> int:
115
- """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
116
- return math.floor(number / factor) * factor
117
-
118
- async def resize_image_async(image):
119
- return resize_image(image)
120
-
121
248
  # Qwen-specific: resize images if they are raw Image objects
122
249
  if base_output.images and isinstance(base_output.images[0], Image.Image):
123
250
  resize_tasks = [resize_image_async(image) for image in base_output.images]
124
251
  base_output.images = await asyncio.gather(*resize_tasks)
125
252
 
126
- video_grid_thw = None # TODO
127
-
128
- mm_items, input_ids = self.process_and_combine_mm_data(base_output)
129
-
130
- if not mm_items:
131
- # Note(Xinyuan): This is the case where image loading fails.
132
- return None
253
+ if base_output.videos:
254
+ base_output.videos = [
255
+ await preprocess_video(video) for video in base_output.videos
256
+ ]
133
257
 
134
- combined_mm_item = mm_items[0] # only image is supported for now
135
- video_grid_thw = None # TODO
136
- second_per_grid_ts = getattr(combined_mm_item, "second_per_grid_ts", None)
258
+ mm_items, input_ids, ret = self.process_and_combine_mm_data(base_output)
137
259
 
260
+ input_ids = input_ids.flatten()
138
261
  mrope_positions, mrope_position_delta = MRotaryEmbedding.get_rope_index(
139
262
  spatial_merge_size=self.hf_config.vision_config.spatial_merge_size,
140
263
  image_token_id=self.IM_TOKEN_ID,
@@ -145,9 +268,9 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
145
268
  self.hf_config.vision_config, "tokens_per_second", None
146
269
  ),
147
270
  input_ids=input_ids.unsqueeze(0),
148
- image_grid_thw=combined_mm_item.image_grid_thw,
149
- video_grid_thw=video_grid_thw,
150
- second_per_grid_ts=second_per_grid_ts,
271
+ image_grid_thw=getattr(ret, "image_grid_thw", None),
272
+ video_grid_thw=getattr(ret, "video_grid_thw", None),
273
+ second_per_grid_ts=getattr(ret, "second_per_grid_ts", None),
151
274
  )
152
275
  mrope_positions = mrope_positions.squeeze(1)
153
276
 
@@ -57,7 +57,7 @@ class VILAMultimodalProcessor(BaseMultimodalProcessor):
57
57
  image_data=image_data,
58
58
  )
59
59
 
60
- mm_items, input_ids = self.process_and_combine_mm_data(base_output)
60
+ mm_items, input_ids, _ = self.process_and_combine_mm_data(base_output)
61
61
 
62
62
  return {
63
63
  "input_ids": input_ids.tolist(),
sglang/srt/server_args.py CHANGED
@@ -217,11 +217,13 @@ class ServerArgs:
217
217
  hicache_ratio: float = 2.0
218
218
  hicache_size: int = 0
219
219
  hicache_write_policy: str = "write_through_selective"
220
+ hicache_io_backend: str = ""
220
221
  flashinfer_mla_disable_ragged: bool = False
221
222
  disable_shared_experts_fusion: bool = False
222
223
  disable_chunked_prefix_cache: bool = False
223
224
  disable_fast_image_processor: bool = False
224
225
  enable_return_hidden_states: bool = False
226
+ enable_triton_kernel_moe: bool = False
225
227
  warmups: Optional[str] = None
226
228
 
227
229
  # Debug tensor dumps
@@ -414,7 +416,7 @@ class ServerArgs:
414
416
  if self.enable_dp_lm_head:
415
417
  assert (
416
418
  self.enable_dp_attention
417
- ), "Please enable dp attention when setting enable_dp_attention. "
419
+ ), "Please enable dp attention when setting enable_dp_lm_head. "
418
420
 
419
421
  # DeepEP MoE
420
422
  if self.enable_deepep_moe:
@@ -706,6 +708,7 @@ class ServerArgs:
706
708
  "w8a8_fp8",
707
709
  "moe_wna16",
708
710
  "qoq",
711
+ "w4afp8",
709
712
  ],
710
713
  help="The quantization method.",
711
714
  )
@@ -1045,9 +1048,16 @@ class ServerArgs:
1045
1048
  parser.add_argument(
1046
1049
  "--tool-call-parser",
1047
1050
  type=str,
1048
- choices=["qwen25", "mistral", "llama3", "deepseekv3", "pythonic"],
1051
+ choices=[
1052
+ "qwen25",
1053
+ "mistral",
1054
+ "llama3",
1055
+ "deepseekv3",
1056
+ "pythonic",
1057
+ "kimi_k2",
1058
+ ],
1049
1059
  default=ServerArgs.tool_call_parser,
1050
- help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', and 'pythonic'.",
1060
+ help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', 'pythonic', and 'kimi_k2'.",
1051
1061
  )
1052
1062
 
1053
1063
  # Data parallelism
@@ -1529,6 +1539,13 @@ class ServerArgs:
1529
1539
  default=ServerArgs.hicache_write_policy,
1530
1540
  help="The write policy of hierarchical cache.",
1531
1541
  )
1542
+ parser.add_argument(
1543
+ "--hicache-io-backend",
1544
+ type=str,
1545
+ choices=["direct", "kernel"],
1546
+ default=ServerArgs.hicache_io_backend,
1547
+ help="The IO backend for KV cache transfer between CPU and GPU",
1548
+ )
1532
1549
  parser.add_argument(
1533
1550
  "--flashinfer-mla-disable-ragged",
1534
1551
  action="store_true",
@@ -1554,6 +1571,11 @@ class ServerArgs:
1554
1571
  action="store_true",
1555
1572
  help="Enable returning hidden states with responses.",
1556
1573
  )
1574
+ parser.add_argument(
1575
+ "--enable-triton-kernel-moe",
1576
+ action="store_true",
1577
+ help="Use triton moe grouped gemm kernel.",
1578
+ )
1557
1579
  parser.add_argument(
1558
1580
  "--warmups",
1559
1581
  type=str,
@@ -1599,7 +1621,7 @@ class ServerArgs:
1599
1621
  "--disaggregation-transfer-backend",
1600
1622
  type=str,
1601
1623
  default=ServerArgs.disaggregation_transfer_backend,
1602
- choices=["mooncake", "nixl"],
1624
+ choices=["mooncake", "nixl", "ascend"],
1603
1625
  help="The backend for disaggregation transfer. Default is mooncake.",
1604
1626
  )
1605
1627
  parser.add_argument(
@@ -490,6 +490,7 @@ class TboForwardBatchPreparer:
490
490
  output_dict["spec_info"] = output_spec_info
491
491
  for key in [
492
492
  "forward_mode",
493
+ "is_extend_in_batch",
493
494
  "return_logprob",
494
495
  "req_to_token_pool",
495
496
  "token_to_kv_pool",
@@ -550,6 +551,8 @@ class TboForwardBatchPreparer:
550
551
  top_p_normalized_logprobs=False,
551
552
  top_p=None,
552
553
  mm_inputs=None,
554
+ top_logprobs_nums=None,
555
+ token_ids_logprobs=None,
553
556
  )
554
557
  )
555
558