sglang 0.4.9.post6__py3-none-any.whl → 0.4.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +3 -0
- sglang/srt/configs/__init__.py +8 -0
- sglang/srt/configs/model_config.py +3 -0
- sglang/srt/configs/step3_vl.py +172 -0
- sglang/srt/conversation.py +23 -0
- sglang/srt/disaggregation/decode.py +2 -8
- sglang/srt/disaggregation/prefill.py +2 -6
- sglang/srt/distributed/parallel_state.py +86 -1
- sglang/srt/entrypoints/engine.py +14 -18
- sglang/srt/entrypoints/http_server.py +10 -2
- sglang/srt/entrypoints/openai/serving_chat.py +2 -21
- sglang/srt/eplb/expert_distribution.py +5 -0
- sglang/srt/eplb/expert_location.py +17 -6
- sglang/srt/eplb/expert_location_dispatch.py +1 -0
- sglang/srt/eplb/expert_location_updater.py +2 -0
- sglang/srt/function_call/function_call_parser.py +2 -0
- sglang/srt/function_call/step3_detector.py +436 -0
- sglang/srt/hf_transformers_utils.py +2 -0
- sglang/srt/jinja_template_utils.py +4 -1
- sglang/srt/layers/moe/cutlass_moe.py +2 -1
- sglang/srt/layers/moe/ep_moe/layer.py +20 -640
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +26 -13
- sglang/srt/layers/moe/fused_moe_triton/layer.py +97 -38
- sglang/srt/layers/quantization/fp8.py +0 -18
- sglang/srt/layers/quantization/unquant.py +0 -8
- sglang/srt/layers/quantization/w4afp8.py +1 -0
- sglang/srt/managers/cache_controller.py +143 -45
- sglang/srt/managers/data_parallel_controller.py +2 -0
- sglang/srt/managers/io_struct.py +0 -2
- sglang/srt/managers/scheduler.py +89 -671
- sglang/srt/managers/scheduler_metrics_mixin.py +229 -0
- sglang/srt/managers/scheduler_profiler_mixin.py +279 -0
- sglang/srt/managers/scheduler_update_weights_mixin.py +142 -0
- sglang/srt/managers/template_manager.py +62 -19
- sglang/srt/managers/tokenizer_manager.py +123 -74
- sglang/srt/managers/tp_worker.py +4 -0
- sglang/srt/managers/tp_worker_overlap_thread.py +2 -1
- sglang/srt/mem_cache/hicache_storage.py +45 -11
- sglang/srt/mem_cache/hiradix_cache.py +15 -4
- sglang/srt/mem_cache/memory_pool_host.py +73 -1
- sglang/srt/mem_cache/mooncake_store/mooncake_store.py +264 -0
- sglang/srt/mem_cache/mooncake_store/unit_test.py +40 -0
- sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +177 -0
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +278 -0
- sglang/srt/mem_cache/storage/hf3fs/test_hf3fs_utils.py +43 -0
- sglang/srt/model_executor/model_runner.py +5 -0
- sglang/srt/models/arcee.py +532 -0
- sglang/srt/models/deepseek_v2.py +2 -0
- sglang/srt/models/glm4_moe.py +3 -1
- sglang/srt/models/granitemoe.py +3 -0
- sglang/srt/models/grok.py +3 -0
- sglang/srt/models/hunyuan.py +1 -0
- sglang/srt/models/llama4.py +3 -0
- sglang/srt/models/mixtral.py +3 -0
- sglang/srt/models/olmoe.py +3 -0
- sglang/srt/models/phimoe.py +1 -0
- sglang/srt/models/step3_vl.py +994 -0
- sglang/srt/multimodal/processors/base_processor.py +15 -16
- sglang/srt/multimodal/processors/step3_vl.py +515 -0
- sglang/srt/reasoning_parser.py +2 -1
- sglang/srt/server_args.py +10 -13
- sglang/srt/speculative/eagle_worker.py +2 -0
- sglang/utils.py +0 -11
- sglang/version.py +1 -1
- {sglang-0.4.9.post6.dist-info → sglang-0.4.10.dist-info}/METADATA +3 -4
- {sglang-0.4.9.post6.dist-info → sglang-0.4.10.dist-info}/RECORD +69 -56
- {sglang-0.4.9.post6.dist-info → sglang-0.4.10.dist-info}/WHEEL +0 -0
- {sglang-0.4.9.post6.dist-info → sglang-0.4.10.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.9.post6.dist-info → sglang-0.4.10.dist-info}/top_level.txt +0 -0
@@ -176,6 +176,8 @@ class BaseMultimodalProcessor(ABC):
|
|
176
176
|
"image_grid_hws": Modality.IMAGE,
|
177
177
|
"aspect_ratio_ids": Modality.IMAGE,
|
178
178
|
"aspect_ratio_mask": Modality.IMAGE,
|
179
|
+
"num_patches": Modality.IMAGE,
|
180
|
+
"patch_pixel_values": Modality.IMAGE,
|
179
181
|
# Audio-related attributes
|
180
182
|
"audio_features": Modality.AUDIO,
|
181
183
|
"audio_feature_lens": Modality.AUDIO,
|
@@ -192,7 +194,12 @@ class BaseMultimodalProcessor(ABC):
|
|
192
194
|
|
193
195
|
# name of the feature filed
|
194
196
|
# TODO: pass from processors
|
195
|
-
self.FEATURE_NAMES = [
|
197
|
+
self.FEATURE_NAMES = [
|
198
|
+
"pixel_values",
|
199
|
+
"pixel_values_videos",
|
200
|
+
"audio_features",
|
201
|
+
"input_features",
|
202
|
+
]
|
196
203
|
|
197
204
|
def process_mm_data(
|
198
205
|
self, input_text, images=None, videos=None, audios=None, **kwargs
|
@@ -221,6 +228,13 @@ class BaseMultimodalProcessor(ABC):
|
|
221
228
|
return_tensors="pt",
|
222
229
|
**kwargs,
|
223
230
|
)
|
231
|
+
# move feature tensors to cpu
|
232
|
+
for feature_name in self.FEATURE_NAMES:
|
233
|
+
if feature_name in result and isinstance(
|
234
|
+
result[feature_name], torch.Tensor
|
235
|
+
):
|
236
|
+
result[feature_name] = result[feature_name].to("cpu")
|
237
|
+
|
224
238
|
return result
|
225
239
|
|
226
240
|
@abstractmethod
|
@@ -623,19 +637,4 @@ class BaseMultimodalProcessor(ABC):
|
|
623
637
|
mm_token_id=mm_token_id,
|
624
638
|
)
|
625
639
|
|
626
|
-
# post-process
|
627
|
-
for item in all_collected_items:
|
628
|
-
# replace the feature tensor with a proxy
|
629
|
-
if isinstance(item.feature, torch.Tensor) and item.feature.is_cuda:
|
630
|
-
item.feature = TransportProxyTensor(
|
631
|
-
transport_mode=self.transport_mode, data=item.feature
|
632
|
-
)
|
633
|
-
elif (
|
634
|
-
isinstance(item.precomputed_embeddings, torch.Tensor)
|
635
|
-
and item.precomputed_embeddings.is_cuda
|
636
|
-
):
|
637
|
-
item.precomputed_embeddings = TransportProxyTensor(
|
638
|
-
transport_mode=self.transport_mode, data=item.precomputed_embeddings
|
639
|
-
)
|
640
|
-
|
641
640
|
return all_collected_items, input_ids, ret
|
@@ -0,0 +1,515 @@
|
|
1
|
+
import math
|
2
|
+
import re
|
3
|
+
from itertools import product
|
4
|
+
from typing import List, Literal, Optional, TypedDict, Union
|
5
|
+
|
6
|
+
import numpy as np
|
7
|
+
import torch
|
8
|
+
from PIL import Image
|
9
|
+
from torchvision import transforms
|
10
|
+
from torchvision.transforms import InterpolationMode
|
11
|
+
from transformers import BatchFeature, TensorType
|
12
|
+
|
13
|
+
from sglang.srt.models.step3_vl import Step3VLForConditionalGeneration
|
14
|
+
from sglang.srt.multimodal.processors.base_processor import (
|
15
|
+
BaseMultimodalProcessor as SGLangBaseProcessor,
|
16
|
+
)
|
17
|
+
from sglang.srt.multimodal.processors.base_processor import MultimodalSpecialTokens
|
18
|
+
|
19
|
+
ImageWithPatches = tuple[Image.Image, list[Image.Image], list[int] | None]
|
20
|
+
|
21
|
+
|
22
|
+
class GPUToTensor(torch.nn.Module):
|
23
|
+
|
24
|
+
def forward(self, raw_image: Union[np.ndarray, Image.Image]) -> torch.Tensor:
|
25
|
+
if isinstance(raw_image, Image.Image):
|
26
|
+
return transforms.ToTensor()(raw_image)
|
27
|
+
if raw_image.ndim == 2:
|
28
|
+
raw_image = raw_image[:, :, None].repeat(3, -1)
|
29
|
+
if torch.cuda.is_available():
|
30
|
+
device = torch.device("cuda")
|
31
|
+
else:
|
32
|
+
device = torch.device("cpu")
|
33
|
+
image_tensor = torch.from_numpy(raw_image).to(device)
|
34
|
+
image_tensor = torch.permute(image_tensor, (2, 0, 1)).contiguous()
|
35
|
+
if image_tensor.dtype == torch.uint8:
|
36
|
+
image_tensor = image_tensor.to(torch.float32).div(255)
|
37
|
+
return image_tensor
|
38
|
+
|
39
|
+
|
40
|
+
class Step3VisionProcessor:
|
41
|
+
def __init__(self, size, interpolation_mode="bicubic", patch_size=None):
|
42
|
+
mean = [0.48145466, 0.4578275, 0.40821073]
|
43
|
+
std = [0.26862954, 0.26130258, 0.27577711]
|
44
|
+
patch_size = patch_size if patch_size is not None else size
|
45
|
+
|
46
|
+
self.transform = transforms.Compose(
|
47
|
+
[
|
48
|
+
GPUToTensor(),
|
49
|
+
transforms.Normalize(mean, std),
|
50
|
+
transforms.Resize(
|
51
|
+
(size, size),
|
52
|
+
interpolation=(
|
53
|
+
InterpolationMode.BICUBIC
|
54
|
+
if interpolation_mode == "bicubic"
|
55
|
+
else InterpolationMode.BILINEAR
|
56
|
+
),
|
57
|
+
antialias=True,
|
58
|
+
),
|
59
|
+
]
|
60
|
+
)
|
61
|
+
|
62
|
+
self.patch_transform = (
|
63
|
+
transforms.Compose(
|
64
|
+
[
|
65
|
+
GPUToTensor(),
|
66
|
+
transforms.Normalize(mean, std),
|
67
|
+
transforms.Resize(
|
68
|
+
(patch_size, patch_size),
|
69
|
+
interpolation=(
|
70
|
+
InterpolationMode.BICUBIC
|
71
|
+
if interpolation_mode == "bicubic"
|
72
|
+
else InterpolationMode.BILINEAR
|
73
|
+
),
|
74
|
+
antialias=True,
|
75
|
+
),
|
76
|
+
]
|
77
|
+
)
|
78
|
+
if patch_size is not None
|
79
|
+
else None
|
80
|
+
)
|
81
|
+
|
82
|
+
def __call__(self, image, is_patch=False):
|
83
|
+
if is_patch:
|
84
|
+
return {"pixel_values": self.patch_transform(image).unsqueeze(0)}
|
85
|
+
else:
|
86
|
+
return {"pixel_values": self.transform(image).unsqueeze(0)}
|
87
|
+
|
88
|
+
|
89
|
+
class ImagePatcher:
|
90
|
+
|
91
|
+
def determine_window_size(self, long: int, short: int) -> int:
|
92
|
+
if long <= 728:
|
93
|
+
return short if long / short > 1.5 else 0
|
94
|
+
return min(short, 504) if long / short > 4 else 504
|
95
|
+
|
96
|
+
def slide_window(
|
97
|
+
self,
|
98
|
+
width: int,
|
99
|
+
height: int,
|
100
|
+
sizes: list[tuple[int, int]],
|
101
|
+
steps: list[tuple[int, int]],
|
102
|
+
img_rate_thr: float = 0.6,
|
103
|
+
) -> tuple[list[tuple[int, int, int, int]], tuple[int, int]]:
|
104
|
+
assert 1 >= img_rate_thr >= 0, "The `in_rate_thr` should lie in 0~1"
|
105
|
+
windows = []
|
106
|
+
# Sliding windows.
|
107
|
+
for size, step in zip(sizes, steps):
|
108
|
+
size_w, size_h = size
|
109
|
+
step_w, step_h = step
|
110
|
+
|
111
|
+
x_num = 1 if width <= size_w else math.ceil((width - size_w) / step_w + 1)
|
112
|
+
x_start = [step_w * i for i in range(x_num)]
|
113
|
+
if len(x_start) > 1 and x_start[-1] + size_w > width:
|
114
|
+
x_start[-1] = width - size_w
|
115
|
+
|
116
|
+
y_num = 1 if height <= size_h else math.ceil((height - size_h) / step_h + 1)
|
117
|
+
y_start = [step_h * i for i in range(y_num)]
|
118
|
+
if len(y_start) > 1 and y_start[-1] + size_h > height:
|
119
|
+
y_start[-1] = height - size_h
|
120
|
+
|
121
|
+
start = np.array(list(product(y_start, x_start)), dtype=int)
|
122
|
+
start[:, [0, 1]] = start[:, [1, 0]]
|
123
|
+
windows.append(np.concatenate([start, start + size], axis=1))
|
124
|
+
windows = np.concatenate(windows, axis=0)
|
125
|
+
|
126
|
+
return [
|
127
|
+
(int(box[0]), int(box[1]), int(box[2] - box[0]), int(box[3] - box[1]))
|
128
|
+
for box in windows
|
129
|
+
], (x_num, y_num)
|
130
|
+
|
131
|
+
def square_pad(self, img: Image.Image) -> Image.Image:
|
132
|
+
w, h = img.size
|
133
|
+
if w == h:
|
134
|
+
return img
|
135
|
+
size = max(w, h)
|
136
|
+
padded = Image.new(img.mode, (size, size), 0)
|
137
|
+
padded.paste(img, (0, 0))
|
138
|
+
return padded
|
139
|
+
|
140
|
+
def get_image_size_for_padding(
|
141
|
+
self, img_width: int, img_height: int
|
142
|
+
) -> tuple[int, int]:
|
143
|
+
ratio = img_width / img_height
|
144
|
+
if min(img_height, img_width) < 32 and (ratio > 4 or ratio < 1 / 4):
|
145
|
+
new_size = max(img_height, img_width)
|
146
|
+
return new_size, new_size
|
147
|
+
return img_width, img_height
|
148
|
+
|
149
|
+
def get_image_size_for_preprocess(
|
150
|
+
self, img_width: int, img_height: int
|
151
|
+
) -> tuple[int, int]:
|
152
|
+
|
153
|
+
if max(img_height, img_width) > 3024:
|
154
|
+
scale_factor = 3024 / max(img_height, img_width)
|
155
|
+
img_width = int(img_width * scale_factor)
|
156
|
+
img_height = int(img_height * scale_factor)
|
157
|
+
return img_width, img_height
|
158
|
+
else:
|
159
|
+
return img_width, img_height
|
160
|
+
|
161
|
+
def get_image_size_for_crop(
|
162
|
+
self, img_width: int, img_height: int, window_size: int
|
163
|
+
):
|
164
|
+
w_ratio = img_width / window_size
|
165
|
+
h_ratio = img_height / window_size
|
166
|
+
|
167
|
+
if w_ratio < 1:
|
168
|
+
width_new = img_width
|
169
|
+
else:
|
170
|
+
decimal_w = w_ratio - img_width // window_size
|
171
|
+
w_ratio = int(w_ratio) + 1 if decimal_w > 0.2 else int(w_ratio)
|
172
|
+
width_new = window_size * w_ratio
|
173
|
+
if h_ratio < 1:
|
174
|
+
height_new = img_height
|
175
|
+
else:
|
176
|
+
decimal_h = h_ratio - img_height // window_size
|
177
|
+
h_ratio = int(h_ratio) + 1 if decimal_h > 0.2 else int(h_ratio)
|
178
|
+
height_new = window_size * h_ratio
|
179
|
+
return int(width_new), int(height_new)
|
180
|
+
|
181
|
+
def patch_crop(self, img: Image.Image, i: int, j: int, th: int, tw: int):
|
182
|
+
target = img.crop((j, i, j + tw, i + th))
|
183
|
+
return target
|
184
|
+
|
185
|
+
def get_num_patches(self, img_width: int, img_height: int) -> tuple[int, int]:
|
186
|
+
img_width, img_height = self.get_image_size_for_padding(img_width, img_height)
|
187
|
+
img_width, img_height = self.get_image_size_for_preprocess(
|
188
|
+
img_width, img_height
|
189
|
+
)
|
190
|
+
window_size = self.determine_window_size(
|
191
|
+
max(img_height, img_width), min(img_height, img_width)
|
192
|
+
)
|
193
|
+
if window_size == 0:
|
194
|
+
return 0, 0
|
195
|
+
else:
|
196
|
+
img_width, img_height = self.get_image_size_for_crop(
|
197
|
+
img_width, img_height, window_size
|
198
|
+
)
|
199
|
+
center_list, (x_num, y_num) = self.slide_window(
|
200
|
+
img_width,
|
201
|
+
img_height,
|
202
|
+
[(window_size, window_size)],
|
203
|
+
[(window_size, window_size)],
|
204
|
+
)
|
205
|
+
full_rows = (len(center_list) - 1) // x_num + 1
|
206
|
+
if len(center_list) > 0 and len(center_list) % x_num == 0:
|
207
|
+
full_rows -= 1
|
208
|
+
return len(center_list), full_rows
|
209
|
+
|
210
|
+
def __call__(
|
211
|
+
self, img: Image.Image
|
212
|
+
) -> tuple[Image.Image, list[Image.Image], list[bool] | None]:
|
213
|
+
img_width, img_height = img.size
|
214
|
+
new_img_width, new_img_height = self.get_image_size_for_padding(
|
215
|
+
img_width, img_height
|
216
|
+
)
|
217
|
+
if new_img_width != img_width or new_img_height != img_height:
|
218
|
+
img = self.square_pad(img)
|
219
|
+
img_width, img_height = img.size
|
220
|
+
|
221
|
+
new_img_width, new_img_height = self.get_image_size_for_preprocess(
|
222
|
+
img_width, img_height
|
223
|
+
)
|
224
|
+
img = img.resize((new_img_width, new_img_height), Image.Resampling.BILINEAR)
|
225
|
+
window_size = self.determine_window_size(
|
226
|
+
max(new_img_height, new_img_width), min(new_img_height, new_img_width)
|
227
|
+
)
|
228
|
+
if window_size == 0:
|
229
|
+
return img, [], None
|
230
|
+
else:
|
231
|
+
new_img_width, new_img_height = self.get_image_size_for_crop(
|
232
|
+
new_img_width, new_img_height, window_size
|
233
|
+
)
|
234
|
+
if (new_img_width, new_img_height) != (img_width, img_height):
|
235
|
+
img_for_crop = img.resize(
|
236
|
+
(new_img_width, new_img_height), Image.Resampling.BILINEAR
|
237
|
+
)
|
238
|
+
else:
|
239
|
+
img_for_crop = img
|
240
|
+
|
241
|
+
patches = []
|
242
|
+
newlines = []
|
243
|
+
center_list, (x_num, y_num) = self.slide_window(
|
244
|
+
new_img_width,
|
245
|
+
new_img_height,
|
246
|
+
[(window_size, window_size)],
|
247
|
+
[(window_size, window_size)],
|
248
|
+
)
|
249
|
+
for patch_id, center_lf_point in enumerate(center_list):
|
250
|
+
x, y, patch_w, patch_h = center_lf_point
|
251
|
+
big_patch = self.patch_crop(img_for_crop, y, x, patch_h, patch_w)
|
252
|
+
patches.append(big_patch)
|
253
|
+
if (patch_id + 1) % x_num == 0:
|
254
|
+
newlines.append(patch_id)
|
255
|
+
|
256
|
+
if newlines and newlines[-1] == len(patches) - 1:
|
257
|
+
newlines.pop()
|
258
|
+
|
259
|
+
return (
|
260
|
+
img,
|
261
|
+
patches,
|
262
|
+
(
|
263
|
+
[i in newlines for i in range(len(patches))]
|
264
|
+
if len(patches) > 0
|
265
|
+
else None
|
266
|
+
),
|
267
|
+
)
|
268
|
+
|
269
|
+
|
270
|
+
class Step3VLProcessor:
|
271
|
+
def __init__(
|
272
|
+
self,
|
273
|
+
config,
|
274
|
+
tokenizer,
|
275
|
+
) -> None:
|
276
|
+
super().__init__()
|
277
|
+
|
278
|
+
self.config = config
|
279
|
+
self.tokenizer = tokenizer
|
280
|
+
|
281
|
+
self.image_size = 728
|
282
|
+
self.patch_size = 504
|
283
|
+
self.image_preprocessor = Step3VisionProcessor(
|
284
|
+
self.image_size, "bilinear", self.patch_size
|
285
|
+
)
|
286
|
+
|
287
|
+
self.num_image_feature_size = 169
|
288
|
+
self.num_patch_feature_size = 81
|
289
|
+
self.image_token = "<im_patch>"
|
290
|
+
self.image_feature_placeholder = self.image_token * self.num_image_feature_size
|
291
|
+
self.patch_feature_placeholder = self.image_token * self.num_patch_feature_size
|
292
|
+
|
293
|
+
self.patcher = ImagePatcher()
|
294
|
+
|
295
|
+
@property
|
296
|
+
def image_token_id(self) -> int:
|
297
|
+
return self.tokenizer.get_vocab()[self.image_token]
|
298
|
+
|
299
|
+
def get_num_image_tokens(self, img_width: int, img_height: int) -> int:
|
300
|
+
num_patches, num_newlines = self.patcher.get_num_patches(img_width, img_height)
|
301
|
+
|
302
|
+
return (
|
303
|
+
num_patches * (self.num_patch_feature_size + 2)
|
304
|
+
+ self.num_image_feature_size
|
305
|
+
+ 2
|
306
|
+
+ num_newlines
|
307
|
+
)
|
308
|
+
|
309
|
+
def _split_images(self, images: list[Image.Image]) -> list[ImageWithPatches]:
|
310
|
+
result = []
|
311
|
+
for img in images:
|
312
|
+
result.append(self.patcher(img))
|
313
|
+
return result
|
314
|
+
|
315
|
+
def _convert_images_to_pixel_values(
|
316
|
+
self,
|
317
|
+
images: list[Image.Image],
|
318
|
+
is_patch: bool = False,
|
319
|
+
) -> list[torch.Tensor]:
|
320
|
+
return [
|
321
|
+
self.image_preprocessor(img, is_patch=is_patch)["pixel_values"]
|
322
|
+
for img in images
|
323
|
+
]
|
324
|
+
|
325
|
+
def _get_patch_repl(
|
326
|
+
self,
|
327
|
+
num_patches: int,
|
328
|
+
patch_newline_mask: list[bool] | None,
|
329
|
+
) -> tuple[str, list[int]]:
|
330
|
+
text = ""
|
331
|
+
token_ids = []
|
332
|
+
for i in range(num_patches):
|
333
|
+
assert len(patch_newline_mask) == num_patches
|
334
|
+
text += f"<patch_start>{self.patch_feature_placeholder}<patch_end>"
|
335
|
+
token_ids.extend(
|
336
|
+
[self.tokenizer.convert_tokens_to_ids("<patch_start>")]
|
337
|
+
+ [self.image_token_id] * self.num_patch_feature_size
|
338
|
+
+ [self.tokenizer.convert_tokens_to_ids("<patch_end>")]
|
339
|
+
)
|
340
|
+
if patch_newline_mask and patch_newline_mask[i]:
|
341
|
+
text += "<patch_newline>"
|
342
|
+
token_ids.append(
|
343
|
+
self.tokenizer.convert_tokens_to_ids("<patch_newline>")
|
344
|
+
)
|
345
|
+
return text, token_ids
|
346
|
+
|
347
|
+
def _get_image_repl(
|
348
|
+
self,
|
349
|
+
num_images: int,
|
350
|
+
) -> tuple[str, list[int]]:
|
351
|
+
text = f"<im_start>{self.image_feature_placeholder}<im_end>"
|
352
|
+
token_ids = (
|
353
|
+
[self.tokenizer.convert_tokens_to_ids("<im_start>")]
|
354
|
+
+ [self.image_token_id] * self.num_image_feature_size
|
355
|
+
+ [self.tokenizer.convert_tokens_to_ids("<im_end>")]
|
356
|
+
)
|
357
|
+
return text * num_images, token_ids * num_images
|
358
|
+
|
359
|
+
def _get_image_repl_features(
|
360
|
+
self,
|
361
|
+
num_images: int,
|
362
|
+
num_patches: int,
|
363
|
+
patch_new_line_idx: Optional[list[bool]],
|
364
|
+
) -> tuple[str, list[int]]:
|
365
|
+
if num_patches > 0:
|
366
|
+
patch_repl, patch_repl_ids = self._get_patch_repl(
|
367
|
+
num_patches, patch_new_line_idx
|
368
|
+
)
|
369
|
+
else:
|
370
|
+
patch_repl = ""
|
371
|
+
patch_repl_ids = []
|
372
|
+
image_repl, image_repl_ids = self._get_image_repl(num_images)
|
373
|
+
return patch_repl + image_repl, patch_repl_ids + image_repl_ids
|
374
|
+
|
375
|
+
def replace_placeholder(self, text: str, placeholder: str, repls: list[str]) -> str:
|
376
|
+
parts = text.split(placeholder)
|
377
|
+
|
378
|
+
if len(parts) - 1 != len(repls):
|
379
|
+
raise ValueError(
|
380
|
+
"The number of placeholders does not match the number of replacements." # noqa: E501
|
381
|
+
)
|
382
|
+
|
383
|
+
result = [parts[0]]
|
384
|
+
for i, repl in enumerate(repls):
|
385
|
+
result.append(repl)
|
386
|
+
result.append(parts[i + 1])
|
387
|
+
|
388
|
+
return "".join(result)
|
389
|
+
|
390
|
+
def __call__(
|
391
|
+
self,
|
392
|
+
text: Optional[Union[str, list[str]]] = None,
|
393
|
+
images: Optional[Union[Image.Image, list[Image.Image]]] = None,
|
394
|
+
return_tensors: Optional[Union[str, TensorType]] = None,
|
395
|
+
*args,
|
396
|
+
**kwargs,
|
397
|
+
) -> BatchFeature:
|
398
|
+
if text is None:
|
399
|
+
text = []
|
400
|
+
if not isinstance(text, list):
|
401
|
+
text = [text]
|
402
|
+
if images is None:
|
403
|
+
images = []
|
404
|
+
if not isinstance(images, list):
|
405
|
+
images = [images]
|
406
|
+
|
407
|
+
if len(images) == 0:
|
408
|
+
image_inputs = {}
|
409
|
+
text_inputs = self.tokenizer(text)
|
410
|
+
else:
|
411
|
+
splitted_images_data = self._split_images(images)
|
412
|
+
pixel_values_lst = []
|
413
|
+
patch_pixel_values_lst = []
|
414
|
+
patch_newline_mask_lst = []
|
415
|
+
image_repl_str_lst = []
|
416
|
+
image_repl_ids_lst = []
|
417
|
+
num_patches = []
|
418
|
+
for (
|
419
|
+
raw_img,
|
420
|
+
img_patches,
|
421
|
+
patch_newline_mask,
|
422
|
+
) in splitted_images_data: # noqa: E501
|
423
|
+
pixel_values_lst.extend(self._convert_images_to_pixel_values([raw_img]))
|
424
|
+
|
425
|
+
if len(img_patches) > 0:
|
426
|
+
patch_pixel_values_lst.extend(
|
427
|
+
self._convert_images_to_pixel_values(img_patches, is_patch=True)
|
428
|
+
)
|
429
|
+
num_patches.append(len(img_patches))
|
430
|
+
|
431
|
+
image_repl_str, image_repl_ids = self._get_image_repl_features(
|
432
|
+
1, len(img_patches), patch_newline_mask
|
433
|
+
)
|
434
|
+
image_repl_str_lst.append(image_repl_str)
|
435
|
+
image_repl_ids_lst.extend(image_repl_ids)
|
436
|
+
|
437
|
+
if patch_newline_mask is not None:
|
438
|
+
patch_newline_mask_lst.extend(patch_newline_mask)
|
439
|
+
|
440
|
+
image_inputs = {
|
441
|
+
"pixel_values": torch.cat(pixel_values_lst),
|
442
|
+
"num_patches": num_patches,
|
443
|
+
}
|
444
|
+
if patch_pixel_values_lst:
|
445
|
+
image_inputs["patch_pixel_values"] = torch.cat(patch_pixel_values_lst)
|
446
|
+
if patch_newline_mask_lst:
|
447
|
+
image_inputs["patch_newline_mask"] = torch.tensor(
|
448
|
+
patch_newline_mask_lst, dtype=torch.bool
|
449
|
+
)
|
450
|
+
|
451
|
+
text = [
|
452
|
+
self.replace_placeholder(t, self.image_token, image_repl_str_lst)
|
453
|
+
for t in text
|
454
|
+
]
|
455
|
+
text_inputs = self.tokenizer(text)
|
456
|
+
|
457
|
+
return BatchFeature(
|
458
|
+
{
|
459
|
+
**text_inputs,
|
460
|
+
**image_inputs,
|
461
|
+
},
|
462
|
+
tensor_type=return_tensors,
|
463
|
+
)
|
464
|
+
|
465
|
+
|
466
|
+
################################################
|
467
|
+
|
468
|
+
|
469
|
+
class Step3VLImageProcessor(SGLangBaseProcessor):
|
470
|
+
models = [Step3VLForConditionalGeneration]
|
471
|
+
|
472
|
+
def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
|
473
|
+
# TODO, check _processor is tokenizer or processor.
|
474
|
+
processor = Step3VLProcessor(hf_config, _processor)
|
475
|
+
super().__init__(hf_config, server_args, processor, *args, **kwargs)
|
476
|
+
self.IM_TOKEN_ID = 128001
|
477
|
+
self.mm_tokens = MultimodalSpecialTokens(
|
478
|
+
image_token="<im_patch>",
|
479
|
+
image_token_id=128001,
|
480
|
+
image_token_regex=re.compile(r"(?:<im_patch>)"),
|
481
|
+
).build(_processor)
|
482
|
+
|
483
|
+
mean = [0.48145466, 0.4578275, 0.40821073]
|
484
|
+
std = [0.26862954, 0.26130258, 0.27577711]
|
485
|
+
|
486
|
+
def preprocess(self, image):
|
487
|
+
return {"pixel_values": self.transform(image).unsqueeze(0)}
|
488
|
+
|
489
|
+
def __call__(self, image):
|
490
|
+
return self.preprocess(image)
|
491
|
+
|
492
|
+
async def process_mm_data_async(
|
493
|
+
self,
|
494
|
+
image_data: List[Union[str, bytes]],
|
495
|
+
input_text: str | List[int],
|
496
|
+
request_obj,
|
497
|
+
*args,
|
498
|
+
**kwargs,
|
499
|
+
):
|
500
|
+
base_output = self.load_mm_data(
|
501
|
+
prompt=input_text,
|
502
|
+
image_data=image_data,
|
503
|
+
video_data=request_obj.video_data,
|
504
|
+
multimodal_tokens=self.mm_tokens,
|
505
|
+
)
|
506
|
+
|
507
|
+
mm_items, input_ids, ret = self.process_and_combine_mm_data(
|
508
|
+
base_output, self.mm_tokens
|
509
|
+
)
|
510
|
+
|
511
|
+
return {
|
512
|
+
"input_ids": input_ids.tolist(),
|
513
|
+
"mm_items": mm_items,
|
514
|
+
"im_token_id": self.mm_tokens.image_token_id,
|
515
|
+
}
|
sglang/srt/reasoning_parser.py
CHANGED
@@ -105,7 +105,7 @@ class BaseReasoningFormatDetector:
|
|
105
105
|
# If we're not in a reasoning block return as normal text
|
106
106
|
if not self._in_reasoning:
|
107
107
|
self._buffer = ""
|
108
|
-
return StreamingParseResult(normal_text=
|
108
|
+
return StreamingParseResult(normal_text=current_text)
|
109
109
|
|
110
110
|
return StreamingParseResult()
|
111
111
|
|
@@ -233,6 +233,7 @@ class ReasoningParser:
|
|
233
233
|
"qwen3-thinking": Qwen3ThinkingDetector,
|
234
234
|
"glm45": Qwen3Detector,
|
235
235
|
"kimi": KimiDetector,
|
236
|
+
"step3": DeepSeekR1Detector,
|
236
237
|
}
|
237
238
|
|
238
239
|
def __init__(self, model_type: Optional[str] = None, stream_reasoning: bool = True):
|
sglang/srt/server_args.py
CHANGED
@@ -270,14 +270,6 @@ class ServerArgs:
|
|
270
270
|
sm_group_num: int = 3
|
271
271
|
|
272
272
|
def __post_init__(self):
|
273
|
-
# Expert parallelism
|
274
|
-
# We put it here first due to some internal ckpt conversation issues.
|
275
|
-
if self.enable_ep_moe:
|
276
|
-
self.ep_size = self.tp_size
|
277
|
-
logger.warning(
|
278
|
-
f"EP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
|
279
|
-
)
|
280
|
-
|
281
273
|
# Set missing default values
|
282
274
|
if self.tokenizer_path is None:
|
283
275
|
self.tokenizer_path = self.model_path
|
@@ -1117,9 +1109,10 @@ class ServerArgs:
|
|
1117
1109
|
"kimi_k2",
|
1118
1110
|
"qwen3_coder",
|
1119
1111
|
"glm45",
|
1112
|
+
"step3",
|
1120
1113
|
],
|
1121
1114
|
default=ServerArgs.tool_call_parser,
|
1122
|
-
help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', 'pythonic', 'kimi_k2', and '
|
1115
|
+
help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', 'pythonic', 'kimi_k2', 'qwen3_coder', 'glm45', and 'step3'.",
|
1123
1116
|
)
|
1124
1117
|
|
1125
1118
|
# Data parallelism
|
@@ -1334,6 +1327,7 @@ class ServerArgs:
|
|
1334
1327
|
parser.add_argument(
|
1335
1328
|
"--expert-parallel-size",
|
1336
1329
|
"--ep-size",
|
1330
|
+
"--ep",
|
1337
1331
|
type=int,
|
1338
1332
|
default=ServerArgs.ep_size,
|
1339
1333
|
help="The expert parallelism size.",
|
@@ -1476,7 +1470,7 @@ class ServerArgs:
|
|
1476
1470
|
parser.add_argument(
|
1477
1471
|
"--hicache-storage-backend",
|
1478
1472
|
type=str,
|
1479
|
-
choices=["file"
|
1473
|
+
choices=["file", "mooncake", "hf3fs"],
|
1480
1474
|
default=ServerArgs.hicache_storage_backend,
|
1481
1475
|
help="The storage backend for hierarchical KV cache.",
|
1482
1476
|
)
|
@@ -2071,6 +2065,9 @@ class PortArgs:
|
|
2071
2065
|
|
2072
2066
|
dist_init_host, dist_init_port = dist_init_addr
|
2073
2067
|
port_base = int(dist_init_port) + 1
|
2068
|
+
detokenizer_port = port_base + 1
|
2069
|
+
rpc_port = port_base + 2
|
2070
|
+
metrics_ipc_name = port_base + 3
|
2074
2071
|
if dp_rank is None:
|
2075
2072
|
# TokenizerManager to DataParallelController
|
2076
2073
|
scheduler_input_port = port_base + 4
|
@@ -2080,10 +2077,10 @@ class PortArgs:
|
|
2080
2077
|
return PortArgs(
|
2081
2078
|
tokenizer_ipc_name=f"tcp://{dist_init_host}:{port_base}",
|
2082
2079
|
scheduler_input_ipc_name=f"tcp://{dist_init_host}:{scheduler_input_port}",
|
2083
|
-
detokenizer_ipc_name=f"tcp://{dist_init_host}:{
|
2080
|
+
detokenizer_ipc_name=f"tcp://{dist_init_host}:{detokenizer_port}",
|
2084
2081
|
nccl_port=nccl_port,
|
2085
|
-
rpc_ipc_name=f"tcp://{dist_init_host}:{
|
2086
|
-
metrics_ipc_name=f"tcp://{dist_init_host}:{
|
2082
|
+
rpc_ipc_name=f"tcp://{dist_init_host}:{rpc_port}",
|
2083
|
+
metrics_ipc_name=f"tcp://{dist_init_host}:{metrics_ipc_name}",
|
2087
2084
|
)
|
2088
2085
|
|
2089
2086
|
|
@@ -73,6 +73,7 @@ class EAGLEWorker(TpModelWorker):
|
|
73
73
|
gpu_id: int,
|
74
74
|
tp_rank: int,
|
75
75
|
dp_rank: Optional[int],
|
76
|
+
moe_ep_rank: int,
|
76
77
|
nccl_port: int,
|
77
78
|
target_worker: TpModelWorker,
|
78
79
|
):
|
@@ -127,6 +128,7 @@ class EAGLEWorker(TpModelWorker):
|
|
127
128
|
tp_rank=tp_rank,
|
128
129
|
pp_rank=0, # FIXME
|
129
130
|
dp_rank=dp_rank,
|
131
|
+
moe_ep_rank=moe_ep_rank,
|
130
132
|
nccl_port=nccl_port,
|
131
133
|
is_draft_worker=True,
|
132
134
|
req_to_token_pool=self.req_to_token_pool,
|
sglang/utils.py
CHANGED
@@ -291,17 +291,6 @@ def find_printable_text(text: str):
|
|
291
291
|
return text[: text.rfind(" ") + 1]
|
292
292
|
|
293
293
|
|
294
|
-
def graceful_registry(sub_module_name: str):
|
295
|
-
def graceful_shutdown(signum, frame):
|
296
|
-
logger.info(
|
297
|
-
f"{sub_module_name} Received signal to shutdown. Performing graceful shutdown..."
|
298
|
-
)
|
299
|
-
if signum == signal.SIGTERM:
|
300
|
-
logger.info(f"{sub_module_name} receive sigterm")
|
301
|
-
|
302
|
-
signal.signal(signal.SIGTERM, graceful_shutdown)
|
303
|
-
|
304
|
-
|
305
294
|
class LazyImport:
|
306
295
|
"""Lazy import to make `import sglang` run faster."""
|
307
296
|
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.4.
|
1
|
+
__version__ = "0.4.10"
|