sglang 0.4.9.post5__py3-none-any.whl → 0.4.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +3 -0
- sglang/srt/configs/__init__.py +8 -0
- sglang/srt/configs/model_config.py +6 -0
- sglang/srt/configs/step3_vl.py +172 -0
- sglang/srt/conversation.py +23 -0
- sglang/srt/disaggregation/decode.py +2 -8
- sglang/srt/disaggregation/prefill.py +2 -6
- sglang/srt/distributed/parallel_state.py +86 -1
- sglang/srt/entrypoints/engine.py +14 -18
- sglang/srt/entrypoints/http_server.py +23 -3
- sglang/srt/entrypoints/openai/protocol.py +3 -1
- sglang/srt/entrypoints/openai/serving_base.py +5 -2
- sglang/srt/entrypoints/openai/serving_chat.py +2 -21
- sglang/srt/eplb/expert_distribution.py +5 -0
- sglang/srt/eplb/expert_location.py +17 -6
- sglang/srt/eplb/expert_location_dispatch.py +1 -0
- sglang/srt/eplb/expert_location_updater.py +2 -0
- sglang/srt/function_call/function_call_parser.py +2 -0
- sglang/srt/function_call/step3_detector.py +436 -0
- sglang/srt/hf_transformers_utils.py +2 -0
- sglang/srt/jinja_template_utils.py +4 -1
- sglang/srt/layers/moe/cutlass_moe.py +2 -1
- sglang/srt/layers/moe/ep_moe/layer.py +98 -603
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +83 -118
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +26 -13
- sglang/srt/layers/moe/fused_moe_triton/layer.py +97 -38
- sglang/srt/layers/moe/token_dispatcher/__init__.py +0 -0
- sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +48 -0
- sglang/srt/layers/moe/token_dispatcher/standard.py +19 -0
- sglang/srt/layers/moe/topk.py +6 -2
- sglang/srt/layers/quantization/fp8.py +0 -18
- sglang/srt/layers/quantization/modelopt_quant.py +2 -0
- sglang/srt/layers/quantization/unquant.py +0 -8
- sglang/srt/layers/quantization/w4afp8.py +1 -0
- sglang/srt/managers/cache_controller.py +143 -45
- sglang/srt/managers/data_parallel_controller.py +6 -0
- sglang/srt/managers/io_struct.py +12 -2
- sglang/srt/managers/scheduler.py +116 -669
- sglang/srt/managers/scheduler_input_blocker.py +106 -0
- sglang/srt/managers/scheduler_metrics_mixin.py +229 -0
- sglang/srt/managers/scheduler_profiler_mixin.py +279 -0
- sglang/srt/managers/scheduler_update_weights_mixin.py +142 -0
- sglang/srt/managers/template_manager.py +62 -19
- sglang/srt/managers/tokenizer_manager.py +166 -83
- sglang/srt/managers/tp_worker.py +9 -0
- sglang/srt/managers/tp_worker_overlap_thread.py +2 -1
- sglang/srt/mem_cache/hicache_storage.py +45 -11
- sglang/srt/mem_cache/hiradix_cache.py +15 -4
- sglang/srt/mem_cache/memory_pool_host.py +73 -1
- sglang/srt/mem_cache/mooncake_store/mooncake_store.py +264 -0
- sglang/srt/mem_cache/mooncake_store/unit_test.py +40 -0
- sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +177 -0
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +278 -0
- sglang/srt/mem_cache/storage/hf3fs/test_hf3fs_utils.py +43 -0
- sglang/srt/model_executor/model_runner.py +20 -13
- sglang/srt/models/arcee.py +532 -0
- sglang/srt/models/deepseek_v2.py +15 -56
- sglang/srt/models/glm4_moe.py +3 -1
- sglang/srt/models/granitemoe.py +3 -0
- sglang/srt/models/grok.py +3 -0
- sglang/srt/models/hunyuan.py +1 -0
- sglang/srt/models/llama4.py +3 -0
- sglang/srt/models/mixtral.py +3 -0
- sglang/srt/models/olmoe.py +3 -0
- sglang/srt/models/phimoe.py +1 -0
- sglang/srt/models/qwen3_moe.py +12 -69
- sglang/srt/models/step3_vl.py +994 -0
- sglang/srt/multimodal/processors/base_processor.py +15 -16
- sglang/srt/multimodal/processors/step3_vl.py +515 -0
- sglang/srt/poll_based_barrier.py +31 -0
- sglang/srt/reasoning_parser.py +2 -1
- sglang/srt/server_args.py +18 -13
- sglang/srt/speculative/eagle_worker.py +2 -0
- sglang/srt/two_batch_overlap.py +8 -3
- sglang/test/test_utils.py +53 -0
- sglang/utils.py +0 -11
- sglang/version.py +1 -1
- {sglang-0.4.9.post5.dist-info → sglang-0.4.10.dist-info}/METADATA +4 -4
- {sglang-0.4.9.post5.dist-info → sglang-0.4.10.dist-info}/RECORD +84 -64
- {sglang-0.4.9.post5.dist-info → sglang-0.4.10.dist-info}/WHEEL +0 -0
- {sglang-0.4.9.post5.dist-info → sglang-0.4.10.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.9.post5.dist-info → sglang-0.4.10.dist-info}/top_level.txt +0 -0
@@ -176,6 +176,8 @@ class BaseMultimodalProcessor(ABC):
|
|
176
176
|
"image_grid_hws": Modality.IMAGE,
|
177
177
|
"aspect_ratio_ids": Modality.IMAGE,
|
178
178
|
"aspect_ratio_mask": Modality.IMAGE,
|
179
|
+
"num_patches": Modality.IMAGE,
|
180
|
+
"patch_pixel_values": Modality.IMAGE,
|
179
181
|
# Audio-related attributes
|
180
182
|
"audio_features": Modality.AUDIO,
|
181
183
|
"audio_feature_lens": Modality.AUDIO,
|
@@ -192,7 +194,12 @@ class BaseMultimodalProcessor(ABC):
|
|
192
194
|
|
193
195
|
# name of the feature filed
|
194
196
|
# TODO: pass from processors
|
195
|
-
self.FEATURE_NAMES = [
|
197
|
+
self.FEATURE_NAMES = [
|
198
|
+
"pixel_values",
|
199
|
+
"pixel_values_videos",
|
200
|
+
"audio_features",
|
201
|
+
"input_features",
|
202
|
+
]
|
196
203
|
|
197
204
|
def process_mm_data(
|
198
205
|
self, input_text, images=None, videos=None, audios=None, **kwargs
|
@@ -221,6 +228,13 @@ class BaseMultimodalProcessor(ABC):
|
|
221
228
|
return_tensors="pt",
|
222
229
|
**kwargs,
|
223
230
|
)
|
231
|
+
# move feature tensors to cpu
|
232
|
+
for feature_name in self.FEATURE_NAMES:
|
233
|
+
if feature_name in result and isinstance(
|
234
|
+
result[feature_name], torch.Tensor
|
235
|
+
):
|
236
|
+
result[feature_name] = result[feature_name].to("cpu")
|
237
|
+
|
224
238
|
return result
|
225
239
|
|
226
240
|
@abstractmethod
|
@@ -623,19 +637,4 @@ class BaseMultimodalProcessor(ABC):
|
|
623
637
|
mm_token_id=mm_token_id,
|
624
638
|
)
|
625
639
|
|
626
|
-
# post-process
|
627
|
-
for item in all_collected_items:
|
628
|
-
# replace the feature tensor with a proxy
|
629
|
-
if isinstance(item.feature, torch.Tensor) and item.feature.is_cuda:
|
630
|
-
item.feature = TransportProxyTensor(
|
631
|
-
transport_mode=self.transport_mode, data=item.feature
|
632
|
-
)
|
633
|
-
elif (
|
634
|
-
isinstance(item.precomputed_embeddings, torch.Tensor)
|
635
|
-
and item.precomputed_embeddings.is_cuda
|
636
|
-
):
|
637
|
-
item.precomputed_embeddings = TransportProxyTensor(
|
638
|
-
transport_mode=self.transport_mode, data=item.precomputed_embeddings
|
639
|
-
)
|
640
|
-
|
641
640
|
return all_collected_items, input_ids, ret
|
@@ -0,0 +1,515 @@
|
|
1
|
+
import math
|
2
|
+
import re
|
3
|
+
from itertools import product
|
4
|
+
from typing import List, Literal, Optional, TypedDict, Union
|
5
|
+
|
6
|
+
import numpy as np
|
7
|
+
import torch
|
8
|
+
from PIL import Image
|
9
|
+
from torchvision import transforms
|
10
|
+
from torchvision.transforms import InterpolationMode
|
11
|
+
from transformers import BatchFeature, TensorType
|
12
|
+
|
13
|
+
from sglang.srt.models.step3_vl import Step3VLForConditionalGeneration
|
14
|
+
from sglang.srt.multimodal.processors.base_processor import (
|
15
|
+
BaseMultimodalProcessor as SGLangBaseProcessor,
|
16
|
+
)
|
17
|
+
from sglang.srt.multimodal.processors.base_processor import MultimodalSpecialTokens
|
18
|
+
|
19
|
+
ImageWithPatches = tuple[Image.Image, list[Image.Image], list[int] | None]
|
20
|
+
|
21
|
+
|
22
|
+
class GPUToTensor(torch.nn.Module):
|
23
|
+
|
24
|
+
def forward(self, raw_image: Union[np.ndarray, Image.Image]) -> torch.Tensor:
|
25
|
+
if isinstance(raw_image, Image.Image):
|
26
|
+
return transforms.ToTensor()(raw_image)
|
27
|
+
if raw_image.ndim == 2:
|
28
|
+
raw_image = raw_image[:, :, None].repeat(3, -1)
|
29
|
+
if torch.cuda.is_available():
|
30
|
+
device = torch.device("cuda")
|
31
|
+
else:
|
32
|
+
device = torch.device("cpu")
|
33
|
+
image_tensor = torch.from_numpy(raw_image).to(device)
|
34
|
+
image_tensor = torch.permute(image_tensor, (2, 0, 1)).contiguous()
|
35
|
+
if image_tensor.dtype == torch.uint8:
|
36
|
+
image_tensor = image_tensor.to(torch.float32).div(255)
|
37
|
+
return image_tensor
|
38
|
+
|
39
|
+
|
40
|
+
class Step3VisionProcessor:
|
41
|
+
def __init__(self, size, interpolation_mode="bicubic", patch_size=None):
|
42
|
+
mean = [0.48145466, 0.4578275, 0.40821073]
|
43
|
+
std = [0.26862954, 0.26130258, 0.27577711]
|
44
|
+
patch_size = patch_size if patch_size is not None else size
|
45
|
+
|
46
|
+
self.transform = transforms.Compose(
|
47
|
+
[
|
48
|
+
GPUToTensor(),
|
49
|
+
transforms.Normalize(mean, std),
|
50
|
+
transforms.Resize(
|
51
|
+
(size, size),
|
52
|
+
interpolation=(
|
53
|
+
InterpolationMode.BICUBIC
|
54
|
+
if interpolation_mode == "bicubic"
|
55
|
+
else InterpolationMode.BILINEAR
|
56
|
+
),
|
57
|
+
antialias=True,
|
58
|
+
),
|
59
|
+
]
|
60
|
+
)
|
61
|
+
|
62
|
+
self.patch_transform = (
|
63
|
+
transforms.Compose(
|
64
|
+
[
|
65
|
+
GPUToTensor(),
|
66
|
+
transforms.Normalize(mean, std),
|
67
|
+
transforms.Resize(
|
68
|
+
(patch_size, patch_size),
|
69
|
+
interpolation=(
|
70
|
+
InterpolationMode.BICUBIC
|
71
|
+
if interpolation_mode == "bicubic"
|
72
|
+
else InterpolationMode.BILINEAR
|
73
|
+
),
|
74
|
+
antialias=True,
|
75
|
+
),
|
76
|
+
]
|
77
|
+
)
|
78
|
+
if patch_size is not None
|
79
|
+
else None
|
80
|
+
)
|
81
|
+
|
82
|
+
def __call__(self, image, is_patch=False):
|
83
|
+
if is_patch:
|
84
|
+
return {"pixel_values": self.patch_transform(image).unsqueeze(0)}
|
85
|
+
else:
|
86
|
+
return {"pixel_values": self.transform(image).unsqueeze(0)}
|
87
|
+
|
88
|
+
|
89
|
+
class ImagePatcher:
|
90
|
+
|
91
|
+
def determine_window_size(self, long: int, short: int) -> int:
|
92
|
+
if long <= 728:
|
93
|
+
return short if long / short > 1.5 else 0
|
94
|
+
return min(short, 504) if long / short > 4 else 504
|
95
|
+
|
96
|
+
def slide_window(
|
97
|
+
self,
|
98
|
+
width: int,
|
99
|
+
height: int,
|
100
|
+
sizes: list[tuple[int, int]],
|
101
|
+
steps: list[tuple[int, int]],
|
102
|
+
img_rate_thr: float = 0.6,
|
103
|
+
) -> tuple[list[tuple[int, int, int, int]], tuple[int, int]]:
|
104
|
+
assert 1 >= img_rate_thr >= 0, "The `in_rate_thr` should lie in 0~1"
|
105
|
+
windows = []
|
106
|
+
# Sliding windows.
|
107
|
+
for size, step in zip(sizes, steps):
|
108
|
+
size_w, size_h = size
|
109
|
+
step_w, step_h = step
|
110
|
+
|
111
|
+
x_num = 1 if width <= size_w else math.ceil((width - size_w) / step_w + 1)
|
112
|
+
x_start = [step_w * i for i in range(x_num)]
|
113
|
+
if len(x_start) > 1 and x_start[-1] + size_w > width:
|
114
|
+
x_start[-1] = width - size_w
|
115
|
+
|
116
|
+
y_num = 1 if height <= size_h else math.ceil((height - size_h) / step_h + 1)
|
117
|
+
y_start = [step_h * i for i in range(y_num)]
|
118
|
+
if len(y_start) > 1 and y_start[-1] + size_h > height:
|
119
|
+
y_start[-1] = height - size_h
|
120
|
+
|
121
|
+
start = np.array(list(product(y_start, x_start)), dtype=int)
|
122
|
+
start[:, [0, 1]] = start[:, [1, 0]]
|
123
|
+
windows.append(np.concatenate([start, start + size], axis=1))
|
124
|
+
windows = np.concatenate(windows, axis=0)
|
125
|
+
|
126
|
+
return [
|
127
|
+
(int(box[0]), int(box[1]), int(box[2] - box[0]), int(box[3] - box[1]))
|
128
|
+
for box in windows
|
129
|
+
], (x_num, y_num)
|
130
|
+
|
131
|
+
def square_pad(self, img: Image.Image) -> Image.Image:
|
132
|
+
w, h = img.size
|
133
|
+
if w == h:
|
134
|
+
return img
|
135
|
+
size = max(w, h)
|
136
|
+
padded = Image.new(img.mode, (size, size), 0)
|
137
|
+
padded.paste(img, (0, 0))
|
138
|
+
return padded
|
139
|
+
|
140
|
+
def get_image_size_for_padding(
|
141
|
+
self, img_width: int, img_height: int
|
142
|
+
) -> tuple[int, int]:
|
143
|
+
ratio = img_width / img_height
|
144
|
+
if min(img_height, img_width) < 32 and (ratio > 4 or ratio < 1 / 4):
|
145
|
+
new_size = max(img_height, img_width)
|
146
|
+
return new_size, new_size
|
147
|
+
return img_width, img_height
|
148
|
+
|
149
|
+
def get_image_size_for_preprocess(
|
150
|
+
self, img_width: int, img_height: int
|
151
|
+
) -> tuple[int, int]:
|
152
|
+
|
153
|
+
if max(img_height, img_width) > 3024:
|
154
|
+
scale_factor = 3024 / max(img_height, img_width)
|
155
|
+
img_width = int(img_width * scale_factor)
|
156
|
+
img_height = int(img_height * scale_factor)
|
157
|
+
return img_width, img_height
|
158
|
+
else:
|
159
|
+
return img_width, img_height
|
160
|
+
|
161
|
+
def get_image_size_for_crop(
|
162
|
+
self, img_width: int, img_height: int, window_size: int
|
163
|
+
):
|
164
|
+
w_ratio = img_width / window_size
|
165
|
+
h_ratio = img_height / window_size
|
166
|
+
|
167
|
+
if w_ratio < 1:
|
168
|
+
width_new = img_width
|
169
|
+
else:
|
170
|
+
decimal_w = w_ratio - img_width // window_size
|
171
|
+
w_ratio = int(w_ratio) + 1 if decimal_w > 0.2 else int(w_ratio)
|
172
|
+
width_new = window_size * w_ratio
|
173
|
+
if h_ratio < 1:
|
174
|
+
height_new = img_height
|
175
|
+
else:
|
176
|
+
decimal_h = h_ratio - img_height // window_size
|
177
|
+
h_ratio = int(h_ratio) + 1 if decimal_h > 0.2 else int(h_ratio)
|
178
|
+
height_new = window_size * h_ratio
|
179
|
+
return int(width_new), int(height_new)
|
180
|
+
|
181
|
+
def patch_crop(self, img: Image.Image, i: int, j: int, th: int, tw: int):
|
182
|
+
target = img.crop((j, i, j + tw, i + th))
|
183
|
+
return target
|
184
|
+
|
185
|
+
def get_num_patches(self, img_width: int, img_height: int) -> tuple[int, int]:
|
186
|
+
img_width, img_height = self.get_image_size_for_padding(img_width, img_height)
|
187
|
+
img_width, img_height = self.get_image_size_for_preprocess(
|
188
|
+
img_width, img_height
|
189
|
+
)
|
190
|
+
window_size = self.determine_window_size(
|
191
|
+
max(img_height, img_width), min(img_height, img_width)
|
192
|
+
)
|
193
|
+
if window_size == 0:
|
194
|
+
return 0, 0
|
195
|
+
else:
|
196
|
+
img_width, img_height = self.get_image_size_for_crop(
|
197
|
+
img_width, img_height, window_size
|
198
|
+
)
|
199
|
+
center_list, (x_num, y_num) = self.slide_window(
|
200
|
+
img_width,
|
201
|
+
img_height,
|
202
|
+
[(window_size, window_size)],
|
203
|
+
[(window_size, window_size)],
|
204
|
+
)
|
205
|
+
full_rows = (len(center_list) - 1) // x_num + 1
|
206
|
+
if len(center_list) > 0 and len(center_list) % x_num == 0:
|
207
|
+
full_rows -= 1
|
208
|
+
return len(center_list), full_rows
|
209
|
+
|
210
|
+
def __call__(
|
211
|
+
self, img: Image.Image
|
212
|
+
) -> tuple[Image.Image, list[Image.Image], list[bool] | None]:
|
213
|
+
img_width, img_height = img.size
|
214
|
+
new_img_width, new_img_height = self.get_image_size_for_padding(
|
215
|
+
img_width, img_height
|
216
|
+
)
|
217
|
+
if new_img_width != img_width or new_img_height != img_height:
|
218
|
+
img = self.square_pad(img)
|
219
|
+
img_width, img_height = img.size
|
220
|
+
|
221
|
+
new_img_width, new_img_height = self.get_image_size_for_preprocess(
|
222
|
+
img_width, img_height
|
223
|
+
)
|
224
|
+
img = img.resize((new_img_width, new_img_height), Image.Resampling.BILINEAR)
|
225
|
+
window_size = self.determine_window_size(
|
226
|
+
max(new_img_height, new_img_width), min(new_img_height, new_img_width)
|
227
|
+
)
|
228
|
+
if window_size == 0:
|
229
|
+
return img, [], None
|
230
|
+
else:
|
231
|
+
new_img_width, new_img_height = self.get_image_size_for_crop(
|
232
|
+
new_img_width, new_img_height, window_size
|
233
|
+
)
|
234
|
+
if (new_img_width, new_img_height) != (img_width, img_height):
|
235
|
+
img_for_crop = img.resize(
|
236
|
+
(new_img_width, new_img_height), Image.Resampling.BILINEAR
|
237
|
+
)
|
238
|
+
else:
|
239
|
+
img_for_crop = img
|
240
|
+
|
241
|
+
patches = []
|
242
|
+
newlines = []
|
243
|
+
center_list, (x_num, y_num) = self.slide_window(
|
244
|
+
new_img_width,
|
245
|
+
new_img_height,
|
246
|
+
[(window_size, window_size)],
|
247
|
+
[(window_size, window_size)],
|
248
|
+
)
|
249
|
+
for patch_id, center_lf_point in enumerate(center_list):
|
250
|
+
x, y, patch_w, patch_h = center_lf_point
|
251
|
+
big_patch = self.patch_crop(img_for_crop, y, x, patch_h, patch_w)
|
252
|
+
patches.append(big_patch)
|
253
|
+
if (patch_id + 1) % x_num == 0:
|
254
|
+
newlines.append(patch_id)
|
255
|
+
|
256
|
+
if newlines and newlines[-1] == len(patches) - 1:
|
257
|
+
newlines.pop()
|
258
|
+
|
259
|
+
return (
|
260
|
+
img,
|
261
|
+
patches,
|
262
|
+
(
|
263
|
+
[i in newlines for i in range(len(patches))]
|
264
|
+
if len(patches) > 0
|
265
|
+
else None
|
266
|
+
),
|
267
|
+
)
|
268
|
+
|
269
|
+
|
270
|
+
class Step3VLProcessor:
|
271
|
+
def __init__(
|
272
|
+
self,
|
273
|
+
config,
|
274
|
+
tokenizer,
|
275
|
+
) -> None:
|
276
|
+
super().__init__()
|
277
|
+
|
278
|
+
self.config = config
|
279
|
+
self.tokenizer = tokenizer
|
280
|
+
|
281
|
+
self.image_size = 728
|
282
|
+
self.patch_size = 504
|
283
|
+
self.image_preprocessor = Step3VisionProcessor(
|
284
|
+
self.image_size, "bilinear", self.patch_size
|
285
|
+
)
|
286
|
+
|
287
|
+
self.num_image_feature_size = 169
|
288
|
+
self.num_patch_feature_size = 81
|
289
|
+
self.image_token = "<im_patch>"
|
290
|
+
self.image_feature_placeholder = self.image_token * self.num_image_feature_size
|
291
|
+
self.patch_feature_placeholder = self.image_token * self.num_patch_feature_size
|
292
|
+
|
293
|
+
self.patcher = ImagePatcher()
|
294
|
+
|
295
|
+
@property
|
296
|
+
def image_token_id(self) -> int:
|
297
|
+
return self.tokenizer.get_vocab()[self.image_token]
|
298
|
+
|
299
|
+
def get_num_image_tokens(self, img_width: int, img_height: int) -> int:
|
300
|
+
num_patches, num_newlines = self.patcher.get_num_patches(img_width, img_height)
|
301
|
+
|
302
|
+
return (
|
303
|
+
num_patches * (self.num_patch_feature_size + 2)
|
304
|
+
+ self.num_image_feature_size
|
305
|
+
+ 2
|
306
|
+
+ num_newlines
|
307
|
+
)
|
308
|
+
|
309
|
+
def _split_images(self, images: list[Image.Image]) -> list[ImageWithPatches]:
|
310
|
+
result = []
|
311
|
+
for img in images:
|
312
|
+
result.append(self.patcher(img))
|
313
|
+
return result
|
314
|
+
|
315
|
+
def _convert_images_to_pixel_values(
|
316
|
+
self,
|
317
|
+
images: list[Image.Image],
|
318
|
+
is_patch: bool = False,
|
319
|
+
) -> list[torch.Tensor]:
|
320
|
+
return [
|
321
|
+
self.image_preprocessor(img, is_patch=is_patch)["pixel_values"]
|
322
|
+
for img in images
|
323
|
+
]
|
324
|
+
|
325
|
+
def _get_patch_repl(
|
326
|
+
self,
|
327
|
+
num_patches: int,
|
328
|
+
patch_newline_mask: list[bool] | None,
|
329
|
+
) -> tuple[str, list[int]]:
|
330
|
+
text = ""
|
331
|
+
token_ids = []
|
332
|
+
for i in range(num_patches):
|
333
|
+
assert len(patch_newline_mask) == num_patches
|
334
|
+
text += f"<patch_start>{self.patch_feature_placeholder}<patch_end>"
|
335
|
+
token_ids.extend(
|
336
|
+
[self.tokenizer.convert_tokens_to_ids("<patch_start>")]
|
337
|
+
+ [self.image_token_id] * self.num_patch_feature_size
|
338
|
+
+ [self.tokenizer.convert_tokens_to_ids("<patch_end>")]
|
339
|
+
)
|
340
|
+
if patch_newline_mask and patch_newline_mask[i]:
|
341
|
+
text += "<patch_newline>"
|
342
|
+
token_ids.append(
|
343
|
+
self.tokenizer.convert_tokens_to_ids("<patch_newline>")
|
344
|
+
)
|
345
|
+
return text, token_ids
|
346
|
+
|
347
|
+
def _get_image_repl(
|
348
|
+
self,
|
349
|
+
num_images: int,
|
350
|
+
) -> tuple[str, list[int]]:
|
351
|
+
text = f"<im_start>{self.image_feature_placeholder}<im_end>"
|
352
|
+
token_ids = (
|
353
|
+
[self.tokenizer.convert_tokens_to_ids("<im_start>")]
|
354
|
+
+ [self.image_token_id] * self.num_image_feature_size
|
355
|
+
+ [self.tokenizer.convert_tokens_to_ids("<im_end>")]
|
356
|
+
)
|
357
|
+
return text * num_images, token_ids * num_images
|
358
|
+
|
359
|
+
def _get_image_repl_features(
|
360
|
+
self,
|
361
|
+
num_images: int,
|
362
|
+
num_patches: int,
|
363
|
+
patch_new_line_idx: Optional[list[bool]],
|
364
|
+
) -> tuple[str, list[int]]:
|
365
|
+
if num_patches > 0:
|
366
|
+
patch_repl, patch_repl_ids = self._get_patch_repl(
|
367
|
+
num_patches, patch_new_line_idx
|
368
|
+
)
|
369
|
+
else:
|
370
|
+
patch_repl = ""
|
371
|
+
patch_repl_ids = []
|
372
|
+
image_repl, image_repl_ids = self._get_image_repl(num_images)
|
373
|
+
return patch_repl + image_repl, patch_repl_ids + image_repl_ids
|
374
|
+
|
375
|
+
def replace_placeholder(self, text: str, placeholder: str, repls: list[str]) -> str:
|
376
|
+
parts = text.split(placeholder)
|
377
|
+
|
378
|
+
if len(parts) - 1 != len(repls):
|
379
|
+
raise ValueError(
|
380
|
+
"The number of placeholders does not match the number of replacements." # noqa: E501
|
381
|
+
)
|
382
|
+
|
383
|
+
result = [parts[0]]
|
384
|
+
for i, repl in enumerate(repls):
|
385
|
+
result.append(repl)
|
386
|
+
result.append(parts[i + 1])
|
387
|
+
|
388
|
+
return "".join(result)
|
389
|
+
|
390
|
+
def __call__(
|
391
|
+
self,
|
392
|
+
text: Optional[Union[str, list[str]]] = None,
|
393
|
+
images: Optional[Union[Image.Image, list[Image.Image]]] = None,
|
394
|
+
return_tensors: Optional[Union[str, TensorType]] = None,
|
395
|
+
*args,
|
396
|
+
**kwargs,
|
397
|
+
) -> BatchFeature:
|
398
|
+
if text is None:
|
399
|
+
text = []
|
400
|
+
if not isinstance(text, list):
|
401
|
+
text = [text]
|
402
|
+
if images is None:
|
403
|
+
images = []
|
404
|
+
if not isinstance(images, list):
|
405
|
+
images = [images]
|
406
|
+
|
407
|
+
if len(images) == 0:
|
408
|
+
image_inputs = {}
|
409
|
+
text_inputs = self.tokenizer(text)
|
410
|
+
else:
|
411
|
+
splitted_images_data = self._split_images(images)
|
412
|
+
pixel_values_lst = []
|
413
|
+
patch_pixel_values_lst = []
|
414
|
+
patch_newline_mask_lst = []
|
415
|
+
image_repl_str_lst = []
|
416
|
+
image_repl_ids_lst = []
|
417
|
+
num_patches = []
|
418
|
+
for (
|
419
|
+
raw_img,
|
420
|
+
img_patches,
|
421
|
+
patch_newline_mask,
|
422
|
+
) in splitted_images_data: # noqa: E501
|
423
|
+
pixel_values_lst.extend(self._convert_images_to_pixel_values([raw_img]))
|
424
|
+
|
425
|
+
if len(img_patches) > 0:
|
426
|
+
patch_pixel_values_lst.extend(
|
427
|
+
self._convert_images_to_pixel_values(img_patches, is_patch=True)
|
428
|
+
)
|
429
|
+
num_patches.append(len(img_patches))
|
430
|
+
|
431
|
+
image_repl_str, image_repl_ids = self._get_image_repl_features(
|
432
|
+
1, len(img_patches), patch_newline_mask
|
433
|
+
)
|
434
|
+
image_repl_str_lst.append(image_repl_str)
|
435
|
+
image_repl_ids_lst.extend(image_repl_ids)
|
436
|
+
|
437
|
+
if patch_newline_mask is not None:
|
438
|
+
patch_newline_mask_lst.extend(patch_newline_mask)
|
439
|
+
|
440
|
+
image_inputs = {
|
441
|
+
"pixel_values": torch.cat(pixel_values_lst),
|
442
|
+
"num_patches": num_patches,
|
443
|
+
}
|
444
|
+
if patch_pixel_values_lst:
|
445
|
+
image_inputs["patch_pixel_values"] = torch.cat(patch_pixel_values_lst)
|
446
|
+
if patch_newline_mask_lst:
|
447
|
+
image_inputs["patch_newline_mask"] = torch.tensor(
|
448
|
+
patch_newline_mask_lst, dtype=torch.bool
|
449
|
+
)
|
450
|
+
|
451
|
+
text = [
|
452
|
+
self.replace_placeholder(t, self.image_token, image_repl_str_lst)
|
453
|
+
for t in text
|
454
|
+
]
|
455
|
+
text_inputs = self.tokenizer(text)
|
456
|
+
|
457
|
+
return BatchFeature(
|
458
|
+
{
|
459
|
+
**text_inputs,
|
460
|
+
**image_inputs,
|
461
|
+
},
|
462
|
+
tensor_type=return_tensors,
|
463
|
+
)
|
464
|
+
|
465
|
+
|
466
|
+
################################################
|
467
|
+
|
468
|
+
|
469
|
+
class Step3VLImageProcessor(SGLangBaseProcessor):
|
470
|
+
models = [Step3VLForConditionalGeneration]
|
471
|
+
|
472
|
+
def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
|
473
|
+
# TODO, check _processor is tokenizer or processor.
|
474
|
+
processor = Step3VLProcessor(hf_config, _processor)
|
475
|
+
super().__init__(hf_config, server_args, processor, *args, **kwargs)
|
476
|
+
self.IM_TOKEN_ID = 128001
|
477
|
+
self.mm_tokens = MultimodalSpecialTokens(
|
478
|
+
image_token="<im_patch>",
|
479
|
+
image_token_id=128001,
|
480
|
+
image_token_regex=re.compile(r"(?:<im_patch>)"),
|
481
|
+
).build(_processor)
|
482
|
+
|
483
|
+
mean = [0.48145466, 0.4578275, 0.40821073]
|
484
|
+
std = [0.26862954, 0.26130258, 0.27577711]
|
485
|
+
|
486
|
+
def preprocess(self, image):
|
487
|
+
return {"pixel_values": self.transform(image).unsqueeze(0)}
|
488
|
+
|
489
|
+
def __call__(self, image):
|
490
|
+
return self.preprocess(image)
|
491
|
+
|
492
|
+
async def process_mm_data_async(
|
493
|
+
self,
|
494
|
+
image_data: List[Union[str, bytes]],
|
495
|
+
input_text: str | List[int],
|
496
|
+
request_obj,
|
497
|
+
*args,
|
498
|
+
**kwargs,
|
499
|
+
):
|
500
|
+
base_output = self.load_mm_data(
|
501
|
+
prompt=input_text,
|
502
|
+
image_data=image_data,
|
503
|
+
video_data=request_obj.video_data,
|
504
|
+
multimodal_tokens=self.mm_tokens,
|
505
|
+
)
|
506
|
+
|
507
|
+
mm_items, input_ids, ret = self.process_and_combine_mm_data(
|
508
|
+
base_output, self.mm_tokens
|
509
|
+
)
|
510
|
+
|
511
|
+
return {
|
512
|
+
"input_ids": input_ids.tolist(),
|
513
|
+
"mm_items": mm_items,
|
514
|
+
"im_token_id": self.mm_tokens.image_token_id,
|
515
|
+
}
|
@@ -0,0 +1,31 @@
|
|
1
|
+
import torch
|
2
|
+
|
3
|
+
from sglang.srt.distributed import get_world_group
|
4
|
+
|
5
|
+
|
6
|
+
class PollBasedBarrier:
|
7
|
+
def __init__(self, noop: bool = False):
|
8
|
+
self._noop = noop
|
9
|
+
self._local_arrived = False
|
10
|
+
|
11
|
+
def local_arrive(self):
|
12
|
+
assert not self._local_arrived
|
13
|
+
self._local_arrived = True
|
14
|
+
|
15
|
+
def poll_global_arrived(self) -> bool:
|
16
|
+
global_arrived = self._compute_global_arrived()
|
17
|
+
output = self._local_arrived and global_arrived
|
18
|
+
if output:
|
19
|
+
self._local_arrived = False
|
20
|
+
return output
|
21
|
+
|
22
|
+
def _compute_global_arrived(self) -> bool:
|
23
|
+
local_arrived = self._noop or self._local_arrived
|
24
|
+
global_arrived = torch.tensor(local_arrived)
|
25
|
+
# Can optimize if bottleneck
|
26
|
+
torch.distributed.all_reduce(
|
27
|
+
global_arrived,
|
28
|
+
torch.distributed.ReduceOp.MIN,
|
29
|
+
group=get_world_group().cpu_group,
|
30
|
+
)
|
31
|
+
return global_arrived.item()
|
sglang/srt/reasoning_parser.py
CHANGED
@@ -105,7 +105,7 @@ class BaseReasoningFormatDetector:
|
|
105
105
|
# If we're not in a reasoning block return as normal text
|
106
106
|
if not self._in_reasoning:
|
107
107
|
self._buffer = ""
|
108
|
-
return StreamingParseResult(normal_text=
|
108
|
+
return StreamingParseResult(normal_text=current_text)
|
109
109
|
|
110
110
|
return StreamingParseResult()
|
111
111
|
|
@@ -233,6 +233,7 @@ class ReasoningParser:
|
|
233
233
|
"qwen3-thinking": Qwen3ThinkingDetector,
|
234
234
|
"glm45": Qwen3Detector,
|
235
235
|
"kimi": KimiDetector,
|
236
|
+
"step3": DeepSeekR1Detector,
|
236
237
|
}
|
237
238
|
|
238
239
|
def __init__(self, model_type: Optional[str] = None, stream_reasoning: bool = True):
|