sglang 0.4.9.post5__py3-none-any.whl → 0.4.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. sglang/bench_one_batch.py +3 -0
  2. sglang/srt/configs/__init__.py +8 -0
  3. sglang/srt/configs/model_config.py +6 -0
  4. sglang/srt/configs/step3_vl.py +172 -0
  5. sglang/srt/conversation.py +23 -0
  6. sglang/srt/disaggregation/decode.py +2 -8
  7. sglang/srt/disaggregation/prefill.py +2 -6
  8. sglang/srt/distributed/parallel_state.py +86 -1
  9. sglang/srt/entrypoints/engine.py +14 -18
  10. sglang/srt/entrypoints/http_server.py +23 -3
  11. sglang/srt/entrypoints/openai/protocol.py +3 -1
  12. sglang/srt/entrypoints/openai/serving_base.py +5 -2
  13. sglang/srt/entrypoints/openai/serving_chat.py +2 -21
  14. sglang/srt/eplb/expert_distribution.py +5 -0
  15. sglang/srt/eplb/expert_location.py +17 -6
  16. sglang/srt/eplb/expert_location_dispatch.py +1 -0
  17. sglang/srt/eplb/expert_location_updater.py +2 -0
  18. sglang/srt/function_call/function_call_parser.py +2 -0
  19. sglang/srt/function_call/step3_detector.py +436 -0
  20. sglang/srt/hf_transformers_utils.py +2 -0
  21. sglang/srt/jinja_template_utils.py +4 -1
  22. sglang/srt/layers/moe/cutlass_moe.py +2 -1
  23. sglang/srt/layers/moe/ep_moe/layer.py +98 -603
  24. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +83 -118
  25. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  26. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  27. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +26 -13
  28. sglang/srt/layers/moe/fused_moe_triton/layer.py +97 -38
  29. sglang/srt/layers/moe/token_dispatcher/__init__.py +0 -0
  30. sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +48 -0
  31. sglang/srt/layers/moe/token_dispatcher/standard.py +19 -0
  32. sglang/srt/layers/moe/topk.py +6 -2
  33. sglang/srt/layers/quantization/fp8.py +0 -18
  34. sglang/srt/layers/quantization/modelopt_quant.py +2 -0
  35. sglang/srt/layers/quantization/unquant.py +0 -8
  36. sglang/srt/layers/quantization/w4afp8.py +1 -0
  37. sglang/srt/managers/cache_controller.py +143 -45
  38. sglang/srt/managers/data_parallel_controller.py +6 -0
  39. sglang/srt/managers/io_struct.py +12 -2
  40. sglang/srt/managers/scheduler.py +116 -669
  41. sglang/srt/managers/scheduler_input_blocker.py +106 -0
  42. sglang/srt/managers/scheduler_metrics_mixin.py +229 -0
  43. sglang/srt/managers/scheduler_profiler_mixin.py +279 -0
  44. sglang/srt/managers/scheduler_update_weights_mixin.py +142 -0
  45. sglang/srt/managers/template_manager.py +62 -19
  46. sglang/srt/managers/tokenizer_manager.py +166 -83
  47. sglang/srt/managers/tp_worker.py +9 -0
  48. sglang/srt/managers/tp_worker_overlap_thread.py +2 -1
  49. sglang/srt/mem_cache/hicache_storage.py +45 -11
  50. sglang/srt/mem_cache/hiradix_cache.py +15 -4
  51. sglang/srt/mem_cache/memory_pool_host.py +73 -1
  52. sglang/srt/mem_cache/mooncake_store/mooncake_store.py +264 -0
  53. sglang/srt/mem_cache/mooncake_store/unit_test.py +40 -0
  54. sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +177 -0
  55. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +278 -0
  56. sglang/srt/mem_cache/storage/hf3fs/test_hf3fs_utils.py +43 -0
  57. sglang/srt/model_executor/model_runner.py +20 -13
  58. sglang/srt/models/arcee.py +532 -0
  59. sglang/srt/models/deepseek_v2.py +15 -56
  60. sglang/srt/models/glm4_moe.py +3 -1
  61. sglang/srt/models/granitemoe.py +3 -0
  62. sglang/srt/models/grok.py +3 -0
  63. sglang/srt/models/hunyuan.py +1 -0
  64. sglang/srt/models/llama4.py +3 -0
  65. sglang/srt/models/mixtral.py +3 -0
  66. sglang/srt/models/olmoe.py +3 -0
  67. sglang/srt/models/phimoe.py +1 -0
  68. sglang/srt/models/qwen3_moe.py +12 -69
  69. sglang/srt/models/step3_vl.py +994 -0
  70. sglang/srt/multimodal/processors/base_processor.py +15 -16
  71. sglang/srt/multimodal/processors/step3_vl.py +515 -0
  72. sglang/srt/poll_based_barrier.py +31 -0
  73. sglang/srt/reasoning_parser.py +2 -1
  74. sglang/srt/server_args.py +18 -13
  75. sglang/srt/speculative/eagle_worker.py +2 -0
  76. sglang/srt/two_batch_overlap.py +8 -3
  77. sglang/test/test_utils.py +53 -0
  78. sglang/utils.py +0 -11
  79. sglang/version.py +1 -1
  80. {sglang-0.4.9.post5.dist-info → sglang-0.4.10.dist-info}/METADATA +4 -4
  81. {sglang-0.4.9.post5.dist-info → sglang-0.4.10.dist-info}/RECORD +84 -64
  82. {sglang-0.4.9.post5.dist-info → sglang-0.4.10.dist-info}/WHEEL +0 -0
  83. {sglang-0.4.9.post5.dist-info → sglang-0.4.10.dist-info}/licenses/LICENSE +0 -0
  84. {sglang-0.4.9.post5.dist-info → sglang-0.4.10.dist-info}/top_level.txt +0 -0
@@ -176,6 +176,8 @@ class BaseMultimodalProcessor(ABC):
176
176
  "image_grid_hws": Modality.IMAGE,
177
177
  "aspect_ratio_ids": Modality.IMAGE,
178
178
  "aspect_ratio_mask": Modality.IMAGE,
179
+ "num_patches": Modality.IMAGE,
180
+ "patch_pixel_values": Modality.IMAGE,
179
181
  # Audio-related attributes
180
182
  "audio_features": Modality.AUDIO,
181
183
  "audio_feature_lens": Modality.AUDIO,
@@ -192,7 +194,12 @@ class BaseMultimodalProcessor(ABC):
192
194
 
193
195
  # name of the feature filed
194
196
  # TODO: pass from processors
195
- self.FEATURE_NAMES = ["pixel_values", "pixel_values_videos", "audio_features"]
197
+ self.FEATURE_NAMES = [
198
+ "pixel_values",
199
+ "pixel_values_videos",
200
+ "audio_features",
201
+ "input_features",
202
+ ]
196
203
 
197
204
  def process_mm_data(
198
205
  self, input_text, images=None, videos=None, audios=None, **kwargs
@@ -221,6 +228,13 @@ class BaseMultimodalProcessor(ABC):
221
228
  return_tensors="pt",
222
229
  **kwargs,
223
230
  )
231
+ # move feature tensors to cpu
232
+ for feature_name in self.FEATURE_NAMES:
233
+ if feature_name in result and isinstance(
234
+ result[feature_name], torch.Tensor
235
+ ):
236
+ result[feature_name] = result[feature_name].to("cpu")
237
+
224
238
  return result
225
239
 
226
240
  @abstractmethod
@@ -623,19 +637,4 @@ class BaseMultimodalProcessor(ABC):
623
637
  mm_token_id=mm_token_id,
624
638
  )
625
639
 
626
- # post-process
627
- for item in all_collected_items:
628
- # replace the feature tensor with a proxy
629
- if isinstance(item.feature, torch.Tensor) and item.feature.is_cuda:
630
- item.feature = TransportProxyTensor(
631
- transport_mode=self.transport_mode, data=item.feature
632
- )
633
- elif (
634
- isinstance(item.precomputed_embeddings, torch.Tensor)
635
- and item.precomputed_embeddings.is_cuda
636
- ):
637
- item.precomputed_embeddings = TransportProxyTensor(
638
- transport_mode=self.transport_mode, data=item.precomputed_embeddings
639
- )
640
-
641
640
  return all_collected_items, input_ids, ret
@@ -0,0 +1,515 @@
1
+ import math
2
+ import re
3
+ from itertools import product
4
+ from typing import List, Literal, Optional, TypedDict, Union
5
+
6
+ import numpy as np
7
+ import torch
8
+ from PIL import Image
9
+ from torchvision import transforms
10
+ from torchvision.transforms import InterpolationMode
11
+ from transformers import BatchFeature, TensorType
12
+
13
+ from sglang.srt.models.step3_vl import Step3VLForConditionalGeneration
14
+ from sglang.srt.multimodal.processors.base_processor import (
15
+ BaseMultimodalProcessor as SGLangBaseProcessor,
16
+ )
17
+ from sglang.srt.multimodal.processors.base_processor import MultimodalSpecialTokens
18
+
19
+ ImageWithPatches = tuple[Image.Image, list[Image.Image], list[int] | None]
20
+
21
+
22
+ class GPUToTensor(torch.nn.Module):
23
+
24
+ def forward(self, raw_image: Union[np.ndarray, Image.Image]) -> torch.Tensor:
25
+ if isinstance(raw_image, Image.Image):
26
+ return transforms.ToTensor()(raw_image)
27
+ if raw_image.ndim == 2:
28
+ raw_image = raw_image[:, :, None].repeat(3, -1)
29
+ if torch.cuda.is_available():
30
+ device = torch.device("cuda")
31
+ else:
32
+ device = torch.device("cpu")
33
+ image_tensor = torch.from_numpy(raw_image).to(device)
34
+ image_tensor = torch.permute(image_tensor, (2, 0, 1)).contiguous()
35
+ if image_tensor.dtype == torch.uint8:
36
+ image_tensor = image_tensor.to(torch.float32).div(255)
37
+ return image_tensor
38
+
39
+
40
+ class Step3VisionProcessor:
41
+ def __init__(self, size, interpolation_mode="bicubic", patch_size=None):
42
+ mean = [0.48145466, 0.4578275, 0.40821073]
43
+ std = [0.26862954, 0.26130258, 0.27577711]
44
+ patch_size = patch_size if patch_size is not None else size
45
+
46
+ self.transform = transforms.Compose(
47
+ [
48
+ GPUToTensor(),
49
+ transforms.Normalize(mean, std),
50
+ transforms.Resize(
51
+ (size, size),
52
+ interpolation=(
53
+ InterpolationMode.BICUBIC
54
+ if interpolation_mode == "bicubic"
55
+ else InterpolationMode.BILINEAR
56
+ ),
57
+ antialias=True,
58
+ ),
59
+ ]
60
+ )
61
+
62
+ self.patch_transform = (
63
+ transforms.Compose(
64
+ [
65
+ GPUToTensor(),
66
+ transforms.Normalize(mean, std),
67
+ transforms.Resize(
68
+ (patch_size, patch_size),
69
+ interpolation=(
70
+ InterpolationMode.BICUBIC
71
+ if interpolation_mode == "bicubic"
72
+ else InterpolationMode.BILINEAR
73
+ ),
74
+ antialias=True,
75
+ ),
76
+ ]
77
+ )
78
+ if patch_size is not None
79
+ else None
80
+ )
81
+
82
+ def __call__(self, image, is_patch=False):
83
+ if is_patch:
84
+ return {"pixel_values": self.patch_transform(image).unsqueeze(0)}
85
+ else:
86
+ return {"pixel_values": self.transform(image).unsqueeze(0)}
87
+
88
+
89
+ class ImagePatcher:
90
+
91
+ def determine_window_size(self, long: int, short: int) -> int:
92
+ if long <= 728:
93
+ return short if long / short > 1.5 else 0
94
+ return min(short, 504) if long / short > 4 else 504
95
+
96
+ def slide_window(
97
+ self,
98
+ width: int,
99
+ height: int,
100
+ sizes: list[tuple[int, int]],
101
+ steps: list[tuple[int, int]],
102
+ img_rate_thr: float = 0.6,
103
+ ) -> tuple[list[tuple[int, int, int, int]], tuple[int, int]]:
104
+ assert 1 >= img_rate_thr >= 0, "The `in_rate_thr` should lie in 0~1"
105
+ windows = []
106
+ # Sliding windows.
107
+ for size, step in zip(sizes, steps):
108
+ size_w, size_h = size
109
+ step_w, step_h = step
110
+
111
+ x_num = 1 if width <= size_w else math.ceil((width - size_w) / step_w + 1)
112
+ x_start = [step_w * i for i in range(x_num)]
113
+ if len(x_start) > 1 and x_start[-1] + size_w > width:
114
+ x_start[-1] = width - size_w
115
+
116
+ y_num = 1 if height <= size_h else math.ceil((height - size_h) / step_h + 1)
117
+ y_start = [step_h * i for i in range(y_num)]
118
+ if len(y_start) > 1 and y_start[-1] + size_h > height:
119
+ y_start[-1] = height - size_h
120
+
121
+ start = np.array(list(product(y_start, x_start)), dtype=int)
122
+ start[:, [0, 1]] = start[:, [1, 0]]
123
+ windows.append(np.concatenate([start, start + size], axis=1))
124
+ windows = np.concatenate(windows, axis=0)
125
+
126
+ return [
127
+ (int(box[0]), int(box[1]), int(box[2] - box[0]), int(box[3] - box[1]))
128
+ for box in windows
129
+ ], (x_num, y_num)
130
+
131
+ def square_pad(self, img: Image.Image) -> Image.Image:
132
+ w, h = img.size
133
+ if w == h:
134
+ return img
135
+ size = max(w, h)
136
+ padded = Image.new(img.mode, (size, size), 0)
137
+ padded.paste(img, (0, 0))
138
+ return padded
139
+
140
+ def get_image_size_for_padding(
141
+ self, img_width: int, img_height: int
142
+ ) -> tuple[int, int]:
143
+ ratio = img_width / img_height
144
+ if min(img_height, img_width) < 32 and (ratio > 4 or ratio < 1 / 4):
145
+ new_size = max(img_height, img_width)
146
+ return new_size, new_size
147
+ return img_width, img_height
148
+
149
+ def get_image_size_for_preprocess(
150
+ self, img_width: int, img_height: int
151
+ ) -> tuple[int, int]:
152
+
153
+ if max(img_height, img_width) > 3024:
154
+ scale_factor = 3024 / max(img_height, img_width)
155
+ img_width = int(img_width * scale_factor)
156
+ img_height = int(img_height * scale_factor)
157
+ return img_width, img_height
158
+ else:
159
+ return img_width, img_height
160
+
161
+ def get_image_size_for_crop(
162
+ self, img_width: int, img_height: int, window_size: int
163
+ ):
164
+ w_ratio = img_width / window_size
165
+ h_ratio = img_height / window_size
166
+
167
+ if w_ratio < 1:
168
+ width_new = img_width
169
+ else:
170
+ decimal_w = w_ratio - img_width // window_size
171
+ w_ratio = int(w_ratio) + 1 if decimal_w > 0.2 else int(w_ratio)
172
+ width_new = window_size * w_ratio
173
+ if h_ratio < 1:
174
+ height_new = img_height
175
+ else:
176
+ decimal_h = h_ratio - img_height // window_size
177
+ h_ratio = int(h_ratio) + 1 if decimal_h > 0.2 else int(h_ratio)
178
+ height_new = window_size * h_ratio
179
+ return int(width_new), int(height_new)
180
+
181
+ def patch_crop(self, img: Image.Image, i: int, j: int, th: int, tw: int):
182
+ target = img.crop((j, i, j + tw, i + th))
183
+ return target
184
+
185
+ def get_num_patches(self, img_width: int, img_height: int) -> tuple[int, int]:
186
+ img_width, img_height = self.get_image_size_for_padding(img_width, img_height)
187
+ img_width, img_height = self.get_image_size_for_preprocess(
188
+ img_width, img_height
189
+ )
190
+ window_size = self.determine_window_size(
191
+ max(img_height, img_width), min(img_height, img_width)
192
+ )
193
+ if window_size == 0:
194
+ return 0, 0
195
+ else:
196
+ img_width, img_height = self.get_image_size_for_crop(
197
+ img_width, img_height, window_size
198
+ )
199
+ center_list, (x_num, y_num) = self.slide_window(
200
+ img_width,
201
+ img_height,
202
+ [(window_size, window_size)],
203
+ [(window_size, window_size)],
204
+ )
205
+ full_rows = (len(center_list) - 1) // x_num + 1
206
+ if len(center_list) > 0 and len(center_list) % x_num == 0:
207
+ full_rows -= 1
208
+ return len(center_list), full_rows
209
+
210
+ def __call__(
211
+ self, img: Image.Image
212
+ ) -> tuple[Image.Image, list[Image.Image], list[bool] | None]:
213
+ img_width, img_height = img.size
214
+ new_img_width, new_img_height = self.get_image_size_for_padding(
215
+ img_width, img_height
216
+ )
217
+ if new_img_width != img_width or new_img_height != img_height:
218
+ img = self.square_pad(img)
219
+ img_width, img_height = img.size
220
+
221
+ new_img_width, new_img_height = self.get_image_size_for_preprocess(
222
+ img_width, img_height
223
+ )
224
+ img = img.resize((new_img_width, new_img_height), Image.Resampling.BILINEAR)
225
+ window_size = self.determine_window_size(
226
+ max(new_img_height, new_img_width), min(new_img_height, new_img_width)
227
+ )
228
+ if window_size == 0:
229
+ return img, [], None
230
+ else:
231
+ new_img_width, new_img_height = self.get_image_size_for_crop(
232
+ new_img_width, new_img_height, window_size
233
+ )
234
+ if (new_img_width, new_img_height) != (img_width, img_height):
235
+ img_for_crop = img.resize(
236
+ (new_img_width, new_img_height), Image.Resampling.BILINEAR
237
+ )
238
+ else:
239
+ img_for_crop = img
240
+
241
+ patches = []
242
+ newlines = []
243
+ center_list, (x_num, y_num) = self.slide_window(
244
+ new_img_width,
245
+ new_img_height,
246
+ [(window_size, window_size)],
247
+ [(window_size, window_size)],
248
+ )
249
+ for patch_id, center_lf_point in enumerate(center_list):
250
+ x, y, patch_w, patch_h = center_lf_point
251
+ big_patch = self.patch_crop(img_for_crop, y, x, patch_h, patch_w)
252
+ patches.append(big_patch)
253
+ if (patch_id + 1) % x_num == 0:
254
+ newlines.append(patch_id)
255
+
256
+ if newlines and newlines[-1] == len(patches) - 1:
257
+ newlines.pop()
258
+
259
+ return (
260
+ img,
261
+ patches,
262
+ (
263
+ [i in newlines for i in range(len(patches))]
264
+ if len(patches) > 0
265
+ else None
266
+ ),
267
+ )
268
+
269
+
270
+ class Step3VLProcessor:
271
+ def __init__(
272
+ self,
273
+ config,
274
+ tokenizer,
275
+ ) -> None:
276
+ super().__init__()
277
+
278
+ self.config = config
279
+ self.tokenizer = tokenizer
280
+
281
+ self.image_size = 728
282
+ self.patch_size = 504
283
+ self.image_preprocessor = Step3VisionProcessor(
284
+ self.image_size, "bilinear", self.patch_size
285
+ )
286
+
287
+ self.num_image_feature_size = 169
288
+ self.num_patch_feature_size = 81
289
+ self.image_token = "<im_patch>"
290
+ self.image_feature_placeholder = self.image_token * self.num_image_feature_size
291
+ self.patch_feature_placeholder = self.image_token * self.num_patch_feature_size
292
+
293
+ self.patcher = ImagePatcher()
294
+
295
+ @property
296
+ def image_token_id(self) -> int:
297
+ return self.tokenizer.get_vocab()[self.image_token]
298
+
299
+ def get_num_image_tokens(self, img_width: int, img_height: int) -> int:
300
+ num_patches, num_newlines = self.patcher.get_num_patches(img_width, img_height)
301
+
302
+ return (
303
+ num_patches * (self.num_patch_feature_size + 2)
304
+ + self.num_image_feature_size
305
+ + 2
306
+ + num_newlines
307
+ )
308
+
309
+ def _split_images(self, images: list[Image.Image]) -> list[ImageWithPatches]:
310
+ result = []
311
+ for img in images:
312
+ result.append(self.patcher(img))
313
+ return result
314
+
315
+ def _convert_images_to_pixel_values(
316
+ self,
317
+ images: list[Image.Image],
318
+ is_patch: bool = False,
319
+ ) -> list[torch.Tensor]:
320
+ return [
321
+ self.image_preprocessor(img, is_patch=is_patch)["pixel_values"]
322
+ for img in images
323
+ ]
324
+
325
+ def _get_patch_repl(
326
+ self,
327
+ num_patches: int,
328
+ patch_newline_mask: list[bool] | None,
329
+ ) -> tuple[str, list[int]]:
330
+ text = ""
331
+ token_ids = []
332
+ for i in range(num_patches):
333
+ assert len(patch_newline_mask) == num_patches
334
+ text += f"<patch_start>{self.patch_feature_placeholder}<patch_end>"
335
+ token_ids.extend(
336
+ [self.tokenizer.convert_tokens_to_ids("<patch_start>")]
337
+ + [self.image_token_id] * self.num_patch_feature_size
338
+ + [self.tokenizer.convert_tokens_to_ids("<patch_end>")]
339
+ )
340
+ if patch_newline_mask and patch_newline_mask[i]:
341
+ text += "<patch_newline>"
342
+ token_ids.append(
343
+ self.tokenizer.convert_tokens_to_ids("<patch_newline>")
344
+ )
345
+ return text, token_ids
346
+
347
+ def _get_image_repl(
348
+ self,
349
+ num_images: int,
350
+ ) -> tuple[str, list[int]]:
351
+ text = f"<im_start>{self.image_feature_placeholder}<im_end>"
352
+ token_ids = (
353
+ [self.tokenizer.convert_tokens_to_ids("<im_start>")]
354
+ + [self.image_token_id] * self.num_image_feature_size
355
+ + [self.tokenizer.convert_tokens_to_ids("<im_end>")]
356
+ )
357
+ return text * num_images, token_ids * num_images
358
+
359
+ def _get_image_repl_features(
360
+ self,
361
+ num_images: int,
362
+ num_patches: int,
363
+ patch_new_line_idx: Optional[list[bool]],
364
+ ) -> tuple[str, list[int]]:
365
+ if num_patches > 0:
366
+ patch_repl, patch_repl_ids = self._get_patch_repl(
367
+ num_patches, patch_new_line_idx
368
+ )
369
+ else:
370
+ patch_repl = ""
371
+ patch_repl_ids = []
372
+ image_repl, image_repl_ids = self._get_image_repl(num_images)
373
+ return patch_repl + image_repl, patch_repl_ids + image_repl_ids
374
+
375
+ def replace_placeholder(self, text: str, placeholder: str, repls: list[str]) -> str:
376
+ parts = text.split(placeholder)
377
+
378
+ if len(parts) - 1 != len(repls):
379
+ raise ValueError(
380
+ "The number of placeholders does not match the number of replacements." # noqa: E501
381
+ )
382
+
383
+ result = [parts[0]]
384
+ for i, repl in enumerate(repls):
385
+ result.append(repl)
386
+ result.append(parts[i + 1])
387
+
388
+ return "".join(result)
389
+
390
+ def __call__(
391
+ self,
392
+ text: Optional[Union[str, list[str]]] = None,
393
+ images: Optional[Union[Image.Image, list[Image.Image]]] = None,
394
+ return_tensors: Optional[Union[str, TensorType]] = None,
395
+ *args,
396
+ **kwargs,
397
+ ) -> BatchFeature:
398
+ if text is None:
399
+ text = []
400
+ if not isinstance(text, list):
401
+ text = [text]
402
+ if images is None:
403
+ images = []
404
+ if not isinstance(images, list):
405
+ images = [images]
406
+
407
+ if len(images) == 0:
408
+ image_inputs = {}
409
+ text_inputs = self.tokenizer(text)
410
+ else:
411
+ splitted_images_data = self._split_images(images)
412
+ pixel_values_lst = []
413
+ patch_pixel_values_lst = []
414
+ patch_newline_mask_lst = []
415
+ image_repl_str_lst = []
416
+ image_repl_ids_lst = []
417
+ num_patches = []
418
+ for (
419
+ raw_img,
420
+ img_patches,
421
+ patch_newline_mask,
422
+ ) in splitted_images_data: # noqa: E501
423
+ pixel_values_lst.extend(self._convert_images_to_pixel_values([raw_img]))
424
+
425
+ if len(img_patches) > 0:
426
+ patch_pixel_values_lst.extend(
427
+ self._convert_images_to_pixel_values(img_patches, is_patch=True)
428
+ )
429
+ num_patches.append(len(img_patches))
430
+
431
+ image_repl_str, image_repl_ids = self._get_image_repl_features(
432
+ 1, len(img_patches), patch_newline_mask
433
+ )
434
+ image_repl_str_lst.append(image_repl_str)
435
+ image_repl_ids_lst.extend(image_repl_ids)
436
+
437
+ if patch_newline_mask is not None:
438
+ patch_newline_mask_lst.extend(patch_newline_mask)
439
+
440
+ image_inputs = {
441
+ "pixel_values": torch.cat(pixel_values_lst),
442
+ "num_patches": num_patches,
443
+ }
444
+ if patch_pixel_values_lst:
445
+ image_inputs["patch_pixel_values"] = torch.cat(patch_pixel_values_lst)
446
+ if patch_newline_mask_lst:
447
+ image_inputs["patch_newline_mask"] = torch.tensor(
448
+ patch_newline_mask_lst, dtype=torch.bool
449
+ )
450
+
451
+ text = [
452
+ self.replace_placeholder(t, self.image_token, image_repl_str_lst)
453
+ for t in text
454
+ ]
455
+ text_inputs = self.tokenizer(text)
456
+
457
+ return BatchFeature(
458
+ {
459
+ **text_inputs,
460
+ **image_inputs,
461
+ },
462
+ tensor_type=return_tensors,
463
+ )
464
+
465
+
466
+ ################################################
467
+
468
+
469
+ class Step3VLImageProcessor(SGLangBaseProcessor):
470
+ models = [Step3VLForConditionalGeneration]
471
+
472
+ def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
473
+ # TODO, check _processor is tokenizer or processor.
474
+ processor = Step3VLProcessor(hf_config, _processor)
475
+ super().__init__(hf_config, server_args, processor, *args, **kwargs)
476
+ self.IM_TOKEN_ID = 128001
477
+ self.mm_tokens = MultimodalSpecialTokens(
478
+ image_token="<im_patch>",
479
+ image_token_id=128001,
480
+ image_token_regex=re.compile(r"(?:<im_patch>)"),
481
+ ).build(_processor)
482
+
483
+ mean = [0.48145466, 0.4578275, 0.40821073]
484
+ std = [0.26862954, 0.26130258, 0.27577711]
485
+
486
+ def preprocess(self, image):
487
+ return {"pixel_values": self.transform(image).unsqueeze(0)}
488
+
489
+ def __call__(self, image):
490
+ return self.preprocess(image)
491
+
492
+ async def process_mm_data_async(
493
+ self,
494
+ image_data: List[Union[str, bytes]],
495
+ input_text: str | List[int],
496
+ request_obj,
497
+ *args,
498
+ **kwargs,
499
+ ):
500
+ base_output = self.load_mm_data(
501
+ prompt=input_text,
502
+ image_data=image_data,
503
+ video_data=request_obj.video_data,
504
+ multimodal_tokens=self.mm_tokens,
505
+ )
506
+
507
+ mm_items, input_ids, ret = self.process_and_combine_mm_data(
508
+ base_output, self.mm_tokens
509
+ )
510
+
511
+ return {
512
+ "input_ids": input_ids.tolist(),
513
+ "mm_items": mm_items,
514
+ "im_token_id": self.mm_tokens.image_token_id,
515
+ }
@@ -0,0 +1,31 @@
1
+ import torch
2
+
3
+ from sglang.srt.distributed import get_world_group
4
+
5
+
6
+ class PollBasedBarrier:
7
+ def __init__(self, noop: bool = False):
8
+ self._noop = noop
9
+ self._local_arrived = False
10
+
11
+ def local_arrive(self):
12
+ assert not self._local_arrived
13
+ self._local_arrived = True
14
+
15
+ def poll_global_arrived(self) -> bool:
16
+ global_arrived = self._compute_global_arrived()
17
+ output = self._local_arrived and global_arrived
18
+ if output:
19
+ self._local_arrived = False
20
+ return output
21
+
22
+ def _compute_global_arrived(self) -> bool:
23
+ local_arrived = self._noop or self._local_arrived
24
+ global_arrived = torch.tensor(local_arrived)
25
+ # Can optimize if bottleneck
26
+ torch.distributed.all_reduce(
27
+ global_arrived,
28
+ torch.distributed.ReduceOp.MIN,
29
+ group=get_world_group().cpu_group,
30
+ )
31
+ return global_arrived.item()
@@ -105,7 +105,7 @@ class BaseReasoningFormatDetector:
105
105
  # If we're not in a reasoning block return as normal text
106
106
  if not self._in_reasoning:
107
107
  self._buffer = ""
108
- return StreamingParseResult(normal_text=new_text)
108
+ return StreamingParseResult(normal_text=current_text)
109
109
 
110
110
  return StreamingParseResult()
111
111
 
@@ -233,6 +233,7 @@ class ReasoningParser:
233
233
  "qwen3-thinking": Qwen3ThinkingDetector,
234
234
  "glm45": Qwen3Detector,
235
235
  "kimi": KimiDetector,
236
+ "step3": DeepSeekR1Detector,
236
237
  }
237
238
 
238
239
  def __init__(self, model_type: Optional[str] = None, stream_reasoning: bool = True):