sglang 0.4.9.post6__py3-none-any.whl → 0.4.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. sglang/bench_one_batch.py +3 -0
  2. sglang/srt/configs/__init__.py +8 -0
  3. sglang/srt/configs/model_config.py +3 -0
  4. sglang/srt/configs/step3_vl.py +172 -0
  5. sglang/srt/conversation.py +23 -0
  6. sglang/srt/disaggregation/decode.py +2 -8
  7. sglang/srt/disaggregation/prefill.py +2 -6
  8. sglang/srt/distributed/parallel_state.py +86 -1
  9. sglang/srt/entrypoints/engine.py +14 -18
  10. sglang/srt/entrypoints/http_server.py +10 -2
  11. sglang/srt/entrypoints/openai/serving_chat.py +2 -21
  12. sglang/srt/eplb/expert_distribution.py +5 -0
  13. sglang/srt/eplb/expert_location.py +17 -6
  14. sglang/srt/eplb/expert_location_dispatch.py +1 -0
  15. sglang/srt/eplb/expert_location_updater.py +2 -0
  16. sglang/srt/function_call/function_call_parser.py +2 -0
  17. sglang/srt/function_call/step3_detector.py +436 -0
  18. sglang/srt/hf_transformers_utils.py +2 -0
  19. sglang/srt/jinja_template_utils.py +4 -1
  20. sglang/srt/layers/moe/cutlass_moe.py +2 -1
  21. sglang/srt/layers/moe/ep_moe/layer.py +20 -640
  22. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +26 -13
  23. sglang/srt/layers/moe/fused_moe_triton/layer.py +97 -38
  24. sglang/srt/layers/quantization/fp8.py +0 -18
  25. sglang/srt/layers/quantization/unquant.py +0 -8
  26. sglang/srt/layers/quantization/w4afp8.py +1 -0
  27. sglang/srt/managers/cache_controller.py +143 -45
  28. sglang/srt/managers/data_parallel_controller.py +2 -0
  29. sglang/srt/managers/io_struct.py +0 -2
  30. sglang/srt/managers/scheduler.py +89 -671
  31. sglang/srt/managers/scheduler_metrics_mixin.py +229 -0
  32. sglang/srt/managers/scheduler_profiler_mixin.py +279 -0
  33. sglang/srt/managers/scheduler_update_weights_mixin.py +142 -0
  34. sglang/srt/managers/template_manager.py +62 -19
  35. sglang/srt/managers/tokenizer_manager.py +123 -74
  36. sglang/srt/managers/tp_worker.py +4 -0
  37. sglang/srt/managers/tp_worker_overlap_thread.py +2 -1
  38. sglang/srt/mem_cache/hicache_storage.py +45 -11
  39. sglang/srt/mem_cache/hiradix_cache.py +15 -4
  40. sglang/srt/mem_cache/memory_pool_host.py +73 -1
  41. sglang/srt/mem_cache/mooncake_store/mooncake_store.py +264 -0
  42. sglang/srt/mem_cache/mooncake_store/unit_test.py +40 -0
  43. sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +177 -0
  44. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +278 -0
  45. sglang/srt/mem_cache/storage/hf3fs/test_hf3fs_utils.py +43 -0
  46. sglang/srt/model_executor/model_runner.py +5 -0
  47. sglang/srt/models/arcee.py +532 -0
  48. sglang/srt/models/deepseek_v2.py +2 -0
  49. sglang/srt/models/glm4_moe.py +3 -1
  50. sglang/srt/models/granitemoe.py +3 -0
  51. sglang/srt/models/grok.py +3 -0
  52. sglang/srt/models/hunyuan.py +1 -0
  53. sglang/srt/models/llama4.py +3 -0
  54. sglang/srt/models/mixtral.py +3 -0
  55. sglang/srt/models/olmoe.py +3 -0
  56. sglang/srt/models/phimoe.py +1 -0
  57. sglang/srt/models/step3_vl.py +994 -0
  58. sglang/srt/multimodal/processors/base_processor.py +15 -16
  59. sglang/srt/multimodal/processors/step3_vl.py +515 -0
  60. sglang/srt/reasoning_parser.py +2 -1
  61. sglang/srt/server_args.py +10 -13
  62. sglang/srt/speculative/eagle_worker.py +2 -0
  63. sglang/utils.py +0 -11
  64. sglang/version.py +1 -1
  65. {sglang-0.4.9.post6.dist-info → sglang-0.4.10.dist-info}/METADATA +3 -4
  66. {sglang-0.4.9.post6.dist-info → sglang-0.4.10.dist-info}/RECORD +69 -56
  67. {sglang-0.4.9.post6.dist-info → sglang-0.4.10.dist-info}/WHEEL +0 -0
  68. {sglang-0.4.9.post6.dist-info → sglang-0.4.10.dist-info}/licenses/LICENSE +0 -0
  69. {sglang-0.4.9.post6.dist-info → sglang-0.4.10.dist-info}/top_level.txt +0 -0
@@ -176,6 +176,8 @@ class BaseMultimodalProcessor(ABC):
176
176
  "image_grid_hws": Modality.IMAGE,
177
177
  "aspect_ratio_ids": Modality.IMAGE,
178
178
  "aspect_ratio_mask": Modality.IMAGE,
179
+ "num_patches": Modality.IMAGE,
180
+ "patch_pixel_values": Modality.IMAGE,
179
181
  # Audio-related attributes
180
182
  "audio_features": Modality.AUDIO,
181
183
  "audio_feature_lens": Modality.AUDIO,
@@ -192,7 +194,12 @@ class BaseMultimodalProcessor(ABC):
192
194
 
193
195
  # name of the feature filed
194
196
  # TODO: pass from processors
195
- self.FEATURE_NAMES = ["pixel_values", "pixel_values_videos", "audio_features"]
197
+ self.FEATURE_NAMES = [
198
+ "pixel_values",
199
+ "pixel_values_videos",
200
+ "audio_features",
201
+ "input_features",
202
+ ]
196
203
 
197
204
  def process_mm_data(
198
205
  self, input_text, images=None, videos=None, audios=None, **kwargs
@@ -221,6 +228,13 @@ class BaseMultimodalProcessor(ABC):
221
228
  return_tensors="pt",
222
229
  **kwargs,
223
230
  )
231
+ # move feature tensors to cpu
232
+ for feature_name in self.FEATURE_NAMES:
233
+ if feature_name in result and isinstance(
234
+ result[feature_name], torch.Tensor
235
+ ):
236
+ result[feature_name] = result[feature_name].to("cpu")
237
+
224
238
  return result
225
239
 
226
240
  @abstractmethod
@@ -623,19 +637,4 @@ class BaseMultimodalProcessor(ABC):
623
637
  mm_token_id=mm_token_id,
624
638
  )
625
639
 
626
- # post-process
627
- for item in all_collected_items:
628
- # replace the feature tensor with a proxy
629
- if isinstance(item.feature, torch.Tensor) and item.feature.is_cuda:
630
- item.feature = TransportProxyTensor(
631
- transport_mode=self.transport_mode, data=item.feature
632
- )
633
- elif (
634
- isinstance(item.precomputed_embeddings, torch.Tensor)
635
- and item.precomputed_embeddings.is_cuda
636
- ):
637
- item.precomputed_embeddings = TransportProxyTensor(
638
- transport_mode=self.transport_mode, data=item.precomputed_embeddings
639
- )
640
-
641
640
  return all_collected_items, input_ids, ret
@@ -0,0 +1,515 @@
1
+ import math
2
+ import re
3
+ from itertools import product
4
+ from typing import List, Literal, Optional, TypedDict, Union
5
+
6
+ import numpy as np
7
+ import torch
8
+ from PIL import Image
9
+ from torchvision import transforms
10
+ from torchvision.transforms import InterpolationMode
11
+ from transformers import BatchFeature, TensorType
12
+
13
+ from sglang.srt.models.step3_vl import Step3VLForConditionalGeneration
14
+ from sglang.srt.multimodal.processors.base_processor import (
15
+ BaseMultimodalProcessor as SGLangBaseProcessor,
16
+ )
17
+ from sglang.srt.multimodal.processors.base_processor import MultimodalSpecialTokens
18
+
19
+ ImageWithPatches = tuple[Image.Image, list[Image.Image], list[int] | None]
20
+
21
+
22
+ class GPUToTensor(torch.nn.Module):
23
+
24
+ def forward(self, raw_image: Union[np.ndarray, Image.Image]) -> torch.Tensor:
25
+ if isinstance(raw_image, Image.Image):
26
+ return transforms.ToTensor()(raw_image)
27
+ if raw_image.ndim == 2:
28
+ raw_image = raw_image[:, :, None].repeat(3, -1)
29
+ if torch.cuda.is_available():
30
+ device = torch.device("cuda")
31
+ else:
32
+ device = torch.device("cpu")
33
+ image_tensor = torch.from_numpy(raw_image).to(device)
34
+ image_tensor = torch.permute(image_tensor, (2, 0, 1)).contiguous()
35
+ if image_tensor.dtype == torch.uint8:
36
+ image_tensor = image_tensor.to(torch.float32).div(255)
37
+ return image_tensor
38
+
39
+
40
+ class Step3VisionProcessor:
41
+ def __init__(self, size, interpolation_mode="bicubic", patch_size=None):
42
+ mean = [0.48145466, 0.4578275, 0.40821073]
43
+ std = [0.26862954, 0.26130258, 0.27577711]
44
+ patch_size = patch_size if patch_size is not None else size
45
+
46
+ self.transform = transforms.Compose(
47
+ [
48
+ GPUToTensor(),
49
+ transforms.Normalize(mean, std),
50
+ transforms.Resize(
51
+ (size, size),
52
+ interpolation=(
53
+ InterpolationMode.BICUBIC
54
+ if interpolation_mode == "bicubic"
55
+ else InterpolationMode.BILINEAR
56
+ ),
57
+ antialias=True,
58
+ ),
59
+ ]
60
+ )
61
+
62
+ self.patch_transform = (
63
+ transforms.Compose(
64
+ [
65
+ GPUToTensor(),
66
+ transforms.Normalize(mean, std),
67
+ transforms.Resize(
68
+ (patch_size, patch_size),
69
+ interpolation=(
70
+ InterpolationMode.BICUBIC
71
+ if interpolation_mode == "bicubic"
72
+ else InterpolationMode.BILINEAR
73
+ ),
74
+ antialias=True,
75
+ ),
76
+ ]
77
+ )
78
+ if patch_size is not None
79
+ else None
80
+ )
81
+
82
+ def __call__(self, image, is_patch=False):
83
+ if is_patch:
84
+ return {"pixel_values": self.patch_transform(image).unsqueeze(0)}
85
+ else:
86
+ return {"pixel_values": self.transform(image).unsqueeze(0)}
87
+
88
+
89
+ class ImagePatcher:
90
+
91
+ def determine_window_size(self, long: int, short: int) -> int:
92
+ if long <= 728:
93
+ return short if long / short > 1.5 else 0
94
+ return min(short, 504) if long / short > 4 else 504
95
+
96
+ def slide_window(
97
+ self,
98
+ width: int,
99
+ height: int,
100
+ sizes: list[tuple[int, int]],
101
+ steps: list[tuple[int, int]],
102
+ img_rate_thr: float = 0.6,
103
+ ) -> tuple[list[tuple[int, int, int, int]], tuple[int, int]]:
104
+ assert 1 >= img_rate_thr >= 0, "The `in_rate_thr` should lie in 0~1"
105
+ windows = []
106
+ # Sliding windows.
107
+ for size, step in zip(sizes, steps):
108
+ size_w, size_h = size
109
+ step_w, step_h = step
110
+
111
+ x_num = 1 if width <= size_w else math.ceil((width - size_w) / step_w + 1)
112
+ x_start = [step_w * i for i in range(x_num)]
113
+ if len(x_start) > 1 and x_start[-1] + size_w > width:
114
+ x_start[-1] = width - size_w
115
+
116
+ y_num = 1 if height <= size_h else math.ceil((height - size_h) / step_h + 1)
117
+ y_start = [step_h * i for i in range(y_num)]
118
+ if len(y_start) > 1 and y_start[-1] + size_h > height:
119
+ y_start[-1] = height - size_h
120
+
121
+ start = np.array(list(product(y_start, x_start)), dtype=int)
122
+ start[:, [0, 1]] = start[:, [1, 0]]
123
+ windows.append(np.concatenate([start, start + size], axis=1))
124
+ windows = np.concatenate(windows, axis=0)
125
+
126
+ return [
127
+ (int(box[0]), int(box[1]), int(box[2] - box[0]), int(box[3] - box[1]))
128
+ for box in windows
129
+ ], (x_num, y_num)
130
+
131
+ def square_pad(self, img: Image.Image) -> Image.Image:
132
+ w, h = img.size
133
+ if w == h:
134
+ return img
135
+ size = max(w, h)
136
+ padded = Image.new(img.mode, (size, size), 0)
137
+ padded.paste(img, (0, 0))
138
+ return padded
139
+
140
+ def get_image_size_for_padding(
141
+ self, img_width: int, img_height: int
142
+ ) -> tuple[int, int]:
143
+ ratio = img_width / img_height
144
+ if min(img_height, img_width) < 32 and (ratio > 4 or ratio < 1 / 4):
145
+ new_size = max(img_height, img_width)
146
+ return new_size, new_size
147
+ return img_width, img_height
148
+
149
+ def get_image_size_for_preprocess(
150
+ self, img_width: int, img_height: int
151
+ ) -> tuple[int, int]:
152
+
153
+ if max(img_height, img_width) > 3024:
154
+ scale_factor = 3024 / max(img_height, img_width)
155
+ img_width = int(img_width * scale_factor)
156
+ img_height = int(img_height * scale_factor)
157
+ return img_width, img_height
158
+ else:
159
+ return img_width, img_height
160
+
161
+ def get_image_size_for_crop(
162
+ self, img_width: int, img_height: int, window_size: int
163
+ ):
164
+ w_ratio = img_width / window_size
165
+ h_ratio = img_height / window_size
166
+
167
+ if w_ratio < 1:
168
+ width_new = img_width
169
+ else:
170
+ decimal_w = w_ratio - img_width // window_size
171
+ w_ratio = int(w_ratio) + 1 if decimal_w > 0.2 else int(w_ratio)
172
+ width_new = window_size * w_ratio
173
+ if h_ratio < 1:
174
+ height_new = img_height
175
+ else:
176
+ decimal_h = h_ratio - img_height // window_size
177
+ h_ratio = int(h_ratio) + 1 if decimal_h > 0.2 else int(h_ratio)
178
+ height_new = window_size * h_ratio
179
+ return int(width_new), int(height_new)
180
+
181
+ def patch_crop(self, img: Image.Image, i: int, j: int, th: int, tw: int):
182
+ target = img.crop((j, i, j + tw, i + th))
183
+ return target
184
+
185
+ def get_num_patches(self, img_width: int, img_height: int) -> tuple[int, int]:
186
+ img_width, img_height = self.get_image_size_for_padding(img_width, img_height)
187
+ img_width, img_height = self.get_image_size_for_preprocess(
188
+ img_width, img_height
189
+ )
190
+ window_size = self.determine_window_size(
191
+ max(img_height, img_width), min(img_height, img_width)
192
+ )
193
+ if window_size == 0:
194
+ return 0, 0
195
+ else:
196
+ img_width, img_height = self.get_image_size_for_crop(
197
+ img_width, img_height, window_size
198
+ )
199
+ center_list, (x_num, y_num) = self.slide_window(
200
+ img_width,
201
+ img_height,
202
+ [(window_size, window_size)],
203
+ [(window_size, window_size)],
204
+ )
205
+ full_rows = (len(center_list) - 1) // x_num + 1
206
+ if len(center_list) > 0 and len(center_list) % x_num == 0:
207
+ full_rows -= 1
208
+ return len(center_list), full_rows
209
+
210
+ def __call__(
211
+ self, img: Image.Image
212
+ ) -> tuple[Image.Image, list[Image.Image], list[bool] | None]:
213
+ img_width, img_height = img.size
214
+ new_img_width, new_img_height = self.get_image_size_for_padding(
215
+ img_width, img_height
216
+ )
217
+ if new_img_width != img_width or new_img_height != img_height:
218
+ img = self.square_pad(img)
219
+ img_width, img_height = img.size
220
+
221
+ new_img_width, new_img_height = self.get_image_size_for_preprocess(
222
+ img_width, img_height
223
+ )
224
+ img = img.resize((new_img_width, new_img_height), Image.Resampling.BILINEAR)
225
+ window_size = self.determine_window_size(
226
+ max(new_img_height, new_img_width), min(new_img_height, new_img_width)
227
+ )
228
+ if window_size == 0:
229
+ return img, [], None
230
+ else:
231
+ new_img_width, new_img_height = self.get_image_size_for_crop(
232
+ new_img_width, new_img_height, window_size
233
+ )
234
+ if (new_img_width, new_img_height) != (img_width, img_height):
235
+ img_for_crop = img.resize(
236
+ (new_img_width, new_img_height), Image.Resampling.BILINEAR
237
+ )
238
+ else:
239
+ img_for_crop = img
240
+
241
+ patches = []
242
+ newlines = []
243
+ center_list, (x_num, y_num) = self.slide_window(
244
+ new_img_width,
245
+ new_img_height,
246
+ [(window_size, window_size)],
247
+ [(window_size, window_size)],
248
+ )
249
+ for patch_id, center_lf_point in enumerate(center_list):
250
+ x, y, patch_w, patch_h = center_lf_point
251
+ big_patch = self.patch_crop(img_for_crop, y, x, patch_h, patch_w)
252
+ patches.append(big_patch)
253
+ if (patch_id + 1) % x_num == 0:
254
+ newlines.append(patch_id)
255
+
256
+ if newlines and newlines[-1] == len(patches) - 1:
257
+ newlines.pop()
258
+
259
+ return (
260
+ img,
261
+ patches,
262
+ (
263
+ [i in newlines for i in range(len(patches))]
264
+ if len(patches) > 0
265
+ else None
266
+ ),
267
+ )
268
+
269
+
270
+ class Step3VLProcessor:
271
+ def __init__(
272
+ self,
273
+ config,
274
+ tokenizer,
275
+ ) -> None:
276
+ super().__init__()
277
+
278
+ self.config = config
279
+ self.tokenizer = tokenizer
280
+
281
+ self.image_size = 728
282
+ self.patch_size = 504
283
+ self.image_preprocessor = Step3VisionProcessor(
284
+ self.image_size, "bilinear", self.patch_size
285
+ )
286
+
287
+ self.num_image_feature_size = 169
288
+ self.num_patch_feature_size = 81
289
+ self.image_token = "<im_patch>"
290
+ self.image_feature_placeholder = self.image_token * self.num_image_feature_size
291
+ self.patch_feature_placeholder = self.image_token * self.num_patch_feature_size
292
+
293
+ self.patcher = ImagePatcher()
294
+
295
+ @property
296
+ def image_token_id(self) -> int:
297
+ return self.tokenizer.get_vocab()[self.image_token]
298
+
299
+ def get_num_image_tokens(self, img_width: int, img_height: int) -> int:
300
+ num_patches, num_newlines = self.patcher.get_num_patches(img_width, img_height)
301
+
302
+ return (
303
+ num_patches * (self.num_patch_feature_size + 2)
304
+ + self.num_image_feature_size
305
+ + 2
306
+ + num_newlines
307
+ )
308
+
309
+ def _split_images(self, images: list[Image.Image]) -> list[ImageWithPatches]:
310
+ result = []
311
+ for img in images:
312
+ result.append(self.patcher(img))
313
+ return result
314
+
315
+ def _convert_images_to_pixel_values(
316
+ self,
317
+ images: list[Image.Image],
318
+ is_patch: bool = False,
319
+ ) -> list[torch.Tensor]:
320
+ return [
321
+ self.image_preprocessor(img, is_patch=is_patch)["pixel_values"]
322
+ for img in images
323
+ ]
324
+
325
+ def _get_patch_repl(
326
+ self,
327
+ num_patches: int,
328
+ patch_newline_mask: list[bool] | None,
329
+ ) -> tuple[str, list[int]]:
330
+ text = ""
331
+ token_ids = []
332
+ for i in range(num_patches):
333
+ assert len(patch_newline_mask) == num_patches
334
+ text += f"<patch_start>{self.patch_feature_placeholder}<patch_end>"
335
+ token_ids.extend(
336
+ [self.tokenizer.convert_tokens_to_ids("<patch_start>")]
337
+ + [self.image_token_id] * self.num_patch_feature_size
338
+ + [self.tokenizer.convert_tokens_to_ids("<patch_end>")]
339
+ )
340
+ if patch_newline_mask and patch_newline_mask[i]:
341
+ text += "<patch_newline>"
342
+ token_ids.append(
343
+ self.tokenizer.convert_tokens_to_ids("<patch_newline>")
344
+ )
345
+ return text, token_ids
346
+
347
+ def _get_image_repl(
348
+ self,
349
+ num_images: int,
350
+ ) -> tuple[str, list[int]]:
351
+ text = f"<im_start>{self.image_feature_placeholder}<im_end>"
352
+ token_ids = (
353
+ [self.tokenizer.convert_tokens_to_ids("<im_start>")]
354
+ + [self.image_token_id] * self.num_image_feature_size
355
+ + [self.tokenizer.convert_tokens_to_ids("<im_end>")]
356
+ )
357
+ return text * num_images, token_ids * num_images
358
+
359
+ def _get_image_repl_features(
360
+ self,
361
+ num_images: int,
362
+ num_patches: int,
363
+ patch_new_line_idx: Optional[list[bool]],
364
+ ) -> tuple[str, list[int]]:
365
+ if num_patches > 0:
366
+ patch_repl, patch_repl_ids = self._get_patch_repl(
367
+ num_patches, patch_new_line_idx
368
+ )
369
+ else:
370
+ patch_repl = ""
371
+ patch_repl_ids = []
372
+ image_repl, image_repl_ids = self._get_image_repl(num_images)
373
+ return patch_repl + image_repl, patch_repl_ids + image_repl_ids
374
+
375
+ def replace_placeholder(self, text: str, placeholder: str, repls: list[str]) -> str:
376
+ parts = text.split(placeholder)
377
+
378
+ if len(parts) - 1 != len(repls):
379
+ raise ValueError(
380
+ "The number of placeholders does not match the number of replacements." # noqa: E501
381
+ )
382
+
383
+ result = [parts[0]]
384
+ for i, repl in enumerate(repls):
385
+ result.append(repl)
386
+ result.append(parts[i + 1])
387
+
388
+ return "".join(result)
389
+
390
+ def __call__(
391
+ self,
392
+ text: Optional[Union[str, list[str]]] = None,
393
+ images: Optional[Union[Image.Image, list[Image.Image]]] = None,
394
+ return_tensors: Optional[Union[str, TensorType]] = None,
395
+ *args,
396
+ **kwargs,
397
+ ) -> BatchFeature:
398
+ if text is None:
399
+ text = []
400
+ if not isinstance(text, list):
401
+ text = [text]
402
+ if images is None:
403
+ images = []
404
+ if not isinstance(images, list):
405
+ images = [images]
406
+
407
+ if len(images) == 0:
408
+ image_inputs = {}
409
+ text_inputs = self.tokenizer(text)
410
+ else:
411
+ splitted_images_data = self._split_images(images)
412
+ pixel_values_lst = []
413
+ patch_pixel_values_lst = []
414
+ patch_newline_mask_lst = []
415
+ image_repl_str_lst = []
416
+ image_repl_ids_lst = []
417
+ num_patches = []
418
+ for (
419
+ raw_img,
420
+ img_patches,
421
+ patch_newline_mask,
422
+ ) in splitted_images_data: # noqa: E501
423
+ pixel_values_lst.extend(self._convert_images_to_pixel_values([raw_img]))
424
+
425
+ if len(img_patches) > 0:
426
+ patch_pixel_values_lst.extend(
427
+ self._convert_images_to_pixel_values(img_patches, is_patch=True)
428
+ )
429
+ num_patches.append(len(img_patches))
430
+
431
+ image_repl_str, image_repl_ids = self._get_image_repl_features(
432
+ 1, len(img_patches), patch_newline_mask
433
+ )
434
+ image_repl_str_lst.append(image_repl_str)
435
+ image_repl_ids_lst.extend(image_repl_ids)
436
+
437
+ if patch_newline_mask is not None:
438
+ patch_newline_mask_lst.extend(patch_newline_mask)
439
+
440
+ image_inputs = {
441
+ "pixel_values": torch.cat(pixel_values_lst),
442
+ "num_patches": num_patches,
443
+ }
444
+ if patch_pixel_values_lst:
445
+ image_inputs["patch_pixel_values"] = torch.cat(patch_pixel_values_lst)
446
+ if patch_newline_mask_lst:
447
+ image_inputs["patch_newline_mask"] = torch.tensor(
448
+ patch_newline_mask_lst, dtype=torch.bool
449
+ )
450
+
451
+ text = [
452
+ self.replace_placeholder(t, self.image_token, image_repl_str_lst)
453
+ for t in text
454
+ ]
455
+ text_inputs = self.tokenizer(text)
456
+
457
+ return BatchFeature(
458
+ {
459
+ **text_inputs,
460
+ **image_inputs,
461
+ },
462
+ tensor_type=return_tensors,
463
+ )
464
+
465
+
466
+ ################################################
467
+
468
+
469
+ class Step3VLImageProcessor(SGLangBaseProcessor):
470
+ models = [Step3VLForConditionalGeneration]
471
+
472
+ def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
473
+ # TODO, check _processor is tokenizer or processor.
474
+ processor = Step3VLProcessor(hf_config, _processor)
475
+ super().__init__(hf_config, server_args, processor, *args, **kwargs)
476
+ self.IM_TOKEN_ID = 128001
477
+ self.mm_tokens = MultimodalSpecialTokens(
478
+ image_token="<im_patch>",
479
+ image_token_id=128001,
480
+ image_token_regex=re.compile(r"(?:<im_patch>)"),
481
+ ).build(_processor)
482
+
483
+ mean = [0.48145466, 0.4578275, 0.40821073]
484
+ std = [0.26862954, 0.26130258, 0.27577711]
485
+
486
+ def preprocess(self, image):
487
+ return {"pixel_values": self.transform(image).unsqueeze(0)}
488
+
489
+ def __call__(self, image):
490
+ return self.preprocess(image)
491
+
492
+ async def process_mm_data_async(
493
+ self,
494
+ image_data: List[Union[str, bytes]],
495
+ input_text: str | List[int],
496
+ request_obj,
497
+ *args,
498
+ **kwargs,
499
+ ):
500
+ base_output = self.load_mm_data(
501
+ prompt=input_text,
502
+ image_data=image_data,
503
+ video_data=request_obj.video_data,
504
+ multimodal_tokens=self.mm_tokens,
505
+ )
506
+
507
+ mm_items, input_ids, ret = self.process_and_combine_mm_data(
508
+ base_output, self.mm_tokens
509
+ )
510
+
511
+ return {
512
+ "input_ids": input_ids.tolist(),
513
+ "mm_items": mm_items,
514
+ "im_token_id": self.mm_tokens.image_token_id,
515
+ }
@@ -105,7 +105,7 @@ class BaseReasoningFormatDetector:
105
105
  # If we're not in a reasoning block return as normal text
106
106
  if not self._in_reasoning:
107
107
  self._buffer = ""
108
- return StreamingParseResult(normal_text=new_text)
108
+ return StreamingParseResult(normal_text=current_text)
109
109
 
110
110
  return StreamingParseResult()
111
111
 
@@ -233,6 +233,7 @@ class ReasoningParser:
233
233
  "qwen3-thinking": Qwen3ThinkingDetector,
234
234
  "glm45": Qwen3Detector,
235
235
  "kimi": KimiDetector,
236
+ "step3": DeepSeekR1Detector,
236
237
  }
237
238
 
238
239
  def __init__(self, model_type: Optional[str] = None, stream_reasoning: bool = True):
sglang/srt/server_args.py CHANGED
@@ -270,14 +270,6 @@ class ServerArgs:
270
270
  sm_group_num: int = 3
271
271
 
272
272
  def __post_init__(self):
273
- # Expert parallelism
274
- # We put it here first due to some internal ckpt conversation issues.
275
- if self.enable_ep_moe:
276
- self.ep_size = self.tp_size
277
- logger.warning(
278
- f"EP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
279
- )
280
-
281
273
  # Set missing default values
282
274
  if self.tokenizer_path is None:
283
275
  self.tokenizer_path = self.model_path
@@ -1117,9 +1109,10 @@ class ServerArgs:
1117
1109
  "kimi_k2",
1118
1110
  "qwen3_coder",
1119
1111
  "glm45",
1112
+ "step3",
1120
1113
  ],
1121
1114
  default=ServerArgs.tool_call_parser,
1122
- help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', 'pythonic', 'kimi_k2', and 'qwen3_coder'.",
1115
+ help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', 'pythonic', 'kimi_k2', 'qwen3_coder', 'glm45', and 'step3'.",
1123
1116
  )
1124
1117
 
1125
1118
  # Data parallelism
@@ -1334,6 +1327,7 @@ class ServerArgs:
1334
1327
  parser.add_argument(
1335
1328
  "--expert-parallel-size",
1336
1329
  "--ep-size",
1330
+ "--ep",
1337
1331
  type=int,
1338
1332
  default=ServerArgs.ep_size,
1339
1333
  help="The expert parallelism size.",
@@ -1476,7 +1470,7 @@ class ServerArgs:
1476
1470
  parser.add_argument(
1477
1471
  "--hicache-storage-backend",
1478
1472
  type=str,
1479
- choices=["file"], # todo, mooncake
1473
+ choices=["file", "mooncake", "hf3fs"],
1480
1474
  default=ServerArgs.hicache_storage_backend,
1481
1475
  help="The storage backend for hierarchical KV cache.",
1482
1476
  )
@@ -2071,6 +2065,9 @@ class PortArgs:
2071
2065
 
2072
2066
  dist_init_host, dist_init_port = dist_init_addr
2073
2067
  port_base = int(dist_init_port) + 1
2068
+ detokenizer_port = port_base + 1
2069
+ rpc_port = port_base + 2
2070
+ metrics_ipc_name = port_base + 3
2074
2071
  if dp_rank is None:
2075
2072
  # TokenizerManager to DataParallelController
2076
2073
  scheduler_input_port = port_base + 4
@@ -2080,10 +2077,10 @@ class PortArgs:
2080
2077
  return PortArgs(
2081
2078
  tokenizer_ipc_name=f"tcp://{dist_init_host}:{port_base}",
2082
2079
  scheduler_input_ipc_name=f"tcp://{dist_init_host}:{scheduler_input_port}",
2083
- detokenizer_ipc_name=f"tcp://{dist_init_host}:{port_base + 1}",
2080
+ detokenizer_ipc_name=f"tcp://{dist_init_host}:{detokenizer_port}",
2084
2081
  nccl_port=nccl_port,
2085
- rpc_ipc_name=f"tcp://{dist_init_host}:{port_base + 2}",
2086
- metrics_ipc_name=f"tcp://{dist_init_host}:{port_base + 3}",
2082
+ rpc_ipc_name=f"tcp://{dist_init_host}:{rpc_port}",
2083
+ metrics_ipc_name=f"tcp://{dist_init_host}:{metrics_ipc_name}",
2087
2084
  )
2088
2085
 
2089
2086
 
@@ -73,6 +73,7 @@ class EAGLEWorker(TpModelWorker):
73
73
  gpu_id: int,
74
74
  tp_rank: int,
75
75
  dp_rank: Optional[int],
76
+ moe_ep_rank: int,
76
77
  nccl_port: int,
77
78
  target_worker: TpModelWorker,
78
79
  ):
@@ -127,6 +128,7 @@ class EAGLEWorker(TpModelWorker):
127
128
  tp_rank=tp_rank,
128
129
  pp_rank=0, # FIXME
129
130
  dp_rank=dp_rank,
131
+ moe_ep_rank=moe_ep_rank,
130
132
  nccl_port=nccl_port,
131
133
  is_draft_worker=True,
132
134
  req_to_token_pool=self.req_to_token_pool,
sglang/utils.py CHANGED
@@ -291,17 +291,6 @@ def find_printable_text(text: str):
291
291
  return text[: text.rfind(" ") + 1]
292
292
 
293
293
 
294
- def graceful_registry(sub_module_name: str):
295
- def graceful_shutdown(signum, frame):
296
- logger.info(
297
- f"{sub_module_name} Received signal to shutdown. Performing graceful shutdown..."
298
- )
299
- if signum == signal.SIGTERM:
300
- logger.info(f"{sub_module_name} receive sigterm")
301
-
302
- signal.signal(signal.SIGTERM, graceful_shutdown)
303
-
304
-
305
294
  class LazyImport:
306
295
  """Lazy import to make `import sglang` run faster."""
307
296
 
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.4.9.post6"
1
+ __version__ = "0.4.10"