sglang 0.4.9.post1__py3-none-any.whl → 0.4.9.post2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/srt/configs/model_config.py +24 -1
- sglang/srt/conversation.py +21 -2
- sglang/srt/disaggregation/ascend/__init__.py +6 -0
- sglang/srt/disaggregation/ascend/conn.py +44 -0
- sglang/srt/disaggregation/ascend/transfer_engine.py +58 -0
- sglang/srt/disaggregation/mooncake/conn.py +15 -14
- sglang/srt/disaggregation/mooncake/transfer_engine.py +17 -8
- sglang/srt/disaggregation/utils.py +25 -3
- sglang/srt/entrypoints/engine.py +1 -1
- sglang/srt/entrypoints/http_server.py +1 -0
- sglang/srt/entrypoints/openai/protocol.py +11 -0
- sglang/srt/entrypoints/openai/serving_chat.py +7 -0
- sglang/srt/function_call/function_call_parser.py +2 -0
- sglang/srt/function_call/kimik2_detector.py +220 -0
- sglang/srt/hf_transformers_utils.py +18 -0
- sglang/srt/jinja_template_utils.py +8 -0
- sglang/srt/layers/communicator.py +17 -4
- sglang/srt/layers/linear.py +12 -2
- sglang/srt/layers/moe/ep_moe/kernels.py +2 -1
- sglang/srt/layers/moe/ep_moe/layer.py +2 -1
- sglang/srt/layers/moe/fused_moe_triton/layer.py +7 -2
- sglang/srt/layers/moe/topk.py +8 -2
- sglang/srt/layers/parameter.py +19 -3
- sglang/srt/layers/quantization/fp8_kernel.py +2 -2
- sglang/srt/layers/quantization/moe_wna16.py +1 -2
- sglang/srt/layers/quantization/w8a8_int8.py +738 -14
- sglang/srt/managers/io_struct.py +27 -2
- sglang/srt/managers/mm_utils.py +55 -94
- sglang/srt/managers/schedule_batch.py +16 -5
- sglang/srt/managers/scheduler.py +21 -1
- sglang/srt/managers/tokenizer_manager.py +16 -0
- sglang/srt/mem_cache/memory_pool.py +65 -40
- sglang/srt/model_executor/forward_batch_info.py +13 -1
- sglang/srt/model_loader/loader.py +23 -12
- sglang/srt/models/deepseek_janus_pro.py +1 -1
- sglang/srt/models/deepseek_v2.py +62 -17
- sglang/srt/models/deepseek_vl2.py +1 -1
- sglang/srt/models/gemma3_mm.py +1 -1
- sglang/srt/models/gemma3n_mm.py +6 -3
- sglang/srt/models/internvl.py +8 -2
- sglang/srt/models/kimi_vl.py +8 -2
- sglang/srt/models/llama.py +2 -0
- sglang/srt/models/llava.py +3 -1
- sglang/srt/models/llavavid.py +1 -1
- sglang/srt/models/minicpmo.py +1 -2
- sglang/srt/models/minicpmv.py +1 -1
- sglang/srt/models/mixtral_quant.py +4 -0
- sglang/srt/models/mllama4.py +13 -4
- sglang/srt/models/phi4mm.py +8 -2
- sglang/srt/models/phimoe.py +553 -0
- sglang/srt/models/qwen2.py +2 -0
- sglang/srt/models/qwen2_5_vl.py +10 -7
- sglang/srt/models/qwen2_vl.py +12 -1
- sglang/srt/models/vila.py +8 -2
- sglang/srt/multimodal/processors/base_processor.py +197 -137
- sglang/srt/multimodal/processors/deepseek_vl_v2.py +1 -1
- sglang/srt/multimodal/processors/gemma3.py +4 -2
- sglang/srt/multimodal/processors/gemma3n.py +1 -1
- sglang/srt/multimodal/processors/internvl.py +1 -1
- sglang/srt/multimodal/processors/janus_pro.py +1 -1
- sglang/srt/multimodal/processors/kimi_vl.py +1 -1
- sglang/srt/multimodal/processors/minicpm.py +4 -3
- sglang/srt/multimodal/processors/mllama4.py +1 -1
- sglang/srt/multimodal/processors/phi4mm.py +1 -1
- sglang/srt/multimodal/processors/pixtral.py +1 -1
- sglang/srt/multimodal/processors/qwen_vl.py +203 -80
- sglang/srt/multimodal/processors/vila.py +1 -1
- sglang/srt/server_args.py +11 -4
- sglang/srt/utils.py +154 -31
- sglang/version.py +1 -1
- {sglang-0.4.9.post1.dist-info → sglang-0.4.9.post2.dist-info}/METADATA +4 -3
- {sglang-0.4.9.post1.dist-info → sglang-0.4.9.post2.dist-info}/RECORD +75 -70
- {sglang-0.4.9.post1.dist-info → sglang-0.4.9.post2.dist-info}/WHEEL +0 -0
- {sglang-0.4.9.post1.dist-info → sglang-0.4.9.post2.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.9.post1.dist-info → sglang-0.4.9.post2.dist-info}/top_level.txt +0 -0
@@ -106,7 +106,7 @@ class PixtralProcessor(BaseMultimodalProcessor):
|
|
106
106
|
pixel_values=processor_output["pixel_values"],
|
107
107
|
image_sizes=processor_output["image_sizes"],
|
108
108
|
modality=Modality.IMAGE,
|
109
|
-
|
109
|
+
offsets=image_offsets,
|
110
110
|
)
|
111
111
|
]
|
112
112
|
|
@@ -1,9 +1,13 @@
|
|
1
1
|
import asyncio
|
2
2
|
import math
|
3
|
+
import os
|
3
4
|
import re
|
4
|
-
from typing import
|
5
|
+
from typing import List, Union
|
5
6
|
|
7
|
+
import torch
|
8
|
+
import torchvision
|
6
9
|
from PIL import Image
|
10
|
+
from torchvision.transforms import InterpolationMode
|
7
11
|
|
8
12
|
from sglang.srt.layers.rotary_embedding import MRotaryEmbedding
|
9
13
|
from sglang.srt.models.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
|
@@ -12,6 +16,185 @@ from sglang.srt.multimodal.processors.base_processor import (
|
|
12
16
|
BaseMultimodalProcessor as SGLangBaseProcessor,
|
13
17
|
)
|
14
18
|
from sglang.srt.multimodal.processors.base_processor import MultimodalSpecialTokens
|
19
|
+
from sglang.utils import logger
|
20
|
+
|
21
|
+
IMAGE_FACTOR = 28
|
22
|
+
MIN_PIXELS = 4 * 28 * 28
|
23
|
+
MAX_PIXELS = 16384 * 28 * 28
|
24
|
+
MAX_RATIO = 200
|
25
|
+
VIDEO_TOTAL_PIXELS = int(
|
26
|
+
float(os.environ.get("VIDEO_MAX_PIXELS", 128000 * 28 * 28 * 0.9))
|
27
|
+
)
|
28
|
+
|
29
|
+
VIDEO_MIN_PIXELS = 128 * 28 * 28
|
30
|
+
VIDEO_MAX_PIXELS = 768 * 28 * 28
|
31
|
+
FRAME_FACTOR = 2
|
32
|
+
FPS = 2.0
|
33
|
+
FPS_MIN_FRAMES = 4
|
34
|
+
FPS_MAX_FRAMES = 768
|
35
|
+
|
36
|
+
|
37
|
+
def smart_resize(
|
38
|
+
height: int,
|
39
|
+
width: int,
|
40
|
+
factor: int = IMAGE_FACTOR,
|
41
|
+
min_pixels: int = MIN_PIXELS,
|
42
|
+
max_pixels: int = MAX_PIXELS,
|
43
|
+
) -> tuple[int, int]:
|
44
|
+
"""
|
45
|
+
Rescales the image so that the following conditions are met:
|
46
|
+
|
47
|
+
1. Both dimensions (height and width) are divisible by 'factor'.
|
48
|
+
|
49
|
+
2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
|
50
|
+
|
51
|
+
3. The aspect ratio of the image is maintained as closely as possible.
|
52
|
+
"""
|
53
|
+
if max(height, width) / min(height, width) > MAX_RATIO:
|
54
|
+
raise ValueError(
|
55
|
+
f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)}"
|
56
|
+
)
|
57
|
+
h_bar = max(factor, round_by_factor(height, factor))
|
58
|
+
w_bar = max(factor, round_by_factor(width, factor))
|
59
|
+
if h_bar * w_bar > max_pixels:
|
60
|
+
beta = math.sqrt((height * width) / max_pixels)
|
61
|
+
h_bar = floor_by_factor(height / beta, factor)
|
62
|
+
w_bar = floor_by_factor(width / beta, factor)
|
63
|
+
elif h_bar * w_bar < min_pixels:
|
64
|
+
beta = math.sqrt(min_pixels / (height * width))
|
65
|
+
h_bar = ceil_by_factor(height * beta, factor)
|
66
|
+
w_bar = ceil_by_factor(width * beta, factor)
|
67
|
+
return h_bar, w_bar
|
68
|
+
|
69
|
+
|
70
|
+
def resize_image(image, size_factor: int = IMAGE_FACTOR) -> Image.Image:
|
71
|
+
width, height = image.size
|
72
|
+
min_pixels = MIN_PIXELS
|
73
|
+
max_pixels = MAX_PIXELS
|
74
|
+
resized_height, resized_width = smart_resize(
|
75
|
+
height,
|
76
|
+
width,
|
77
|
+
factor=size_factor,
|
78
|
+
min_pixels=min_pixels,
|
79
|
+
max_pixels=max_pixels,
|
80
|
+
)
|
81
|
+
image = image.resize((resized_width, resized_height))
|
82
|
+
return image
|
83
|
+
|
84
|
+
|
85
|
+
def round_by_factor(number: int, factor: int) -> int:
|
86
|
+
"""Returns the closest integer to 'number' that is divisible by 'factor'."""
|
87
|
+
return round(number / factor) * factor
|
88
|
+
|
89
|
+
|
90
|
+
def ceil_by_factor(number: int, factor: int) -> int:
|
91
|
+
"""Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
|
92
|
+
return math.ceil(number / factor) * factor
|
93
|
+
|
94
|
+
|
95
|
+
def floor_by_factor(number: int, factor: int) -> int:
|
96
|
+
"""Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
|
97
|
+
return math.floor(number / factor) * factor
|
98
|
+
|
99
|
+
|
100
|
+
async def resize_image_async(image):
|
101
|
+
return resize_image(image)
|
102
|
+
|
103
|
+
|
104
|
+
def smart_nframes(
|
105
|
+
ele: dict,
|
106
|
+
total_frames: int,
|
107
|
+
video_fps: int | float,
|
108
|
+
) -> int:
|
109
|
+
"""calculate the number of frames for video used for model inputs.
|
110
|
+
|
111
|
+
Args:
|
112
|
+
ele (dict): a dict contains the configuration of video.
|
113
|
+
support either `fps` or `nframes`:
|
114
|
+
- nframes: the number of frames to extract for model inputs.
|
115
|
+
- fps: the fps to extract frames for model inputs.
|
116
|
+
- min_frames: the minimum number of frames of the video, only used when fps is provided.
|
117
|
+
- max_frames: the maximum number of frames of the video, only used when fps is provided.
|
118
|
+
total_frames (int): the original total number of frames of the video.
|
119
|
+
video_fps (int | float): the original fps of the video.
|
120
|
+
|
121
|
+
Raises:
|
122
|
+
ValueError: nframes should in interval [FRAME_FACTOR, total_frames].
|
123
|
+
|
124
|
+
Returns:
|
125
|
+
int: the number of frames for video used for model inputs.
|
126
|
+
"""
|
127
|
+
assert not (
|
128
|
+
"fps" in ele and "nframes" in ele
|
129
|
+
), "Only accept either `fps` or `nframes`"
|
130
|
+
if "nframes" in ele:
|
131
|
+
nframes = round_by_factor(ele["nframes"], FRAME_FACTOR)
|
132
|
+
else:
|
133
|
+
fps = ele.get("fps", FPS)
|
134
|
+
min_frames = ceil_by_factor(ele.get("min_frames", FPS_MIN_FRAMES), FRAME_FACTOR)
|
135
|
+
max_frames = floor_by_factor(
|
136
|
+
ele.get("max_frames", min(FPS_MAX_FRAMES, total_frames)), FRAME_FACTOR
|
137
|
+
)
|
138
|
+
nframes = total_frames / video_fps * fps
|
139
|
+
if nframes > total_frames:
|
140
|
+
logger.warning(
|
141
|
+
f"smart_nframes: nframes[{nframes}] > total_frames[{total_frames}]"
|
142
|
+
)
|
143
|
+
nframes = min(min(max(nframes, min_frames), max_frames), total_frames)
|
144
|
+
nframes = floor_by_factor(nframes, FRAME_FACTOR)
|
145
|
+
if not (FRAME_FACTOR <= nframes and nframes <= total_frames):
|
146
|
+
raise ValueError(
|
147
|
+
f"nframes should in interval [{FRAME_FACTOR}, {total_frames}], but got {nframes}."
|
148
|
+
)
|
149
|
+
return nframes
|
150
|
+
|
151
|
+
|
152
|
+
# process video, qwen-specific
|
153
|
+
async def preprocess_video(
|
154
|
+
vr,
|
155
|
+
image_factor: int = IMAGE_FACTOR,
|
156
|
+
# vr: VideoReader, image_factor: int = IMAGE_FACTOR
|
157
|
+
) -> torch.Tensor:
|
158
|
+
ele = {}
|
159
|
+
total_frames, video_fps = len(vr), vr.get_avg_fps()
|
160
|
+
nframes = smart_nframes({}, total_frames=total_frames, video_fps=video_fps)
|
161
|
+
idx = torch.linspace(0, total_frames - 1, nframes).round().long().tolist()
|
162
|
+
video = vr.get_batch(idx).asnumpy()
|
163
|
+
video = torch.tensor(video).permute(0, 3, 1, 2) # Convert to TCHW format
|
164
|
+
nframes, _, height, width = video.shape
|
165
|
+
min_pixels = ele.get("min_pixels", VIDEO_MIN_PIXELS)
|
166
|
+
total_pixels = ele.get("total_pixels", VIDEO_TOTAL_PIXELS)
|
167
|
+
max_pixels = max(
|
168
|
+
min(VIDEO_MAX_PIXELS, total_pixels / nframes * FRAME_FACTOR),
|
169
|
+
int(min_pixels * 1.05),
|
170
|
+
)
|
171
|
+
max_pixels_supposed = ele.get("max_pixels", max_pixels)
|
172
|
+
if max_pixels_supposed > max_pixels:
|
173
|
+
logger.warning(
|
174
|
+
f"The given max_pixels[{max_pixels_supposed}] exceeds limit[{max_pixels}]."
|
175
|
+
)
|
176
|
+
max_pixels = min(max_pixels_supposed, max_pixels)
|
177
|
+
if "resized_height" in ele and "resized_width" in ele:
|
178
|
+
resized_height, resized_width = smart_resize(
|
179
|
+
ele["resized_height"],
|
180
|
+
ele["resized_width"],
|
181
|
+
factor=image_factor,
|
182
|
+
)
|
183
|
+
else:
|
184
|
+
resized_height, resized_width = smart_resize(
|
185
|
+
height,
|
186
|
+
width,
|
187
|
+
factor=image_factor,
|
188
|
+
min_pixels=min_pixels,
|
189
|
+
max_pixels=max_pixels,
|
190
|
+
)
|
191
|
+
video = torchvision.transforms.functional.resize(
|
192
|
+
video,
|
193
|
+
[resized_height, resized_width],
|
194
|
+
interpolation=InterpolationMode.BICUBIC,
|
195
|
+
antialias=True,
|
196
|
+
).float()
|
197
|
+
return video
|
15
198
|
|
16
199
|
|
17
200
|
# Compatible with Qwen2VL and Qwen2_5VL
|
@@ -37,104 +220,44 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
|
|
37
220
|
self.MIN_PIXELS = 4 * 28 * 28
|
38
221
|
self.MAX_PIXELS = 16384 * 28 * 28
|
39
222
|
self.MAX_RATIO = 200
|
223
|
+
# TODO(mick): move all MultimodalSpecialTokens initializations into processor init
|
224
|
+
self.mm_special_tokens = MultimodalSpecialTokens(
|
225
|
+
image_token=self.IMAGE_TOKEN,
|
226
|
+
image_token_regex=self.IMAGE_TOKEN_REGEX,
|
227
|
+
video_token=self.VIDEO_TOKEN_ID,
|
228
|
+
)
|
40
229
|
|
41
230
|
async def process_mm_data_async(
|
42
231
|
self,
|
43
|
-
image_data: List[Union[str, bytes
|
232
|
+
image_data: List[Union[str, bytes]],
|
44
233
|
input_text,
|
45
234
|
request_obj,
|
46
235
|
max_req_input_len,
|
47
236
|
*args,
|
48
237
|
**kwargs,
|
49
238
|
):
|
239
|
+
|
50
240
|
base_output = self.load_mm_data(
|
51
241
|
prompt=input_text,
|
52
242
|
image_data=image_data,
|
53
|
-
|
54
|
-
|
55
|
-
image_token_regex=self.IMAGE_TOKEN_REGEX,
|
56
|
-
),
|
243
|
+
video_data=request_obj.video_data,
|
244
|
+
multimodal_tokens=self.mm_special_tokens,
|
57
245
|
max_req_input_len=max_req_input_len,
|
58
246
|
)
|
59
247
|
|
60
|
-
def smart_resize(
|
61
|
-
height: int,
|
62
|
-
width: int,
|
63
|
-
factor: int = self.IMAGE_FACTOR,
|
64
|
-
min_pixels: int = self.MIN_PIXELS,
|
65
|
-
max_pixels: int = self.MAX_PIXELS,
|
66
|
-
) -> tuple[int, int]:
|
67
|
-
"""
|
68
|
-
Rescales the image so that the following conditions are met:
|
69
|
-
|
70
|
-
1. Both dimensions (height and width) are divisible by 'factor'.
|
71
|
-
|
72
|
-
2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
|
73
|
-
|
74
|
-
3. The aspect ratio of the image is maintained as closely as possible.
|
75
|
-
"""
|
76
|
-
if max(height, width) / min(height, width) > self.MAX_RATIO:
|
77
|
-
raise ValueError(
|
78
|
-
f"absolute aspect ratio must be smaller than {self.MAX_RATIO}, got {max(height, width) / min(height, width)}"
|
79
|
-
)
|
80
|
-
h_bar = max(factor, round_by_factor(height, factor))
|
81
|
-
w_bar = max(factor, round_by_factor(width, factor))
|
82
|
-
if h_bar * w_bar > max_pixels:
|
83
|
-
beta = math.sqrt((height * width) / max_pixels)
|
84
|
-
h_bar = floor_by_factor(height / beta, factor)
|
85
|
-
w_bar = floor_by_factor(width / beta, factor)
|
86
|
-
elif h_bar * w_bar < min_pixels:
|
87
|
-
beta = math.sqrt(min_pixels / (height * width))
|
88
|
-
h_bar = ceil_by_factor(height * beta, factor)
|
89
|
-
w_bar = ceil_by_factor(width * beta, factor)
|
90
|
-
return h_bar, w_bar
|
91
|
-
|
92
|
-
def resize_image(image, size_factor: int = self.IMAGE_FACTOR) -> Image.Image:
|
93
|
-
width, height = image.size
|
94
|
-
min_pixels = self.MIN_PIXELS
|
95
|
-
max_pixels = self.MAX_PIXELS
|
96
|
-
resized_height, resized_width = smart_resize(
|
97
|
-
height,
|
98
|
-
width,
|
99
|
-
factor=size_factor,
|
100
|
-
min_pixels=min_pixels,
|
101
|
-
max_pixels=max_pixels,
|
102
|
-
)
|
103
|
-
image = image.resize((resized_width, resized_height))
|
104
|
-
return image
|
105
|
-
|
106
|
-
def round_by_factor(number: int, factor: int) -> int:
|
107
|
-
"""Returns the closest integer to 'number' that is divisible by 'factor'."""
|
108
|
-
return round(number / factor) * factor
|
109
|
-
|
110
|
-
def ceil_by_factor(number: int, factor: int) -> int:
|
111
|
-
"""Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
|
112
|
-
return math.ceil(number / factor) * factor
|
113
|
-
|
114
|
-
def floor_by_factor(number: int, factor: int) -> int:
|
115
|
-
"""Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
|
116
|
-
return math.floor(number / factor) * factor
|
117
|
-
|
118
|
-
async def resize_image_async(image):
|
119
|
-
return resize_image(image)
|
120
|
-
|
121
248
|
# Qwen-specific: resize images if they are raw Image objects
|
122
249
|
if base_output.images and isinstance(base_output.images[0], Image.Image):
|
123
250
|
resize_tasks = [resize_image_async(image) for image in base_output.images]
|
124
251
|
base_output.images = await asyncio.gather(*resize_tasks)
|
125
252
|
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
if not mm_items:
|
131
|
-
# Note(Xinyuan): This is the case where image loading fails.
|
132
|
-
return None
|
253
|
+
if base_output.videos:
|
254
|
+
base_output.videos = [
|
255
|
+
await preprocess_video(video) for video in base_output.videos
|
256
|
+
]
|
133
257
|
|
134
|
-
|
135
|
-
video_grid_thw = None # TODO
|
136
|
-
second_per_grid_ts = getattr(combined_mm_item, "second_per_grid_ts", None)
|
258
|
+
mm_items, input_ids, ret = self.process_and_combine_mm_data(base_output)
|
137
259
|
|
260
|
+
input_ids = input_ids.flatten()
|
138
261
|
mrope_positions, mrope_position_delta = MRotaryEmbedding.get_rope_index(
|
139
262
|
spatial_merge_size=self.hf_config.vision_config.spatial_merge_size,
|
140
263
|
image_token_id=self.IM_TOKEN_ID,
|
@@ -145,9 +268,9 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
|
|
145
268
|
self.hf_config.vision_config, "tokens_per_second", None
|
146
269
|
),
|
147
270
|
input_ids=input_ids.unsqueeze(0),
|
148
|
-
image_grid_thw=
|
149
|
-
video_grid_thw=video_grid_thw,
|
150
|
-
second_per_grid_ts=second_per_grid_ts,
|
271
|
+
image_grid_thw=getattr(ret, "image_grid_thw", None),
|
272
|
+
video_grid_thw=getattr(ret, "video_grid_thw", None),
|
273
|
+
second_per_grid_ts=getattr(ret, "second_per_grid_ts", None),
|
151
274
|
)
|
152
275
|
mrope_positions = mrope_positions.squeeze(1)
|
153
276
|
|
@@ -57,7 +57,7 @@ class VILAMultimodalProcessor(BaseMultimodalProcessor):
|
|
57
57
|
image_data=image_data,
|
58
58
|
)
|
59
59
|
|
60
|
-
mm_items, input_ids = self.process_and_combine_mm_data(base_output)
|
60
|
+
mm_items, input_ids, _ = self.process_and_combine_mm_data(base_output)
|
61
61
|
|
62
62
|
return {
|
63
63
|
"input_ids": input_ids.tolist(),
|
sglang/srt/server_args.py
CHANGED
@@ -416,7 +416,7 @@ class ServerArgs:
|
|
416
416
|
if self.enable_dp_lm_head:
|
417
417
|
assert (
|
418
418
|
self.enable_dp_attention
|
419
|
-
), "Please enable dp attention when setting
|
419
|
+
), "Please enable dp attention when setting enable_dp_lm_head. "
|
420
420
|
|
421
421
|
# DeepEP MoE
|
422
422
|
if self.enable_deepep_moe:
|
@@ -1048,9 +1048,16 @@ class ServerArgs:
|
|
1048
1048
|
parser.add_argument(
|
1049
1049
|
"--tool-call-parser",
|
1050
1050
|
type=str,
|
1051
|
-
choices=[
|
1051
|
+
choices=[
|
1052
|
+
"qwen25",
|
1053
|
+
"mistral",
|
1054
|
+
"llama3",
|
1055
|
+
"deepseekv3",
|
1056
|
+
"pythonic",
|
1057
|
+
"kimi_k2",
|
1058
|
+
],
|
1052
1059
|
default=ServerArgs.tool_call_parser,
|
1053
|
-
help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', and '
|
1060
|
+
help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', 'pythonic', and 'kimi_k2'.",
|
1054
1061
|
)
|
1055
1062
|
|
1056
1063
|
# Data parallelism
|
@@ -1614,7 +1621,7 @@ class ServerArgs:
|
|
1614
1621
|
"--disaggregation-transfer-backend",
|
1615
1622
|
type=str,
|
1616
1623
|
default=ServerArgs.disaggregation_transfer_backend,
|
1617
|
-
choices=["mooncake", "nixl"],
|
1624
|
+
choices=["mooncake", "nixl", "ascend"],
|
1618
1625
|
help="The backend for disaggregation transfer. Default is mooncake.",
|
1619
1626
|
)
|
1620
1627
|
parser.add_argument(
|
sglang/srt/utils.py
CHANGED
@@ -197,7 +197,7 @@ def get_int_env_var(name: str, default: int = 0) -> int:
|
|
197
197
|
|
198
198
|
|
199
199
|
def support_triton(backend: str) -> bool:
|
200
|
-
return backend not in ["torch_native", "intel_amx"]
|
200
|
+
return backend not in ["torch_native", "intel_amx", "ascend"]
|
201
201
|
|
202
202
|
|
203
203
|
try:
|
@@ -728,33 +728,6 @@ def load_audio(audio_file: str, sr: int = 16000, mono: bool = True) -> np.ndarra
|
|
728
728
|
return audio
|
729
729
|
|
730
730
|
|
731
|
-
def encode_video(video_path, frame_count_limit=None):
|
732
|
-
# Lazy import because decord is not available on some arm platforms.
|
733
|
-
from decord import VideoReader, cpu
|
734
|
-
|
735
|
-
if not os.path.exists(video_path):
|
736
|
-
logger.error(f"Video {video_path} does not exist")
|
737
|
-
return []
|
738
|
-
|
739
|
-
if frame_count_limit == 0:
|
740
|
-
return []
|
741
|
-
|
742
|
-
def uniform_sample(l, n):
|
743
|
-
gap = len(l) / n
|
744
|
-
idxs = [int(i * gap + gap / 2) for i in range(n)]
|
745
|
-
return [l[i] for i in idxs]
|
746
|
-
|
747
|
-
vr = VideoReader(video_path, ctx=cpu(0))
|
748
|
-
sample_fps = round(vr.get_avg_fps() / 1) # FPS
|
749
|
-
frame_indices = [i for i in range(0, len(vr), sample_fps)]
|
750
|
-
if frame_count_limit is not None and len(frame_indices) > frame_count_limit:
|
751
|
-
frame_indices = uniform_sample(frame_indices, frame_count_limit)
|
752
|
-
|
753
|
-
frames = vr.get_batch(frame_indices).asnumpy()
|
754
|
-
frames = [Image.fromarray(v.astype("uint8")) for v in frames]
|
755
|
-
return frames
|
756
|
-
|
757
|
-
|
758
731
|
def load_image(
|
759
732
|
image_file: Union[Image.Image, str, bytes],
|
760
733
|
) -> tuple[Image.Image, tuple[int, int]]:
|
@@ -774,9 +747,6 @@ def load_image(
|
|
774
747
|
elif image_file.startswith("data:"):
|
775
748
|
image_file = image_file.split(",")[1]
|
776
749
|
image = Image.open(BytesIO(pybase64.b64decode(image_file, validate=True)))
|
777
|
-
elif image_file.startswith("video:"):
|
778
|
-
image_file = image_file.replace("video:", "")
|
779
|
-
image, image_size = decode_video_base64(image_file)
|
780
750
|
elif isinstance(image_file, str):
|
781
751
|
image = Image.open(BytesIO(pybase64.b64decode(image_file, validate=True)))
|
782
752
|
else:
|
@@ -785,6 +755,61 @@ def load_image(
|
|
785
755
|
return image, image_size
|
786
756
|
|
787
757
|
|
758
|
+
def load_video(video_file: Union[str, bytes], use_gpu: bool = True):
|
759
|
+
# We import decord here to avoid a strange Segmentation fault (core dumped) issue.
|
760
|
+
from decord import VideoReader, cpu, gpu
|
761
|
+
|
762
|
+
try:
|
763
|
+
from decord.bridge import decord_bridge
|
764
|
+
|
765
|
+
ctx = gpu(0)
|
766
|
+
_ = decord_bridge.get_ctx_device(ctx)
|
767
|
+
except Exception:
|
768
|
+
ctx = cpu(0)
|
769
|
+
|
770
|
+
tmp_file = None
|
771
|
+
vr = None
|
772
|
+
try:
|
773
|
+
if isinstance(video_file, bytes):
|
774
|
+
tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
|
775
|
+
tmp_file.write(video_file)
|
776
|
+
tmp_file.close()
|
777
|
+
vr = VideoReader(tmp_file.name, ctx=ctx)
|
778
|
+
elif isinstance(video_file, str):
|
779
|
+
if video_file.startswith(("http://", "https://")):
|
780
|
+
timeout = int(os.getenv("REQUEST_TIMEOUT", "10"))
|
781
|
+
response = requests.get(video_file, stream=True, timeout=timeout)
|
782
|
+
response.raise_for_status()
|
783
|
+
tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
|
784
|
+
for chunk in response.iter_content(chunk_size=8192):
|
785
|
+
tmp_file.write(chunk)
|
786
|
+
tmp_file.close()
|
787
|
+
vr = VideoReader(tmp_file.name, ctx=ctx)
|
788
|
+
elif video_file.startswith("data:"):
|
789
|
+
_, encoded = video_file.split(",", 1)
|
790
|
+
video_bytes = base64.b64decode(encoded)
|
791
|
+
tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
|
792
|
+
tmp_file.write(video_bytes)
|
793
|
+
tmp_file.close()
|
794
|
+
vr = VideoReader(tmp_file.name, ctx=ctx)
|
795
|
+
elif os.path.isfile(video_file):
|
796
|
+
vr = VideoReader(video_file, ctx=ctx)
|
797
|
+
else:
|
798
|
+
video_bytes = base64.b64decode(video_file)
|
799
|
+
tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
|
800
|
+
tmp_file.write(video_bytes)
|
801
|
+
tmp_file.close()
|
802
|
+
vr = VideoReader(tmp_file.name, ctx=ctx)
|
803
|
+
else:
|
804
|
+
raise ValueError(f"Unsupported video input type: {type(video_file)}")
|
805
|
+
|
806
|
+
return vr
|
807
|
+
|
808
|
+
finally:
|
809
|
+
if tmp_file and os.path.exists(tmp_file.name):
|
810
|
+
os.unlink(tmp_file.name)
|
811
|
+
|
812
|
+
|
788
813
|
def suppress_other_loggers():
|
789
814
|
warnings.filterwarnings(
|
790
815
|
"ignore", category=UserWarning, message="The given NumPy array is not writable"
|
@@ -2757,3 +2782,101 @@ def lru_cache_frozenset(maxsize=128):
|
|
2757
2782
|
return wrapper
|
2758
2783
|
|
2759
2784
|
return decorator
|
2785
|
+
|
2786
|
+
|
2787
|
+
def apply_module_patch(target_module, target_function, wrappers):
|
2788
|
+
original_module, original_function = parse_module_path(
|
2789
|
+
target_module, target_function, False
|
2790
|
+
)
|
2791
|
+
|
2792
|
+
original_function_id = id(original_function)
|
2793
|
+
|
2794
|
+
candidate = original_function
|
2795
|
+
for wrapper in wrappers:
|
2796
|
+
candidate = wrapper(candidate)
|
2797
|
+
if target_function is not None:
|
2798
|
+
setattr(original_module, target_function, candidate)
|
2799
|
+
|
2800
|
+
for key, value in sys.modules.copy().items():
|
2801
|
+
if (
|
2802
|
+
target_function is not None
|
2803
|
+
and hasattr(value, target_function)
|
2804
|
+
and id(getattr(value, target_function)) == original_function_id
|
2805
|
+
):
|
2806
|
+
setattr(value, target_function, candidate)
|
2807
|
+
|
2808
|
+
|
2809
|
+
def parse_module_path(module_path, function_name, create_dummy):
|
2810
|
+
from importlib.machinery import ModuleSpec
|
2811
|
+
|
2812
|
+
def create_dummy_module(full_path, parent=None):
|
2813
|
+
"""Create and register a placeholder module"""
|
2814
|
+
dummy = types.ModuleType(full_path)
|
2815
|
+
dummy.__file__ = "vllm_ascend.dummy_module.py"
|
2816
|
+
dummy.__spec__ = ModuleSpec(full_path, None)
|
2817
|
+
sys.modules[full_path] = dummy
|
2818
|
+
if parent:
|
2819
|
+
setattr(parent, full_path.split(".")[-1], dummy)
|
2820
|
+
return dummy
|
2821
|
+
|
2822
|
+
def create_placeholder_function(func_name):
|
2823
|
+
"""Create dummy function that raises when called"""
|
2824
|
+
|
2825
|
+
def placeholder(*args, **kwargs):
|
2826
|
+
raise NotImplementedError(f"Function {func_name} is a placeholder")
|
2827
|
+
|
2828
|
+
placeholder.__name__ = func_name
|
2829
|
+
return placeholder
|
2830
|
+
|
2831
|
+
modules = module_path.split(".")
|
2832
|
+
current_module = None
|
2833
|
+
processed_path = []
|
2834
|
+
|
2835
|
+
for idx, part in enumerate(modules):
|
2836
|
+
current_path = ".".join(modules[: idx + 1])
|
2837
|
+
parent_path = ".".join(modules[:idx]) if idx > 0 else None
|
2838
|
+
|
2839
|
+
try:
|
2840
|
+
current_module = importlib.import_module(current_path)
|
2841
|
+
except ModuleNotFoundError:
|
2842
|
+
# Handle missing module
|
2843
|
+
parent = importlib.import_module(parent_path) if parent_path else None
|
2844
|
+
if parent and hasattr(parent, part):
|
2845
|
+
# Use existing attribute from parent
|
2846
|
+
current_module = getattr(parent, part)
|
2847
|
+
# Check for early function resolution
|
2848
|
+
if function_name and hasattr(current_module, function_name):
|
2849
|
+
return current_module, getattr(current_module, function_name)
|
2850
|
+
if function_name and create_dummy:
|
2851
|
+
ph_func = create_placeholder_function(function_name)
|
2852
|
+
setattr(current_module, function_name, ph_func)
|
2853
|
+
return current_module, ph_func
|
2854
|
+
if function_name:
|
2855
|
+
raise AttributeError(
|
2856
|
+
f"Function {function_name} missing in {current_path}"
|
2857
|
+
)
|
2858
|
+
else:
|
2859
|
+
if not create_dummy:
|
2860
|
+
raise
|
2861
|
+
# Create and register dummy module
|
2862
|
+
current_module = create_dummy_module(
|
2863
|
+
current_path,
|
2864
|
+
parent=(
|
2865
|
+
importlib.import_module(parent_path) if parent_path else None
|
2866
|
+
),
|
2867
|
+
)
|
2868
|
+
|
2869
|
+
processed_path.append(part)
|
2870
|
+
|
2871
|
+
# Final function handling
|
2872
|
+
final_module = sys.modules[module_path]
|
2873
|
+
if function_name is not None:
|
2874
|
+
if not hasattr(final_module, function_name):
|
2875
|
+
if create_dummy:
|
2876
|
+
ph_func = create_placeholder_function(function_name)
|
2877
|
+
setattr(final_module, function_name, ph_func)
|
2878
|
+
else:
|
2879
|
+
setattr(final_module, function_name, None)
|
2880
|
+
return final_module, getattr(final_module, function_name)
|
2881
|
+
|
2882
|
+
return final_module, None
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.4.9.
|
1
|
+
__version__ = "0.4.9.post2"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.4.9.
|
3
|
+
Version: 0.4.9.post2
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -242,6 +242,7 @@ Requires-Dist: pynvml; extra == "runtime-common"
|
|
242
242
|
Requires-Dist: pybase64; extra == "runtime-common"
|
243
243
|
Requires-Dist: python-multipart; extra == "runtime-common"
|
244
244
|
Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
|
245
|
+
Requires-Dist: sentencepiece; extra == "runtime-common"
|
245
246
|
Requires-Dist: soundfile==0.13.1; extra == "runtime-common"
|
246
247
|
Requires-Dist: scipy; extra == "runtime-common"
|
247
248
|
Requires-Dist: torchao==0.9.0; extra == "runtime-common"
|
@@ -249,10 +250,10 @@ Requires-Dist: transformers==4.53.0; extra == "runtime-common"
|
|
249
250
|
Requires-Dist: timm==1.0.16; extra == "runtime-common"
|
250
251
|
Requires-Dist: uvicorn; extra == "runtime-common"
|
251
252
|
Requires-Dist: uvloop; extra == "runtime-common"
|
252
|
-
Requires-Dist: xgrammar==0.1.
|
253
|
+
Requires-Dist: xgrammar==0.1.21; extra == "runtime-common"
|
253
254
|
Provides-Extra: srt
|
254
255
|
Requires-Dist: sglang[runtime_common]; extra == "srt"
|
255
|
-
Requires-Dist: sgl-kernel==0.2.
|
256
|
+
Requires-Dist: sgl-kernel==0.2.5; extra == "srt"
|
256
257
|
Requires-Dist: torch==2.7.1; extra == "srt"
|
257
258
|
Requires-Dist: torchaudio==2.7.1; extra == "srt"
|
258
259
|
Requires-Dist: torchvision==0.22.1; extra == "srt"
|