sglang 0.4.9__py3-none-any.whl → 0.4.9.post2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_serving.py +2 -2
- sglang/srt/configs/model_config.py +36 -2
- sglang/srt/conversation.py +56 -3
- sglang/srt/disaggregation/ascend/__init__.py +6 -0
- sglang/srt/disaggregation/ascend/conn.py +44 -0
- sglang/srt/disaggregation/ascend/transfer_engine.py +58 -0
- sglang/srt/disaggregation/mooncake/conn.py +50 -18
- sglang/srt/disaggregation/mooncake/transfer_engine.py +17 -8
- sglang/srt/disaggregation/utils.py +25 -3
- sglang/srt/entrypoints/engine.py +1 -1
- sglang/srt/entrypoints/http_server.py +1 -0
- sglang/srt/entrypoints/http_server_engine.py +1 -1
- sglang/srt/entrypoints/openai/protocol.py +11 -0
- sglang/srt/entrypoints/openai/serving_chat.py +7 -0
- sglang/srt/function_call/function_call_parser.py +2 -0
- sglang/srt/function_call/kimik2_detector.py +220 -0
- sglang/srt/hf_transformers_utils.py +18 -0
- sglang/srt/jinja_template_utils.py +8 -0
- sglang/srt/layers/communicator.py +20 -5
- sglang/srt/layers/flashinfer_comm_fusion.py +3 -3
- sglang/srt/layers/layernorm.py +2 -2
- sglang/srt/layers/linear.py +12 -2
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +215 -0
- sglang/srt/layers/moe/ep_moe/kernels.py +60 -1
- sglang/srt/layers/moe/ep_moe/layer.py +141 -2
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +2 -0
- sglang/srt/layers/moe/fused_moe_triton/layer.py +141 -59
- sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +176 -0
- sglang/srt/layers/moe/topk.py +8 -2
- sglang/srt/layers/parameter.py +19 -3
- sglang/srt/layers/quantization/__init__.py +2 -0
- sglang/srt/layers/quantization/fp8.py +28 -7
- sglang/srt/layers/quantization/fp8_kernel.py +2 -2
- sglang/srt/layers/quantization/modelopt_quant.py +244 -1
- sglang/srt/layers/quantization/moe_wna16.py +1 -2
- sglang/srt/layers/quantization/w4afp8.py +264 -0
- sglang/srt/layers/quantization/w8a8_int8.py +738 -14
- sglang/srt/layers/vocab_parallel_embedding.py +9 -3
- sglang/srt/lora/triton_ops/gate_up_lora_b.py +30 -19
- sglang/srt/lora/triton_ops/qkv_lora_b.py +30 -19
- sglang/srt/lora/triton_ops/sgemm_lora_a.py +27 -11
- sglang/srt/lora/triton_ops/sgemm_lora_b.py +27 -15
- sglang/srt/managers/cache_controller.py +41 -195
- sglang/srt/managers/io_struct.py +35 -3
- sglang/srt/managers/mm_utils.py +59 -96
- sglang/srt/managers/schedule_batch.py +17 -6
- sglang/srt/managers/scheduler.py +38 -6
- sglang/srt/managers/tokenizer_manager.py +16 -0
- sglang/srt/mem_cache/hiradix_cache.py +2 -0
- sglang/srt/mem_cache/memory_pool.py +176 -101
- sglang/srt/mem_cache/memory_pool_host.py +6 -109
- sglang/srt/mem_cache/radix_cache.py +8 -4
- sglang/srt/model_executor/forward_batch_info.py +13 -1
- sglang/srt/model_loader/loader.py +23 -12
- sglang/srt/models/deepseek_janus_pro.py +1 -1
- sglang/srt/models/deepseek_v2.py +78 -19
- sglang/srt/models/deepseek_vl2.py +1 -1
- sglang/srt/models/gemma3_mm.py +1 -1
- sglang/srt/models/gemma3n_mm.py +6 -3
- sglang/srt/models/internvl.py +8 -2
- sglang/srt/models/kimi_vl.py +8 -2
- sglang/srt/models/llama.py +2 -0
- sglang/srt/models/llava.py +3 -1
- sglang/srt/models/llavavid.py +1 -1
- sglang/srt/models/minicpmo.py +1 -2
- sglang/srt/models/minicpmv.py +1 -1
- sglang/srt/models/mixtral_quant.py +4 -0
- sglang/srt/models/mllama4.py +372 -82
- sglang/srt/models/phi4mm.py +8 -2
- sglang/srt/models/phimoe.py +553 -0
- sglang/srt/models/qwen2.py +2 -0
- sglang/srt/models/qwen2_5_vl.py +10 -7
- sglang/srt/models/qwen2_vl.py +12 -1
- sglang/srt/models/vila.py +8 -2
- sglang/srt/multimodal/mm_utils.py +2 -2
- sglang/srt/multimodal/processors/base_processor.py +197 -137
- sglang/srt/multimodal/processors/deepseek_vl_v2.py +1 -1
- sglang/srt/multimodal/processors/gemma3.py +4 -2
- sglang/srt/multimodal/processors/gemma3n.py +1 -1
- sglang/srt/multimodal/processors/internvl.py +1 -1
- sglang/srt/multimodal/processors/janus_pro.py +1 -1
- sglang/srt/multimodal/processors/kimi_vl.py +1 -1
- sglang/srt/multimodal/processors/minicpm.py +4 -3
- sglang/srt/multimodal/processors/mllama4.py +63 -61
- sglang/srt/multimodal/processors/phi4mm.py +1 -1
- sglang/srt/multimodal/processors/pixtral.py +1 -1
- sglang/srt/multimodal/processors/qwen_vl.py +203 -80
- sglang/srt/multimodal/processors/vila.py +1 -1
- sglang/srt/server_args.py +26 -4
- sglang/srt/two_batch_overlap.py +3 -0
- sglang/srt/utils.py +191 -48
- sglang/test/test_cutlass_w4a8_moe.py +281 -0
- sglang/utils.py +5 -5
- sglang/version.py +1 -1
- {sglang-0.4.9.dist-info → sglang-0.4.9.post2.dist-info}/METADATA +6 -4
- {sglang-0.4.9.dist-info → sglang-0.4.9.post2.dist-info}/RECORD +99 -90
- {sglang-0.4.9.dist-info → sglang-0.4.9.post2.dist-info}/WHEEL +0 -0
- {sglang-0.4.9.dist-info → sglang-0.4.9.post2.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.9.dist-info → sglang-0.4.9.post2.dist-info}/top_level.txt +0 -0
@@ -60,70 +60,72 @@ class Mllama4ImageProcessor(BaseMultimodalProcessor):
|
|
60
60
|
)
|
61
61
|
|
62
62
|
# Handle image resolutions and aspect ratios
|
63
|
-
if "pixel_values" in processor_output:
|
64
|
-
|
65
|
-
tokenizer = self._processor.tokenizer
|
63
|
+
if "pixel_values" not in processor_output: # no image processed
|
64
|
+
return None
|
66
65
|
|
67
|
-
|
68
|
-
|
69
|
-
max_num_tiles = getattr(self.vision_config, "max_patches", 1)
|
66
|
+
image_processor = processor.image_processor
|
67
|
+
tokenizer = self._processor.tokenizer
|
70
68
|
|
71
|
-
|
72
|
-
|
73
|
-
|
69
|
+
# Calculate tile size and find supported resolutions
|
70
|
+
tile_size = self.vision_config.image_size
|
71
|
+
max_num_tiles = getattr(self.vision_config, "max_patches", 1)
|
72
|
+
|
73
|
+
possible_resolutions = find_supported_resolutions(
|
74
|
+
max_num_chunks=max_num_tiles,
|
75
|
+
patch_size=SizeDict(height=tile_size, width=tile_size),
|
76
|
+
)
|
77
|
+
|
78
|
+
# Find best fit for each image
|
79
|
+
best_fit_sizes = [
|
80
|
+
get_best_fit(
|
81
|
+
(image.size[1], image.size[0]), # (height, width)
|
82
|
+
torch.tensor(possible_resolutions),
|
83
|
+
resize_to_max_canvas=image_processor.resize_to_max_canvas,
|
74
84
|
)
|
85
|
+
for image in processed_data.images
|
86
|
+
]
|
87
|
+
|
88
|
+
# Calculate aspect ratios and patches per image
|
89
|
+
aspect_ratios = [
|
90
|
+
(image_size[0] // tile_size, image_size[1] // tile_size)
|
91
|
+
for image_size in best_fit_sizes
|
92
|
+
]
|
93
|
+
|
94
|
+
patches_per_image = [
|
95
|
+
1 if r_h * r_w == 1 else 1 + r_h * r_w for (r_h, r_w) in aspect_ratios
|
96
|
+
]
|
97
|
+
|
98
|
+
# Add to image_inputs
|
99
|
+
processor_output["aspect_ratios"] = aspect_ratios
|
100
|
+
processor_output["patches_per_image"] = torch.tensor(patches_per_image)
|
101
|
+
|
102
|
+
# Process embed_is_patch
|
103
|
+
vocab = tokenizer.get_vocab()
|
104
|
+
patch_id = vocab.get(processor.img_patch_token, -1)
|
105
|
+
image_end_id = vocab.get(processor.end_of_img_token, -1)
|
106
|
+
|
107
|
+
if patch_id != -1 and image_end_id != -1:
|
108
|
+
input_ids = processor_output["input_ids"].view(-1)
|
109
|
+
|
110
|
+
# Remove BOS token if present
|
111
|
+
if input_ids.size(0) > 0 and input_ids[0] == tokenizer.bos_token_id:
|
112
|
+
input_ids = input_ids[1:]
|
113
|
+
|
114
|
+
# Find image end indices and split input_ids
|
115
|
+
image_end_indices = (input_ids == image_end_id).nonzero().view(-1)
|
116
|
+
|
117
|
+
if image_end_indices.size(0) > 0:
|
118
|
+
# Split at image boundaries
|
119
|
+
split_indices = (image_end_indices + 1)[:-1]
|
120
|
+
split_input_ids = torch.tensor_split(input_ids, split_indices)
|
121
|
+
split_input_ids = [x for x in split_input_ids if x.numel() > 0]
|
122
|
+
|
123
|
+
# Create embed_is_patch for each image
|
124
|
+
embed_is_patch = []
|
125
|
+
for per_image_input_ids in split_input_ids:
|
126
|
+
embed_is_patch.append(per_image_input_ids == patch_id)
|
75
127
|
|
76
|
-
|
77
|
-
best_fit_sizes = [
|
78
|
-
get_best_fit(
|
79
|
-
(image.size[1], image.size[0]), # (height, width)
|
80
|
-
torch.tensor(possible_resolutions),
|
81
|
-
resize_to_max_canvas=image_processor.resize_to_max_canvas,
|
82
|
-
)
|
83
|
-
for image in processed_data.images
|
84
|
-
]
|
85
|
-
|
86
|
-
# Calculate aspect ratios and patches per image
|
87
|
-
aspect_ratios = [
|
88
|
-
(image_size[0] // tile_size, image_size[1] // tile_size)
|
89
|
-
for image_size in best_fit_sizes
|
90
|
-
]
|
91
|
-
|
92
|
-
patches_per_image = [
|
93
|
-
1 if r_h * r_w == 1 else 1 + r_h * r_w for (r_h, r_w) in aspect_ratios
|
94
|
-
]
|
95
|
-
|
96
|
-
# Add to image_inputs
|
97
|
-
processor_output["aspect_ratios"] = aspect_ratios
|
98
|
-
processor_output["patches_per_image"] = torch.tensor(patches_per_image)
|
99
|
-
|
100
|
-
# Process embed_is_patch
|
101
|
-
vocab = tokenizer.get_vocab()
|
102
|
-
patch_id = vocab.get(processor.img_patch_token, -1)
|
103
|
-
image_end_id = vocab.get(processor.end_of_img_token, -1)
|
104
|
-
|
105
|
-
if patch_id != -1 and image_end_id != -1:
|
106
|
-
input_ids = processor_output["input_ids"].view(-1)
|
107
|
-
|
108
|
-
# Remove BOS token if present
|
109
|
-
if input_ids.size(0) > 0 and input_ids[0] == tokenizer.bos_token_id:
|
110
|
-
input_ids = input_ids[1:]
|
111
|
-
|
112
|
-
# Find image end indices and split input_ids
|
113
|
-
image_end_indices = (input_ids == image_end_id).nonzero().view(-1)
|
114
|
-
|
115
|
-
if image_end_indices.size(0) > 0:
|
116
|
-
# Split at image boundaries
|
117
|
-
split_indices = (image_end_indices + 1)[:-1]
|
118
|
-
split_input_ids = torch.tensor_split(input_ids, split_indices)
|
119
|
-
split_input_ids = [x for x in split_input_ids if x.numel() > 0]
|
120
|
-
|
121
|
-
# Create embed_is_patch for each image
|
122
|
-
embed_is_patch = []
|
123
|
-
for per_image_input_ids in split_input_ids:
|
124
|
-
embed_is_patch.append(per_image_input_ids == patch_id)
|
125
|
-
|
126
|
-
processor_output["embed_is_patch"] = embed_is_patch
|
128
|
+
processor_output["embed_is_patch"] = embed_is_patch
|
127
129
|
|
128
130
|
# Convert to the format expected by SGLang
|
129
131
|
processor_output["input_ids"] = processor_output["input_ids"].tolist()[0]
|
@@ -142,7 +144,7 @@ class Mllama4ImageProcessor(BaseMultimodalProcessor):
|
|
142
144
|
MultimodalDataItem(
|
143
145
|
pixel_values=processor_output["pixel_values"],
|
144
146
|
modality=Modality.IMAGE,
|
145
|
-
|
147
|
+
offsets=image_offsets,
|
146
148
|
)
|
147
149
|
]
|
148
150
|
|
@@ -65,7 +65,7 @@ class Phi4MMImageProcessor(BaseMultimodalProcessor):
|
|
65
65
|
pixel_values=res["input_image_embeds"],
|
66
66
|
image_sizes=res["image_sizes"],
|
67
67
|
image_emb_mask=res["image_attention_mask"],
|
68
|
-
|
68
|
+
offsets=image_offsets,
|
69
69
|
modality=Modality.IMAGE,
|
70
70
|
)
|
71
71
|
]
|
@@ -106,7 +106,7 @@ class PixtralProcessor(BaseMultimodalProcessor):
|
|
106
106
|
pixel_values=processor_output["pixel_values"],
|
107
107
|
image_sizes=processor_output["image_sizes"],
|
108
108
|
modality=Modality.IMAGE,
|
109
|
-
|
109
|
+
offsets=image_offsets,
|
110
110
|
)
|
111
111
|
]
|
112
112
|
|
@@ -1,9 +1,13 @@
|
|
1
1
|
import asyncio
|
2
2
|
import math
|
3
|
+
import os
|
3
4
|
import re
|
4
|
-
from typing import
|
5
|
+
from typing import List, Union
|
5
6
|
|
7
|
+
import torch
|
8
|
+
import torchvision
|
6
9
|
from PIL import Image
|
10
|
+
from torchvision.transforms import InterpolationMode
|
7
11
|
|
8
12
|
from sglang.srt.layers.rotary_embedding import MRotaryEmbedding
|
9
13
|
from sglang.srt.models.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
|
@@ -12,6 +16,185 @@ from sglang.srt.multimodal.processors.base_processor import (
|
|
12
16
|
BaseMultimodalProcessor as SGLangBaseProcessor,
|
13
17
|
)
|
14
18
|
from sglang.srt.multimodal.processors.base_processor import MultimodalSpecialTokens
|
19
|
+
from sglang.utils import logger
|
20
|
+
|
21
|
+
IMAGE_FACTOR = 28
|
22
|
+
MIN_PIXELS = 4 * 28 * 28
|
23
|
+
MAX_PIXELS = 16384 * 28 * 28
|
24
|
+
MAX_RATIO = 200
|
25
|
+
VIDEO_TOTAL_PIXELS = int(
|
26
|
+
float(os.environ.get("VIDEO_MAX_PIXELS", 128000 * 28 * 28 * 0.9))
|
27
|
+
)
|
28
|
+
|
29
|
+
VIDEO_MIN_PIXELS = 128 * 28 * 28
|
30
|
+
VIDEO_MAX_PIXELS = 768 * 28 * 28
|
31
|
+
FRAME_FACTOR = 2
|
32
|
+
FPS = 2.0
|
33
|
+
FPS_MIN_FRAMES = 4
|
34
|
+
FPS_MAX_FRAMES = 768
|
35
|
+
|
36
|
+
|
37
|
+
def smart_resize(
|
38
|
+
height: int,
|
39
|
+
width: int,
|
40
|
+
factor: int = IMAGE_FACTOR,
|
41
|
+
min_pixels: int = MIN_PIXELS,
|
42
|
+
max_pixels: int = MAX_PIXELS,
|
43
|
+
) -> tuple[int, int]:
|
44
|
+
"""
|
45
|
+
Rescales the image so that the following conditions are met:
|
46
|
+
|
47
|
+
1. Both dimensions (height and width) are divisible by 'factor'.
|
48
|
+
|
49
|
+
2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
|
50
|
+
|
51
|
+
3. The aspect ratio of the image is maintained as closely as possible.
|
52
|
+
"""
|
53
|
+
if max(height, width) / min(height, width) > MAX_RATIO:
|
54
|
+
raise ValueError(
|
55
|
+
f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)}"
|
56
|
+
)
|
57
|
+
h_bar = max(factor, round_by_factor(height, factor))
|
58
|
+
w_bar = max(factor, round_by_factor(width, factor))
|
59
|
+
if h_bar * w_bar > max_pixels:
|
60
|
+
beta = math.sqrt((height * width) / max_pixels)
|
61
|
+
h_bar = floor_by_factor(height / beta, factor)
|
62
|
+
w_bar = floor_by_factor(width / beta, factor)
|
63
|
+
elif h_bar * w_bar < min_pixels:
|
64
|
+
beta = math.sqrt(min_pixels / (height * width))
|
65
|
+
h_bar = ceil_by_factor(height * beta, factor)
|
66
|
+
w_bar = ceil_by_factor(width * beta, factor)
|
67
|
+
return h_bar, w_bar
|
68
|
+
|
69
|
+
|
70
|
+
def resize_image(image, size_factor: int = IMAGE_FACTOR) -> Image.Image:
|
71
|
+
width, height = image.size
|
72
|
+
min_pixels = MIN_PIXELS
|
73
|
+
max_pixels = MAX_PIXELS
|
74
|
+
resized_height, resized_width = smart_resize(
|
75
|
+
height,
|
76
|
+
width,
|
77
|
+
factor=size_factor,
|
78
|
+
min_pixels=min_pixels,
|
79
|
+
max_pixels=max_pixels,
|
80
|
+
)
|
81
|
+
image = image.resize((resized_width, resized_height))
|
82
|
+
return image
|
83
|
+
|
84
|
+
|
85
|
+
def round_by_factor(number: int, factor: int) -> int:
|
86
|
+
"""Returns the closest integer to 'number' that is divisible by 'factor'."""
|
87
|
+
return round(number / factor) * factor
|
88
|
+
|
89
|
+
|
90
|
+
def ceil_by_factor(number: int, factor: int) -> int:
|
91
|
+
"""Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
|
92
|
+
return math.ceil(number / factor) * factor
|
93
|
+
|
94
|
+
|
95
|
+
def floor_by_factor(number: int, factor: int) -> int:
|
96
|
+
"""Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
|
97
|
+
return math.floor(number / factor) * factor
|
98
|
+
|
99
|
+
|
100
|
+
async def resize_image_async(image):
|
101
|
+
return resize_image(image)
|
102
|
+
|
103
|
+
|
104
|
+
def smart_nframes(
|
105
|
+
ele: dict,
|
106
|
+
total_frames: int,
|
107
|
+
video_fps: int | float,
|
108
|
+
) -> int:
|
109
|
+
"""calculate the number of frames for video used for model inputs.
|
110
|
+
|
111
|
+
Args:
|
112
|
+
ele (dict): a dict contains the configuration of video.
|
113
|
+
support either `fps` or `nframes`:
|
114
|
+
- nframes: the number of frames to extract for model inputs.
|
115
|
+
- fps: the fps to extract frames for model inputs.
|
116
|
+
- min_frames: the minimum number of frames of the video, only used when fps is provided.
|
117
|
+
- max_frames: the maximum number of frames of the video, only used when fps is provided.
|
118
|
+
total_frames (int): the original total number of frames of the video.
|
119
|
+
video_fps (int | float): the original fps of the video.
|
120
|
+
|
121
|
+
Raises:
|
122
|
+
ValueError: nframes should in interval [FRAME_FACTOR, total_frames].
|
123
|
+
|
124
|
+
Returns:
|
125
|
+
int: the number of frames for video used for model inputs.
|
126
|
+
"""
|
127
|
+
assert not (
|
128
|
+
"fps" in ele and "nframes" in ele
|
129
|
+
), "Only accept either `fps` or `nframes`"
|
130
|
+
if "nframes" in ele:
|
131
|
+
nframes = round_by_factor(ele["nframes"], FRAME_FACTOR)
|
132
|
+
else:
|
133
|
+
fps = ele.get("fps", FPS)
|
134
|
+
min_frames = ceil_by_factor(ele.get("min_frames", FPS_MIN_FRAMES), FRAME_FACTOR)
|
135
|
+
max_frames = floor_by_factor(
|
136
|
+
ele.get("max_frames", min(FPS_MAX_FRAMES, total_frames)), FRAME_FACTOR
|
137
|
+
)
|
138
|
+
nframes = total_frames / video_fps * fps
|
139
|
+
if nframes > total_frames:
|
140
|
+
logger.warning(
|
141
|
+
f"smart_nframes: nframes[{nframes}] > total_frames[{total_frames}]"
|
142
|
+
)
|
143
|
+
nframes = min(min(max(nframes, min_frames), max_frames), total_frames)
|
144
|
+
nframes = floor_by_factor(nframes, FRAME_FACTOR)
|
145
|
+
if not (FRAME_FACTOR <= nframes and nframes <= total_frames):
|
146
|
+
raise ValueError(
|
147
|
+
f"nframes should in interval [{FRAME_FACTOR}, {total_frames}], but got {nframes}."
|
148
|
+
)
|
149
|
+
return nframes
|
150
|
+
|
151
|
+
|
152
|
+
# process video, qwen-specific
|
153
|
+
async def preprocess_video(
|
154
|
+
vr,
|
155
|
+
image_factor: int = IMAGE_FACTOR,
|
156
|
+
# vr: VideoReader, image_factor: int = IMAGE_FACTOR
|
157
|
+
) -> torch.Tensor:
|
158
|
+
ele = {}
|
159
|
+
total_frames, video_fps = len(vr), vr.get_avg_fps()
|
160
|
+
nframes = smart_nframes({}, total_frames=total_frames, video_fps=video_fps)
|
161
|
+
idx = torch.linspace(0, total_frames - 1, nframes).round().long().tolist()
|
162
|
+
video = vr.get_batch(idx).asnumpy()
|
163
|
+
video = torch.tensor(video).permute(0, 3, 1, 2) # Convert to TCHW format
|
164
|
+
nframes, _, height, width = video.shape
|
165
|
+
min_pixels = ele.get("min_pixels", VIDEO_MIN_PIXELS)
|
166
|
+
total_pixels = ele.get("total_pixels", VIDEO_TOTAL_PIXELS)
|
167
|
+
max_pixels = max(
|
168
|
+
min(VIDEO_MAX_PIXELS, total_pixels / nframes * FRAME_FACTOR),
|
169
|
+
int(min_pixels * 1.05),
|
170
|
+
)
|
171
|
+
max_pixels_supposed = ele.get("max_pixels", max_pixels)
|
172
|
+
if max_pixels_supposed > max_pixels:
|
173
|
+
logger.warning(
|
174
|
+
f"The given max_pixels[{max_pixels_supposed}] exceeds limit[{max_pixels}]."
|
175
|
+
)
|
176
|
+
max_pixels = min(max_pixels_supposed, max_pixels)
|
177
|
+
if "resized_height" in ele and "resized_width" in ele:
|
178
|
+
resized_height, resized_width = smart_resize(
|
179
|
+
ele["resized_height"],
|
180
|
+
ele["resized_width"],
|
181
|
+
factor=image_factor,
|
182
|
+
)
|
183
|
+
else:
|
184
|
+
resized_height, resized_width = smart_resize(
|
185
|
+
height,
|
186
|
+
width,
|
187
|
+
factor=image_factor,
|
188
|
+
min_pixels=min_pixels,
|
189
|
+
max_pixels=max_pixels,
|
190
|
+
)
|
191
|
+
video = torchvision.transforms.functional.resize(
|
192
|
+
video,
|
193
|
+
[resized_height, resized_width],
|
194
|
+
interpolation=InterpolationMode.BICUBIC,
|
195
|
+
antialias=True,
|
196
|
+
).float()
|
197
|
+
return video
|
15
198
|
|
16
199
|
|
17
200
|
# Compatible with Qwen2VL and Qwen2_5VL
|
@@ -37,104 +220,44 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
|
|
37
220
|
self.MIN_PIXELS = 4 * 28 * 28
|
38
221
|
self.MAX_PIXELS = 16384 * 28 * 28
|
39
222
|
self.MAX_RATIO = 200
|
223
|
+
# TODO(mick): move all MultimodalSpecialTokens initializations into processor init
|
224
|
+
self.mm_special_tokens = MultimodalSpecialTokens(
|
225
|
+
image_token=self.IMAGE_TOKEN,
|
226
|
+
image_token_regex=self.IMAGE_TOKEN_REGEX,
|
227
|
+
video_token=self.VIDEO_TOKEN_ID,
|
228
|
+
)
|
40
229
|
|
41
230
|
async def process_mm_data_async(
|
42
231
|
self,
|
43
|
-
image_data: List[Union[str, bytes
|
232
|
+
image_data: List[Union[str, bytes]],
|
44
233
|
input_text,
|
45
234
|
request_obj,
|
46
235
|
max_req_input_len,
|
47
236
|
*args,
|
48
237
|
**kwargs,
|
49
238
|
):
|
239
|
+
|
50
240
|
base_output = self.load_mm_data(
|
51
241
|
prompt=input_text,
|
52
242
|
image_data=image_data,
|
53
|
-
|
54
|
-
|
55
|
-
image_token_regex=self.IMAGE_TOKEN_REGEX,
|
56
|
-
),
|
243
|
+
video_data=request_obj.video_data,
|
244
|
+
multimodal_tokens=self.mm_special_tokens,
|
57
245
|
max_req_input_len=max_req_input_len,
|
58
246
|
)
|
59
247
|
|
60
|
-
def smart_resize(
|
61
|
-
height: int,
|
62
|
-
width: int,
|
63
|
-
factor: int = self.IMAGE_FACTOR,
|
64
|
-
min_pixels: int = self.MIN_PIXELS,
|
65
|
-
max_pixels: int = self.MAX_PIXELS,
|
66
|
-
) -> tuple[int, int]:
|
67
|
-
"""
|
68
|
-
Rescales the image so that the following conditions are met:
|
69
|
-
|
70
|
-
1. Both dimensions (height and width) are divisible by 'factor'.
|
71
|
-
|
72
|
-
2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
|
73
|
-
|
74
|
-
3. The aspect ratio of the image is maintained as closely as possible.
|
75
|
-
"""
|
76
|
-
if max(height, width) / min(height, width) > self.MAX_RATIO:
|
77
|
-
raise ValueError(
|
78
|
-
f"absolute aspect ratio must be smaller than {self.MAX_RATIO}, got {max(height, width) / min(height, width)}"
|
79
|
-
)
|
80
|
-
h_bar = max(factor, round_by_factor(height, factor))
|
81
|
-
w_bar = max(factor, round_by_factor(width, factor))
|
82
|
-
if h_bar * w_bar > max_pixels:
|
83
|
-
beta = math.sqrt((height * width) / max_pixels)
|
84
|
-
h_bar = floor_by_factor(height / beta, factor)
|
85
|
-
w_bar = floor_by_factor(width / beta, factor)
|
86
|
-
elif h_bar * w_bar < min_pixels:
|
87
|
-
beta = math.sqrt(min_pixels / (height * width))
|
88
|
-
h_bar = ceil_by_factor(height * beta, factor)
|
89
|
-
w_bar = ceil_by_factor(width * beta, factor)
|
90
|
-
return h_bar, w_bar
|
91
|
-
|
92
|
-
def resize_image(image, size_factor: int = self.IMAGE_FACTOR) -> Image.Image:
|
93
|
-
width, height = image.size
|
94
|
-
min_pixels = self.MIN_PIXELS
|
95
|
-
max_pixels = self.MAX_PIXELS
|
96
|
-
resized_height, resized_width = smart_resize(
|
97
|
-
height,
|
98
|
-
width,
|
99
|
-
factor=size_factor,
|
100
|
-
min_pixels=min_pixels,
|
101
|
-
max_pixels=max_pixels,
|
102
|
-
)
|
103
|
-
image = image.resize((resized_width, resized_height))
|
104
|
-
return image
|
105
|
-
|
106
|
-
def round_by_factor(number: int, factor: int) -> int:
|
107
|
-
"""Returns the closest integer to 'number' that is divisible by 'factor'."""
|
108
|
-
return round(number / factor) * factor
|
109
|
-
|
110
|
-
def ceil_by_factor(number: int, factor: int) -> int:
|
111
|
-
"""Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
|
112
|
-
return math.ceil(number / factor) * factor
|
113
|
-
|
114
|
-
def floor_by_factor(number: int, factor: int) -> int:
|
115
|
-
"""Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
|
116
|
-
return math.floor(number / factor) * factor
|
117
|
-
|
118
|
-
async def resize_image_async(image):
|
119
|
-
return resize_image(image)
|
120
|
-
|
121
248
|
# Qwen-specific: resize images if they are raw Image objects
|
122
249
|
if base_output.images and isinstance(base_output.images[0], Image.Image):
|
123
250
|
resize_tasks = [resize_image_async(image) for image in base_output.images]
|
124
251
|
base_output.images = await asyncio.gather(*resize_tasks)
|
125
252
|
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
if not mm_items:
|
131
|
-
# Note(Xinyuan): This is the case where image loading fails.
|
132
|
-
return None
|
253
|
+
if base_output.videos:
|
254
|
+
base_output.videos = [
|
255
|
+
await preprocess_video(video) for video in base_output.videos
|
256
|
+
]
|
133
257
|
|
134
|
-
|
135
|
-
video_grid_thw = None # TODO
|
136
|
-
second_per_grid_ts = getattr(combined_mm_item, "second_per_grid_ts", None)
|
258
|
+
mm_items, input_ids, ret = self.process_and_combine_mm_data(base_output)
|
137
259
|
|
260
|
+
input_ids = input_ids.flatten()
|
138
261
|
mrope_positions, mrope_position_delta = MRotaryEmbedding.get_rope_index(
|
139
262
|
spatial_merge_size=self.hf_config.vision_config.spatial_merge_size,
|
140
263
|
image_token_id=self.IM_TOKEN_ID,
|
@@ -145,9 +268,9 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
|
|
145
268
|
self.hf_config.vision_config, "tokens_per_second", None
|
146
269
|
),
|
147
270
|
input_ids=input_ids.unsqueeze(0),
|
148
|
-
image_grid_thw=
|
149
|
-
video_grid_thw=video_grid_thw,
|
150
|
-
second_per_grid_ts=second_per_grid_ts,
|
271
|
+
image_grid_thw=getattr(ret, "image_grid_thw", None),
|
272
|
+
video_grid_thw=getattr(ret, "video_grid_thw", None),
|
273
|
+
second_per_grid_ts=getattr(ret, "second_per_grid_ts", None),
|
151
274
|
)
|
152
275
|
mrope_positions = mrope_positions.squeeze(1)
|
153
276
|
|
@@ -57,7 +57,7 @@ class VILAMultimodalProcessor(BaseMultimodalProcessor):
|
|
57
57
|
image_data=image_data,
|
58
58
|
)
|
59
59
|
|
60
|
-
mm_items, input_ids = self.process_and_combine_mm_data(base_output)
|
60
|
+
mm_items, input_ids, _ = self.process_and_combine_mm_data(base_output)
|
61
61
|
|
62
62
|
return {
|
63
63
|
"input_ids": input_ids.tolist(),
|
sglang/srt/server_args.py
CHANGED
@@ -217,11 +217,13 @@ class ServerArgs:
|
|
217
217
|
hicache_ratio: float = 2.0
|
218
218
|
hicache_size: int = 0
|
219
219
|
hicache_write_policy: str = "write_through_selective"
|
220
|
+
hicache_io_backend: str = ""
|
220
221
|
flashinfer_mla_disable_ragged: bool = False
|
221
222
|
disable_shared_experts_fusion: bool = False
|
222
223
|
disable_chunked_prefix_cache: bool = False
|
223
224
|
disable_fast_image_processor: bool = False
|
224
225
|
enable_return_hidden_states: bool = False
|
226
|
+
enable_triton_kernel_moe: bool = False
|
225
227
|
warmups: Optional[str] = None
|
226
228
|
|
227
229
|
# Debug tensor dumps
|
@@ -414,7 +416,7 @@ class ServerArgs:
|
|
414
416
|
if self.enable_dp_lm_head:
|
415
417
|
assert (
|
416
418
|
self.enable_dp_attention
|
417
|
-
), "Please enable dp attention when setting
|
419
|
+
), "Please enable dp attention when setting enable_dp_lm_head. "
|
418
420
|
|
419
421
|
# DeepEP MoE
|
420
422
|
if self.enable_deepep_moe:
|
@@ -706,6 +708,7 @@ class ServerArgs:
|
|
706
708
|
"w8a8_fp8",
|
707
709
|
"moe_wna16",
|
708
710
|
"qoq",
|
711
|
+
"w4afp8",
|
709
712
|
],
|
710
713
|
help="The quantization method.",
|
711
714
|
)
|
@@ -1045,9 +1048,16 @@ class ServerArgs:
|
|
1045
1048
|
parser.add_argument(
|
1046
1049
|
"--tool-call-parser",
|
1047
1050
|
type=str,
|
1048
|
-
choices=[
|
1051
|
+
choices=[
|
1052
|
+
"qwen25",
|
1053
|
+
"mistral",
|
1054
|
+
"llama3",
|
1055
|
+
"deepseekv3",
|
1056
|
+
"pythonic",
|
1057
|
+
"kimi_k2",
|
1058
|
+
],
|
1049
1059
|
default=ServerArgs.tool_call_parser,
|
1050
|
-
help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', and '
|
1060
|
+
help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', 'pythonic', and 'kimi_k2'.",
|
1051
1061
|
)
|
1052
1062
|
|
1053
1063
|
# Data parallelism
|
@@ -1529,6 +1539,13 @@ class ServerArgs:
|
|
1529
1539
|
default=ServerArgs.hicache_write_policy,
|
1530
1540
|
help="The write policy of hierarchical cache.",
|
1531
1541
|
)
|
1542
|
+
parser.add_argument(
|
1543
|
+
"--hicache-io-backend",
|
1544
|
+
type=str,
|
1545
|
+
choices=["direct", "kernel"],
|
1546
|
+
default=ServerArgs.hicache_io_backend,
|
1547
|
+
help="The IO backend for KV cache transfer between CPU and GPU",
|
1548
|
+
)
|
1532
1549
|
parser.add_argument(
|
1533
1550
|
"--flashinfer-mla-disable-ragged",
|
1534
1551
|
action="store_true",
|
@@ -1554,6 +1571,11 @@ class ServerArgs:
|
|
1554
1571
|
action="store_true",
|
1555
1572
|
help="Enable returning hidden states with responses.",
|
1556
1573
|
)
|
1574
|
+
parser.add_argument(
|
1575
|
+
"--enable-triton-kernel-moe",
|
1576
|
+
action="store_true",
|
1577
|
+
help="Use triton moe grouped gemm kernel.",
|
1578
|
+
)
|
1557
1579
|
parser.add_argument(
|
1558
1580
|
"--warmups",
|
1559
1581
|
type=str,
|
@@ -1599,7 +1621,7 @@ class ServerArgs:
|
|
1599
1621
|
"--disaggregation-transfer-backend",
|
1600
1622
|
type=str,
|
1601
1623
|
default=ServerArgs.disaggregation_transfer_backend,
|
1602
|
-
choices=["mooncake", "nixl"],
|
1624
|
+
choices=["mooncake", "nixl", "ascend"],
|
1603
1625
|
help="The backend for disaggregation transfer. Default is mooncake.",
|
1604
1626
|
)
|
1605
1627
|
parser.add_argument(
|
sglang/srt/two_batch_overlap.py
CHANGED
@@ -490,6 +490,7 @@ class TboForwardBatchPreparer:
|
|
490
490
|
output_dict["spec_info"] = output_spec_info
|
491
491
|
for key in [
|
492
492
|
"forward_mode",
|
493
|
+
"is_extend_in_batch",
|
493
494
|
"return_logprob",
|
494
495
|
"req_to_token_pool",
|
495
496
|
"token_to_kv_pool",
|
@@ -550,6 +551,8 @@ class TboForwardBatchPreparer:
|
|
550
551
|
top_p_normalized_logprobs=False,
|
551
552
|
top_p=None,
|
552
553
|
mm_inputs=None,
|
554
|
+
top_logprobs_nums=None,
|
555
|
+
token_ids_logprobs=None,
|
553
556
|
)
|
554
557
|
)
|
555
558
|
|