sglang 0.4.4.post3__py3-none-any.whl → 0.4.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_serving.py +49 -7
- sglang/lang/chat_template.py +24 -0
- sglang/srt/_custom_ops.py +59 -92
- sglang/srt/configs/model_config.py +5 -0
- sglang/srt/constrained/base_grammar_backend.py +5 -1
- sglang/srt/conversation.py +29 -4
- sglang/srt/custom_op.py +5 -0
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +27 -79
- sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +2 -2
- sglang/srt/entrypoints/engine.py +0 -5
- sglang/srt/layers/attention/flashattention_backend.py +678 -83
- sglang/srt/layers/attention/flashinfer_backend.py +5 -7
- sglang/srt/layers/attention/flashinfer_mla_backend.py +1 -3
- sglang/srt/layers/attention/flashmla_backend.py +1 -1
- sglang/srt/layers/moe/ep_moe/kernels.py +142 -0
- sglang/srt/layers/moe/ep_moe/layer.py +79 -80
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +382 -199
- sglang/srt/layers/moe/fused_moe_native.py +5 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1024,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +416 -50
- sglang/srt/layers/moe/fused_moe_triton/layer.py +7 -0
- sglang/srt/layers/moe/topk.py +49 -3
- sglang/srt/layers/quantization/__init__.py +5 -1
- sglang/srt/layers/quantization/blockwise_int8.py +2 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +2 -1
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +34 -10
- sglang/srt/layers/quantization/fp8.py +3 -1
- sglang/srt/layers/quantization/fp8_utils.py +1 -4
- sglang/srt/layers/quantization/moe_wna16.py +503 -0
- sglang/srt/layers/quantization/utils.py +1 -1
- sglang/srt/layers/quantization/w8a8_int8.py +2 -0
- sglang/srt/layers/radix_attention.py +2 -0
- sglang/srt/layers/rotary_embedding.py +63 -12
- sglang/srt/managers/cache_controller.py +34 -11
- sglang/srt/managers/mm_utils.py +202 -156
- sglang/srt/managers/multimodal_processor.py +0 -2
- sglang/srt/managers/multimodal_processors/base_processor.py +45 -77
- sglang/srt/managers/multimodal_processors/clip.py +7 -26
- sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +17 -58
- sglang/srt/managers/multimodal_processors/gemma3.py +12 -27
- sglang/srt/managers/multimodal_processors/janus_pro.py +21 -47
- sglang/srt/managers/multimodal_processors/llava.py +34 -14
- sglang/srt/managers/multimodal_processors/minicpm.py +35 -38
- sglang/srt/managers/multimodal_processors/mlama.py +10 -23
- sglang/srt/managers/multimodal_processors/mllama4.py +161 -0
- sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -45
- sglang/srt/managers/schedule_batch.py +185 -128
- sglang/srt/managers/scheduler.py +4 -4
- sglang/srt/managers/tokenizer_manager.py +1 -1
- sglang/srt/managers/utils.py +1 -6
- sglang/srt/mem_cache/hiradix_cache.py +62 -52
- sglang/srt/mem_cache/memory_pool.py +72 -6
- sglang/srt/mem_cache/paged_allocator.py +39 -0
- sglang/srt/metrics/collector.py +23 -53
- sglang/srt/model_executor/cuda_graph_runner.py +8 -6
- sglang/srt/model_executor/forward_batch_info.py +10 -10
- sglang/srt/model_executor/model_runner.py +60 -57
- sglang/srt/model_loader/loader.py +8 -0
- sglang/srt/models/clip.py +12 -7
- sglang/srt/models/deepseek_janus_pro.py +10 -15
- sglang/srt/models/deepseek_v2.py +212 -121
- sglang/srt/models/deepseek_vl2.py +105 -104
- sglang/srt/models/gemma3_mm.py +14 -80
- sglang/srt/models/llama.py +16 -5
- sglang/srt/models/llama4.py +420 -0
- sglang/srt/models/llava.py +31 -19
- sglang/srt/models/llavavid.py +16 -7
- sglang/srt/models/minicpmo.py +63 -147
- sglang/srt/models/minicpmv.py +17 -27
- sglang/srt/models/mllama.py +29 -14
- sglang/srt/models/mllama4.py +154 -0
- sglang/srt/models/qwen2.py +9 -6
- sglang/srt/models/qwen2_5_vl.py +21 -31
- sglang/srt/models/qwen2_vl.py +20 -21
- sglang/srt/openai_api/adapter.py +18 -6
- sglang/srt/platforms/interface.py +371 -0
- sglang/srt/server_args.py +99 -14
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -5
- sglang/srt/speculative/eagle_utils.py +140 -28
- sglang/srt/speculative/eagle_worker.py +93 -24
- sglang/srt/utils.py +104 -51
- sglang/test/test_custom_ops.py +55 -0
- sglang/test/test_utils.py +13 -26
- sglang/utils.py +2 -2
- sglang/version.py +1 -1
- {sglang-0.4.4.post3.dist-info → sglang-0.4.5.dist-info}/METADATA +4 -3
- {sglang-0.4.4.post3.dist-info → sglang-0.4.5.dist-info}/RECORD +99 -84
- {sglang-0.4.4.post3.dist-info → sglang-0.4.5.dist-info}/WHEEL +0 -0
- {sglang-0.4.4.post3.dist-info → sglang-0.4.5.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.4.post3.dist-info → sglang-0.4.5.dist-info}/top_level.txt +0 -0
@@ -1,13 +1,13 @@
|
|
1
|
-
import asyncio
|
2
1
|
from typing import List, Union
|
3
2
|
|
4
3
|
import torch
|
4
|
+
from transformers import BaseImageProcessorFast
|
5
5
|
|
6
6
|
from sglang.srt.managers.multimodal_processors.base_processor import (
|
7
7
|
BaseMultimodalProcessor,
|
8
8
|
MultimodalSpecialTokens,
|
9
|
-
get_global_processor,
|
10
9
|
)
|
10
|
+
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
11
11
|
from sglang.srt.models.minicpmo import MiniCPMO
|
12
12
|
from sglang.srt.models.minicpmv import MiniCPMV
|
13
13
|
|
@@ -21,19 +21,23 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
|
|
21
21
|
self.image_token = "(<image>./</image>)"
|
22
22
|
self.audio_token = "(<audio>./</audio>)"
|
23
23
|
|
24
|
-
|
25
|
-
def _process_data_task(input_text, images=None, audios=None):
|
24
|
+
def process_data_task(self, input_text, images=None, audios=None):
|
26
25
|
|
27
26
|
if isinstance(images, list) and len(images) == 0:
|
28
27
|
images = None
|
29
28
|
if isinstance(audios, list) and len(audios) == 0:
|
30
29
|
audios = None
|
31
|
-
|
30
|
+
processor = self._processor
|
31
|
+
args = {}
|
32
|
+
if isinstance(processor, BaseImageProcessorFast):
|
33
|
+
args["device"] = "cuda"
|
34
|
+
result = self._processor.__call__(
|
32
35
|
text=input_text,
|
33
36
|
images=images,
|
34
37
|
audios=audios,
|
35
38
|
return_tensors="pt",
|
36
39
|
chunk_input=True,
|
40
|
+
**args,
|
37
41
|
)
|
38
42
|
return {
|
39
43
|
"input_ids": result.input_ids,
|
@@ -44,23 +48,6 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
|
|
44
48
|
"audio_bounds": getattr(result, "audio_bounds", None),
|
45
49
|
}
|
46
50
|
|
47
|
-
async def _process_data(self, images, input_text, audios=None):
|
48
|
-
if self.executor is not None:
|
49
|
-
loop = asyncio.get_event_loop()
|
50
|
-
multimodal_data_inputs = await loop.run_in_executor(
|
51
|
-
self.executor,
|
52
|
-
MiniCPMMultimodalProcessor._process_data_task,
|
53
|
-
input_text,
|
54
|
-
images,
|
55
|
-
audios,
|
56
|
-
)
|
57
|
-
else:
|
58
|
-
multimodal_data_inputs = self._processor(
|
59
|
-
images=images, text=input_text, audios=audios, return_tensors="pt"
|
60
|
-
)
|
61
|
-
|
62
|
-
return multimodal_data_inputs
|
63
|
-
|
64
51
|
async def process_mm_data_async(
|
65
52
|
self,
|
66
53
|
image_data: List[Union[str, bytes]],
|
@@ -77,7 +64,7 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
|
|
77
64
|
audio_data = [audio_data]
|
78
65
|
|
79
66
|
base_output = self.load_mm_data(
|
80
|
-
|
67
|
+
prompt=input_ids,
|
81
68
|
max_req_input_len=max_req_input_len,
|
82
69
|
audio_data=audio_data,
|
83
70
|
image_data=image_data,
|
@@ -88,9 +75,9 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
|
|
88
75
|
if base_output is None:
|
89
76
|
return None
|
90
77
|
|
91
|
-
res =
|
92
|
-
images=base_output.images,
|
78
|
+
res = self.process_mm_data(
|
93
79
|
input_text=base_output.input_text,
|
80
|
+
images=base_output.images,
|
94
81
|
audios=base_output.audios,
|
95
82
|
)
|
96
83
|
|
@@ -142,23 +129,33 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
|
|
142
129
|
tgt_sizes_flat += [tgt_n]
|
143
130
|
|
144
131
|
pixel_values = pixel_values_flat
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
132
|
+
|
133
|
+
items = []
|
134
|
+
if len(pixel_values) != 0:
|
135
|
+
item = MultimodalDataItem(
|
136
|
+
pixel_values=pixel_values,
|
137
|
+
tgt_size=tgt_sizes_flat,
|
138
|
+
modality=Modality.IMAGE,
|
139
|
+
)
|
140
|
+
items += [item]
|
141
|
+
|
142
|
+
if (
|
143
|
+
"audio_features" in res
|
144
|
+
and res["audio_features"] is not None
|
145
|
+
and len(res["audio_features"]) != 0
|
146
|
+
):
|
147
|
+
item = MultimodalDataItem(
|
148
|
+
audio_features=[res["audio_features"]],
|
149
|
+
audio_feature_lens=res["audio_feature_lens"],
|
150
|
+
modality=Modality.AUDIO,
|
151
|
+
)
|
152
|
+
items += [item]
|
153
|
+
|
151
154
|
return {
|
155
|
+
"mm_items": items,
|
152
156
|
"input_ids": res["input_ids"].flatten().tolist(),
|
153
|
-
"pixel_values": pixel_values,
|
154
|
-
"tgt_sizes": tgt_sizes,
|
155
|
-
"data_hashes": base_output.mm_data_hashes,
|
156
|
-
"modalities": request_obj.modalities or ["image"],
|
157
157
|
"audio_start_id": audio_start_id,
|
158
158
|
"audio_end_id": audio_end_id,
|
159
|
-
"audio_features": res["audio_features"],
|
160
|
-
"audio_bounds": res["audio_bounds"],
|
161
|
-
"audio_feature_lens": res["audio_feature_lens"],
|
162
159
|
"im_token_id": im_token_id,
|
163
160
|
"im_start_id": tokenizer.im_start_id,
|
164
161
|
"im_end_id": tokenizer.im_end_id,
|
@@ -1,10 +1,9 @@
|
|
1
|
-
import asyncio
|
2
1
|
from typing import List, Union
|
3
2
|
|
4
3
|
from sglang.srt.managers.multimodal_processors.base_processor import (
|
5
4
|
BaseMultimodalProcessor,
|
6
|
-
get_global_processor,
|
7
5
|
)
|
6
|
+
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
8
7
|
from sglang.srt.models.mllama import MllamaForConditionalGeneration
|
9
8
|
from sglang.srt.utils import load_image
|
10
9
|
|
@@ -15,25 +14,6 @@ class MllamaImageProcessor(BaseMultimodalProcessor):
|
|
15
14
|
def __init__(self, hf_config, server_args, _processor):
|
16
15
|
super().__init__(hf_config, server_args, _processor)
|
17
16
|
|
18
|
-
@staticmethod
|
19
|
-
def _process_single_image_task(images, input_text):
|
20
|
-
# input_ids', 'attention_mask', 'pixel_values', 'aspect_ratio_ids', 'aspect_ratio_mask', 'cross_attention_mask'
|
21
|
-
return get_global_processor()(images, input_text, return_tensors="pt")
|
22
|
-
|
23
|
-
async def _process_single_image(self, images, input_text):
|
24
|
-
if self.executor is not None:
|
25
|
-
loop = asyncio.get_event_loop()
|
26
|
-
image_inputs = await loop.run_in_executor(
|
27
|
-
self.executor,
|
28
|
-
MllamaImageProcessor._process_single_image_task,
|
29
|
-
images,
|
30
|
-
input_text,
|
31
|
-
)
|
32
|
-
else:
|
33
|
-
image_inputs = self._processor(images, input_text, return_tensors="pt")
|
34
|
-
|
35
|
-
return image_inputs
|
36
|
-
|
37
17
|
async def process_mm_data_async(
|
38
18
|
self, image_data: List[Union[str, bytes]], input_text, *args, **kwargs
|
39
19
|
):
|
@@ -52,8 +32,15 @@ class MllamaImageProcessor(BaseMultimodalProcessor):
|
|
52
32
|
else:
|
53
33
|
images = load_image(image_data[0])[0]
|
54
34
|
|
55
|
-
image_inputs =
|
56
|
-
image_inputs["data_hashes"] = [hash(str(image_data))]
|
35
|
+
image_inputs = self.process_mm_data(input_text=input_text, images=images)
|
57
36
|
image_inputs["input_ids"] = image_inputs["input_ids"].tolist()[0]
|
37
|
+
image_inputs["mm_items"] = [
|
38
|
+
MultimodalDataItem(
|
39
|
+
pixel_values=image_inputs["pixel_values"],
|
40
|
+
aspect_ratio_id=image_inputs["aspect_ratio_ids"],
|
41
|
+
aspect_ratio_mask=image_inputs["aspect_ratio_mask"],
|
42
|
+
modality=Modality.IMAGE,
|
43
|
+
)
|
44
|
+
]
|
58
45
|
|
59
46
|
return image_inputs
|
@@ -0,0 +1,161 @@
|
|
1
|
+
from typing import List, Mapping, Optional, Tuple, Union
|
2
|
+
|
3
|
+
import torch
|
4
|
+
from PIL import Image
|
5
|
+
from transformers import Llama4Processor
|
6
|
+
from transformers.image_utils import SizeDict
|
7
|
+
from transformers.models.llama4.image_processing_llama4 import (
|
8
|
+
find_supported_resolutions,
|
9
|
+
get_best_fit,
|
10
|
+
)
|
11
|
+
|
12
|
+
from sglang.srt.managers.multimodal_processors.base_processor import (
|
13
|
+
BaseMultimodalProcessor,
|
14
|
+
MultimodalSpecialTokens,
|
15
|
+
)
|
16
|
+
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
17
|
+
from sglang.srt.models.mllama4 import Llama4ForConditionalGeneration
|
18
|
+
from sglang.srt.utils import load_image
|
19
|
+
|
20
|
+
|
21
|
+
class Mllama4ImageProcessor(BaseMultimodalProcessor):
|
22
|
+
models = [Llama4ForConditionalGeneration]
|
23
|
+
|
24
|
+
def __init__(self, hf_config, server_args, _processor):
|
25
|
+
super().__init__(hf_config, server_args, _processor)
|
26
|
+
self.vision_config = hf_config.vision_config
|
27
|
+
self.text_config = hf_config.text_config
|
28
|
+
self.multimodal_tokens = MultimodalSpecialTokens(
|
29
|
+
image_token=_processor.image_token
|
30
|
+
)
|
31
|
+
|
32
|
+
async def process_mm_data_async(
|
33
|
+
self,
|
34
|
+
image_data: List[Union[str, bytes]],
|
35
|
+
input_text,
|
36
|
+
max_req_input_len=None,
|
37
|
+
*args,
|
38
|
+
**kwargs,
|
39
|
+
):
|
40
|
+
if not image_data:
|
41
|
+
return None
|
42
|
+
|
43
|
+
if isinstance(input_text, list):
|
44
|
+
assert len(input_text) and isinstance(input_text[0], int)
|
45
|
+
input_text = self._processor.tokenizer.decode(input_text)
|
46
|
+
|
47
|
+
# Process images and text using the base processor's load_mm_data method
|
48
|
+
processed_data = self.load_mm_data(
|
49
|
+
prompt=input_text,
|
50
|
+
multimodal_tokens=self.multimodal_tokens,
|
51
|
+
max_req_input_len=max_req_input_len or 4096,
|
52
|
+
image_data=image_data,
|
53
|
+
return_text=True,
|
54
|
+
)
|
55
|
+
|
56
|
+
# Process the images using the processor
|
57
|
+
processor = Llama4Processor.from_pretrained(
|
58
|
+
self.server_args.model_path, **kwargs
|
59
|
+
)
|
60
|
+
|
61
|
+
# Process the prompt and images
|
62
|
+
image_inputs = processor(
|
63
|
+
text=processed_data.input_text,
|
64
|
+
images=processed_data.images,
|
65
|
+
return_tensors="pt",
|
66
|
+
)
|
67
|
+
|
68
|
+
# Handle image resolutions and aspect ratios
|
69
|
+
if "pixel_values" in image_inputs:
|
70
|
+
image_processor = processor.image_processor
|
71
|
+
tokenizer = self._processor.tokenizer
|
72
|
+
|
73
|
+
# Calculate tile size and find supported resolutions
|
74
|
+
tile_size = self.vision_config.image_size
|
75
|
+
max_num_tiles = getattr(self.vision_config, "max_patches", 1)
|
76
|
+
|
77
|
+
possible_resolutions = find_supported_resolutions(
|
78
|
+
max_num_chunks=max_num_tiles,
|
79
|
+
patch_size=SizeDict(height=tile_size, width=tile_size),
|
80
|
+
)
|
81
|
+
|
82
|
+
# Find best fit for each image
|
83
|
+
best_fit_sizes = [
|
84
|
+
get_best_fit(
|
85
|
+
(image.size[1], image.size[0]), # (height, width)
|
86
|
+
torch.tensor(possible_resolutions),
|
87
|
+
resize_to_max_canvas=image_processor.resize_to_max_canvas,
|
88
|
+
)
|
89
|
+
for image in processed_data.images
|
90
|
+
]
|
91
|
+
|
92
|
+
# Calculate aspect ratios and patches per image
|
93
|
+
aspect_ratios = [
|
94
|
+
(image_size[0] // tile_size, image_size[1] // tile_size)
|
95
|
+
for image_size in best_fit_sizes
|
96
|
+
]
|
97
|
+
|
98
|
+
patches_per_image = [
|
99
|
+
1 if r_h * r_w == 1 else 1 + r_h * r_w for (r_h, r_w) in aspect_ratios
|
100
|
+
]
|
101
|
+
|
102
|
+
# Add to image_inputs
|
103
|
+
image_inputs["aspect_ratios"] = aspect_ratios
|
104
|
+
image_inputs["patches_per_image"] = torch.tensor(patches_per_image)
|
105
|
+
|
106
|
+
# Process embed_is_patch
|
107
|
+
vocab = tokenizer.get_vocab()
|
108
|
+
patch_id = vocab.get(processor.img_patch_token, -1)
|
109
|
+
image_end_id = vocab.get(processor.end_of_img_token, -1)
|
110
|
+
|
111
|
+
if patch_id != -1 and image_end_id != -1:
|
112
|
+
input_ids = image_inputs["input_ids"].view(-1)
|
113
|
+
|
114
|
+
# Remove BOS token if present
|
115
|
+
if input_ids.size(0) > 0 and input_ids[0] == tokenizer.bos_token_id:
|
116
|
+
input_ids = input_ids[1:]
|
117
|
+
|
118
|
+
# Find image end indices and split input_ids
|
119
|
+
image_end_indices = (input_ids == image_end_id).nonzero().view(-1)
|
120
|
+
|
121
|
+
if image_end_indices.size(0) > 0:
|
122
|
+
# Split at image boundaries
|
123
|
+
split_indices = (image_end_indices + 1)[:-1]
|
124
|
+
split_input_ids = torch.tensor_split(input_ids, split_indices)
|
125
|
+
split_input_ids = [x for x in split_input_ids if x.numel() > 0]
|
126
|
+
|
127
|
+
# Create embed_is_patch for each image
|
128
|
+
embed_is_patch = []
|
129
|
+
for per_image_input_ids in split_input_ids:
|
130
|
+
embed_is_patch.append(per_image_input_ids == patch_id)
|
131
|
+
|
132
|
+
image_inputs["embed_is_patch"] = embed_is_patch
|
133
|
+
|
134
|
+
# Convert to the format expected by SGLang
|
135
|
+
image_inputs["input_ids"] = image_inputs["input_ids"].tolist()[0]
|
136
|
+
|
137
|
+
# Add metadata for image processing
|
138
|
+
image_inputs["mm_items"] = [
|
139
|
+
MultimodalDataItem(
|
140
|
+
pixel_values=image_inputs["pixel_values"],
|
141
|
+
modality=Modality.IMAGE,
|
142
|
+
# Add additional metadata needed for Llama4 vision processing
|
143
|
+
embed_is_patch=image_inputs.get("embed_is_patch", None),
|
144
|
+
aspect_ratios=image_inputs.get("aspect_ratios", None),
|
145
|
+
patches_per_image=image_inputs.get("patches_per_image", None),
|
146
|
+
)
|
147
|
+
]
|
148
|
+
|
149
|
+
return image_inputs
|
150
|
+
|
151
|
+
def get_patch_per_chunk(self):
|
152
|
+
"""Calculate patches per chunk based on vision config"""
|
153
|
+
image_size = self.vision_config.image_size
|
154
|
+
patch_size = self.vision_config.patch_size
|
155
|
+
|
156
|
+
assert (
|
157
|
+
image_size % patch_size == 0
|
158
|
+
), f"chunk size {image_size} should be multiple of patch_size {patch_size}"
|
159
|
+
|
160
|
+
ds_ratio = int(round(1.0 / (self.vision_config.pixel_shuffle_ratio**2)))
|
161
|
+
return (image_size // patch_size) ** 2 // ds_ratio
|
@@ -1,18 +1,17 @@
|
|
1
1
|
import asyncio
|
2
2
|
import math
|
3
|
-
import time
|
4
3
|
from typing import List, Union
|
5
4
|
|
6
5
|
import torch
|
7
6
|
from PIL import Image
|
8
7
|
|
9
|
-
from sglang.srt.managers.
|
8
|
+
from sglang.srt.managers.multimodal_processors.base_processor import (
|
10
9
|
BaseMultimodalProcessor as SGLangBaseProcessor,
|
11
10
|
)
|
12
11
|
from sglang.srt.managers.multimodal_processors.base_processor import (
|
13
12
|
MultimodalSpecialTokens,
|
14
|
-
get_global_processor,
|
15
13
|
)
|
14
|
+
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
16
15
|
from sglang.srt.models.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
|
17
16
|
from sglang.srt.models.qwen2_vl import Qwen2VLForConditionalGeneration
|
18
17
|
|
@@ -34,45 +33,15 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
|
|
34
33
|
self.MAX_PIXELS = 16384 * 28 * 28
|
35
34
|
self.MAX_RATIO = 200
|
36
35
|
|
37
|
-
@staticmethod
|
38
|
-
def _process_images_task(images, input_text, _hf_config):
|
39
|
-
if isinstance(images, list) and len(images) == 0:
|
40
|
-
images = None
|
41
|
-
result = get_global_processor().__call__(
|
42
|
-
text=[input_text], images=images, padding=True, return_tensors="pt"
|
43
|
-
)
|
44
|
-
|
45
|
-
return {
|
46
|
-
"input_ids": result.input_ids,
|
47
|
-
"pixel_values": getattr(result, "pixel_values", None),
|
48
|
-
"image_grid_thw": getattr(result, "image_grid_thw", None),
|
49
|
-
"second_per_grid_ts": getattr(result, "second_per_grid_ts", None),
|
50
|
-
"video_grid_thws": getattr(result, "video_grid_thws", None),
|
51
|
-
}
|
52
|
-
|
53
|
-
async def _process_single_image(self, images, input_text) -> dict:
|
54
|
-
if self.executor is not None:
|
55
|
-
loop = asyncio.get_event_loop()
|
56
|
-
return await loop.run_in_executor(
|
57
|
-
self.executor,
|
58
|
-
Qwen2_5VLImageProcessor._process_images_task,
|
59
|
-
images,
|
60
|
-
input_text,
|
61
|
-
self.hf_config,
|
62
|
-
)
|
63
|
-
else:
|
64
|
-
return self._process_images_task(images, input_text, self.hf_config)
|
65
|
-
|
66
36
|
async def process_mm_data_async(
|
67
37
|
self,
|
68
38
|
image_data: List[Union[str, bytes]],
|
69
|
-
|
39
|
+
prompt,
|
70
40
|
request_obj,
|
71
41
|
max_req_input_len,
|
72
42
|
*args,
|
73
43
|
**kwargs,
|
74
44
|
):
|
75
|
-
start = time.time()
|
76
45
|
if not image_data:
|
77
46
|
return None
|
78
47
|
if isinstance(image_data, str):
|
@@ -80,7 +49,7 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
|
|
80
49
|
|
81
50
|
image_token = self.IMAGE_TOKEN
|
82
51
|
base_output = self.load_mm_data(
|
83
|
-
|
52
|
+
prompt=prompt,
|
84
53
|
image_data=image_data,
|
85
54
|
multimodal_tokens=MultimodalSpecialTokens(image_token=image_token),
|
86
55
|
max_req_input_len=max_req_input_len,
|
@@ -144,24 +113,32 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
|
|
144
113
|
"""Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
|
145
114
|
return math.floor(number / factor) * factor
|
146
115
|
|
147
|
-
|
116
|
+
async def resize_image_async(image):
|
117
|
+
return resize_image(image)
|
148
118
|
|
149
|
-
|
150
|
-
|
119
|
+
resize_tasks = [resize_image_async(image) for image in base_output.images]
|
120
|
+
resized_images = await asyncio.gather(*resize_tasks)
|
121
|
+
|
122
|
+
ret = self.process_mm_data(
|
123
|
+
input_text=base_output.input_text,
|
124
|
+
images=resized_images,
|
151
125
|
)
|
152
126
|
|
153
127
|
image_grid_thws = torch.concat([ret["image_grid_thw"]])
|
154
|
-
video_grid_thws = None
|
155
128
|
return {
|
156
129
|
"input_ids": ret["input_ids"].flatten().tolist(),
|
157
|
-
"
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
130
|
+
"mm_items": [
|
131
|
+
MultimodalDataItem(
|
132
|
+
pixel_values=ret["pixel_values"],
|
133
|
+
image_grid_thws=image_grid_thws,
|
134
|
+
# TODO
|
135
|
+
video_grid_thws=None,
|
136
|
+
second_per_grid_ts=ret.get("second_per_grid_ts", None),
|
137
|
+
modality=Modality.IMAGE,
|
138
|
+
)
|
139
|
+
],
|
162
140
|
"im_start_id": self.IM_START_TOKEN_ID,
|
163
141
|
"im_end_id": self.IM_END_TOKEN_ID,
|
164
142
|
"im_token_id": self.image_token_id,
|
165
143
|
"video_token_id": self.video_token_id,
|
166
|
-
"second_per_grid_ts": ret["second_per_grid_ts"],
|
167
144
|
}
|