sglang 0.4.4.post2__py3-none-any.whl → 0.4.4.post4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_serving.py +72 -10
- sglang/srt/_custom_ops.py +59 -92
- sglang/srt/configs/deepseekvl2.py +10 -1
- sglang/srt/configs/model_config.py +6 -16
- sglang/srt/constrained/base_grammar_backend.py +5 -1
- sglang/srt/custom_op.py +5 -0
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +28 -80
- sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +2 -2
- sglang/srt/distributed/parallel_state.py +32 -5
- sglang/srt/entrypoints/engine.py +0 -5
- sglang/srt/entrypoints/http_server.py +7 -1
- sglang/srt/entrypoints/verl_engine.py +2 -0
- sglang/srt/function_call_parser.py +0 -1
- sglang/srt/layers/attention/flashattention_backend.py +582 -125
- sglang/srt/layers/attention/flashinfer_backend.py +5 -7
- sglang/srt/layers/attention/flashinfer_mla_backend.py +1 -3
- sglang/srt/layers/attention/flashmla_backend.py +1 -1
- sglang/srt/layers/dp_attention.py +12 -1
- sglang/srt/layers/moe/ep_moe/kernels.py +142 -0
- sglang/srt/layers/moe/ep_moe/layer.py +79 -80
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +382 -199
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +403 -47
- sglang/srt/layers/moe/topk.py +79 -6
- sglang/srt/layers/quantization/__init__.py +137 -165
- sglang/srt/layers/quantization/awq.py +200 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +2 -1
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +34 -10
- sglang/srt/layers/quantization/fp8_kernel.py +2 -1
- sglang/srt/layers/quantization/fp8_utils.py +1 -4
- sglang/srt/layers/quantization/gptq.py +30 -40
- sglang/srt/layers/quantization/moe_wna16.py +501 -0
- sglang/srt/layers/quantization/utils.py +1 -1
- sglang/srt/layers/quantization/w8a8_fp8.py +1 -1
- sglang/srt/lora/backend/base_backend.py +4 -4
- sglang/srt/lora/backend/flashinfer_backend.py +12 -9
- sglang/srt/lora/backend/triton_backend.py +5 -8
- sglang/srt/lora/layers.py +19 -33
- sglang/srt/lora/lora_manager.py +20 -7
- sglang/srt/lora/mem_pool.py +12 -6
- sglang/srt/lora/triton_ops/gate_up_lora_b.py +10 -4
- sglang/srt/lora/triton_ops/qkv_lora_b.py +8 -3
- sglang/srt/lora/triton_ops/sgemm_lora_a.py +16 -5
- sglang/srt/lora/triton_ops/sgemm_lora_b.py +11 -6
- sglang/srt/lora/utils.py +6 -0
- sglang/srt/managers/cache_controller.py +34 -11
- sglang/srt/managers/io_struct.py +4 -2
- sglang/srt/managers/mm_utils.py +202 -156
- sglang/srt/managers/multimodal_processor.py +0 -2
- sglang/srt/managers/multimodal_processors/base_processor.py +45 -77
- sglang/srt/managers/multimodal_processors/clip.py +44 -0
- sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +17 -58
- sglang/srt/managers/multimodal_processors/gemma3.py +12 -27
- sglang/srt/managers/multimodal_processors/janus_pro.py +21 -47
- sglang/srt/managers/multimodal_processors/llava.py +34 -14
- sglang/srt/managers/multimodal_processors/minicpm.py +35 -38
- sglang/srt/managers/multimodal_processors/mlama.py +10 -23
- sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -45
- sglang/srt/managers/schedule_batch.py +185 -127
- sglang/srt/managers/scheduler.py +29 -23
- sglang/srt/managers/tokenizer_manager.py +1 -2
- sglang/srt/managers/tp_worker.py +3 -0
- sglang/srt/managers/utils.py +1 -6
- sglang/srt/mem_cache/hiradix_cache.py +62 -52
- sglang/srt/mem_cache/memory_pool.py +72 -6
- sglang/srt/mem_cache/paged_allocator.py +39 -0
- sglang/srt/metrics/collector.py +23 -53
- sglang/srt/model_executor/cuda_graph_runner.py +16 -13
- sglang/srt/model_executor/forward_batch_info.py +10 -10
- sglang/srt/model_executor/model_runner.py +64 -59
- sglang/srt/model_loader/loader.py +19 -1
- sglang/srt/model_loader/weight_utils.py +6 -3
- sglang/srt/models/clip.py +568 -0
- sglang/srt/models/deepseek_janus_pro.py +12 -17
- sglang/srt/models/deepseek_v2.py +339 -123
- sglang/srt/models/deepseek_vl2.py +105 -104
- sglang/srt/models/gemma3_causal.py +12 -2
- sglang/srt/models/gemma3_mm.py +20 -80
- sglang/srt/models/llama.py +4 -1
- sglang/srt/models/llava.py +31 -19
- sglang/srt/models/llavavid.py +16 -7
- sglang/srt/models/minicpmo.py +63 -147
- sglang/srt/models/minicpmv.py +17 -27
- sglang/srt/models/mllama.py +29 -14
- sglang/srt/models/qwen2.py +9 -6
- sglang/srt/models/qwen2_5_vl.py +21 -31
- sglang/srt/models/qwen2_vl.py +20 -21
- sglang/srt/openai_api/adapter.py +106 -93
- sglang/srt/openai_api/protocol.py +10 -5
- sglang/srt/patch_torch.py +71 -0
- sglang/srt/platforms/interface.py +371 -0
- sglang/srt/server_args.py +120 -25
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -5
- sglang/srt/speculative/eagle_utils.py +140 -28
- sglang/srt/speculative/eagle_worker.py +94 -25
- sglang/srt/utils.py +137 -51
- sglang/test/runners.py +27 -2
- sglang/test/test_custom_ops.py +55 -0
- sglang/test/test_utils.py +14 -27
- sglang/utils.py +2 -2
- sglang/version.py +1 -1
- {sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/METADATA +10 -5
- {sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/RECORD +108 -99
- {sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/WHEEL +0 -0
- {sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/top_level.txt +0 -0
@@ -8,19 +8,10 @@ from typing import Optional
|
|
8
8
|
|
9
9
|
import numpy as np
|
10
10
|
import PIL
|
11
|
-
import transformers
|
12
11
|
from decord import VideoReader, cpu
|
13
|
-
from openai import BadRequestError
|
14
12
|
from PIL import Image
|
15
13
|
|
16
|
-
from sglang.srt.utils import load_audio, load_image, logger
|
17
|
-
|
18
|
-
global global_processor
|
19
|
-
|
20
|
-
|
21
|
-
def get_global_processor():
|
22
|
-
global global_processor
|
23
|
-
return global_processor
|
14
|
+
from sglang.srt.utils import encode_video, load_audio, load_image, logger
|
24
15
|
|
25
16
|
|
26
17
|
@dataclasses.dataclass
|
@@ -28,9 +19,6 @@ class BaseMultiModalProcessorOutput:
|
|
28
19
|
# input_text, with each frame of video/image represented with a image_token
|
29
20
|
input_text: str
|
30
21
|
|
31
|
-
mm_data_hashes: Optional[list[int]]
|
32
|
-
# images
|
33
|
-
image_sizes: Optional[list[int]]
|
34
22
|
# frames loaded from image and video, in given order
|
35
23
|
images: Optional[list[PIL.Image]] = None
|
36
24
|
|
@@ -38,7 +26,7 @@ class BaseMultiModalProcessorOutput:
|
|
38
26
|
audios: Optional[list[np.ndarray]] = None
|
39
27
|
|
40
28
|
def normalize(self):
|
41
|
-
for field_name in ["
|
29
|
+
for field_name in ["image_sizes", "images", "audios"]:
|
42
30
|
field = getattr(self, field_name, None)
|
43
31
|
if field is not None and isinstance(field, list) and len(field) == 0:
|
44
32
|
setattr(self, field_name, None)
|
@@ -68,28 +56,35 @@ class BaseMultimodalProcessor(ABC):
|
|
68
56
|
# FIXME: not accurate, model and image specific
|
69
57
|
self.NUM_TOKEN_PER_FRAME = 330
|
70
58
|
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
self.
|
75
|
-
initializer=init_global_processor,
|
59
|
+
self.io_executor = concurrent.futures.ThreadPoolExecutor(
|
60
|
+
max_workers=int(os.environ.get("SGLANG_IO_WORKERS", 4))
|
61
|
+
)
|
62
|
+
self.cpu_executor = concurrent.futures.ProcessPoolExecutor(
|
76
63
|
mp_context=mp.get_context("fork"),
|
77
|
-
|
78
|
-
self,
|
79
|
-
server_args,
|
80
|
-
),
|
81
|
-
max_workers=int(os.environ.get("SGLANG_CPU_COUNT", os.cpu_count())),
|
64
|
+
max_workers=int(os.environ.get("SGLANG_CPU_WORKERS", os.cpu_count())),
|
82
65
|
)
|
83
66
|
|
84
|
-
def
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
67
|
+
def process_mm_data(
|
68
|
+
self, input_text, images=None, videos=None, audios=None, **kwargs
|
69
|
+
):
|
70
|
+
"""
|
71
|
+
process multimodal data with transformers AutoProcessor
|
72
|
+
"""
|
73
|
+
if images is not None:
|
74
|
+
kwargs["images"] = images
|
75
|
+
if videos is not None:
|
76
|
+
kwargs["videos"] = videos
|
77
|
+
if audios is not None:
|
78
|
+
kwargs["audios"] = audios
|
79
|
+
|
80
|
+
processor = self._processor
|
81
|
+
result = processor.__call__(
|
82
|
+
text=[input_text],
|
83
|
+
padding=True,
|
84
|
+
return_tensors="pt",
|
85
|
+
**kwargs,
|
92
86
|
)
|
87
|
+
return result
|
93
88
|
|
94
89
|
@abstractmethod
|
95
90
|
async def process_mm_data_async(
|
@@ -116,33 +111,9 @@ class BaseMultimodalProcessor(ABC):
|
|
116
111
|
|
117
112
|
return estimated_frames_list
|
118
113
|
|
119
|
-
@staticmethod
|
120
|
-
def encode_video(video_path, frame_count_limit=None):
|
121
|
-
if not os.path.exists(video_path):
|
122
|
-
logger.error(f"Video {video_path} does not exist")
|
123
|
-
return []
|
124
|
-
|
125
|
-
if frame_count_limit == 0:
|
126
|
-
return []
|
127
|
-
|
128
|
-
def uniform_sample(l, n):
|
129
|
-
gap = len(l) / n
|
130
|
-
idxs = [int(i * gap + gap / 2) for i in range(n)]
|
131
|
-
return [l[i] for i in idxs]
|
132
|
-
|
133
|
-
vr = VideoReader(video_path, ctx=cpu(0))
|
134
|
-
sample_fps = round(vr.get_avg_fps() / 1) # FPS
|
135
|
-
frame_indices = [i for i in range(0, len(vr), sample_fps)]
|
136
|
-
if frame_count_limit is not None and len(frame_indices) > frame_count_limit:
|
137
|
-
frame_indices = uniform_sample(frame_indices, frame_count_limit)
|
138
|
-
|
139
|
-
frames = vr.get_batch(frame_indices).asnumpy()
|
140
|
-
frames = [Image.fromarray(v.astype("uint8")) for v in frames]
|
141
|
-
return frames
|
142
|
-
|
143
114
|
def load_mm_data(
|
144
115
|
self,
|
145
|
-
|
116
|
+
prompt: str,
|
146
117
|
multimodal_tokens: MultimodalSpecialTokens,
|
147
118
|
max_req_input_len: int,
|
148
119
|
image_data: Optional[list] = None,
|
@@ -168,11 +139,11 @@ class BaseMultimodalProcessor(ABC):
|
|
168
139
|
else:
|
169
140
|
multimodal_tokens.image_token = multimodal_tokens.image_token
|
170
141
|
|
171
|
-
if isinstance(
|
172
|
-
assert len(
|
173
|
-
|
142
|
+
if isinstance(prompt, list) and return_text:
|
143
|
+
assert len(prompt) and isinstance(prompt[0], int)
|
144
|
+
prompt = self._processor.tokenizer.decode(prompt)
|
174
145
|
else:
|
175
|
-
|
146
|
+
prompt = prompt
|
176
147
|
if return_text:
|
177
148
|
import re
|
178
149
|
|
@@ -182,7 +153,7 @@ class BaseMultimodalProcessor(ABC):
|
|
182
153
|
+ ")"
|
183
154
|
)
|
184
155
|
# split text into list of normal text and special tokens
|
185
|
-
text_parts = re.split(pattern,
|
156
|
+
text_parts = re.split(pattern, prompt)
|
186
157
|
|
187
158
|
# TODO(mick): load from server_args, env, or sampling_params
|
188
159
|
MAX_NUM_FRAMES = 30
|
@@ -218,7 +189,7 @@ class BaseMultimodalProcessor(ABC):
|
|
218
189
|
):
|
219
190
|
# video
|
220
191
|
path = image_file[len("video:") :]
|
221
|
-
frames =
|
192
|
+
frames = encode_video(
|
222
193
|
path, frame_count_limit=frames_to_process
|
223
194
|
)
|
224
195
|
else:
|
@@ -231,7 +202,16 @@ class BaseMultimodalProcessor(ABC):
|
|
231
202
|
continue
|
232
203
|
|
233
204
|
image_sizes += frames[0].size * len(frames)
|
234
|
-
|
205
|
+
|
206
|
+
# Generate a hashable value for the image file
|
207
|
+
if isinstance(image_file, Image.Image):
|
208
|
+
# For PIL.Image objects, use the ID as a hashable value
|
209
|
+
hash_value = hash(id(image_file))
|
210
|
+
else:
|
211
|
+
# For other types (strings, etc.), use the regular hash
|
212
|
+
hash_value = hash(image_file)
|
213
|
+
|
214
|
+
hashes += [hash_value] * len(frames)
|
235
215
|
images += frames
|
236
216
|
image_index += 1
|
237
217
|
if frames_to_process != 0:
|
@@ -252,24 +232,12 @@ class BaseMultimodalProcessor(ABC):
|
|
252
232
|
|
253
233
|
except Exception as e:
|
254
234
|
logger.error(f"An exception occurred while loading images: {e}")
|
255
|
-
raise
|
256
|
-
f"An exception occurred while loading images: {e}"
|
257
|
-
)
|
235
|
+
raise RuntimeError(f"An exception occurred while loading images: {e}")
|
258
236
|
|
259
237
|
out = BaseMultiModalProcessorOutput(
|
260
|
-
mm_data_hashes=hashes,
|
261
|
-
image_sizes=image_sizes,
|
262
238
|
images=images,
|
263
239
|
audios=audios,
|
264
240
|
input_text=new_text,
|
265
241
|
)
|
266
242
|
out.normalize()
|
267
243
|
return out
|
268
|
-
|
269
|
-
|
270
|
-
def init_global_processor(sglang_processor: BaseMultimodalProcessor, server_args):
|
271
|
-
"""
|
272
|
-
Init the global processor for multimodal models."""
|
273
|
-
global global_processor
|
274
|
-
transformers.logging.set_verbosity_error()
|
275
|
-
global_processor = sglang_processor._build_processor(server_args=server_args)
|
@@ -0,0 +1,44 @@
|
|
1
|
+
from typing import List, Union
|
2
|
+
|
3
|
+
from sglang.srt.managers.multimodal_processors.base_processor import (
|
4
|
+
BaseMultimodalProcessor,
|
5
|
+
)
|
6
|
+
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
7
|
+
from sglang.srt.models.clip import CLIPModel
|
8
|
+
from sglang.srt.utils import load_image
|
9
|
+
|
10
|
+
|
11
|
+
class ClipImageProcessor(BaseMultimodalProcessor):
|
12
|
+
models = [CLIPModel]
|
13
|
+
|
14
|
+
def __init__(self, hf_config, server_args, _processor):
|
15
|
+
super().__init__(hf_config, server_args, _processor)
|
16
|
+
|
17
|
+
async def process_mm_data_async(
|
18
|
+
self, image_data: List[Union[str, bytes]], input_text, *args, **kwargs
|
19
|
+
):
|
20
|
+
if not image_data:
|
21
|
+
return None
|
22
|
+
|
23
|
+
if isinstance(input_text, list):
|
24
|
+
assert len(input_text) and isinstance(input_text[0], int)
|
25
|
+
input_text = self._processor.tokenizer.decode(input_text)
|
26
|
+
|
27
|
+
if not isinstance(image_data, list):
|
28
|
+
image_data = [image_data]
|
29
|
+
|
30
|
+
if len(image_data) > 0:
|
31
|
+
images = [load_image(image)[0] for image in image_data]
|
32
|
+
else:
|
33
|
+
images = load_image(image_data[0])[0]
|
34
|
+
|
35
|
+
image_inputs = self.process_mm_data(input_text=input_text, images=images)
|
36
|
+
image_inputs["data_hashes"] = [hash(str(image_data))]
|
37
|
+
image_inputs["input_ids"] = image_inputs["input_ids"].tolist()[0]
|
38
|
+
image_inputs["mm_items"] = [
|
39
|
+
MultimodalDataItem(
|
40
|
+
pixel_values=image_inputs["pixel_values"], modality=Modality.IMAGE
|
41
|
+
)
|
42
|
+
]
|
43
|
+
|
44
|
+
return image_inputs
|
@@ -16,15 +16,14 @@
|
|
16
16
|
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
17
17
|
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
18
18
|
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
19
|
-
import asyncio
|
20
19
|
|
21
20
|
import torch
|
22
21
|
|
23
22
|
from sglang.srt.managers.multimodal_processors.base_processor import (
|
24
23
|
BaseMultimodalProcessor,
|
25
24
|
MultimodalSpecialTokens,
|
26
|
-
get_global_processor,
|
27
25
|
)
|
26
|
+
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
28
27
|
from sglang.srt.models.deepseek_vl2 import DeepseekVL2ForCausalLM
|
29
28
|
|
30
29
|
|
@@ -35,51 +34,6 @@ class DeepseekVL2ImageProcessor(BaseMultimodalProcessor):
|
|
35
34
|
super().__init__(hf_config, server_args, _processor)
|
36
35
|
self.IMAGE_TOKEN = "<image>"
|
37
36
|
|
38
|
-
@staticmethod
|
39
|
-
def _process_images_task(image, input_text, max_req_input_len):
|
40
|
-
processor = get_global_processor()
|
41
|
-
res = processor.__call__(
|
42
|
-
conversations=input_text, images=image, max_req_input_len=max_req_input_len
|
43
|
-
)
|
44
|
-
|
45
|
-
image_token_id = processor.image_token_id
|
46
|
-
|
47
|
-
res["im_token_id"] = image_token_id
|
48
|
-
return res
|
49
|
-
|
50
|
-
async def _process_images(self, image_data, input_text, max_req_input_len):
|
51
|
-
if self.executor is not None:
|
52
|
-
loop = asyncio.get_event_loop()
|
53
|
-
image_inputs = await loop.run_in_executor(
|
54
|
-
self.executor,
|
55
|
-
DeepseekVL2ImageProcessor._process_images_task,
|
56
|
-
image_data,
|
57
|
-
input_text,
|
58
|
-
max_req_input_len,
|
59
|
-
)
|
60
|
-
else:
|
61
|
-
image_inputs = self._process_images_task(
|
62
|
-
image_data, input_text, max_req_input_len
|
63
|
-
)
|
64
|
-
|
65
|
-
return image_inputs
|
66
|
-
|
67
|
-
async def _process_images(self, image_data, input_text, max_req_input_len):
|
68
|
-
if self.executor is not None:
|
69
|
-
loop = asyncio.get_event_loop()
|
70
|
-
image_inputs = await loop.run_in_executor(
|
71
|
-
self.executor,
|
72
|
-
DeepseekVL2ImageProcessor._process_images_task,
|
73
|
-
image_data,
|
74
|
-
input_text,
|
75
|
-
max_req_input_len,
|
76
|
-
)
|
77
|
-
else:
|
78
|
-
image_inputs = self._process_images_task(
|
79
|
-
image_data, input_text, max_req_input_len
|
80
|
-
)
|
81
|
-
return image_inputs
|
82
|
-
|
83
37
|
async def process_mm_data_async(
|
84
38
|
self, image_data, input_ids, request_obj, max_req_input_len, *args, **kwargs
|
85
39
|
):
|
@@ -89,8 +43,6 @@ class DeepseekVL2ImageProcessor(BaseMultimodalProcessor):
|
|
89
43
|
if not isinstance(image_data, list):
|
90
44
|
image_data = [image_data]
|
91
45
|
|
92
|
-
images, image_sizes = [], []
|
93
|
-
|
94
46
|
image_token = self.IMAGE_TOKEN
|
95
47
|
base_output = self.load_mm_data(
|
96
48
|
input_ids,
|
@@ -98,8 +50,11 @@ class DeepseekVL2ImageProcessor(BaseMultimodalProcessor):
|
|
98
50
|
multimodal_tokens=MultimodalSpecialTokens(image_token=image_token),
|
99
51
|
max_req_input_len=max_req_input_len,
|
100
52
|
)
|
101
|
-
res =
|
102
|
-
base_output.
|
53
|
+
res = self.process_mm_data(
|
54
|
+
input_text=base_output.input_text,
|
55
|
+
images=base_output.images,
|
56
|
+
max_req_input_len=max_req_input_len,
|
57
|
+
conversations=base_output.input_text,
|
103
58
|
)
|
104
59
|
images_seq_mask = res["images_seq_mask"]
|
105
60
|
images_spatial_crop = res["images_spatial_crop"]
|
@@ -107,13 +62,17 @@ class DeepseekVL2ImageProcessor(BaseMultimodalProcessor):
|
|
107
62
|
batched_images_spatial_crop.append(images_spatial_crop)
|
108
63
|
batched_images_spatial_crop = torch.stack(batched_images_spatial_crop, dim=0)
|
109
64
|
|
65
|
+
items = []
|
66
|
+
item = MultimodalDataItem(
|
67
|
+
pixel_values=res["images"],
|
68
|
+
modality=Modality.IMAGE,
|
69
|
+
image_emb_mask=images_seq_mask,
|
70
|
+
image_spatial_crop=batched_images_spatial_crop,
|
71
|
+
)
|
72
|
+
items += [item]
|
73
|
+
|
110
74
|
return {
|
75
|
+
"mm_items": items,
|
111
76
|
"input_ids": res["input_ids"].tolist(),
|
112
|
-
"
|
113
|
-
"im_token_id": res["im_token_id"],
|
114
|
-
"data_hashes": base_output.mm_data_hashes,
|
115
|
-
"image_sizes": image_sizes,
|
116
|
-
"images_emb_mask": images_seq_mask,
|
117
|
-
"image_spatial_crop": batched_images_spatial_crop,
|
118
|
-
"modalities": request_obj.modalities or ["image"],
|
77
|
+
"im_token_id": self._processor.image_token_id,
|
119
78
|
}
|
@@ -7,8 +7,8 @@ from sglang.srt.managers.multimodal_processor import (
|
|
7
7
|
)
|
8
8
|
from sglang.srt.managers.multimodal_processors.base_processor import (
|
9
9
|
MultimodalSpecialTokens,
|
10
|
-
get_global_processor,
|
11
10
|
)
|
11
|
+
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
12
12
|
from sglang.srt.models.gemma3_mm import Gemma3ForConditionalGeneration
|
13
13
|
|
14
14
|
# Copied from: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gemma3/image_processing_gemma3_fast.py
|
@@ -25,28 +25,6 @@ class Gemma3SGLangImageProcessor(SGLangBaseProcessor):
|
|
25
25
|
self.IM_START_TOKEN_ID = hf_config.boi_token_index
|
26
26
|
self.IM_END_TOKEN_ID = hf_config.eoi_token_index
|
27
27
|
|
28
|
-
async def _process_single_image(self, images, input_text) -> dict:
|
29
|
-
if isinstance(images, list) and len(images) == 0:
|
30
|
-
images = None
|
31
|
-
processor = get_global_processor()
|
32
|
-
result = processor.__call__(
|
33
|
-
text=[input_text],
|
34
|
-
images=images,
|
35
|
-
padding=True,
|
36
|
-
return_tensors="pt",
|
37
|
-
# if RGBA, this needs to be set
|
38
|
-
# images_kwargs={
|
39
|
-
# "input_data_format": ChannelDimension.FIRST
|
40
|
-
# }
|
41
|
-
)
|
42
|
-
|
43
|
-
pixel_values = getattr(result, "pixel_values", None)
|
44
|
-
|
45
|
-
return {
|
46
|
-
"input_ids": result.input_ids,
|
47
|
-
"pixel_values": pixel_values,
|
48
|
-
}
|
49
|
-
|
50
28
|
async def process_mm_data_async(
|
51
29
|
self,
|
52
30
|
image_data: List[Union[str, bytes]],
|
@@ -63,21 +41,28 @@ class Gemma3SGLangImageProcessor(SGLangBaseProcessor):
|
|
63
41
|
|
64
42
|
image_token = self.IMAGE_TOKEN
|
65
43
|
base_output = self.load_mm_data(
|
66
|
-
|
44
|
+
prompt=input_ids,
|
67
45
|
image_data=image_data,
|
68
46
|
multimodal_tokens=MultimodalSpecialTokens(image_token=image_token),
|
69
47
|
max_req_input_len=max_req_input_len,
|
70
48
|
discard_alpha_channel=True,
|
71
49
|
)
|
72
50
|
|
73
|
-
ret =
|
51
|
+
ret = self.process_mm_data(
|
74
52
|
input_text=base_output.input_text, images=base_output.images
|
75
53
|
)
|
76
54
|
|
55
|
+
items = []
|
56
|
+
for i, image in enumerate(base_output.images):
|
57
|
+
item = MultimodalDataItem(
|
58
|
+
pixel_values=ret["pixel_values"][i],
|
59
|
+
modality=Modality.IMAGE,
|
60
|
+
)
|
61
|
+
items += [item]
|
62
|
+
|
77
63
|
return {
|
64
|
+
"mm_items": items,
|
78
65
|
"input_ids": ret["input_ids"].flatten().tolist(),
|
79
|
-
"pixel_values": ret["pixel_values"],
|
80
|
-
"data_hashes": base_output.mm_data_hashes,
|
81
66
|
"im_start_id": self.IM_START_TOKEN_ID,
|
82
67
|
"im_end_id": self.IM_END_TOKEN_ID,
|
83
68
|
}
|
@@ -1,11 +1,10 @@
|
|
1
|
-
import asyncio
|
2
1
|
from typing import List, Union
|
3
2
|
|
4
3
|
from sglang.srt.managers.multimodal_processors.base_processor import (
|
5
4
|
BaseMultimodalProcessor,
|
6
5
|
MultimodalSpecialTokens,
|
7
|
-
get_global_processor,
|
8
6
|
)
|
7
|
+
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
9
8
|
from sglang.srt.models.deepseek_janus_pro import MultiModalityCausalLM
|
10
9
|
|
11
10
|
|
@@ -15,37 +14,6 @@ class JanusProImageProcessor(BaseMultimodalProcessor):
|
|
15
14
|
def __init__(self, hf_config, server_args, _processor):
|
16
15
|
super().__init__(hf_config, server_args, _processor)
|
17
16
|
|
18
|
-
@staticmethod
|
19
|
-
def _process_images_task(images, input_text):
|
20
|
-
processor = get_global_processor()
|
21
|
-
result = processor.__call__(
|
22
|
-
prompt=input_text, images=images, return_tensors="pt"
|
23
|
-
)
|
24
|
-
return {
|
25
|
-
"input_ids": result["input_ids"],
|
26
|
-
"pixel_values": result["pixel_values"],
|
27
|
-
"images_emb_mask": result["images_emb_mask"],
|
28
|
-
"im_start_id": processor.image_start_id,
|
29
|
-
"im_end_id": processor.image_end_id,
|
30
|
-
"im_token_id": processor.image_id,
|
31
|
-
}
|
32
|
-
|
33
|
-
async def _process_images(self, images, input_text):
|
34
|
-
if self.executor is not None:
|
35
|
-
loop = asyncio.get_event_loop()
|
36
|
-
image_inputs = await loop.run_in_executor(
|
37
|
-
self.executor,
|
38
|
-
JanusProImageProcessor._process_images_task,
|
39
|
-
images,
|
40
|
-
input_text,
|
41
|
-
)
|
42
|
-
else:
|
43
|
-
image_inputs = self._processor(
|
44
|
-
images=images, text=input_text, return_tensors="pt"
|
45
|
-
)
|
46
|
-
|
47
|
-
return image_inputs
|
48
|
-
|
49
17
|
async def process_mm_data_async(
|
50
18
|
self,
|
51
19
|
image_data: List[Union[str, bytes]],
|
@@ -60,25 +28,31 @@ class JanusProImageProcessor(BaseMultimodalProcessor):
|
|
60
28
|
if not isinstance(image_data, list):
|
61
29
|
image_data = [image_data]
|
62
30
|
|
31
|
+
processor = self._processor
|
32
|
+
|
63
33
|
base_out = self.load_mm_data(
|
64
|
-
|
34
|
+
prompt=input_ids,
|
65
35
|
image_data=image_data,
|
66
|
-
multimodal_tokens=MultimodalSpecialTokens(
|
67
|
-
image_token="<image_placeholder>"
|
68
|
-
),
|
36
|
+
multimodal_tokens=MultimodalSpecialTokens(image_token=processor.image_tag),
|
69
37
|
max_req_input_len=max_req_input_len,
|
70
38
|
)
|
39
|
+
|
71
40
|
images = base_out.images
|
72
|
-
res =
|
73
|
-
|
74
|
-
|
75
|
-
|
41
|
+
res = self.process_mm_data(
|
42
|
+
input_text=base_out.input_text,
|
43
|
+
prompt=base_out.input_text,
|
44
|
+
images=images,
|
45
|
+
)
|
76
46
|
return {
|
47
|
+
"mm_items": [
|
48
|
+
MultimodalDataItem(
|
49
|
+
pixel_values=res["pixel_values"],
|
50
|
+
image_emb_mask=res["images_emb_mask"],
|
51
|
+
modality=Modality.IMAGE,
|
52
|
+
)
|
53
|
+
],
|
77
54
|
"input_ids": res["input_ids"].flatten().tolist(),
|
78
|
-
"
|
79
|
-
"
|
80
|
-
"
|
81
|
-
"im_start_id": res["im_start_id"],
|
82
|
-
"im_end_id": res["im_end_id"],
|
83
|
-
"im_token_id": res["im_token_id"],
|
55
|
+
"im_start_id": processor.image_start_id,
|
56
|
+
"im_end_id": processor.image_end_id,
|
57
|
+
"im_token_id": processor.image_id,
|
84
58
|
}
|
@@ -5,17 +5,26 @@ import numpy as np
|
|
5
5
|
|
6
6
|
from sglang.srt.managers.multimodal_processors.base_processor import (
|
7
7
|
BaseMultimodalProcessor,
|
8
|
-
get_global_processor,
|
9
8
|
)
|
9
|
+
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
10
10
|
from sglang.srt.mm_utils import expand2square, process_anyres_image
|
11
|
-
from sglang.srt.models.llava import
|
11
|
+
from sglang.srt.models.llava import (
|
12
|
+
LlavaLlamaForCausalLM,
|
13
|
+
LlavaMistralForCausalLM,
|
14
|
+
LlavaQwenForCausalLM,
|
15
|
+
)
|
12
16
|
from sglang.srt.models.llavavid import LlavaVidForCausalLM
|
13
17
|
from sglang.srt.utils import load_image, logger
|
14
18
|
from sglang.utils import get_exception_traceback
|
15
19
|
|
16
20
|
|
17
21
|
class LlavaImageProcessor(BaseMultimodalProcessor):
|
18
|
-
models = [
|
22
|
+
models = [
|
23
|
+
LlavaLlamaForCausalLM,
|
24
|
+
LlavaVidForCausalLM,
|
25
|
+
LlavaQwenForCausalLM,
|
26
|
+
LlavaMistralForCausalLM,
|
27
|
+
]
|
19
28
|
|
20
29
|
def __init__(self, hf_config, server_args, _processor):
|
21
30
|
super().__init__(hf_config, server_args, _processor)
|
@@ -25,11 +34,10 @@ class LlavaImageProcessor(BaseMultimodalProcessor):
|
|
25
34
|
image_data: Union[str, bytes],
|
26
35
|
image_aspect_ratio: Optional[str] = None,
|
27
36
|
image_grid_pinpoints: Optional[str] = None,
|
28
|
-
|
37
|
+
processor=None,
|
29
38
|
):
|
30
|
-
processor = get_global_processor()
|
31
39
|
|
32
|
-
image_processor =
|
40
|
+
image_processor = processor.image_processor
|
33
41
|
|
34
42
|
try:
|
35
43
|
image, image_size = load_image(image_data)
|
@@ -72,18 +80,22 @@ class LlavaImageProcessor(BaseMultimodalProcessor):
|
|
72
80
|
async def _process_single_image(
|
73
81
|
self, image_data: Union[bytes, str], aspect_ratio: str, grid_pinpoints: str
|
74
82
|
):
|
75
|
-
if self.
|
83
|
+
if self.cpu_executor is not None:
|
76
84
|
loop = asyncio.get_event_loop()
|
77
85
|
return await loop.run_in_executor(
|
78
|
-
self.
|
86
|
+
self.cpu_executor,
|
79
87
|
LlavaImageProcessor._process_single_image_task,
|
80
88
|
image_data,
|
81
89
|
aspect_ratio,
|
82
90
|
grid_pinpoints,
|
91
|
+
self._processor,
|
83
92
|
)
|
84
93
|
else:
|
85
94
|
return self._process_single_image_task(
|
86
|
-
image_data,
|
95
|
+
image_data,
|
96
|
+
aspect_ratio,
|
97
|
+
grid_pinpoints,
|
98
|
+
self._processor.image_processor,
|
87
99
|
)
|
88
100
|
|
89
101
|
async def process_mm_data_async(
|
@@ -134,14 +146,22 @@ class LlavaImageProcessor(BaseMultimodalProcessor):
|
|
134
146
|
pixel_values, image_hash, image_size = await self._process_single_image(
|
135
147
|
image_data[0], aspect_ratio, grid_pinpoints
|
136
148
|
)
|
137
|
-
data_hashes = [image_hash]
|
138
149
|
image_sizes = [image_size]
|
139
150
|
else:
|
140
151
|
raise ValueError(f"Invalid image data: {image_data}")
|
152
|
+
modality = Modality.IMAGE
|
153
|
+
if isinstance(request_obj.modalities, list):
|
154
|
+
if request_obj.modalities[0] == "multi-images":
|
155
|
+
modality = Modality.MULTI_IMAGES
|
156
|
+
elif request_obj.modalities[0] == "video":
|
157
|
+
modality = Modality.VIDEO
|
141
158
|
|
142
159
|
return {
|
143
|
-
"
|
144
|
-
|
145
|
-
|
146
|
-
|
160
|
+
"mm_items": [
|
161
|
+
MultimodalDataItem(
|
162
|
+
pixel_values=pixel_values,
|
163
|
+
image_sizes=image_sizes,
|
164
|
+
modality=modality,
|
165
|
+
)
|
166
|
+
],
|
147
167
|
}
|