sglang 0.4.9.post1__py3-none-any.whl → 0.4.9.post2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/srt/configs/model_config.py +24 -1
- sglang/srt/conversation.py +21 -2
- sglang/srt/disaggregation/ascend/__init__.py +6 -0
- sglang/srt/disaggregation/ascend/conn.py +44 -0
- sglang/srt/disaggregation/ascend/transfer_engine.py +58 -0
- sglang/srt/disaggregation/mooncake/conn.py +15 -14
- sglang/srt/disaggregation/mooncake/transfer_engine.py +17 -8
- sglang/srt/disaggregation/utils.py +25 -3
- sglang/srt/entrypoints/engine.py +1 -1
- sglang/srt/entrypoints/http_server.py +1 -0
- sglang/srt/entrypoints/openai/protocol.py +11 -0
- sglang/srt/entrypoints/openai/serving_chat.py +7 -0
- sglang/srt/function_call/function_call_parser.py +2 -0
- sglang/srt/function_call/kimik2_detector.py +220 -0
- sglang/srt/hf_transformers_utils.py +18 -0
- sglang/srt/jinja_template_utils.py +8 -0
- sglang/srt/layers/communicator.py +17 -4
- sglang/srt/layers/linear.py +12 -2
- sglang/srt/layers/moe/ep_moe/kernels.py +2 -1
- sglang/srt/layers/moe/ep_moe/layer.py +2 -1
- sglang/srt/layers/moe/fused_moe_triton/layer.py +7 -2
- sglang/srt/layers/moe/topk.py +8 -2
- sglang/srt/layers/parameter.py +19 -3
- sglang/srt/layers/quantization/fp8_kernel.py +2 -2
- sglang/srt/layers/quantization/moe_wna16.py +1 -2
- sglang/srt/layers/quantization/w8a8_int8.py +738 -14
- sglang/srt/managers/io_struct.py +27 -2
- sglang/srt/managers/mm_utils.py +55 -94
- sglang/srt/managers/schedule_batch.py +16 -5
- sglang/srt/managers/scheduler.py +21 -1
- sglang/srt/managers/tokenizer_manager.py +16 -0
- sglang/srt/mem_cache/memory_pool.py +65 -40
- sglang/srt/model_executor/forward_batch_info.py +13 -1
- sglang/srt/model_loader/loader.py +23 -12
- sglang/srt/models/deepseek_janus_pro.py +1 -1
- sglang/srt/models/deepseek_v2.py +62 -17
- sglang/srt/models/deepseek_vl2.py +1 -1
- sglang/srt/models/gemma3_mm.py +1 -1
- sglang/srt/models/gemma3n_mm.py +6 -3
- sglang/srt/models/internvl.py +8 -2
- sglang/srt/models/kimi_vl.py +8 -2
- sglang/srt/models/llama.py +2 -0
- sglang/srt/models/llava.py +3 -1
- sglang/srt/models/llavavid.py +1 -1
- sglang/srt/models/minicpmo.py +1 -2
- sglang/srt/models/minicpmv.py +1 -1
- sglang/srt/models/mixtral_quant.py +4 -0
- sglang/srt/models/mllama4.py +13 -4
- sglang/srt/models/phi4mm.py +8 -2
- sglang/srt/models/phimoe.py +553 -0
- sglang/srt/models/qwen2.py +2 -0
- sglang/srt/models/qwen2_5_vl.py +10 -7
- sglang/srt/models/qwen2_vl.py +12 -1
- sglang/srt/models/vila.py +8 -2
- sglang/srt/multimodal/processors/base_processor.py +197 -137
- sglang/srt/multimodal/processors/deepseek_vl_v2.py +1 -1
- sglang/srt/multimodal/processors/gemma3.py +4 -2
- sglang/srt/multimodal/processors/gemma3n.py +1 -1
- sglang/srt/multimodal/processors/internvl.py +1 -1
- sglang/srt/multimodal/processors/janus_pro.py +1 -1
- sglang/srt/multimodal/processors/kimi_vl.py +1 -1
- sglang/srt/multimodal/processors/minicpm.py +4 -3
- sglang/srt/multimodal/processors/mllama4.py +1 -1
- sglang/srt/multimodal/processors/phi4mm.py +1 -1
- sglang/srt/multimodal/processors/pixtral.py +1 -1
- sglang/srt/multimodal/processors/qwen_vl.py +203 -80
- sglang/srt/multimodal/processors/vila.py +1 -1
- sglang/srt/server_args.py +11 -4
- sglang/srt/utils.py +154 -31
- sglang/version.py +1 -1
- {sglang-0.4.9.post1.dist-info → sglang-0.4.9.post2.dist-info}/METADATA +4 -3
- {sglang-0.4.9.post1.dist-info → sglang-0.4.9.post2.dist-info}/RECORD +75 -70
- {sglang-0.4.9.post1.dist-info → sglang-0.4.9.post2.dist-info}/WHEEL +0 -0
- {sglang-0.4.9.post1.dist-info → sglang-0.4.9.post2.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.9.post1.dist-info → sglang-0.4.9.post2.dist-info}/top_level.txt +0 -0
@@ -5,7 +5,7 @@ import multiprocessing as mp
|
|
5
5
|
import os
|
6
6
|
import re
|
7
7
|
from abc import ABC, abstractmethod
|
8
|
-
from
|
8
|
+
from functools import lru_cache
|
9
9
|
from typing import Any, Dict, List, Optional, Tuple, Union
|
10
10
|
|
11
11
|
import numpy as np
|
@@ -14,7 +14,7 @@ from PIL import Image
|
|
14
14
|
from transformers import BaseImageProcessorFast
|
15
15
|
|
16
16
|
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
17
|
-
from sglang.srt.utils import
|
17
|
+
from sglang.srt.utils import load_audio, load_image, load_video, logger
|
18
18
|
|
19
19
|
|
20
20
|
@dataclasses.dataclass
|
@@ -25,14 +25,22 @@ class BaseMultiModalProcessorOutput:
|
|
25
25
|
# frames loaded from image and video, in given order
|
26
26
|
images: Optional[list[Union[Image.Image, dict]]] = None
|
27
27
|
|
28
|
+
# videos
|
29
|
+
videos: Optional[list[Union[torch.Tensor, dict]]] = None
|
30
|
+
|
28
31
|
# audios
|
29
32
|
audios: Optional[list[Union[np.ndarray, dict]]] = None
|
30
33
|
|
31
|
-
def
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
34
|
+
def organize_results(self) -> List[Tuple[Modality, Any]]:
|
35
|
+
"""
|
36
|
+
|
37
|
+
:return: a list of results, with their corresponding modalities
|
38
|
+
"""
|
39
|
+
return (
|
40
|
+
[(Modality.IMAGE, data) for data in self.images]
|
41
|
+
+ [(Modality.VIDEO, data) for data in self.videos]
|
42
|
+
+ [(Modality.AUDIO, data) for data in self.audios]
|
43
|
+
)
|
36
44
|
|
37
45
|
|
38
46
|
@dataclasses.dataclass
|
@@ -41,6 +49,10 @@ class MultimodalSpecialTokens:
|
|
41
49
|
video_token: Optional[Union[int, str, List[str]]] = None
|
42
50
|
audio_token: Optional[Union[int, str, List[str]]] = None
|
43
51
|
|
52
|
+
image_token_regex: Optional[re.Pattern] = None
|
53
|
+
video_token_regex: Optional[re.Pattern] = None
|
54
|
+
audio_token_regex: Optional[re.Pattern] = None
|
55
|
+
|
44
56
|
def convert_to_str(self, token: Union[str, int], processor) -> str:
|
45
57
|
if token is None:
|
46
58
|
return token
|
@@ -53,11 +65,29 @@ class MultimodalSpecialTokens:
|
|
53
65
|
self.video_token = self.convert_to_str(self.video_token, processor)
|
54
66
|
self.audio_token = self.convert_to_str(self.audio_token, processor)
|
55
67
|
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
68
|
+
def get_modality_of_token(self, token) -> Optional[Modality]:
|
69
|
+
"""
|
70
|
+
:return: the modality associated with the given token, if the token is a special_token or matches with the multimodal token regex
|
71
|
+
"""
|
72
|
+
modality = {
|
73
|
+
self.image_token: Modality.IMAGE,
|
74
|
+
self.video_token: Modality.VIDEO,
|
75
|
+
self.audio_token: Modality.AUDIO,
|
76
|
+
}.get(token)
|
77
|
+
if modality:
|
78
|
+
return modality
|
79
|
+
|
80
|
+
for regex, modality in [
|
81
|
+
(self.image_token_regex, Modality.IMAGE),
|
82
|
+
(self.video_token_regex, Modality.VIDEO),
|
83
|
+
(self.audio_token_regex, Modality.AUDIO),
|
84
|
+
]:
|
85
|
+
if regex and regex.match(token):
|
86
|
+
return modality
|
87
|
+
|
88
|
+
return None
|
89
|
+
|
90
|
+
def parse_regex(self):
|
61
91
|
if self.image_token_regex is None and self.image_token is not None:
|
62
92
|
self.image_token_regex = re.compile(re.escape(self.image_token))
|
63
93
|
if self.video_token_regex is None and self.video_token is not None:
|
@@ -65,7 +95,7 @@ class MultimodalSpecialTokens:
|
|
65
95
|
if self.audio_token_regex is None and self.audio_token is not None:
|
66
96
|
self.audio_token_regex = re.compile(re.escape(self.audio_token))
|
67
97
|
|
68
|
-
def
|
98
|
+
def combine_regex(self) -> re.Pattern:
|
69
99
|
tokens = [
|
70
100
|
self.image_token_regex,
|
71
101
|
self.video_token_regex,
|
@@ -105,6 +135,7 @@ class BaseMultimodalProcessor(ABC):
|
|
105
135
|
self.ATTR_NAME_TO_MODALITY = {
|
106
136
|
# Image-related attributes
|
107
137
|
"pixel_values": Modality.IMAGE,
|
138
|
+
"pixel_values_videos": Modality.VIDEO,
|
108
139
|
"image_sizes": Modality.IMAGE,
|
109
140
|
"image_grid_thw": Modality.IMAGE,
|
110
141
|
"image_emb_mask": Modality.IMAGE,
|
@@ -120,7 +151,7 @@ class BaseMultimodalProcessor(ABC):
|
|
120
151
|
"input_features": Modality.AUDIO,
|
121
152
|
"input_features_mask": Modality.AUDIO,
|
122
153
|
# Video-related attributes
|
123
|
-
"
|
154
|
+
"video_grid_thw": Modality.VIDEO,
|
124
155
|
# Generic attributes that could apply to multiple modalities
|
125
156
|
# "precomputed_features" - handled specially as it can be any modality
|
126
157
|
}
|
@@ -196,20 +227,25 @@ class BaseMultimodalProcessor(ABC):
|
|
196
227
|
|
197
228
|
@staticmethod
|
198
229
|
def _load_single_item(
|
199
|
-
data,
|
230
|
+
data, modality: Modality, frame_count_limit=None, discard_alpha_channel=True
|
200
231
|
):
|
201
|
-
"""
|
232
|
+
"""
|
233
|
+
Load a single multimodal data.
|
234
|
+
|
235
|
+
If data is precomputed, returns directly.
|
236
|
+
|
237
|
+
Static method that can be pickled for multiprocessing"""
|
202
238
|
if isinstance(data, dict):
|
203
239
|
return data
|
204
240
|
try:
|
205
|
-
if
|
206
|
-
return load_audio(data)
|
207
|
-
elif is_video:
|
208
|
-
path = data[len("video:") :]
|
209
|
-
return encode_video(path, frame_count_limit)
|
210
|
-
else:
|
241
|
+
if modality == Modality.IMAGE:
|
211
242
|
img, _ = load_image(data)
|
212
243
|
return img.convert("RGB") if discard_alpha_channel else img
|
244
|
+
elif modality == Modality.VIDEO:
|
245
|
+
return load_video(data, frame_count_limit)
|
246
|
+
elif modality == Modality.AUDIO:
|
247
|
+
return load_audio(data)
|
248
|
+
|
213
249
|
except Exception as e:
|
214
250
|
raise RuntimeError(f"Error while loading data {data}: {e}")
|
215
251
|
|
@@ -217,75 +253,78 @@ class BaseMultimodalProcessor(ABC):
|
|
217
253
|
self,
|
218
254
|
text_parts: List[str],
|
219
255
|
multimodal_tokens: MultimodalSpecialTokens,
|
220
|
-
|
221
|
-
audio_data: Optional[list] = None,
|
256
|
+
data_iterators: dict,
|
222
257
|
discard_alpha_channel: bool = True,
|
223
|
-
|
258
|
+
image_estimated_frames_iter: Optional[iter] = None,
|
259
|
+
image_scaling_factor: float = 1.0,
|
260
|
+
max_image_frames: int = 30,
|
261
|
+
) -> Tuple[List, List]:
|
224
262
|
"""
|
225
|
-
load multimodal data parallelly
|
263
|
+
load multimodal data parallelly using iterators.
|
226
264
|
"""
|
227
|
-
|
228
|
-
# TODO(mick): load from server_args, env, or sampling_params
|
229
|
-
MAX_NUM_FRAMES = 30
|
230
|
-
estimated_frames_list = self.get_estimated_frames_list(image_data=image_data)
|
231
|
-
total_frame_count = sum(estimated_frames_list)
|
232
|
-
# a heuristic value, suggesting the maximum fraction of frames to embed from all visual inputs.
|
233
|
-
# e.g., 0.1 suggests that 1 frame out of 10 input frames should be used
|
234
|
-
scaling_factor = min(1.0, MAX_NUM_FRAMES / max(1, total_frame_count))
|
235
|
-
|
236
|
-
assert len(image_data) == len(estimated_frames_list)
|
237
|
-
# Submit all tasks
|
238
265
|
futures = []
|
239
266
|
task_info = []
|
240
|
-
image_index, audio_index = 0, 0
|
241
267
|
|
242
268
|
for text_part in text_parts:
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
269
|
+
modality = multimodal_tokens.get_modality_of_token(text_part)
|
270
|
+
if modality is not None:
|
271
|
+
data_iterator = data_iterators.get(modality)
|
272
|
+
if data_iterator is None:
|
273
|
+
raise ValueError(f"No data iterator found for token: {text_part}")
|
274
|
+
|
275
|
+
try:
|
276
|
+
data = next(data_iterator)
|
277
|
+
except StopIteration:
|
278
|
+
raise ValueError(
|
279
|
+
f"Mismatch: More '{text_part}' tokens found than corresponding data items provided."
|
280
|
+
)
|
281
|
+
|
282
|
+
frame_count_limit = None
|
283
|
+
if modality == Modality.IMAGE and image_estimated_frames_iter:
|
284
|
+
try:
|
285
|
+
estimated_frames = next(image_estimated_frames_iter)
|
286
|
+
# Use the pre-calculated scaling factor and max frames
|
287
|
+
frame_count_limit = max(
|
288
|
+
1, int(estimated_frames * image_scaling_factor)
|
289
|
+
)
|
290
|
+
# Ensure we don't exceed the absolute max (redundant if scaling_factor handles it)
|
291
|
+
# frame_count_limit = min(frame_count_limit, max_image_frames)
|
292
|
+
except StopIteration:
|
293
|
+
raise ValueError(
|
294
|
+
"Mismatch between image tokens and estimated frame counts."
|
295
|
+
)
|
296
|
+
|
251
297
|
futures.append(
|
252
298
|
self.io_executor.submit(
|
253
299
|
BaseMultimodalProcessor._load_single_item,
|
254
300
|
data,
|
255
|
-
|
256
|
-
False,
|
301
|
+
modality,
|
257
302
|
frame_count_limit,
|
258
303
|
discard_alpha_channel,
|
259
304
|
)
|
260
305
|
)
|
261
|
-
task_info.append((
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
futures.append(
|
269
|
-
self.io_executor.submit(
|
270
|
-
BaseMultimodalProcessor._load_single_item,
|
271
|
-
data,
|
272
|
-
False,
|
273
|
-
True,
|
274
|
-
None,
|
275
|
-
discard_alpha_channel,
|
276
|
-
)
|
306
|
+
task_info.append((modality, data, frame_count_limit))
|
307
|
+
|
308
|
+
for modality, iterator in data_iterators.items():
|
309
|
+
try:
|
310
|
+
next(iterator)
|
311
|
+
logger.warning(
|
312
|
+
f"Warning: More {modality.name.lower()} data items provided than corresponding tokens found in the prompt."
|
277
313
|
)
|
278
|
-
|
279
|
-
|
314
|
+
except StopIteration:
|
315
|
+
pass
|
316
|
+
except Exception:
|
317
|
+
pass
|
280
318
|
|
281
319
|
return futures, task_info
|
282
320
|
|
283
321
|
def load_mm_data(
|
284
322
|
self,
|
285
|
-
prompt: str
|
323
|
+
prompt: str,
|
286
324
|
multimodal_tokens: MultimodalSpecialTokens,
|
287
325
|
max_req_input_len: int,
|
288
326
|
image_data: Optional[list] = None,
|
327
|
+
video_data: Optional[list] = None,
|
289
328
|
audio_data: Optional[list] = None,
|
290
329
|
return_text: Optional[bool] = True,
|
291
330
|
discard_alpha_channel: bool = True,
|
@@ -299,14 +338,9 @@ class BaseMultimodalProcessor(ABC):
|
|
299
338
|
discard_alpha_channel: if True, discards the alpha channel in the returned images
|
300
339
|
|
301
340
|
"""
|
302
|
-
if not return_text:
|
303
|
-
raise NotImplementedError()
|
304
|
-
if image_data is None:
|
305
|
-
image_data = []
|
306
|
-
|
307
341
|
multimodal_tokens.convert_to_strs(self._processor)
|
308
|
-
|
309
|
-
|
342
|
+
multimodal_tokens.parse_regex()
|
343
|
+
multimodal_tokens_pattern = multimodal_tokens.combine_regex()
|
310
344
|
if isinstance(prompt, list) and return_text:
|
311
345
|
assert len(prompt) and isinstance(prompt[0], int)
|
312
346
|
prompt = self._processor.tokenizer.decode(prompt)
|
@@ -317,59 +351,84 @@ class BaseMultimodalProcessor(ABC):
|
|
317
351
|
# split text into list of normal text and special tokens
|
318
352
|
text_parts = re.split(multimodal_tokens_pattern, prompt)
|
319
353
|
|
354
|
+
# collect all data
|
355
|
+
data_iterators = {}
|
356
|
+
if multimodal_tokens.image_token and image_data:
|
357
|
+
data_iterators[Modality.IMAGE] = iter(image_data)
|
358
|
+
if multimodal_tokens.video_token and video_data:
|
359
|
+
data_iterators[Modality.VIDEO] = iter(video_data)
|
360
|
+
if multimodal_tokens.audio_token and audio_data:
|
361
|
+
data_iterators[Modality.AUDIO] = iter(audio_data)
|
362
|
+
|
363
|
+
# futures: the futures of loaded data
|
364
|
+
# task_info: modality, raw_data, and other metadata of each data
|
320
365
|
futures, task_info = self.submit_data_loading_tasks(
|
321
366
|
text_parts=text_parts,
|
322
367
|
multimodal_tokens=multimodal_tokens,
|
323
|
-
|
324
|
-
audio_data=audio_data,
|
368
|
+
data_iterators=data_iterators,
|
325
369
|
discard_alpha_channel=discard_alpha_channel,
|
326
370
|
)
|
327
|
-
|
328
|
-
|
329
|
-
new_text = ""
|
330
|
-
task_ptr = 0
|
371
|
+
task_info_iter = iter(task_info)
|
372
|
+
futures_iter = iter(futures)
|
331
373
|
|
374
|
+
# Process results
|
375
|
+
images, videos, audios = [], [], []
|
376
|
+
new_text_parts = []
|
332
377
|
for text_part in text_parts:
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
378
|
+
try:
|
379
|
+
if multimodal_tokens_pattern.match(text_part):
|
380
|
+
modality, raw_data, frame_limit = next(task_info_iter)
|
381
|
+
is_precomputed = isinstance(raw_data, dict)
|
382
|
+
result = next(futures_iter).result()
|
383
|
+
|
384
|
+
if modality == Modality.IMAGE:
|
385
|
+
# If data is already processed it will be a
|
386
|
+
# dictionary(precomputed). In this case we want to keep the
|
387
|
+
# expanded tokens in text_part. Otherwise, we will
|
388
|
+
# call the processor code, so keep only a single image
|
389
|
+
# token.
|
390
|
+
mm_tokens = (
|
391
|
+
text_part
|
392
|
+
if is_precomputed
|
393
|
+
else multimodal_tokens.image_token
|
394
|
+
)
|
395
|
+
frames = [result] if not isinstance(result, list) else result
|
396
|
+
if frames:
|
397
|
+
# only for minicpmv
|
398
|
+
images += frames
|
399
|
+
new_text_parts += mm_tokens * len(frames)
|
400
|
+
elif modality == Modality.VIDEO:
|
401
|
+
# load as video
|
402
|
+
mm_tokens = (
|
403
|
+
text_part
|
404
|
+
if is_precomputed
|
405
|
+
else multimodal_tokens.video_token
|
406
|
+
)
|
407
|
+
videos += [result]
|
408
|
+
new_text_parts += mm_tokens
|
409
|
+
elif modality == Modality.AUDIO:
|
410
|
+
# audio
|
411
|
+
mm_tokens = (
|
412
|
+
text_part
|
413
|
+
if is_precomputed
|
414
|
+
else multimodal_tokens.audio_token
|
415
|
+
)
|
416
|
+
audios += [result]
|
417
|
+
new_text_parts += mm_tokens
|
418
|
+
else:
|
419
|
+
# normal text
|
420
|
+
new_text_parts += [text_part]
|
421
|
+
|
422
|
+
except Exception as e:
|
423
|
+
raise RuntimeError(
|
424
|
+
f"An exception occurred while loading multimodal data: {e}"
|
425
|
+
)
|
426
|
+
return BaseMultiModalProcessorOutput(
|
368
427
|
images=images,
|
369
428
|
audios=audios,
|
429
|
+
videos=videos,
|
430
|
+
input_text="".join(new_text_parts),
|
370
431
|
)
|
371
|
-
out.normalize()
|
372
|
-
return out
|
373
432
|
|
374
433
|
@staticmethod
|
375
434
|
def get_mm_items_offset(
|
@@ -460,21 +519,19 @@ class BaseMultimodalProcessor(ABC):
|
|
460
519
|
)
|
461
520
|
except ValueError:
|
462
521
|
modality = Modality.IMAGE
|
463
|
-
|
464
522
|
if modality:
|
465
523
|
# Create item if needed
|
466
524
|
if modality not in items:
|
467
525
|
items[modality] = MultimodalDataItem(modality=modality)
|
468
526
|
|
469
527
|
# Set attribute
|
470
|
-
|
471
|
-
setattr(items[modality], attr_name, value)
|
528
|
+
setattr(items[modality], attr_name, value)
|
472
529
|
|
473
530
|
return list(items.values())
|
474
531
|
|
475
532
|
def _process_and_collect_mm_items(
|
476
533
|
self, input_text: str, images=None, audios=None, videos=None, **kwargs
|
477
|
-
) -> Tuple[List[MultimodalDataItem], torch.Tensor]:
|
534
|
+
) -> Tuple[List[MultimodalDataItem], torch.Tensor, dict]:
|
478
535
|
"""
|
479
536
|
Helper method to process multimodal data and create mm_items in one step.
|
480
537
|
|
@@ -488,11 +545,11 @@ class BaseMultimodalProcessor(ABC):
|
|
488
545
|
input_ids = ret["input_ids"].flatten()
|
489
546
|
collected_items = self.collect_mm_items_from_processor_output(ret)
|
490
547
|
|
491
|
-
return collected_items, input_ids
|
548
|
+
return collected_items, input_ids, ret
|
492
549
|
|
493
550
|
def process_and_combine_mm_data(
|
494
551
|
self, base_output: BaseMultiModalProcessorOutput
|
495
|
-
) -> Tuple[List[MultimodalDataItem], torch.Tensor]:
|
552
|
+
) -> Tuple[List[MultimodalDataItem], torch.Tensor, dict]:
|
496
553
|
"""
|
497
554
|
Process multimodal data and return the combined multimodal items and input_ids.
|
498
555
|
Supports mixed modalities (images and audio in the same request).
|
@@ -501,8 +558,7 @@ class BaseMultimodalProcessor(ABC):
|
|
501
558
|
Tuple of (list of mm_items, input_ids)
|
502
559
|
"""
|
503
560
|
# Collect all items and categorize them
|
504
|
-
all_items =
|
505
|
-
|
561
|
+
all_items = base_output.organize_results()
|
506
562
|
# Handle text-only case
|
507
563
|
if not all_items:
|
508
564
|
input_ids = self._processor.tokenizer(
|
@@ -510,19 +566,20 @@ class BaseMultimodalProcessor(ABC):
|
|
510
566
|
return_tensors="pt",
|
511
567
|
add_special_tokens=True,
|
512
568
|
).input_ids.flatten()
|
513
|
-
return [], input_ids
|
569
|
+
return [], input_ids, {}
|
514
570
|
|
515
|
-
dict_items, raw_images, raw_audios = [], [], []
|
516
|
-
for item in all_items:
|
571
|
+
dict_items, raw_images, raw_audios, raw_videos = [], [], [], []
|
572
|
+
for modality, item in all_items:
|
517
573
|
if isinstance(item, dict):
|
518
574
|
dict_items.append(item)
|
519
|
-
elif
|
575
|
+
elif modality == Modality.IMAGE:
|
520
576
|
raw_images.append(item)
|
521
|
-
elif
|
577
|
+
elif modality == Modality.AUDIO:
|
522
578
|
raw_audios.append(item)
|
579
|
+
elif modality == Modality.VIDEO:
|
580
|
+
raw_videos.append(item)
|
523
581
|
else:
|
524
582
|
raise ValueError(f"Unknown multimodal item type: {type(item)}")
|
525
|
-
|
526
583
|
# Process items and get input_ids
|
527
584
|
all_collected_items = []
|
528
585
|
input_ids = None
|
@@ -534,13 +591,16 @@ class BaseMultimodalProcessor(ABC):
|
|
534
591
|
)
|
535
592
|
|
536
593
|
# Handle raw items (need processing)
|
537
|
-
if raw_images or raw_audios:
|
538
|
-
collected_items, input_ids = self._process_and_collect_mm_items(
|
594
|
+
if raw_images or raw_audios or raw_videos:
|
595
|
+
collected_items, input_ids, ret = self._process_and_collect_mm_items(
|
539
596
|
input_text=base_output.input_text,
|
540
597
|
images=raw_images,
|
541
598
|
audios=raw_audios,
|
599
|
+
videos=raw_videos,
|
542
600
|
)
|
543
601
|
all_collected_items.extend(collected_items)
|
602
|
+
else:
|
603
|
+
ret = None
|
544
604
|
|
545
605
|
# Fallback tokenization if no raw items were processed
|
546
606
|
if input_ids is None:
|
@@ -553,21 +613,21 @@ class BaseMultimodalProcessor(ABC):
|
|
553
613
|
# Add offsets to all items
|
554
614
|
for mm_item in all_collected_items:
|
555
615
|
if mm_item.modality in [Modality.IMAGE, Modality.MULTI_IMAGES]:
|
556
|
-
mm_item.
|
616
|
+
mm_item.offsets = self.get_mm_items_offset(
|
557
617
|
input_ids=input_ids,
|
558
618
|
mm_token_id=self.IM_TOKEN_ID,
|
559
619
|
)
|
560
620
|
elif mm_item.modality == Modality.AUDIO:
|
561
|
-
mm_item.
|
621
|
+
mm_item.offsets = self.get_mm_items_offset(
|
562
622
|
input_ids=input_ids,
|
563
623
|
mm_token_id=self.AUDIO_TOKEN_ID,
|
564
624
|
)
|
565
625
|
elif mm_item.modality == Modality.VIDEO:
|
566
|
-
mm_item.
|
626
|
+
mm_item.offsets = self.get_mm_items_offset(
|
567
627
|
input_ids=input_ids,
|
568
628
|
mm_token_id=self.VIDEO_TOKEN_ID,
|
569
629
|
)
|
570
630
|
else:
|
571
631
|
raise ValueError(f"Unknown modality: {mm_item.modality}")
|
572
632
|
|
573
|
-
return all_collected_items, input_ids
|
633
|
+
return all_collected_items, input_ids, ret
|
@@ -69,7 +69,7 @@ class DeepseekVL2ImageProcessor(BaseMultimodalProcessor):
|
|
69
69
|
)
|
70
70
|
item = MultimodalDataItem(
|
71
71
|
pixel_values=res["images"],
|
72
|
-
|
72
|
+
offsets=image_offsets,
|
73
73
|
modality=Modality.IMAGE,
|
74
74
|
image_emb_mask=images_seq_mask,
|
75
75
|
image_spatial_crop=batched_images_spatial_crop,
|
@@ -36,6 +36,7 @@ class Gemma3SGLangImageProcessor(SGLangBaseProcessor):
|
|
36
36
|
*args,
|
37
37
|
**kwargs,
|
38
38
|
):
|
39
|
+
print(f"{image_data=}")
|
39
40
|
base_output = self.load_mm_data(
|
40
41
|
prompt=input_text,
|
41
42
|
image_data=image_data,
|
@@ -46,8 +47,9 @@ class Gemma3SGLangImageProcessor(SGLangBaseProcessor):
|
|
46
47
|
discard_alpha_channel=True,
|
47
48
|
)
|
48
49
|
|
49
|
-
mm_items, input_ids = self.process_and_combine_mm_data(base_output)
|
50
|
-
|
50
|
+
mm_items, input_ids, _ = self.process_and_combine_mm_data(base_output)
|
51
|
+
print(f"{base_output=}")
|
52
|
+
print(f"{mm_items=}")
|
51
53
|
return {
|
52
54
|
"input_ids": input_ids.tolist(),
|
53
55
|
"mm_items": mm_items,
|
@@ -72,7 +72,7 @@ class Gemma3nSGLangProcessor(SGLangBaseProcessor):
|
|
72
72
|
),
|
73
73
|
)
|
74
74
|
|
75
|
-
mm_items, input_ids = self.process_and_combine_mm_data(base_output)
|
75
|
+
mm_items, input_ids, _ = self.process_and_combine_mm_data(base_output)
|
76
76
|
|
77
77
|
return {
|
78
78
|
"input_ids": input_ids.tolist(),
|
@@ -39,7 +39,7 @@ class KimiVLImageProcessor(SGLangBaseProcessor):
|
|
39
39
|
max_req_input_len=max_req_input_len,
|
40
40
|
)
|
41
41
|
|
42
|
-
mm_items, input_ids = self.process_and_combine_mm_data(base_output)
|
42
|
+
mm_items, input_ids, _ = self.process_and_combine_mm_data(base_output)
|
43
43
|
|
44
44
|
return {
|
45
45
|
"input_ids": input_ids.tolist(),
|
@@ -19,6 +19,7 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
|
|
19
19
|
super().__init__(hf_config, server_args, _processor)
|
20
20
|
self.image_token = "(<image>./</image>)"
|
21
21
|
self.audio_token = "(<audio>./</audio>)"
|
22
|
+
self.video_token = "(<video>./</video>)"
|
22
23
|
|
23
24
|
async def process_mm_data_async(
|
24
25
|
self,
|
@@ -36,6 +37,7 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
|
|
36
37
|
image_data=image_data,
|
37
38
|
multimodal_tokens=MultimodalSpecialTokens(
|
38
39
|
image_token=self.image_token,
|
40
|
+
video_token=self.video_token,
|
39
41
|
audio_token=self.audio_token,
|
40
42
|
),
|
41
43
|
)
|
@@ -113,7 +115,7 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
|
|
113
115
|
if len(pixel_values) != 0:
|
114
116
|
item = MultimodalDataItem(
|
115
117
|
pixel_values=pixel_values,
|
116
|
-
|
118
|
+
offsets=image_offsets,
|
117
119
|
tgt_size=tgt_sizes_flat,
|
118
120
|
modality=Modality.IMAGE,
|
119
121
|
)
|
@@ -135,11 +137,10 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
|
|
135
137
|
item = MultimodalDataItem(
|
136
138
|
audio_features=[res["audio_features"]],
|
137
139
|
audio_feature_lens=res["audio_feature_lens"],
|
138
|
-
|
140
|
+
offsets=audio_offsets,
|
139
141
|
modality=Modality.AUDIO,
|
140
142
|
)
|
141
143
|
items += [item]
|
142
|
-
|
143
144
|
return {
|
144
145
|
"mm_items": items,
|
145
146
|
"input_ids": input_ids.tolist(),
|
@@ -65,7 +65,7 @@ class Phi4MMImageProcessor(BaseMultimodalProcessor):
|
|
65
65
|
pixel_values=res["input_image_embeds"],
|
66
66
|
image_sizes=res["image_sizes"],
|
67
67
|
image_emb_mask=res["image_attention_mask"],
|
68
|
-
|
68
|
+
offsets=image_offsets,
|
69
69
|
modality=Modality.IMAGE,
|
70
70
|
)
|
71
71
|
]
|