sglang 0.4.6.post3__py3-none-any.whl → 0.4.6.post5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_offline_throughput.py +10 -8
- sglang/bench_one_batch.py +7 -6
- sglang/bench_one_batch_server.py +157 -21
- sglang/bench_serving.py +137 -59
- sglang/compile_deep_gemm.py +5 -5
- sglang/eval/loogle_eval.py +157 -0
- sglang/lang/chat_template.py +78 -78
- sglang/lang/tracer.py +1 -1
- sglang/srt/code_completion_parser.py +1 -1
- sglang/srt/configs/deepseekvl2.py +2 -2
- sglang/srt/configs/model_config.py +40 -28
- sglang/srt/constrained/base_grammar_backend.py +55 -72
- sglang/srt/constrained/llguidance_backend.py +25 -21
- sglang/srt/constrained/outlines_backend.py +27 -26
- sglang/srt/constrained/reasoner_grammar_backend.py +22 -33
- sglang/srt/constrained/xgrammar_backend.py +69 -43
- sglang/srt/conversation.py +49 -44
- sglang/srt/disaggregation/base/conn.py +1 -0
- sglang/srt/disaggregation/decode.py +129 -135
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +142 -0
- sglang/srt/disaggregation/fake/conn.py +3 -13
- sglang/srt/disaggregation/kv_events.py +357 -0
- sglang/srt/disaggregation/mini_lb.py +57 -24
- sglang/srt/disaggregation/mooncake/conn.py +238 -122
- sglang/srt/disaggregation/mooncake/transfer_engine.py +2 -1
- sglang/srt/disaggregation/nixl/conn.py +10 -19
- sglang/srt/disaggregation/prefill.py +132 -47
- sglang/srt/disaggregation/utils.py +123 -6
- sglang/srt/distributed/utils.py +3 -3
- sglang/srt/entrypoints/EngineBase.py +5 -0
- sglang/srt/entrypoints/engine.py +44 -9
- sglang/srt/entrypoints/http_server.py +23 -6
- sglang/srt/entrypoints/http_server_engine.py +5 -2
- sglang/srt/function_call/base_format_detector.py +250 -0
- sglang/srt/function_call/core_types.py +34 -0
- sglang/srt/function_call/deepseekv3_detector.py +157 -0
- sglang/srt/function_call/ebnf_composer.py +234 -0
- sglang/srt/function_call/function_call_parser.py +175 -0
- sglang/srt/function_call/llama32_detector.py +74 -0
- sglang/srt/function_call/mistral_detector.py +84 -0
- sglang/srt/function_call/pythonic_detector.py +163 -0
- sglang/srt/function_call/qwen25_detector.py +67 -0
- sglang/srt/function_call/utils.py +35 -0
- sglang/srt/hf_transformers_utils.py +46 -7
- sglang/srt/layers/attention/aiter_backend.py +513 -0
- sglang/srt/layers/attention/flashattention_backend.py +64 -18
- sglang/srt/layers/attention/flashinfer_mla_backend.py +8 -4
- sglang/srt/layers/attention/flashmla_backend.py +340 -78
- sglang/srt/layers/attention/triton_backend.py +3 -0
- sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +1 -1
- sglang/srt/layers/attention/utils.py +6 -4
- sglang/srt/layers/attention/vision.py +1 -1
- sglang/srt/layers/communicator.py +451 -0
- sglang/srt/layers/dp_attention.py +61 -21
- sglang/srt/layers/layernorm.py +1 -1
- sglang/srt/layers/logits_processor.py +46 -11
- sglang/srt/layers/moe/cutlass_moe.py +207 -0
- sglang/srt/layers/moe/ep_moe/kernels.py +34 -12
- sglang/srt/layers/moe/ep_moe/layer.py +105 -51
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +82 -7
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +1 -1
- sglang/srt/layers/moe/fused_moe_triton/layer.py +14 -0
- sglang/srt/layers/moe/topk.py +67 -10
- sglang/srt/layers/multimodal.py +70 -0
- sglang/srt/layers/quantization/__init__.py +8 -3
- sglang/srt/layers/quantization/blockwise_int8.py +2 -2
- sglang/srt/layers/quantization/deep_gemm.py +77 -74
- sglang/srt/layers/quantization/fp8.py +92 -2
- sglang/srt/layers/quantization/fp8_kernel.py +3 -3
- sglang/srt/layers/quantization/fp8_utils.py +6 -0
- sglang/srt/layers/quantization/gptq.py +298 -6
- sglang/srt/layers/quantization/int8_kernel.py +20 -7
- sglang/srt/layers/quantization/qoq.py +244 -0
- sglang/srt/layers/sampler.py +0 -4
- sglang/srt/layers/vocab_parallel_embedding.py +18 -7
- sglang/srt/lora/lora_manager.py +2 -4
- sglang/srt/lora/mem_pool.py +4 -4
- sglang/srt/lora/triton_ops/gate_up_lora_b.py +1 -1
- sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
- sglang/srt/lora/triton_ops/sgemm_lora_a.py +1 -1
- sglang/srt/lora/triton_ops/sgemm_lora_b.py +1 -1
- sglang/srt/lora/utils.py +1 -1
- sglang/srt/managers/data_parallel_controller.py +3 -3
- sglang/srt/managers/deepseek_eplb.py +278 -0
- sglang/srt/managers/detokenizer_manager.py +21 -8
- sglang/srt/managers/eplb_manager.py +55 -0
- sglang/srt/managers/expert_distribution.py +704 -56
- sglang/srt/managers/expert_location.py +394 -0
- sglang/srt/managers/expert_location_dispatch.py +91 -0
- sglang/srt/managers/io_struct.py +19 -4
- sglang/srt/managers/mm_utils.py +294 -140
- sglang/srt/managers/multimodal_processors/base_processor.py +127 -42
- sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +6 -1
- sglang/srt/managers/multimodal_processors/gemma3.py +31 -6
- sglang/srt/managers/multimodal_processors/internvl.py +14 -5
- sglang/srt/managers/multimodal_processors/janus_pro.py +7 -1
- sglang/srt/managers/multimodal_processors/kimi_vl.py +7 -6
- sglang/srt/managers/multimodal_processors/llava.py +46 -0
- sglang/srt/managers/multimodal_processors/minicpm.py +25 -31
- sglang/srt/managers/multimodal_processors/mllama4.py +6 -0
- sglang/srt/managers/multimodal_processors/pixtral.py +127 -0
- sglang/srt/managers/multimodal_processors/qwen_vl.py +58 -16
- sglang/srt/managers/schedule_batch.py +122 -42
- sglang/srt/managers/schedule_policy.py +1 -5
- sglang/srt/managers/scheduler.py +205 -138
- sglang/srt/managers/scheduler_output_processor_mixin.py +124 -55
- sglang/srt/managers/session_controller.py +1 -1
- sglang/srt/managers/tokenizer_manager.py +232 -58
- sglang/srt/managers/tp_worker.py +12 -9
- sglang/srt/managers/tp_worker_overlap_thread.py +22 -11
- sglang/srt/mem_cache/base_prefix_cache.py +3 -0
- sglang/srt/mem_cache/chunk_cache.py +3 -1
- sglang/srt/mem_cache/hiradix_cache.py +4 -4
- sglang/srt/mem_cache/memory_pool.py +76 -52
- sglang/srt/mem_cache/multimodal_cache.py +45 -0
- sglang/srt/mem_cache/radix_cache.py +58 -5
- sglang/srt/metrics/collector.py +314 -39
- sglang/srt/mm_utils.py +10 -0
- sglang/srt/model_executor/cuda_graph_runner.py +29 -19
- sglang/srt/model_executor/expert_location_updater.py +422 -0
- sglang/srt/model_executor/forward_batch_info.py +5 -1
- sglang/srt/model_executor/model_runner.py +163 -68
- sglang/srt/model_loader/loader.py +10 -6
- sglang/srt/models/clip.py +5 -1
- sglang/srt/models/deepseek_janus_pro.py +2 -2
- sglang/srt/models/deepseek_v2.py +308 -351
- sglang/srt/models/exaone.py +8 -3
- sglang/srt/models/gemma3_mm.py +70 -33
- sglang/srt/models/llama.py +2 -0
- sglang/srt/models/llama4.py +15 -8
- sglang/srt/models/llava.py +258 -7
- sglang/srt/models/mimo_mtp.py +220 -0
- sglang/srt/models/minicpmo.py +5 -12
- sglang/srt/models/mistral.py +71 -1
- sglang/srt/models/mixtral.py +98 -34
- sglang/srt/models/mllama.py +3 -3
- sglang/srt/models/pixtral.py +467 -0
- sglang/srt/models/qwen2.py +95 -26
- sglang/srt/models/qwen2_5_vl.py +8 -0
- sglang/srt/models/qwen2_moe.py +330 -60
- sglang/srt/models/qwen2_vl.py +6 -0
- sglang/srt/models/qwen3.py +52 -10
- sglang/srt/models/qwen3_moe.py +411 -48
- sglang/srt/models/roberta.py +1 -1
- sglang/srt/models/siglip.py +294 -0
- sglang/srt/models/torch_native_llama.py +1 -1
- sglang/srt/openai_api/adapter.py +58 -20
- sglang/srt/openai_api/protocol.py +6 -8
- sglang/srt/operations.py +154 -0
- sglang/srt/operations_strategy.py +31 -0
- sglang/srt/reasoning_parser.py +3 -3
- sglang/srt/sampling/custom_logit_processor.py +18 -3
- sglang/srt/sampling/sampling_batch_info.py +4 -56
- sglang/srt/sampling/sampling_params.py +2 -2
- sglang/srt/server_args.py +162 -22
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +3 -3
- sglang/srt/speculative/eagle_utils.py +138 -7
- sglang/srt/speculative/eagle_worker.py +69 -21
- sglang/srt/utils.py +74 -17
- sglang/test/few_shot_gsm8k.py +2 -2
- sglang/test/few_shot_gsm8k_engine.py +2 -2
- sglang/test/run_eval.py +2 -2
- sglang/test/runners.py +8 -1
- sglang/test/send_one.py +13 -3
- sglang/test/simple_eval_common.py +1 -1
- sglang/test/simple_eval_humaneval.py +1 -1
- sglang/test/test_cutlass_moe.py +278 -0
- sglang/test/test_programs.py +5 -5
- sglang/test/test_utils.py +55 -14
- sglang/utils.py +3 -3
- sglang/version.py +1 -1
- {sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/METADATA +23 -13
- {sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/RECORD +178 -149
- {sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/WHEEL +1 -1
- sglang/srt/function_call_parser.py +0 -858
- sglang/srt/platforms/interface.py +0 -371
- /sglang/{llama3_eval.py → eval/llama3_eval.py} +0 -0
- /sglang/srt/models/{xiaomi_mimo.py → mimo.py} +0 -0
- {sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/top_level.txt +0 -0
@@ -3,16 +3,16 @@ import concurrent.futures
|
|
3
3
|
import dataclasses
|
4
4
|
import multiprocessing as mp
|
5
5
|
import os
|
6
|
+
import re
|
6
7
|
from abc import ABC, abstractmethod
|
7
|
-
from typing import List, Optional
|
8
|
+
from typing import List, Optional, Tuple, Union
|
8
9
|
|
9
10
|
import numpy as np
|
10
|
-
import PIL
|
11
11
|
import torch
|
12
12
|
from PIL import Image
|
13
13
|
from transformers import BaseImageProcessorFast
|
14
14
|
|
15
|
-
from sglang.srt.managers.schedule_batch import Modality
|
15
|
+
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
16
16
|
from sglang.srt.utils import encode_video, load_audio, load_image
|
17
17
|
|
18
18
|
|
@@ -22,13 +22,13 @@ class BaseMultiModalProcessorOutput:
|
|
22
22
|
input_text: str
|
23
23
|
|
24
24
|
# frames loaded from image and video, in given order
|
25
|
-
images: Optional[list[
|
25
|
+
images: Optional[list[Union[Image.Image, MultimodalDataItem]]] = None
|
26
26
|
|
27
27
|
# audios
|
28
|
-
audios: Optional[list[np.ndarray]] = None
|
28
|
+
audios: Optional[list[Union[np.ndarray, MultimodalDataItem]]] = None
|
29
29
|
|
30
30
|
def normalize(self):
|
31
|
-
for field_name in ["
|
31
|
+
for field_name in ["images", "audios"]:
|
32
32
|
field = getattr(self, field_name, None)
|
33
33
|
if field is not None and isinstance(field, list) and len(field) == 0:
|
34
34
|
setattr(self, field_name, None)
|
@@ -36,16 +36,48 @@ class BaseMultiModalProcessorOutput:
|
|
36
36
|
|
37
37
|
@dataclasses.dataclass
|
38
38
|
class MultimodalSpecialTokens:
|
39
|
-
image_token: Optional[str] = None
|
40
|
-
video_token: Optional[str] = None
|
41
|
-
audio_token: Optional[str] = None
|
42
|
-
|
43
|
-
def
|
44
|
-
|
45
|
-
token
|
46
|
-
|
47
|
-
|
39
|
+
image_token: Optional[Union[int, str, List[str]]] = None
|
40
|
+
video_token: Optional[Union[int, str, List[str]]] = None
|
41
|
+
audio_token: Optional[Union[int, str, List[str]]] = None
|
42
|
+
|
43
|
+
def convert_to_str(self, token: Union[str, int], processor) -> str:
|
44
|
+
if token is None:
|
45
|
+
return token
|
46
|
+
if isinstance(token, str):
|
47
|
+
return token
|
48
|
+
return processor.tokenizer.convert_ids_to_tokens([token])[0]
|
49
|
+
|
50
|
+
def convert_to_strs(self, processor):
|
51
|
+
self.image_token = self.convert_to_str(self.image_token, processor)
|
52
|
+
self.video_token = self.convert_to_str(self.video_token, processor)
|
53
|
+
self.audio_token = self.convert_to_str(self.audio_token, processor)
|
54
|
+
|
55
|
+
image_token_regex: Optional[re.Pattern] = None
|
56
|
+
video_token_regex: Optional[re.Pattern] = None
|
57
|
+
audio_token_regex: Optional[re.Pattern] = None
|
58
|
+
|
59
|
+
def __post_init__(self):
|
60
|
+
if self.image_token_regex is None and self.image_token is not None:
|
61
|
+
self.image_token_regex = re.compile(re.escape(self.image_token))
|
62
|
+
if self.video_token_regex is None and self.video_token is not None:
|
63
|
+
self.video_token_regex = re.compile(re.escape(self.video_token))
|
64
|
+
if self.audio_token_regex is None and self.audio_token is not None:
|
65
|
+
self.audio_token_regex = re.compile(re.escape(self.audio_token))
|
66
|
+
|
67
|
+
def collect(self) -> re.Pattern:
|
68
|
+
tokens = [
|
69
|
+
self.image_token_regex,
|
70
|
+
self.video_token_regex,
|
71
|
+
self.audio_token_regex,
|
48
72
|
]
|
73
|
+
patterns = []
|
74
|
+
flags = 0
|
75
|
+
for t in tokens:
|
76
|
+
if t is not None:
|
77
|
+
patterns.append(t.pattern)
|
78
|
+
flags |= t.flags
|
79
|
+
combined = "(" + "|".join(f"(?:{p})" for p in patterns) + ")"
|
80
|
+
return re.compile(combined, flags)
|
49
81
|
|
50
82
|
|
51
83
|
class BaseMultimodalProcessor(ABC):
|
@@ -54,6 +86,7 @@ class BaseMultimodalProcessor(ABC):
|
|
54
86
|
def __init__(self, hf_config, server_args, _processor):
|
55
87
|
self.hf_config = hf_config
|
56
88
|
self._processor = _processor
|
89
|
+
self.arch = hf_config.architectures[0]
|
57
90
|
self.server_args = server_args
|
58
91
|
# FIXME: not accurate, model and image specific
|
59
92
|
self.NUM_TOKEN_PER_FRAME = 330
|
@@ -136,6 +169,10 @@ class BaseMultimodalProcessor(ABC):
|
|
136
169
|
data, is_video, is_audio, frame_count_limit=None, discard_alpha_channel=True
|
137
170
|
):
|
138
171
|
"""Static method that can be pickled for multiprocessing"""
|
172
|
+
if isinstance(data, dict):
|
173
|
+
return MultimodalDataItem.from_dict(data)
|
174
|
+
if isinstance(data, MultimodalDataItem):
|
175
|
+
return data
|
139
176
|
try:
|
140
177
|
if is_audio:
|
141
178
|
return load_audio(data)
|
@@ -175,7 +212,10 @@ class BaseMultimodalProcessor(ABC):
|
|
175
212
|
image_index, audio_index = 0, 0
|
176
213
|
|
177
214
|
for text_part in text_parts:
|
178
|
-
if
|
215
|
+
if (
|
216
|
+
multimodal_tokens.image_token_regex
|
217
|
+
and multimodal_tokens.image_token_regex.match(text_part)
|
218
|
+
):
|
179
219
|
data = image_data[image_index]
|
180
220
|
is_video = isinstance(data, str) and data.startswith("video:")
|
181
221
|
estimated_frames = estimated_frames_list[image_index]
|
@@ -192,7 +232,10 @@ class BaseMultimodalProcessor(ABC):
|
|
192
232
|
)
|
193
233
|
task_info.append((Modality.IMAGE, data, frame_count_limit))
|
194
234
|
image_index += 1
|
195
|
-
elif
|
235
|
+
elif (
|
236
|
+
multimodal_tokens.audio_token_regex
|
237
|
+
and multimodal_tokens.audio_token_regex.match(text_part)
|
238
|
+
):
|
196
239
|
data = audio_data[audio_index]
|
197
240
|
futures.append(
|
198
241
|
self.io_executor.submit(
|
@@ -228,17 +271,13 @@ class BaseMultimodalProcessor(ABC):
|
|
228
271
|
discard_alpha_channel: if True, discards the alpha channel in the returned images
|
229
272
|
|
230
273
|
"""
|
231
|
-
|
274
|
+
if not return_text:
|
275
|
+
raise NotImplementedError()
|
232
276
|
if image_data is None:
|
233
277
|
image_data = []
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
multimodal_tokens.image_token
|
238
|
-
)
|
239
|
-
)
|
240
|
-
else:
|
241
|
-
multimodal_tokens.image_token = multimodal_tokens.image_token
|
278
|
+
|
279
|
+
multimodal_tokens.convert_to_strs(self._processor)
|
280
|
+
multimodal_tokens_pattern = multimodal_tokens.collect()
|
242
281
|
|
243
282
|
if isinstance(prompt, list) and return_text:
|
244
283
|
assert len(prompt) and isinstance(prompt[0], int)
|
@@ -247,16 +286,8 @@ class BaseMultimodalProcessor(ABC):
|
|
247
286
|
prompt = prompt
|
248
287
|
|
249
288
|
assert isinstance(prompt, str)
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
pattern = (
|
254
|
-
"("
|
255
|
-
+ "|".join(re.escape(sep) for sep in multimodal_tokens.collect())
|
256
|
-
+ ")"
|
257
|
-
)
|
258
|
-
# split text into list of normal text and special tokens
|
259
|
-
text_parts = re.split(pattern, prompt)
|
289
|
+
# split text into list of normal text and special tokens
|
290
|
+
text_parts = re.split(multimodal_tokens_pattern, prompt)
|
260
291
|
|
261
292
|
futures, task_info = self.submit_data_loading_tasks(
|
262
293
|
text_parts=text_parts,
|
@@ -266,34 +297,88 @@ class BaseMultimodalProcessor(ABC):
|
|
266
297
|
discard_alpha_channel=discard_alpha_channel,
|
267
298
|
)
|
268
299
|
# Process results
|
269
|
-
|
300
|
+
images, audios = [], []
|
270
301
|
new_text = ""
|
271
302
|
task_ptr = 0
|
272
303
|
|
273
304
|
for text_part in text_parts:
|
274
|
-
if
|
305
|
+
if multimodal_tokens_pattern.match(text_part):
|
275
306
|
task_type, data, frame_limit = task_info[task_ptr]
|
276
307
|
result = futures[task_ptr].result()
|
277
308
|
task_ptr += 1
|
278
309
|
|
279
310
|
if task_type == Modality.IMAGE:
|
311
|
+
# If data is already processed it will be a
|
312
|
+
# dictionary. In this case we want to keep the
|
313
|
+
# expanded tokens in text_part. Otherwise, we will
|
314
|
+
# call the processor code, so keep only a single image
|
315
|
+
# token.
|
316
|
+
mm_tokens = (
|
317
|
+
text_part
|
318
|
+
if isinstance(data, dict)
|
319
|
+
else multimodal_tokens.image_token
|
320
|
+
)
|
280
321
|
frames = [result] if not isinstance(result, list) else result
|
281
322
|
if frames:
|
282
|
-
image_sizes += frames[0].size * len(frames)
|
283
323
|
images += frames
|
284
|
-
new_text +=
|
324
|
+
new_text += mm_tokens * len(frames)
|
285
325
|
elif task_type == Modality.AUDIO:
|
286
326
|
# audio
|
327
|
+
mm_tokens = (
|
328
|
+
text_part
|
329
|
+
if isinstance(data, dict)
|
330
|
+
else multimodal_tokens.audio_token
|
331
|
+
)
|
287
332
|
audios.append(result)
|
288
|
-
new_text +=
|
333
|
+
new_text += mm_tokens
|
289
334
|
# TODO: handle video
|
290
335
|
else:
|
291
336
|
new_text += text_part
|
292
337
|
|
293
338
|
out = BaseMultiModalProcessorOutput(
|
339
|
+
input_text=new_text,
|
294
340
|
images=images,
|
295
341
|
audios=audios,
|
296
|
-
input_text=new_text,
|
297
342
|
)
|
298
343
|
out.normalize()
|
299
344
|
return out
|
345
|
+
|
346
|
+
@staticmethod
|
347
|
+
def get_mm_items_offset(
|
348
|
+
input_ids: torch.Tensor, mm_token_id: int
|
349
|
+
) -> List[Tuple[int, int]]:
|
350
|
+
"""
|
351
|
+
Get a set of range for mm_items from input_ids
|
352
|
+
Example:
|
353
|
+
input_ids = [1, 2, 3, 3, 3, 4, 3, 3]
|
354
|
+
mm_token_id = 3
|
355
|
+
return result = [(2,4),(6,7)]
|
356
|
+
"""
|
357
|
+
mask = input_ids == mm_token_id
|
358
|
+
|
359
|
+
start_positions = (mask & ~torch.roll(mask, 1)).nonzero(as_tuple=True)[0]
|
360
|
+
end_positions = (mask & ~torch.roll(mask, -1)).nonzero(as_tuple=True)[0]
|
361
|
+
|
362
|
+
return list(zip(start_positions.tolist(), end_positions.tolist()))
|
363
|
+
|
364
|
+
@staticmethod
|
365
|
+
def get_mm_items_offset_by_pair(
|
366
|
+
input_ids: torch.Tensor, mm_start_id: int, mm_end_id: int
|
367
|
+
) -> List[Tuple[int, int]]:
|
368
|
+
indices_start = (input_ids == mm_start_id).nonzero(as_tuple=True)[0] + 1
|
369
|
+
indices_end = (input_ids == mm_end_id).nonzero(as_tuple=True)[0] - 1
|
370
|
+
|
371
|
+
return list(zip(indices_start.tolist(), indices_end.tolist()))
|
372
|
+
|
373
|
+
def mm_inputs_are_preprocessed(self, mm_inputs: Optional[list]):
|
374
|
+
"""Returns true if all images are preprocessed, false if all are not, and error otherwise."""
|
375
|
+
if not mm_inputs:
|
376
|
+
return True
|
377
|
+
ret = any(isinstance(mm_input, MultimodalDataItem) for mm_input in mm_inputs)
|
378
|
+
if ret and not all(
|
379
|
+
isinstance(mm_input, MultimodalDataItem) for mm_input in mm_inputs
|
380
|
+
):
|
381
|
+
raise ValueError(
|
382
|
+
"Unsupported: mixture of multimodal inputs where some but not all are preprocessed."
|
383
|
+
)
|
384
|
+
return ret
|
@@ -70,8 +70,13 @@ class DeepseekVL2ImageProcessor(BaseMultimodalProcessor):
|
|
70
70
|
batched_images_spatial_crop = torch.stack(batched_images_spatial_crop, dim=0)
|
71
71
|
|
72
72
|
items = []
|
73
|
+
input_ids = res["input_ids"]
|
74
|
+
image_offsets = self.get_mm_items_offset(
|
75
|
+
input_ids=input_ids, mm_token_id=self._processor.image_token_id
|
76
|
+
)
|
73
77
|
item = MultimodalDataItem(
|
74
78
|
pixel_values=res["images"],
|
79
|
+
image_offsets=image_offsets,
|
75
80
|
modality=Modality.IMAGE,
|
76
81
|
image_emb_mask=images_seq_mask,
|
77
82
|
image_spatial_crop=batched_images_spatial_crop,
|
@@ -80,6 +85,6 @@ class DeepseekVL2ImageProcessor(BaseMultimodalProcessor):
|
|
80
85
|
|
81
86
|
return {
|
82
87
|
"mm_items": items,
|
83
|
-
"input_ids":
|
88
|
+
"input_ids": input_ids.tolist(),
|
84
89
|
"im_token_id": self._processor.image_token_id,
|
85
90
|
}
|
@@ -1,4 +1,5 @@
|
|
1
|
-
|
1
|
+
import re
|
2
|
+
from typing import Dict, List, Union
|
2
3
|
|
3
4
|
from sglang.srt.managers.multimodal_processor import (
|
4
5
|
BaseMultimodalProcessor as SGLangBaseProcessor,
|
@@ -18,13 +19,18 @@ class Gemma3SGLangImageProcessor(SGLangBaseProcessor):
|
|
18
19
|
|
19
20
|
def __init__(self, hf_config, server_args, _processor):
|
20
21
|
super().__init__(hf_config, server_args, _processor)
|
22
|
+
# The single, pre-expanded image token.
|
21
23
|
self.IMAGE_TOKEN = "<start_of_image>"
|
24
|
+
# The regex that matches expanded image tokens.
|
25
|
+
self.IMAGE_TOKEN_REGEX = re.compile(
|
26
|
+
r"<start_of_image>(?:(?:<image_soft_token>)*<end_of_image>)?"
|
27
|
+
)
|
22
28
|
self.IM_START_TOKEN_ID = hf_config.boi_token_index
|
23
29
|
self.IM_END_TOKEN_ID = hf_config.eoi_token_index
|
24
30
|
|
25
31
|
async def process_mm_data_async(
|
26
32
|
self,
|
27
|
-
image_data: List[Union[str, bytes]],
|
33
|
+
image_data: List[Union[str, bytes, Dict]],
|
28
34
|
input_text,
|
29
35
|
request_obj,
|
30
36
|
max_req_input_len,
|
@@ -37,29 +43,48 @@ class Gemma3SGLangImageProcessor(SGLangBaseProcessor):
|
|
37
43
|
image_data = [image_data]
|
38
44
|
|
39
45
|
image_token = self.IMAGE_TOKEN
|
46
|
+
image_token_regex = self.IMAGE_TOKEN_REGEX
|
40
47
|
base_output = self.load_mm_data(
|
41
48
|
prompt=input_text,
|
42
49
|
image_data=image_data,
|
43
|
-
multimodal_tokens=MultimodalSpecialTokens(
|
50
|
+
multimodal_tokens=MultimodalSpecialTokens(
|
51
|
+
image_token=image_token, image_token_regex=image_token_regex
|
52
|
+
),
|
44
53
|
max_req_input_len=max_req_input_len,
|
45
54
|
discard_alpha_channel=True,
|
46
55
|
)
|
47
56
|
|
57
|
+
images_are_preprocessed = self.mm_inputs_are_preprocessed(base_output.images)
|
48
58
|
ret = self.process_mm_data(
|
49
|
-
input_text=base_output.input_text,
|
59
|
+
input_text=base_output.input_text,
|
60
|
+
images=None if images_are_preprocessed else base_output.images,
|
50
61
|
)
|
51
62
|
|
52
63
|
items = []
|
64
|
+
input_ids = ret["input_ids"].flatten()
|
65
|
+
image_offsets = self.get_mm_items_offset(
|
66
|
+
input_ids=input_ids,
|
67
|
+
mm_token_id=self.hf_config.image_token_index,
|
68
|
+
)
|
53
69
|
for i, image in enumerate(base_output.images):
|
70
|
+
if images_are_preprocessed:
|
71
|
+
pixel_values = image.pixel_values
|
72
|
+
precomputed_features = image.precomputed_features
|
73
|
+
else:
|
74
|
+
pixel_values = ret["pixel_values"][i]
|
75
|
+
precomputed_features = None
|
76
|
+
|
54
77
|
item = MultimodalDataItem(
|
55
|
-
pixel_values=
|
78
|
+
pixel_values=pixel_values,
|
79
|
+
precomputed_features=precomputed_features,
|
56
80
|
modality=Modality.IMAGE,
|
81
|
+
image_offsets=image_offsets[i],
|
57
82
|
)
|
58
83
|
items += [item]
|
59
84
|
|
60
85
|
return {
|
61
86
|
"mm_items": items,
|
62
|
-
"input_ids":
|
87
|
+
"input_ids": input_ids.tolist(),
|
63
88
|
"im_start_id": self.IM_START_TOKEN_ID,
|
64
89
|
"im_end_id": self.IM_END_TOKEN_ID,
|
65
90
|
}
|
@@ -3,7 +3,6 @@
|
|
3
3
|
import numpy as np
|
4
4
|
import torch
|
5
5
|
from decord import VideoReader, cpu
|
6
|
-
from numpy.distutils.cpuinfo import cpu
|
7
6
|
from PIL import Image
|
8
7
|
|
9
8
|
from sglang.srt.managers.multimodal_processors.base_processor import (
|
@@ -210,7 +209,6 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
|
|
210
209
|
return None
|
211
210
|
|
212
211
|
pixel_values = torch.cat(pixel_values, dim=0)
|
213
|
-
items = [MultimodalDataItem(pixel_values=pixel_values, modality=Modality.IMAGE)]
|
214
212
|
|
215
213
|
for idx, num_patches in enumerate(num_patches_list):
|
216
214
|
image_tokens = (
|
@@ -221,10 +219,21 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
|
|
221
219
|
input_text = input_text.replace("<image>", image_tokens, 1)
|
222
220
|
|
223
221
|
tokenizer = self._processor
|
222
|
+
input_ids = tokenizer(input_text, return_tensors="pt")["input_ids"].flatten()
|
223
|
+
image_offsets = self.get_mm_items_offset(
|
224
|
+
input_ids=input_ids,
|
225
|
+
mm_token_id=self.img_context_token_id,
|
226
|
+
)
|
227
|
+
items = [
|
228
|
+
MultimodalDataItem(
|
229
|
+
pixel_values=pixel_values,
|
230
|
+
modality=Modality.IMAGE,
|
231
|
+
image_offsets=image_offsets,
|
232
|
+
)
|
233
|
+
]
|
234
|
+
|
224
235
|
return {
|
225
|
-
"input_ids":
|
226
|
-
.flatten()
|
227
|
-
.tolist(),
|
236
|
+
"input_ids": input_ids.tolist(),
|
228
237
|
"mm_items": items,
|
229
238
|
"im_start_id": self.img_start_token_id,
|
230
239
|
"im_end_id": self.img_end_token_id,
|
@@ -45,15 +45,21 @@ class JanusProImageProcessor(BaseMultimodalProcessor):
|
|
45
45
|
prompt=base_out.input_text,
|
46
46
|
images=images,
|
47
47
|
)
|
48
|
+
|
49
|
+
input_ids = res["input_ids"].flatten()
|
50
|
+
image_offsets = self.get_mm_items_offset(
|
51
|
+
input_ids=input_ids, mm_token_id=processor.image_id
|
52
|
+
)
|
48
53
|
return {
|
49
54
|
"mm_items": [
|
50
55
|
MultimodalDataItem(
|
51
56
|
pixel_values=res["pixel_values"],
|
52
57
|
image_emb_mask=res["images_emb_mask"],
|
58
|
+
image_offsets=image_offsets,
|
53
59
|
modality=Modality.IMAGE,
|
54
60
|
)
|
55
61
|
],
|
56
|
-
"input_ids":
|
62
|
+
"input_ids": input_ids.tolist(),
|
57
63
|
"im_start_id": processor.image_start_id,
|
58
64
|
"im_end_id": processor.image_end_id,
|
59
65
|
"im_token_id": processor.image_id,
|
@@ -1,10 +1,5 @@
|
|
1
|
-
import asyncio
|
2
|
-
import math
|
3
1
|
from typing import List, Union
|
4
2
|
|
5
|
-
import torch
|
6
|
-
from PIL import Image
|
7
|
-
|
8
3
|
from sglang.srt.managers.multimodal_processors.base_processor import (
|
9
4
|
BaseMultimodalProcessor as SGLangBaseProcessor,
|
10
5
|
)
|
@@ -57,13 +52,19 @@ class KimiVLImageProcessor(SGLangBaseProcessor):
|
|
57
52
|
input_text=base_output.input_text,
|
58
53
|
images=base_output.images,
|
59
54
|
)
|
55
|
+
input_ids = ret["input_ids"].flatten()
|
56
|
+
image_offsets = self.get_mm_items_offset(
|
57
|
+
input_ids=input_ids,
|
58
|
+
mm_token_id=self.im_token_id,
|
59
|
+
)
|
60
60
|
return {
|
61
|
-
"input_ids":
|
61
|
+
"input_ids": input_ids.tolist(),
|
62
62
|
"mm_items": [
|
63
63
|
MultimodalDataItem(
|
64
64
|
pixel_values=ret["pixel_values"],
|
65
65
|
image_grid_thws=ret["image_grid_hws"],
|
66
66
|
modality=Modality.IMAGE,
|
67
|
+
image_offsets=image_offsets,
|
67
68
|
)
|
68
69
|
],
|
69
70
|
"im_token_id": self.im_token_id,
|
@@ -2,18 +2,24 @@ import asyncio
|
|
2
2
|
from typing import List, Optional, Union
|
3
3
|
|
4
4
|
import numpy as np
|
5
|
+
from transformers.models.auto.processing_auto import (
|
6
|
+
PROCESSOR_MAPPING_NAMES as HF_MAPPING_NAMES,
|
7
|
+
)
|
5
8
|
|
9
|
+
import sglang.srt.managers.multimodal_processor as sgl_mm_processor_utils
|
6
10
|
from sglang.srt.managers.multimodal_processors.base_processor import (
|
7
11
|
BaseMultimodalProcessor,
|
8
12
|
)
|
9
13
|
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
10
14
|
from sglang.srt.mm_utils import expand2square, process_anyres_image
|
11
15
|
from sglang.srt.models.llava import (
|
16
|
+
LlavaForConditionalGeneration,
|
12
17
|
LlavaLlamaForCausalLM,
|
13
18
|
LlavaMistralForCausalLM,
|
14
19
|
LlavaQwenForCausalLM,
|
15
20
|
)
|
16
21
|
from sglang.srt.models.llavavid import LlavaVidForCausalLM
|
22
|
+
from sglang.srt.models.mistral import Mistral3ForConditionalGeneration
|
17
23
|
from sglang.srt.utils import load_image, logger
|
18
24
|
from sglang.utils import get_exception_traceback
|
19
25
|
|
@@ -133,6 +139,7 @@ class LlavaImageProcessor(BaseMultimodalProcessor):
|
|
133
139
|
img_data, aspect_ratio, grid_pinpoints
|
134
140
|
)
|
135
141
|
)
|
142
|
+
|
136
143
|
res = await asyncio.gather(*res)
|
137
144
|
for pixel_v, image_h, image_s in res:
|
138
145
|
pixel_values.append(pixel_v)
|
@@ -165,3 +172,42 @@ class LlavaImageProcessor(BaseMultimodalProcessor):
|
|
165
172
|
)
|
166
173
|
],
|
167
174
|
}
|
175
|
+
|
176
|
+
|
177
|
+
class LlavaMultimodalProcessor(BaseMultimodalProcessor):
|
178
|
+
"""
|
179
|
+
This is a wrapper class used to identify the multimodal processor for Llava architectures' vision model.
|
180
|
+
"""
|
181
|
+
|
182
|
+
models = [LlavaForConditionalGeneration, Mistral3ForConditionalGeneration]
|
183
|
+
|
184
|
+
def _get_sgl_processor_cls(self, model_type: str):
|
185
|
+
if hf_name := HF_MAPPING_NAMES.get(model_type):
|
186
|
+
sgl_mm_processor_set = sgl_mm_processor_utils.PROCESSOR_MAPPING.values()
|
187
|
+
sgl_processor_cls = list(
|
188
|
+
filter(lambda p: p.__name__ == hf_name, sgl_mm_processor_set)
|
189
|
+
)
|
190
|
+
if sgl_processor_cls:
|
191
|
+
return sgl_processor_cls[0]
|
192
|
+
raise ValueError(
|
193
|
+
f"Cannot find corresponding multimodal processor registered in sglang for model type `{model_type}`"
|
194
|
+
)
|
195
|
+
|
196
|
+
def __init__(self, hf_config, server_args, _processor):
|
197
|
+
assert hasattr(hf_config, "vision_config")
|
198
|
+
assert hasattr(hf_config, "text_config")
|
199
|
+
self.vision_config = hf_config.vision_config
|
200
|
+
self.text_config = hf_config.text_config
|
201
|
+
self.hf_config = hf_config
|
202
|
+
|
203
|
+
if vision_type := getattr(self.vision_config, "model_type"):
|
204
|
+
self.inner = self._get_sgl_processor_cls(vision_type)(
|
205
|
+
hf_config, server_args, _processor
|
206
|
+
)
|
207
|
+
else:
|
208
|
+
raise ValueError(
|
209
|
+
f"Required `vision_config.model_type` is not found in hf_config: `{hf_config}`"
|
210
|
+
)
|
211
|
+
|
212
|
+
async def process_mm_data_async(self, *args, **kwargs):
|
213
|
+
return await self.inner.process_mm_data_async(*args, **kwargs)
|
@@ -1,7 +1,6 @@
|
|
1
1
|
from typing import List, Union
|
2
2
|
|
3
3
|
import torch
|
4
|
-
from transformers import BaseImageProcessorFast
|
5
4
|
|
6
5
|
from sglang.srt.managers.multimodal_processors.base_processor import (
|
7
6
|
BaseMultimodalProcessor,
|
@@ -21,33 +20,6 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
|
|
21
20
|
self.image_token = "(<image>./</image>)"
|
22
21
|
self.audio_token = "(<audio>./</audio>)"
|
23
22
|
|
24
|
-
def process_data_task(self, input_text, images=None, audios=None):
|
25
|
-
|
26
|
-
if isinstance(images, list) and len(images) == 0:
|
27
|
-
images = None
|
28
|
-
if isinstance(audios, list) and len(audios) == 0:
|
29
|
-
audios = None
|
30
|
-
processor = self._processor
|
31
|
-
args = {}
|
32
|
-
if isinstance(processor, BaseImageProcessorFast):
|
33
|
-
args["device"] = "cuda"
|
34
|
-
result = self._processor.__call__(
|
35
|
-
text=input_text,
|
36
|
-
images=images,
|
37
|
-
audios=audios,
|
38
|
-
return_tensors="pt",
|
39
|
-
chunk_input=True,
|
40
|
-
**args,
|
41
|
-
)
|
42
|
-
return {
|
43
|
-
"input_ids": result.input_ids,
|
44
|
-
"pixel_values": getattr(result, "pixel_values", None),
|
45
|
-
"tgt_sizes": getattr(result, "tgt_sizes", None),
|
46
|
-
"audio_features": getattr(result, "audio_features", None),
|
47
|
-
"audio_feature_lens": getattr(result, "audio_feature_lens", None),
|
48
|
-
"audio_bounds": getattr(result, "audio_bounds", None),
|
49
|
-
}
|
50
|
-
|
51
23
|
async def process_mm_data_async(
|
52
24
|
self,
|
53
25
|
image_data: List[Union[str, bytes]],
|
@@ -97,6 +69,8 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
|
|
97
69
|
audio_start_id = tokenizer.audio_start_id
|
98
70
|
audio_end_id = tokenizer.audio_end_id
|
99
71
|
|
72
|
+
im_start_id = tokenizer.im_start_id
|
73
|
+
im_end_id = tokenizer.im_end_id
|
100
74
|
im_token_id = tokenizer.unk_id
|
101
75
|
pixel_values = res["pixel_values"]
|
102
76
|
tgt_sizes = res["tgt_sizes"]
|
@@ -132,9 +106,20 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
|
|
132
106
|
pixel_values = pixel_values_flat
|
133
107
|
|
134
108
|
items = []
|
109
|
+
input_ids = res["input_ids"].flatten()
|
110
|
+
image_offsets = self.get_mm_items_offset_by_pair(
|
111
|
+
input_ids=input_ids, mm_start_id=im_start_id, mm_end_id=im_end_id
|
112
|
+
)
|
113
|
+
slice_offsets = self.get_mm_items_offset_by_pair(
|
114
|
+
input_ids=input_ids, mm_start_id=slice_start_id, mm_end_id=slice_end_id
|
115
|
+
)
|
116
|
+
image_offsets.extend(slice_offsets)
|
117
|
+
image_offsets = sorted(image_offsets)
|
118
|
+
|
135
119
|
if len(pixel_values) != 0:
|
136
120
|
item = MultimodalDataItem(
|
137
121
|
pixel_values=pixel_values,
|
122
|
+
image_offsets=image_offsets,
|
138
123
|
tgt_size=tgt_sizes_flat,
|
139
124
|
modality=Modality.IMAGE,
|
140
125
|
)
|
@@ -145,21 +130,30 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
|
|
145
130
|
and res["audio_features"] is not None
|
146
131
|
and len(res["audio_features"]) != 0
|
147
132
|
):
|
133
|
+
if audio_start_id is not None and audio_end_id is not None:
|
134
|
+
audio_offsets = self.get_mm_items_offset_by_pair(
|
135
|
+
input_ids=input_ids,
|
136
|
+
mm_start_id=audio_start_id,
|
137
|
+
mm_end_id=audio_end_id,
|
138
|
+
)
|
139
|
+
else:
|
140
|
+
audio_offsets = None
|
148
141
|
item = MultimodalDataItem(
|
149
142
|
audio_features=[res["audio_features"]],
|
150
143
|
audio_feature_lens=res["audio_feature_lens"],
|
144
|
+
audio_offsets=audio_offsets,
|
151
145
|
modality=Modality.AUDIO,
|
152
146
|
)
|
153
147
|
items += [item]
|
154
148
|
|
155
149
|
return {
|
156
150
|
"mm_items": items,
|
157
|
-
"input_ids":
|
151
|
+
"input_ids": input_ids.tolist(),
|
158
152
|
"audio_start_id": audio_start_id,
|
159
153
|
"audio_end_id": audio_end_id,
|
160
154
|
"im_token_id": im_token_id,
|
161
|
-
"im_start_id":
|
162
|
-
"im_end_id":
|
155
|
+
"im_start_id": im_start_id,
|
156
|
+
"im_end_id": im_end_id,
|
163
157
|
"slice_start_id": slice_start_id,
|
164
158
|
"slice_end_id": slice_end_id,
|
165
159
|
}
|
@@ -135,11 +135,17 @@ class Mllama4ImageProcessor(BaseMultimodalProcessor):
|
|
135
135
|
processor_output["im_end_id"] = self.eoi_token_index
|
136
136
|
processor_output["im_token_id"] = self.image_token_index
|
137
137
|
|
138
|
+
image_offsets = self.get_mm_items_offset(
|
139
|
+
input_ids=torch.tensor(processor_output["input_ids"]),
|
140
|
+
mm_token_id=self.image_token_index,
|
141
|
+
)
|
142
|
+
|
138
143
|
# Add metadata for image processing
|
139
144
|
processor_output["mm_items"] = [
|
140
145
|
MultimodalDataItem(
|
141
146
|
pixel_values=processor_output["pixel_values"],
|
142
147
|
modality=Modality.IMAGE,
|
148
|
+
image_offsets=image_offsets,
|
143
149
|
)
|
144
150
|
]
|
145
151
|
|