sglang 0.4.8.post1__py3-none-any.whl → 0.4.9.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch_server.py +17 -2
- sglang/bench_serving.py +170 -24
- sglang/srt/configs/internvl.py +4 -2
- sglang/srt/configs/janus_pro.py +1 -1
- sglang/srt/configs/model_config.py +60 -1
- sglang/srt/configs/update_config.py +119 -0
- sglang/srt/conversation.py +69 -1
- sglang/srt/disaggregation/decode.py +21 -5
- sglang/srt/disaggregation/mooncake/conn.py +35 -4
- sglang/srt/disaggregation/nixl/conn.py +6 -6
- sglang/srt/disaggregation/prefill.py +2 -2
- sglang/srt/disaggregation/utils.py +1 -1
- sglang/srt/distributed/parallel_state.py +44 -17
- sglang/srt/entrypoints/EngineBase.py +8 -0
- sglang/srt/entrypoints/engine.py +40 -6
- sglang/srt/entrypoints/http_server.py +111 -24
- sglang/srt/entrypoints/http_server_engine.py +1 -1
- sglang/srt/entrypoints/openai/protocol.py +4 -2
- sglang/srt/eplb/__init__.py +0 -0
- sglang/srt/{managers → eplb}/eplb_algorithms/__init__.py +1 -1
- sglang/srt/{managers → eplb}/eplb_manager.py +2 -4
- sglang/srt/{eplb_simulator → eplb/eplb_simulator}/reader.py +1 -1
- sglang/srt/{managers → eplb}/expert_distribution.py +1 -5
- sglang/srt/{managers → eplb}/expert_location.py +1 -1
- sglang/srt/{managers → eplb}/expert_location_dispatch.py +1 -1
- sglang/srt/{model_executor → eplb}/expert_location_updater.py +17 -1
- sglang/srt/hf_transformers_utils.py +2 -1
- sglang/srt/layers/activation.py +2 -2
- sglang/srt/layers/amx_utils.py +86 -0
- sglang/srt/layers/attention/ascend_backend.py +219 -0
- sglang/srt/layers/attention/flashattention_backend.py +32 -9
- sglang/srt/layers/attention/tbo_backend.py +37 -9
- sglang/srt/layers/communicator.py +20 -2
- sglang/srt/layers/dp_attention.py +9 -3
- sglang/srt/layers/elementwise.py +76 -12
- sglang/srt/layers/flashinfer_comm_fusion.py +202 -0
- sglang/srt/layers/layernorm.py +26 -0
- sglang/srt/layers/linear.py +84 -14
- sglang/srt/layers/logits_processor.py +4 -4
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +215 -0
- sglang/srt/layers/moe/ep_moe/kernels.py +81 -8
- sglang/srt/layers/moe/ep_moe/layer.py +176 -15
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +23 -17
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +3 -2
- sglang/srt/layers/moe/fused_moe_triton/layer.py +211 -74
- sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +176 -0
- sglang/srt/layers/moe/router.py +60 -22
- sglang/srt/layers/moe/topk.py +10 -28
- sglang/srt/layers/parameter.py +67 -7
- sglang/srt/layers/quantization/__init__.py +2 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +1 -1
- sglang/srt/layers/quantization/fp8.py +72 -7
- sglang/srt/layers/quantization/fp8_kernel.py +1 -1
- sglang/srt/layers/quantization/fp8_utils.py +1 -2
- sglang/srt/layers/quantization/gptq.py +5 -1
- sglang/srt/layers/quantization/modelopt_quant.py +244 -1
- sglang/srt/layers/quantization/moe_wna16.py +1 -1
- sglang/srt/layers/quantization/quant_utils.py +166 -0
- sglang/srt/layers/quantization/w4afp8.py +264 -0
- sglang/srt/layers/quantization/w8a8_int8.py +52 -1
- sglang/srt/layers/rotary_embedding.py +2 -2
- sglang/srt/layers/vocab_parallel_embedding.py +20 -10
- sglang/srt/lora/lora.py +4 -5
- sglang/srt/lora/lora_manager.py +73 -20
- sglang/srt/lora/triton_ops/gate_up_lora_b.py +30 -19
- sglang/srt/lora/triton_ops/qkv_lora_b.py +30 -19
- sglang/srt/lora/triton_ops/sgemm_lora_a.py +27 -11
- sglang/srt/lora/triton_ops/sgemm_lora_b.py +27 -15
- sglang/srt/managers/cache_controller.py +41 -195
- sglang/srt/managers/configure_logging.py +1 -1
- sglang/srt/managers/io_struct.py +58 -14
- sglang/srt/managers/mm_utils.py +77 -61
- sglang/srt/managers/multimodal_processor.py +2 -6
- sglang/srt/managers/multimodal_processors/qwen_audio.py +94 -0
- sglang/srt/managers/schedule_batch.py +78 -85
- sglang/srt/managers/scheduler.py +130 -64
- sglang/srt/managers/scheduler_output_processor_mixin.py +8 -2
- sglang/srt/managers/session_controller.py +12 -3
- sglang/srt/managers/tokenizer_manager.py +314 -103
- sglang/srt/managers/tp_worker.py +13 -1
- sglang/srt/managers/tp_worker_overlap_thread.py +8 -0
- sglang/srt/mem_cache/allocator.py +290 -0
- sglang/srt/mem_cache/chunk_cache.py +34 -2
- sglang/srt/mem_cache/hiradix_cache.py +2 -0
- sglang/srt/mem_cache/memory_pool.py +402 -66
- sglang/srt/mem_cache/memory_pool_host.py +6 -109
- sglang/srt/mem_cache/multimodal_cache.py +3 -0
- sglang/srt/mem_cache/radix_cache.py +8 -4
- sglang/srt/model_executor/cuda_graph_runner.py +2 -1
- sglang/srt/model_executor/forward_batch_info.py +17 -4
- sglang/srt/model_executor/model_runner.py +297 -56
- sglang/srt/model_loader/loader.py +41 -0
- sglang/srt/model_loader/weight_utils.py +72 -4
- sglang/srt/models/deepseek_nextn.py +1 -3
- sglang/srt/models/deepseek_v2.py +195 -45
- sglang/srt/models/deepseek_vl2.py +3 -5
- sglang/srt/models/gemma3_causal.py +1 -2
- sglang/srt/models/gemma3n_causal.py +4 -3
- sglang/srt/models/gemma3n_mm.py +4 -20
- sglang/srt/models/hunyuan.py +1 -1
- sglang/srt/models/kimi_vl.py +1 -2
- sglang/srt/models/llama.py +10 -4
- sglang/srt/models/llama4.py +32 -45
- sglang/srt/models/llama_eagle3.py +61 -11
- sglang/srt/models/llava.py +5 -5
- sglang/srt/models/minicpmo.py +2 -2
- sglang/srt/models/mistral.py +1 -1
- sglang/srt/models/mllama4.py +402 -89
- sglang/srt/models/phi4mm.py +1 -3
- sglang/srt/models/pixtral.py +3 -7
- sglang/srt/models/qwen2.py +31 -3
- sglang/srt/models/qwen2_5_vl.py +1 -3
- sglang/srt/models/qwen2_audio.py +200 -0
- sglang/srt/models/qwen2_moe.py +32 -6
- sglang/srt/models/qwen2_vl.py +1 -4
- sglang/srt/models/qwen3.py +94 -25
- sglang/srt/models/qwen3_moe.py +68 -21
- sglang/srt/models/vila.py +3 -8
- sglang/srt/{mm_utils.py → multimodal/mm_utils.py} +2 -2
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/base_processor.py +140 -158
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/clip.py +2 -13
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/deepseek_vl_v2.py +4 -11
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/gemma3.py +3 -10
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/gemma3n.py +5 -20
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/internvl.py +3 -10
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/janus_pro.py +3 -9
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/kimi_vl.py +6 -13
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/llava.py +2 -10
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/minicpm.py +5 -12
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/mlama.py +2 -14
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/mllama4.py +65 -66
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/phi4mm.py +4 -14
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/pixtral.py +3 -9
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/qwen_vl.py +8 -14
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/vila.py +13 -31
- sglang/srt/operations_strategy.py +6 -2
- sglang/srt/reasoning_parser.py +26 -0
- sglang/srt/sampling/sampling_batch_info.py +39 -1
- sglang/srt/server_args.py +84 -22
- sglang/srt/speculative/build_eagle_tree.py +57 -18
- sglang/srt/speculative/eagle_worker.py +6 -4
- sglang/srt/two_batch_overlap.py +203 -27
- sglang/srt/utils.py +343 -163
- sglang/srt/warmup.py +12 -3
- sglang/test/runners.py +10 -1
- sglang/test/test_cutlass_w4a8_moe.py +281 -0
- sglang/test/test_utils.py +15 -3
- sglang/utils.py +5 -5
- sglang/version.py +1 -1
- {sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/METADATA +12 -8
- {sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/RECORD +157 -146
- sglang/math_utils.py +0 -8
- /sglang/srt/{managers → eplb}/eplb_algorithms/deepseek.py +0 -0
- /sglang/srt/{managers → eplb}/eplb_algorithms/deepseek_vec.py +0 -0
- /sglang/srt/{eplb_simulator → eplb/eplb_simulator}/__init__.py +0 -0
- {sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/WHEEL +0 -0
- {sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/top_level.txt +0 -0
@@ -17,15 +17,6 @@ from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
|
17
17
|
from sglang.srt.utils import encode_video, load_audio, load_image
|
18
18
|
|
19
19
|
|
20
|
-
class MultimodalInputFormat(Enum):
|
21
|
-
"""Enum for different multimodal input formats."""
|
22
|
-
|
23
|
-
RAW_IMAGES = "raw_images"
|
24
|
-
PRECOMPUTED_FEATURES = "precomputed_features"
|
25
|
-
PIXEL_VALUES = "pixel_values"
|
26
|
-
AUDIO = "audio"
|
27
|
-
|
28
|
-
|
29
20
|
@dataclasses.dataclass
|
30
21
|
class BaseMultiModalProcessorOutput:
|
31
22
|
# input_text, with each frame of video/image represented with a image_token
|
@@ -98,6 +89,7 @@ class BaseMultimodalProcessor(ABC):
|
|
98
89
|
self._processor = _processor
|
99
90
|
self.arch = hf_config.architectures[0]
|
100
91
|
self.server_args = server_args
|
92
|
+
|
101
93
|
# FIXME: not accurate, model and image specific
|
102
94
|
self.NUM_TOKEN_PER_FRAME = 330
|
103
95
|
|
@@ -109,18 +101,45 @@ class BaseMultimodalProcessor(ABC):
|
|
109
101
|
max_workers=int(os.environ.get("SGLANG_CPU_WORKERS", os.cpu_count())),
|
110
102
|
)
|
111
103
|
|
104
|
+
# Mapping from attribute names to modality types
|
105
|
+
self.ATTR_NAME_TO_MODALITY = {
|
106
|
+
# Image-related attributes
|
107
|
+
"pixel_values": Modality.IMAGE,
|
108
|
+
"image_sizes": Modality.IMAGE,
|
109
|
+
"image_grid_thw": Modality.IMAGE,
|
110
|
+
"image_emb_mask": Modality.IMAGE,
|
111
|
+
"image_spatial_crop": Modality.IMAGE,
|
112
|
+
"tgt_size": Modality.IMAGE,
|
113
|
+
"image_grid_hws": Modality.IMAGE,
|
114
|
+
"aspect_ratio_id": Modality.IMAGE,
|
115
|
+
"aspect_ratio_mask": Modality.IMAGE,
|
116
|
+
"second_per_grid_ts": Modality.IMAGE,
|
117
|
+
# Audio-related attributes
|
118
|
+
"audio_features": Modality.AUDIO,
|
119
|
+
"audio_feature_lens": Modality.AUDIO,
|
120
|
+
"input_features": Modality.AUDIO,
|
121
|
+
"input_features_mask": Modality.AUDIO,
|
122
|
+
# Video-related attributes
|
123
|
+
"video_grid_thws": Modality.VIDEO,
|
124
|
+
# Generic attributes that could apply to multiple modalities
|
125
|
+
# "precomputed_features" - handled specially as it can be any modality
|
126
|
+
}
|
127
|
+
|
112
128
|
def process_mm_data(
|
113
129
|
self, input_text, images=None, videos=None, audios=None, **kwargs
|
114
130
|
):
|
115
131
|
"""
|
116
132
|
process multimodal data with transformers AutoProcessor
|
117
133
|
"""
|
118
|
-
if images
|
134
|
+
if images:
|
119
135
|
kwargs["images"] = images
|
120
|
-
if videos
|
136
|
+
if videos:
|
121
137
|
kwargs["videos"] = videos
|
122
|
-
if audios
|
138
|
+
if audios:
|
123
139
|
kwargs["audios"] = audios
|
140
|
+
if self.__class__.__name__ == "Gemma3nSGLangProcessor":
|
141
|
+
# Note(Xinyuan): for gemma3n, ref: https://github.com/huggingface/transformers/blob/ccf2ca162e33f381e454cdb74bf4b41a51ab976d/src/transformers/models/gemma3n/processing_gemma3n.py#L107
|
142
|
+
kwargs["audio"] = audios
|
124
143
|
|
125
144
|
processor = self._processor
|
126
145
|
if hasattr(processor, "image_processor") and isinstance(
|
@@ -143,6 +162,7 @@ class BaseMultimodalProcessor(ABC):
|
|
143
162
|
async def process_mm_data_async(
|
144
163
|
self,
|
145
164
|
image_data,
|
165
|
+
audio_data,
|
146
166
|
input_text,
|
147
167
|
request_obj,
|
148
168
|
max_req_input_len,
|
@@ -417,175 +437,137 @@ class BaseMultimodalProcessor(ABC):
|
|
417
437
|
values[k] = v
|
418
438
|
return values
|
419
439
|
|
440
|
+
def collect_mm_items_from_processor_output(
|
441
|
+
self, data_dict: dict
|
442
|
+
) -> List[MultimodalDataItem]:
|
443
|
+
"""Create mm_items directly from processor output."""
|
444
|
+
items = {} # modality -> MultimodalDataItem
|
445
|
+
|
446
|
+
for attr_name, value in data_dict.items():
|
447
|
+
if attr_name == "input_ids":
|
448
|
+
continue
|
449
|
+
|
450
|
+
# Get modality for this attribute
|
451
|
+
modality = self.ATTR_NAME_TO_MODALITY.get(attr_name)
|
452
|
+
|
453
|
+
if not modality and attr_name == "precomputed_features":
|
454
|
+
modality_str = data_dict.get("modality")
|
455
|
+
try:
|
456
|
+
modality = (
|
457
|
+
Modality.from_str(modality_str)
|
458
|
+
if modality_str
|
459
|
+
else Modality.IMAGE
|
460
|
+
)
|
461
|
+
except ValueError:
|
462
|
+
modality = Modality.IMAGE
|
463
|
+
|
464
|
+
if modality:
|
465
|
+
# Create item if needed
|
466
|
+
if modality not in items:
|
467
|
+
items[modality] = MultimodalDataItem(modality=modality)
|
468
|
+
|
469
|
+
# Set attribute
|
470
|
+
if hasattr(items[modality], attr_name):
|
471
|
+
setattr(items[modality], attr_name, value)
|
472
|
+
|
473
|
+
return list(items.values())
|
474
|
+
|
475
|
+
def _process_and_collect_mm_items(
|
476
|
+
self, input_text: str, images=None, audios=None, videos=None, **kwargs
|
477
|
+
) -> Tuple[List[MultimodalDataItem], torch.Tensor]:
|
478
|
+
"""
|
479
|
+
Helper method to process multimodal data and create mm_items in one step.
|
480
|
+
|
481
|
+
Returns:
|
482
|
+
Tuple of (created mm_items, input_ids)
|
483
|
+
"""
|
484
|
+
ret = self.process_mm_data(
|
485
|
+
input_text=input_text, images=images, audios=audios, videos=videos, **kwargs
|
486
|
+
)
|
487
|
+
|
488
|
+
input_ids = ret["input_ids"].flatten()
|
489
|
+
collected_items = self.collect_mm_items_from_processor_output(ret)
|
490
|
+
|
491
|
+
return collected_items, input_ids
|
492
|
+
|
420
493
|
def process_and_combine_mm_data(
|
421
494
|
self, base_output: BaseMultiModalProcessorOutput
|
422
|
-
) -> Tuple[
|
495
|
+
) -> Tuple[List[MultimodalDataItem], torch.Tensor]:
|
423
496
|
"""
|
424
|
-
Process multimodal data and return the combined multimodal
|
425
|
-
|
497
|
+
Process multimodal data and return the combined multimodal items and input_ids.
|
498
|
+
Supports mixed modalities (images and audio in the same request).
|
426
499
|
|
427
500
|
Returns:
|
428
|
-
Tuple of (
|
501
|
+
Tuple of (list of mm_items, input_ids)
|
429
502
|
"""
|
503
|
+
# Collect all items and categorize them
|
504
|
+
all_items = (base_output.images or []) + (base_output.audios or [])
|
430
505
|
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
input_text,
|
506
|
+
# Handle text-only case
|
507
|
+
if not all_items:
|
508
|
+
input_ids = self._processor.tokenizer(
|
509
|
+
base_output.input_text,
|
435
510
|
return_tensors="pt",
|
436
511
|
add_special_tokens=True,
|
437
512
|
).input_ids.flatten()
|
513
|
+
return [], input_ids
|
514
|
+
|
515
|
+
dict_items, raw_images, raw_audios = [], [], []
|
516
|
+
for item in all_items:
|
517
|
+
if isinstance(item, dict):
|
518
|
+
dict_items.append(item)
|
519
|
+
elif isinstance(item, Image.Image):
|
520
|
+
raw_images.append(item)
|
521
|
+
elif isinstance(item, np.ndarray):
|
522
|
+
raw_audios.append(item)
|
523
|
+
else:
|
524
|
+
raise ValueError(f"Unknown multimodal item type: {type(item)}")
|
438
525
|
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
has_image = False
|
443
|
-
has_pixel_values = False
|
444
|
-
has_precomputed_features = False
|
445
|
-
has_audio = False
|
446
|
-
|
447
|
-
for mm_input in mm_inputs:
|
448
|
-
if isinstance(mm_input, Image.Image):
|
449
|
-
has_image = True
|
450
|
-
elif isinstance(mm_input, np.ndarray):
|
451
|
-
has_audio = True
|
452
|
-
elif isinstance(mm_input, dict):
|
453
|
-
if mm_input.get("precomputed_features", None) is not None:
|
454
|
-
has_precomputed_features = True
|
455
|
-
elif mm_input.get("pixel_values", None) is not None:
|
456
|
-
has_pixel_values = True
|
457
|
-
else:
|
458
|
-
raise ValueError(
|
459
|
-
f"Invalid multimodal input: {mm_input}, expected dict with pixel_values or precomputed_features"
|
460
|
-
)
|
461
|
-
else:
|
462
|
-
raise ValueError(
|
463
|
-
f"Invalid multimodal input: {mm_input}, expected Image.Image or dict"
|
464
|
-
)
|
526
|
+
# Process items and get input_ids
|
527
|
+
all_collected_items = []
|
528
|
+
input_ids = None
|
465
529
|
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
)
|
470
|
-
if format_count > 1:
|
471
|
-
raise ValueError(
|
472
|
-
"Unsupported: mixture of multimodal input formats. "
|
473
|
-
f"Found formats: image={has_image}, pixel_values={has_pixel_values}, "
|
474
|
-
f"precomputed_features={has_precomputed_features}, audio={has_audio}"
|
475
|
-
)
|
476
|
-
|
477
|
-
if has_image:
|
478
|
-
return MultimodalInputFormat.RAW_IMAGES
|
479
|
-
elif has_precomputed_features:
|
480
|
-
return MultimodalInputFormat.PRECOMPUTED_FEATURES
|
481
|
-
elif has_pixel_values:
|
482
|
-
return MultimodalInputFormat.PIXEL_VALUES
|
483
|
-
elif has_audio:
|
484
|
-
return MultimodalInputFormat.AUDIO
|
485
|
-
else:
|
486
|
-
raise ValueError("No valid multimodal input format found")
|
487
|
-
except Exception as e:
|
488
|
-
raise ValueError(f"Failed to categorize inputs: {e}")
|
489
|
-
|
490
|
-
def process_raw_images(
|
491
|
-
base_output: BaseMultiModalProcessorOutput,
|
492
|
-
) -> Tuple[MultimodalDataItem, torch.Tensor]:
|
493
|
-
"""Process raw Image.Image objects using transformers processor."""
|
494
|
-
ret = self.process_mm_data(
|
495
|
-
input_text=base_output.input_text,
|
496
|
-
images=base_output.images,
|
497
|
-
)
|
498
|
-
combined_mm_item = MultimodalDataItem(modality=Modality.IMAGE)
|
499
|
-
|
500
|
-
# Copy all fields from processor output except input_ids
|
501
|
-
for key, value in ret.items():
|
502
|
-
if key != "input_ids" and hasattr(combined_mm_item, key):
|
503
|
-
setattr(combined_mm_item, key, value)
|
504
|
-
|
505
|
-
input_ids = ret["input_ids"].flatten()
|
506
|
-
return combined_mm_item, input_ids
|
507
|
-
|
508
|
-
def process_precomputed_features(
|
509
|
-
base_output: BaseMultiModalProcessorOutput,
|
510
|
-
) -> Tuple[MultimodalDataItem, torch.Tensor]:
|
511
|
-
"""Process inputs with precomputed features."""
|
512
|
-
combined_mm_item = MultimodalDataItem(modality=Modality.IMAGE)
|
513
|
-
combined_mm_item.precomputed_features = self._extract_processor_features(
|
514
|
-
base_output.images, "precomputed_features"
|
530
|
+
# Handle dict items (already processed)
|
531
|
+
for dict_item in dict_items:
|
532
|
+
all_collected_items.extend(
|
533
|
+
self.collect_mm_items_from_processor_output(dict_item)
|
515
534
|
)
|
516
|
-
|
517
|
-
|
518
|
-
|
519
|
-
|
520
|
-
base_output: BaseMultiModalProcessorOutput,
|
521
|
-
) -> Tuple[MultimodalDataItem, torch.Tensor]:
|
522
|
-
"""Process inputs with pixel values."""
|
523
|
-
values = self._extract_processor_features_from_all_attributes(
|
524
|
-
base_output.images
|
525
|
-
)
|
526
|
-
combined_mm_item = MultimodalDataItem.from_dict(values)
|
527
|
-
input_ids = tokenize_text(base_output.input_text)
|
528
|
-
return combined_mm_item, input_ids
|
529
|
-
|
530
|
-
def process_audio(
|
531
|
-
base_output: BaseMultiModalProcessorOutput,
|
532
|
-
) -> Tuple[MultimodalDataItem, torch.Tensor]:
|
533
|
-
"""Process inputs with audio."""
|
534
|
-
ret = self.process_mm_data(
|
535
|
+
|
536
|
+
# Handle raw items (need processing)
|
537
|
+
if raw_images or raw_audios:
|
538
|
+
collected_items, input_ids = self._process_and_collect_mm_items(
|
535
539
|
input_text=base_output.input_text,
|
536
|
-
|
540
|
+
images=raw_images,
|
541
|
+
audios=raw_audios,
|
537
542
|
)
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
input_ids =
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
|
550
|
-
|
543
|
+
all_collected_items.extend(collected_items)
|
544
|
+
|
545
|
+
# Fallback tokenization if no raw items were processed
|
546
|
+
if input_ids is None:
|
547
|
+
input_ids = self._processor.tokenizer(
|
548
|
+
base_output.input_text,
|
549
|
+
return_tensors="pt",
|
550
|
+
add_special_tokens=True,
|
551
|
+
).input_ids.flatten()
|
552
|
+
|
553
|
+
# Add offsets to all items
|
554
|
+
for mm_item in all_collected_items:
|
555
|
+
if mm_item.modality in [Modality.IMAGE, Modality.MULTI_IMAGES]:
|
556
|
+
mm_item.image_offsets = self.get_mm_items_offset(
|
551
557
|
input_ids=input_ids,
|
552
558
|
mm_token_id=self.IM_TOKEN_ID,
|
553
559
|
)
|
554
|
-
elif
|
555
|
-
|
560
|
+
elif mm_item.modality == Modality.AUDIO:
|
561
|
+
mm_item.audio_offsets = self.get_mm_items_offset(
|
556
562
|
input_ids=input_ids,
|
557
563
|
mm_token_id=self.AUDIO_TOKEN_ID,
|
558
564
|
)
|
559
|
-
elif
|
560
|
-
|
565
|
+
elif mm_item.modality == Modality.VIDEO:
|
566
|
+
mm_item.video_offsets = self.get_mm_items_offset(
|
561
567
|
input_ids=input_ids,
|
562
568
|
mm_token_id=self.VIDEO_TOKEN_ID,
|
563
569
|
)
|
564
570
|
else:
|
565
|
-
raise ValueError(f"Unknown modality: {
|
566
|
-
return combined_mm_item
|
567
|
-
|
568
|
-
# Main logic - determine input type and handle text-only case
|
569
|
-
mm_inputs = base_output.images or base_output.audios
|
570
|
-
if not mm_inputs:
|
571
|
-
input_ids = tokenize_text(base_output.input_text)
|
572
|
-
return None, input_ids
|
573
|
-
|
574
|
-
# Categorize input formats
|
575
|
-
input_format = categorize_mm_inputs(mm_inputs)
|
576
|
-
|
577
|
-
# Process based on format
|
578
|
-
if input_format == MultimodalInputFormat.RAW_IMAGES:
|
579
|
-
combined_mm_item, input_ids = process_raw_images(base_output)
|
580
|
-
elif input_format == MultimodalInputFormat.PRECOMPUTED_FEATURES:
|
581
|
-
combined_mm_item, input_ids = process_precomputed_features(base_output)
|
582
|
-
elif input_format == MultimodalInputFormat.PIXEL_VALUES:
|
583
|
-
combined_mm_item, input_ids = process_pixel_values(base_output)
|
584
|
-
elif input_format == MultimodalInputFormat.AUDIO:
|
585
|
-
combined_mm_item, input_ids = process_audio(base_output)
|
586
|
-
else:
|
587
|
-
raise ValueError(f"Unknown input format: {input_format}")
|
571
|
+
raise ValueError(f"Unknown modality: {mm_item.modality}")
|
588
572
|
|
589
|
-
|
590
|
-
combined_mm_item = finalize_mm_item(combined_mm_item, input_ids)
|
591
|
-
return combined_mm_item, input_ids
|
573
|
+
return all_collected_items, input_ids
|
@@ -1,10 +1,8 @@
|
|
1
1
|
from typing import List, Union
|
2
2
|
|
3
|
-
from sglang.srt.managers.multimodal_processors.base_processor import (
|
4
|
-
BaseMultimodalProcessor,
|
5
|
-
)
|
6
3
|
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
7
4
|
from sglang.srt.models.clip import CLIPModel
|
5
|
+
from sglang.srt.multimodal.processors.base_processor import BaseMultimodalProcessor
|
8
6
|
from sglang.srt.utils import load_image
|
9
7
|
|
10
8
|
|
@@ -17,20 +15,11 @@ class ClipImageProcessor(BaseMultimodalProcessor):
|
|
17
15
|
async def process_mm_data_async(
|
18
16
|
self, image_data: List[Union[str, bytes]], input_text, *args, **kwargs
|
19
17
|
):
|
20
|
-
if not image_data:
|
21
|
-
return None
|
22
|
-
|
23
18
|
if isinstance(input_text, list):
|
24
19
|
assert len(input_text) and isinstance(input_text[0], int)
|
25
20
|
input_text = self._processor.tokenizer.decode(input_text)
|
26
21
|
|
27
|
-
|
28
|
-
image_data = [image_data]
|
29
|
-
|
30
|
-
if len(image_data) > 0:
|
31
|
-
images = [load_image(image)[0] for image in image_data]
|
32
|
-
else:
|
33
|
-
images = load_image(image_data[0])[0]
|
22
|
+
images = [load_image(image)[0] for image in image_data]
|
34
23
|
|
35
24
|
image_inputs = self.process_mm_data(input_text=input_text, images=images)
|
36
25
|
image_inputs["data_hashes"] = [hash(str(image_data))]
|
@@ -20,12 +20,12 @@ from typing import List, Union
|
|
20
20
|
|
21
21
|
import torch
|
22
22
|
|
23
|
-
from sglang.srt.managers.
|
23
|
+
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
24
|
+
from sglang.srt.models.deepseek_vl2 import DeepseekVL2ForCausalLM
|
25
|
+
from sglang.srt.multimodal.processors.base_processor import (
|
24
26
|
BaseMultimodalProcessor,
|
25
27
|
MultimodalSpecialTokens,
|
26
28
|
)
|
27
|
-
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
28
|
-
from sglang.srt.models.deepseek_vl2 import DeepseekVL2ForCausalLM
|
29
29
|
|
30
30
|
|
31
31
|
class DeepseekVL2ImageProcessor(BaseMultimodalProcessor):
|
@@ -44,17 +44,10 @@ class DeepseekVL2ImageProcessor(BaseMultimodalProcessor):
|
|
44
44
|
*args,
|
45
45
|
**kwargs
|
46
46
|
):
|
47
|
-
if not image_data:
|
48
|
-
return None
|
49
|
-
|
50
|
-
if not isinstance(image_data, list):
|
51
|
-
image_data = [image_data]
|
52
|
-
|
53
|
-
image_token = self.IMAGE_TOKEN
|
54
47
|
base_output = self.load_mm_data(
|
55
48
|
input_text,
|
56
49
|
image_data=image_data,
|
57
|
-
multimodal_tokens=MultimodalSpecialTokens(image_token=
|
50
|
+
multimodal_tokens=MultimodalSpecialTokens(image_token=self.IMAGE_TOKEN),
|
58
51
|
max_req_input_len=max_req_input_len,
|
59
52
|
)
|
60
53
|
res = self.process_mm_data(
|
@@ -4,11 +4,9 @@ from typing import Dict, List, Union
|
|
4
4
|
from sglang.srt.managers.multimodal_processor import (
|
5
5
|
BaseMultimodalProcessor as SGLangBaseProcessor,
|
6
6
|
)
|
7
|
-
from sglang.srt.managers.multimodal_processors.base_processor import (
|
8
|
-
MultimodalSpecialTokens,
|
9
|
-
)
|
10
7
|
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
11
8
|
from sglang.srt.models.gemma3_mm import Gemma3ForConditionalGeneration
|
9
|
+
from sglang.srt.multimodal.processors.base_processor import MultimodalSpecialTokens
|
12
10
|
|
13
11
|
# Copied from: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gemma3/image_processing_gemma3_fast.py
|
14
12
|
# will be removed in the future
|
@@ -38,11 +36,6 @@ class Gemma3SGLangImageProcessor(SGLangBaseProcessor):
|
|
38
36
|
*args,
|
39
37
|
**kwargs,
|
40
38
|
):
|
41
|
-
if not image_data:
|
42
|
-
return None
|
43
|
-
if isinstance(image_data, str):
|
44
|
-
image_data = [image_data]
|
45
|
-
|
46
39
|
base_output = self.load_mm_data(
|
47
40
|
prompt=input_text,
|
48
41
|
image_data=image_data,
|
@@ -53,11 +46,11 @@ class Gemma3SGLangImageProcessor(SGLangBaseProcessor):
|
|
53
46
|
discard_alpha_channel=True,
|
54
47
|
)
|
55
48
|
|
56
|
-
|
49
|
+
mm_items, input_ids = self.process_and_combine_mm_data(base_output)
|
57
50
|
|
58
51
|
return {
|
59
52
|
"input_ids": input_ids.tolist(),
|
60
|
-
"mm_items":
|
53
|
+
"mm_items": mm_items,
|
61
54
|
"im_start_id": self.IM_START_TOKEN_ID,
|
62
55
|
"im_end_id": self.IM_END_TOKEN_ID,
|
63
56
|
}
|
@@ -18,10 +18,8 @@ from typing import Dict, List, Optional, Union
|
|
18
18
|
from sglang.srt.managers.multimodal_processor import (
|
19
19
|
BaseMultimodalProcessor as SGLangBaseProcessor,
|
20
20
|
)
|
21
|
-
from sglang.srt.managers.multimodal_processors.base_processor import (
|
22
|
-
MultimodalSpecialTokens,
|
23
|
-
)
|
24
21
|
from sglang.srt.models.gemma3n_mm import Gemma3nForConditionalGeneration
|
22
|
+
from sglang.srt.multimodal.processors.base_processor import MultimodalSpecialTokens
|
25
23
|
|
26
24
|
|
27
25
|
class Gemma3nSGLangProcessor(SGLangBaseProcessor):
|
@@ -61,17 +59,6 @@ class Gemma3nSGLangProcessor(SGLangBaseProcessor):
|
|
61
59
|
**kwargs,
|
62
60
|
):
|
63
61
|
"""Process multimodal data including images and audio."""
|
64
|
-
|
65
|
-
audio_data = request_obj.audio_data
|
66
|
-
if not image_data and not audio_data:
|
67
|
-
return None
|
68
|
-
|
69
|
-
if isinstance(image_data, str):
|
70
|
-
image_data = [image_data]
|
71
|
-
|
72
|
-
if isinstance(audio_data, str):
|
73
|
-
audio_data = [audio_data]
|
74
|
-
|
75
62
|
base_output = self.load_mm_data(
|
76
63
|
prompt=input_text,
|
77
64
|
image_data=image_data,
|
@@ -85,13 +72,11 @@ class Gemma3nSGLangProcessor(SGLangBaseProcessor):
|
|
85
72
|
),
|
86
73
|
)
|
87
74
|
|
88
|
-
|
75
|
+
mm_items, input_ids = self.process_and_combine_mm_data(base_output)
|
89
76
|
|
90
77
|
return {
|
91
78
|
"input_ids": input_ids.tolist(),
|
92
|
-
"mm_items":
|
93
|
-
"
|
94
|
-
"
|
95
|
-
"audio_start_id": self.AUDIO_START_TOKEN_ID,
|
96
|
-
"audio_end_id": self.AUDIO_END_TOKEN_ID,
|
79
|
+
"mm_items": mm_items,
|
80
|
+
"im_token_id": self.IM_TOKEN_ID,
|
81
|
+
"audio_token_id": self.AUDIO_TOKEN_ID,
|
97
82
|
}
|
@@ -5,12 +5,12 @@ import torch
|
|
5
5
|
from decord import VideoReader, cpu
|
6
6
|
from PIL import Image
|
7
7
|
|
8
|
-
from sglang.srt.managers.
|
8
|
+
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
9
|
+
from sglang.srt.models.internvl import InternVLChatModel
|
10
|
+
from sglang.srt.multimodal.processors.base_processor import (
|
9
11
|
BaseMultimodalProcessor,
|
10
12
|
MultimodalSpecialTokens,
|
11
13
|
)
|
12
|
-
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
13
|
-
from sglang.srt.models.internvl import InternVLChatModel
|
14
14
|
|
15
15
|
|
16
16
|
class InternVLImageProcessor(BaseMultimodalProcessor):
|
@@ -172,13 +172,6 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
|
|
172
172
|
async def process_mm_data_async(
|
173
173
|
self, image_data, input_text, request_obj, max_req_input_len, **kwargs
|
174
174
|
):
|
175
|
-
if not image_data:
|
176
|
-
return None
|
177
|
-
|
178
|
-
# Ensure image_data is a list
|
179
|
-
if isinstance(image_data, str):
|
180
|
-
image_data = [image_data]
|
181
|
-
|
182
175
|
base_output = self.load_mm_data(
|
183
176
|
prompt=input_text,
|
184
177
|
image_data=image_data,
|
@@ -1,11 +1,11 @@
|
|
1
1
|
from typing import List, Union
|
2
2
|
|
3
|
-
from sglang.srt.managers.
|
3
|
+
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
4
|
+
from sglang.srt.models.deepseek_janus_pro import MultiModalityCausalLM
|
5
|
+
from sglang.srt.multimodal.processors.base_processor import (
|
4
6
|
BaseMultimodalProcessor,
|
5
7
|
MultimodalSpecialTokens,
|
6
8
|
)
|
7
|
-
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
8
|
-
from sglang.srt.models.deepseek_janus_pro import MultiModalityCausalLM
|
9
9
|
|
10
10
|
|
11
11
|
class JanusProImageProcessor(BaseMultimodalProcessor):
|
@@ -22,12 +22,6 @@ class JanusProImageProcessor(BaseMultimodalProcessor):
|
|
22
22
|
max_req_input_len,
|
23
23
|
**kwargs,
|
24
24
|
):
|
25
|
-
if not image_data:
|
26
|
-
return None
|
27
|
-
|
28
|
-
if not isinstance(image_data, list):
|
29
|
-
image_data = [image_data]
|
30
|
-
|
31
25
|
processor = self._processor
|
32
26
|
|
33
27
|
base_out = self.load_mm_data(
|
@@ -3,14 +3,12 @@ from typing import Any, Dict, List, Optional, Union
|
|
3
3
|
|
4
4
|
import torch
|
5
5
|
|
6
|
-
from sglang.srt.managers.multimodal_processors.base_processor import (
|
7
|
-
BaseMultimodalProcessor as SGLangBaseProcessor,
|
8
|
-
)
|
9
|
-
from sglang.srt.managers.multimodal_processors.base_processor import (
|
10
|
-
MultimodalSpecialTokens,
|
11
|
-
)
|
12
6
|
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
13
7
|
from sglang.srt.models.kimi_vl import KimiVLForConditionalGeneration
|
8
|
+
from sglang.srt.multimodal.processors.base_processor import (
|
9
|
+
BaseMultimodalProcessor as SGLangBaseProcessor,
|
10
|
+
)
|
11
|
+
from sglang.srt.multimodal.processors.base_processor import MultimodalSpecialTokens
|
14
12
|
|
15
13
|
|
16
14
|
# Compatible with KimiVLForConditionalGeneration
|
@@ -32,11 +30,6 @@ class KimiVLImageProcessor(SGLangBaseProcessor):
|
|
32
30
|
*args,
|
33
31
|
**kwargs,
|
34
32
|
):
|
35
|
-
if not image_data:
|
36
|
-
return None
|
37
|
-
if isinstance(image_data, str):
|
38
|
-
image_data = [image_data]
|
39
|
-
|
40
33
|
base_output = self.load_mm_data(
|
41
34
|
prompt=input_text,
|
42
35
|
image_data=image_data,
|
@@ -46,10 +39,10 @@ class KimiVLImageProcessor(SGLangBaseProcessor):
|
|
46
39
|
max_req_input_len=max_req_input_len,
|
47
40
|
)
|
48
41
|
|
49
|
-
|
42
|
+
mm_items, input_ids = self.process_and_combine_mm_data(base_output)
|
50
43
|
|
51
44
|
return {
|
52
45
|
"input_ids": input_ids.tolist(),
|
53
|
-
"mm_items":
|
46
|
+
"mm_items": mm_items,
|
54
47
|
"im_token_id": self.IM_TOKEN_ID,
|
55
48
|
}
|
@@ -7,11 +7,7 @@ from transformers.models.auto.processing_auto import (
|
|
7
7
|
)
|
8
8
|
|
9
9
|
import sglang.srt.managers.multimodal_processor as sgl_mm_processor_utils
|
10
|
-
from sglang.srt.managers.multimodal_processors.base_processor import (
|
11
|
-
BaseMultimodalProcessor,
|
12
|
-
)
|
13
10
|
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
14
|
-
from sglang.srt.mm_utils import expand2square, process_anyres_image
|
15
11
|
from sglang.srt.models.llava import (
|
16
12
|
LlavaForConditionalGeneration,
|
17
13
|
LlavaLlamaForCausalLM,
|
@@ -20,6 +16,8 @@ from sglang.srt.models.llava import (
|
|
20
16
|
)
|
21
17
|
from sglang.srt.models.llavavid import LlavaVidForCausalLM
|
22
18
|
from sglang.srt.models.mistral import Mistral3ForConditionalGeneration
|
19
|
+
from sglang.srt.multimodal.mm_utils import expand2square, process_anyres_image
|
20
|
+
from sglang.srt.multimodal.processors.base_processor import BaseMultimodalProcessor
|
23
21
|
from sglang.srt.utils import load_image, logger
|
24
22
|
from sglang.utils import get_exception_traceback
|
25
23
|
|
@@ -112,9 +110,6 @@ class LlavaImageProcessor(BaseMultimodalProcessor):
|
|
112
110
|
*args,
|
113
111
|
**kwargs,
|
114
112
|
):
|
115
|
-
if not image_data:
|
116
|
-
return None
|
117
|
-
|
118
113
|
modalities = request_obj.modalities or ["image"]
|
119
114
|
aspect_ratio = getattr(self.hf_config, "image_aspect_ratio", None)
|
120
115
|
grid_pinpoints = (
|
@@ -124,9 +119,6 @@ class LlavaImageProcessor(BaseMultimodalProcessor):
|
|
124
119
|
else None
|
125
120
|
)
|
126
121
|
|
127
|
-
if isinstance(image_data, str):
|
128
|
-
image_data = [image_data]
|
129
|
-
|
130
122
|
if isinstance(image_data, list) and len(image_data) > 0:
|
131
123
|
if "multi-images" in modalities or "video" in modalities:
|
132
124
|
# Multiple images
|