sglang 0.4.9.post2__py3-none-any.whl → 0.4.9.post4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +2 -1
- sglang/eval/loogle_eval.py +7 -0
- sglang/srt/_custom_ops.py +29 -1
- sglang/srt/configs/deepseekvl2.py +11 -2
- sglang/srt/configs/internvl.py +3 -0
- sglang/srt/configs/janus_pro.py +3 -0
- sglang/srt/configs/model_config.py +10 -8
- sglang/srt/configs/update_config.py +3 -1
- sglang/srt/conversation.py +2 -1
- sglang/srt/custom_op.py +5 -2
- sglang/srt/disaggregation/common/conn.py +34 -6
- sglang/srt/disaggregation/decode.py +9 -1
- sglang/srt/disaggregation/mini_lb.py +3 -2
- sglang/srt/disaggregation/mooncake/conn.py +93 -76
- sglang/srt/disaggregation/mooncake/transfer_engine.py +4 -2
- sglang/srt/disaggregation/nixl/conn.py +17 -13
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +3 -91
- sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +96 -1
- sglang/srt/distributed/device_communicators/quick_all_reduce.py +273 -0
- sglang/srt/distributed/device_communicators/shm_broadcast.py +12 -5
- sglang/srt/distributed/parallel_state.py +103 -15
- sglang/srt/entrypoints/engine.py +31 -33
- sglang/srt/entrypoints/http_server.py +20 -32
- sglang/srt/entrypoints/openai/protocol.py +3 -3
- sglang/srt/entrypoints/openai/serving_chat.py +48 -6
- sglang/srt/eplb/expert_location_dispatch.py +1 -1
- sglang/srt/function_call/base_format_detector.py +74 -12
- sglang/srt/function_call/deepseekv3_detector.py +26 -11
- sglang/srt/function_call/ebnf_composer.py +95 -63
- sglang/srt/function_call/function_call_parser.py +4 -2
- sglang/srt/function_call/kimik2_detector.py +41 -16
- sglang/srt/function_call/llama32_detector.py +6 -3
- sglang/srt/function_call/mistral_detector.py +11 -3
- sglang/srt/function_call/pythonic_detector.py +16 -14
- sglang/srt/function_call/qwen25_detector.py +12 -3
- sglang/srt/function_call/qwen3_coder_detector.py +151 -0
- sglang/srt/hf_transformers_utils.py +0 -1
- sglang/srt/layers/activation.py +24 -3
- sglang/srt/layers/attention/base_attn_backend.py +3 -1
- sglang/srt/layers/attention/flashattention_backend.py +3 -3
- sglang/srt/layers/attention/flashinfer_backend.py +40 -1
- sglang/srt/layers/communicator.py +12 -12
- sglang/srt/layers/dp_attention.py +72 -24
- sglang/srt/layers/linear.py +13 -102
- sglang/srt/layers/logits_processor.py +34 -24
- sglang/srt/layers/moe/ep_moe/kernels.py +4 -2
- sglang/srt/layers/moe/ep_moe/layer.py +23 -402
- sglang/srt/layers/moe/fused_moe_native.py +7 -47
- sglang/srt/layers/moe/fused_moe_triton/__init__.py +4 -4
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=320,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=385,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=385,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +54 -263
- sglang/srt/layers/moe/fused_moe_triton/layer.py +14 -396
- sglang/srt/layers/moe/topk.py +190 -23
- sglang/srt/layers/quantization/__init__.py +20 -134
- sglang/srt/layers/quantization/awq.py +578 -11
- sglang/srt/layers/quantization/awq_triton.py +339 -0
- sglang/srt/layers/quantization/base_config.py +85 -10
- sglang/srt/layers/quantization/blockwise_int8.py +17 -55
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +13 -11
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +23 -79
- sglang/srt/layers/quantization/fp8.py +273 -62
- sglang/srt/layers/quantization/fp8_kernel.py +210 -46
- sglang/srt/layers/quantization/fp8_utils.py +2 -2
- sglang/srt/layers/quantization/gptq.py +501 -143
- sglang/srt/layers/quantization/marlin_utils.py +790 -0
- sglang/srt/layers/quantization/modelopt_quant.py +34 -112
- sglang/srt/layers/quantization/moe_wna16.py +45 -49
- sglang/srt/layers/quantization/petit.py +252 -0
- sglang/srt/layers/quantization/petit_utils.py +104 -0
- sglang/srt/layers/quantization/qoq.py +7 -6
- sglang/srt/layers/quantization/scalar_type.py +352 -0
- sglang/srt/layers/quantization/unquant.py +422 -0
- sglang/srt/layers/quantization/utils.py +340 -9
- sglang/srt/layers/quantization/w4afp8.py +8 -4
- sglang/srt/layers/quantization/w8a8_fp8.py +17 -51
- sglang/srt/layers/quantization/w8a8_int8.py +51 -115
- sglang/srt/layers/radix_attention.py +5 -3
- sglang/srt/layers/vocab_parallel_embedding.py +1 -41
- sglang/srt/lora/lora.py +0 -4
- sglang/srt/lora/lora_manager.py +162 -164
- sglang/srt/lora/lora_registry.py +124 -0
- sglang/srt/lora/mem_pool.py +83 -35
- sglang/srt/lora/utils.py +12 -5
- sglang/srt/managers/cache_controller.py +288 -0
- sglang/srt/managers/io_struct.py +60 -30
- sglang/srt/managers/mm_utils.py +7 -8
- sglang/srt/managers/schedule_batch.py +163 -113
- sglang/srt/managers/schedule_policy.py +68 -27
- sglang/srt/managers/scheduler.py +256 -86
- sglang/srt/managers/scheduler_output_processor_mixin.py +22 -4
- sglang/srt/managers/tokenizer_manager.py +38 -27
- sglang/srt/managers/tp_worker.py +16 -4
- sglang/srt/managers/tp_worker_overlap_thread.py +11 -0
- sglang/srt/mem_cache/allocator.py +74 -23
- sglang/srt/mem_cache/base_prefix_cache.py +14 -2
- sglang/srt/mem_cache/chunk_cache.py +5 -2
- sglang/srt/mem_cache/hicache_storage.py +168 -0
- sglang/srt/mem_cache/hiradix_cache.py +194 -5
- sglang/srt/mem_cache/memory_pool.py +16 -1
- sglang/srt/mem_cache/memory_pool_host.py +44 -2
- sglang/srt/mem_cache/radix_cache.py +26 -0
- sglang/srt/mem_cache/swa_radix_cache.py +1025 -0
- sglang/srt/metrics/collector.py +9 -0
- sglang/srt/model_executor/cuda_graph_runner.py +66 -31
- sglang/srt/model_executor/forward_batch_info.py +210 -25
- sglang/srt/model_executor/model_runner.py +147 -42
- sglang/srt/model_loader/loader.py +7 -1
- sglang/srt/model_loader/utils.py +4 -4
- sglang/srt/models/clip.py +1 -1
- sglang/srt/models/deepseek.py +9 -6
- sglang/srt/models/deepseek_janus_pro.py +1 -1
- sglang/srt/models/deepseek_v2.py +192 -173
- sglang/srt/models/deepseek_vl2.py +5 -5
- sglang/srt/models/gemma.py +48 -0
- sglang/srt/models/gemma2.py +52 -0
- sglang/srt/models/gemma3_causal.py +63 -0
- sglang/srt/models/gemma3_mm.py +1 -1
- sglang/srt/models/gemma3n_mm.py +2 -4
- sglang/srt/models/granitemoe.py +385 -0
- sglang/srt/models/grok.py +9 -3
- sglang/srt/models/hunyuan.py +63 -16
- sglang/srt/models/internvl.py +1 -1
- sglang/srt/models/kimi_vl.py +1 -1
- sglang/srt/models/llama.py +41 -0
- sglang/srt/models/llama4.py +11 -11
- sglang/srt/models/llava.py +2 -2
- sglang/srt/models/llavavid.py +1 -1
- sglang/srt/models/minicpm.py +0 -2
- sglang/srt/models/minicpmo.py +3 -7
- sglang/srt/models/minicpmv.py +1 -1
- sglang/srt/models/mistral.py +1 -1
- sglang/srt/models/mixtral.py +9 -2
- sglang/srt/models/mllama.py +3 -5
- sglang/srt/models/mllama4.py +13 -6
- sglang/srt/models/olmoe.py +8 -5
- sglang/srt/models/persimmon.py +330 -0
- sglang/srt/models/phi.py +321 -0
- sglang/srt/models/phi4mm.py +44 -4
- sglang/srt/models/phi4mm_audio.py +1260 -0
- sglang/srt/models/phi4mm_utils.py +1917 -0
- sglang/srt/models/phimoe.py +9 -3
- sglang/srt/models/qwen.py +37 -0
- sglang/srt/models/qwen2.py +41 -0
- sglang/srt/models/qwen2_5_vl.py +4 -4
- sglang/srt/models/qwen2_audio.py +1 -1
- sglang/srt/models/qwen2_moe.py +53 -9
- sglang/srt/models/qwen2_vl.py +4 -4
- sglang/srt/models/qwen3.py +65 -1
- sglang/srt/models/qwen3_moe.py +57 -24
- sglang/srt/models/vila.py +1 -1
- sglang/srt/multimodal/processors/base_processor.py +91 -97
- sglang/srt/multimodal/processors/clip.py +21 -19
- sglang/srt/multimodal/processors/deepseek_vl_v2.py +8 -26
- sglang/srt/multimodal/processors/gemma3.py +13 -17
- sglang/srt/multimodal/processors/gemma3n.py +19 -23
- sglang/srt/multimodal/processors/internvl.py +9 -10
- sglang/srt/multimodal/processors/janus_pro.py +12 -27
- sglang/srt/multimodal/processors/kimi_vl.py +12 -14
- sglang/srt/multimodal/processors/llava.py +4 -2
- sglang/srt/multimodal/processors/minicpm.py +35 -44
- sglang/srt/multimodal/processors/mlama.py +21 -18
- sglang/srt/multimodal/processors/mllama4.py +4 -5
- sglang/srt/multimodal/processors/phi4mm.py +63 -39
- sglang/srt/multimodal/processors/pixtral.py +14 -35
- sglang/srt/multimodal/processors/qwen_audio.py +65 -0
- sglang/srt/multimodal/processors/qwen_vl.py +16 -21
- sglang/srt/multimodal/processors/vila.py +14 -14
- sglang/srt/reasoning_parser.py +46 -4
- sglang/srt/sampling/sampling_batch_info.py +6 -5
- sglang/srt/sampling/sampling_params.py +8 -1
- sglang/srt/server_args.py +454 -270
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +33 -28
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +46 -37
- sglang/srt/speculative/eagle_utils.py +51 -23
- sglang/srt/speculative/eagle_worker.py +59 -44
- sglang/srt/two_batch_overlap.py +10 -5
- sglang/srt/utils.py +44 -69
- sglang/test/runners.py +14 -3
- sglang/test/test_activation.py +50 -1
- sglang/test/test_block_fp8.py +8 -3
- sglang/test/test_block_fp8_ep.py +1 -1
- sglang/test/test_custom_ops.py +12 -7
- sglang/test/test_cutlass_w4a8_moe.py +1 -3
- sglang/test/test_fp4_moe.py +1 -3
- sglang/test/test_marlin_moe.py +286 -0
- sglang/test/test_marlin_utils.py +171 -0
- sglang/test/test_utils.py +35 -0
- sglang/version.py +1 -1
- {sglang-0.4.9.post2.dist-info → sglang-0.4.9.post4.dist-info}/METADATA +10 -10
- {sglang-0.4.9.post2.dist-info → sglang-0.4.9.post4.dist-info}/RECORD +198 -175
- sglang/srt/layers/quantization/quant_utils.py +0 -166
- sglang/srt/managers/multimodal_processors/qwen_audio.py +0 -94
- {sglang-0.4.9.post2.dist-info → sglang-0.4.9.post4.dist-info}/WHEEL +0 -0
- {sglang-0.4.9.post2.dist-info → sglang-0.4.9.post4.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.9.post2.dist-info → sglang-0.4.9.post4.dist-info}/top_level.txt +0 -0
@@ -24,7 +24,6 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
|
|
24
24
|
self.IMG_CONTEXT_TOKEN = "<IMG_CONTEXT>"
|
25
25
|
self.IMG_START_TOKEN = "<img>"
|
26
26
|
self.IMG_END_TOKEN = "</img>"
|
27
|
-
self.IMG_TOKEN = "<image>"
|
28
27
|
self.num_image_token = int(
|
29
28
|
(image_size // patch_size) ** 2 * (hf_config.downsample_ratio**2)
|
30
29
|
)
|
@@ -32,9 +31,10 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
|
|
32
31
|
tokenizer = self._processor
|
33
32
|
self.img_start_token_id = tokenizer.convert_tokens_to_ids(self.IMG_START_TOKEN)
|
34
33
|
self.img_end_token_id = tokenizer.convert_tokens_to_ids(self.IMG_END_TOKEN)
|
35
|
-
self.
|
36
|
-
|
37
|
-
|
34
|
+
self.mm_tokens = MultimodalSpecialTokens(
|
35
|
+
image_token="<image>",
|
36
|
+
image_token_id=tokenizer.convert_tokens_to_ids(self.IMG_CONTEXT_TOKEN),
|
37
|
+
).build(_image_processor)
|
38
38
|
|
39
39
|
@staticmethod
|
40
40
|
def build_transform(input_size):
|
@@ -170,13 +170,12 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
|
|
170
170
|
return pixel_values, num_patches_list
|
171
171
|
|
172
172
|
async def process_mm_data_async(
|
173
|
-
self, image_data, input_text, request_obj,
|
173
|
+
self, image_data, input_text, request_obj, **kwargs
|
174
174
|
):
|
175
175
|
base_output = self.load_mm_data(
|
176
176
|
prompt=input_text,
|
177
177
|
image_data=image_data,
|
178
|
-
multimodal_tokens=
|
179
|
-
max_req_input_len=max_req_input_len,
|
178
|
+
multimodal_tokens=self.mm_tokens,
|
180
179
|
discard_alpha_channel=True,
|
181
180
|
)
|
182
181
|
|
@@ -219,11 +218,11 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
|
|
219
218
|
input_ids = tokenizer(input_text, return_tensors="pt")["input_ids"].flatten()
|
220
219
|
image_offsets = self.get_mm_items_offset(
|
221
220
|
input_ids=input_ids,
|
222
|
-
mm_token_id=self.
|
221
|
+
mm_token_id=self.mm_tokens.image_token_id,
|
223
222
|
)
|
224
223
|
items = [
|
225
224
|
MultimodalDataItem(
|
226
|
-
|
225
|
+
feature=pixel_values,
|
227
226
|
modality=Modality.IMAGE,
|
228
227
|
offsets=image_offsets,
|
229
228
|
)
|
@@ -234,5 +233,5 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
|
|
234
233
|
"mm_items": items,
|
235
234
|
"im_start_id": self.img_start_token_id,
|
236
235
|
"im_end_id": self.img_end_token_id,
|
237
|
-
"im_token_id": self.
|
236
|
+
"im_token_id": self.mm_tokens.image_token_id,
|
238
237
|
}
|
@@ -14,47 +14,32 @@ class JanusProImageProcessor(BaseMultimodalProcessor):
|
|
14
14
|
def __init__(self, hf_config, server_args, _processor):
|
15
15
|
super().__init__(hf_config, server_args, _processor)
|
16
16
|
|
17
|
+
self.mm_tokens = MultimodalSpecialTokens(
|
18
|
+
image_token=_processor.image_token,
|
19
|
+
image_token_id=_processor.image_id,
|
20
|
+
).build(_processor)
|
21
|
+
|
17
22
|
async def process_mm_data_async(
|
18
23
|
self,
|
19
24
|
image_data: List[Union[str, bytes]],
|
20
25
|
input_text,
|
21
26
|
request_obj,
|
22
|
-
max_req_input_len,
|
23
27
|
**kwargs,
|
24
28
|
):
|
25
|
-
processor = self._processor
|
26
|
-
|
27
29
|
base_out = self.load_mm_data(
|
28
30
|
prompt=input_text,
|
29
31
|
image_data=image_data,
|
30
|
-
multimodal_tokens=
|
31
|
-
image_token=processor.image_token
|
32
|
-
),
|
33
|
-
max_req_input_len=max_req_input_len,
|
32
|
+
multimodal_tokens=self.mm_tokens,
|
34
33
|
)
|
35
34
|
|
36
|
-
|
37
|
-
|
38
|
-
input_text=base_out.input_text,
|
39
|
-
prompt=base_out.input_text,
|
40
|
-
images=images,
|
35
|
+
mm_items, input_ids, _ = self.process_and_combine_mm_data(
|
36
|
+
base_out, self.mm_tokens, prompt=base_out.input_text
|
41
37
|
)
|
42
38
|
|
43
|
-
input_ids = res["input_ids"].flatten()
|
44
|
-
image_offsets = self.get_mm_items_offset(
|
45
|
-
input_ids=input_ids, mm_token_id=processor.image_id
|
46
|
-
)
|
47
39
|
return {
|
48
|
-
"mm_items":
|
49
|
-
MultimodalDataItem(
|
50
|
-
pixel_values=res["pixel_values"],
|
51
|
-
image_emb_mask=res["images_emb_mask"],
|
52
|
-
offsets=image_offsets,
|
53
|
-
modality=Modality.IMAGE,
|
54
|
-
)
|
55
|
-
],
|
40
|
+
"mm_items": mm_items,
|
56
41
|
"input_ids": input_ids.tolist(),
|
57
|
-
"im_start_id":
|
58
|
-
"im_end_id":
|
59
|
-
"im_token_id":
|
42
|
+
"im_start_id": self._processor.image_start_id,
|
43
|
+
"im_end_id": self._processor.image_end_id,
|
44
|
+
"im_token_id": self.mm_tokens.image_token_id,
|
60
45
|
}
|
@@ -1,9 +1,6 @@
|
|
1
1
|
import re
|
2
|
-
from typing import
|
2
|
+
from typing import Dict, List, Union
|
3
3
|
|
4
|
-
import torch
|
5
|
-
|
6
|
-
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
7
4
|
from sglang.srt.models.kimi_vl import KimiVLForConditionalGeneration
|
8
5
|
from sglang.srt.multimodal.processors.base_processor import (
|
9
6
|
BaseMultimodalProcessor as SGLangBaseProcessor,
|
@@ -17,32 +14,33 @@ class KimiVLImageProcessor(SGLangBaseProcessor):
|
|
17
14
|
|
18
15
|
def __init__(self, hf_config, server_args, _processor):
|
19
16
|
super().__init__(hf_config, server_args, _processor)
|
20
|
-
self.
|
21
|
-
|
22
|
-
|
17
|
+
self.mm_tokens = MultimodalSpecialTokens(
|
18
|
+
image_token="<|media_pad|>",
|
19
|
+
# TODO: could we convert in MultimodalSpecialTokens?
|
20
|
+
image_token_id=hf_config.media_placeholder_token_id,
|
21
|
+
image_token_regex=re.compile(r"(?:<\|media_pad\|>)+"),
|
22
|
+
).build(_processor)
|
23
23
|
|
24
24
|
async def process_mm_data_async(
|
25
25
|
self,
|
26
26
|
image_data: List[Union[str, bytes, Dict]],
|
27
27
|
input_text,
|
28
28
|
request_obj,
|
29
|
-
max_req_input_len,
|
30
29
|
*args,
|
31
30
|
**kwargs,
|
32
31
|
):
|
33
32
|
base_output = self.load_mm_data(
|
34
33
|
prompt=input_text,
|
35
34
|
image_data=image_data,
|
36
|
-
multimodal_tokens=
|
37
|
-
image_token=self.IMAGE_TOKEN, image_token_regex=self.IMAGE_TOKEN_REGEX
|
38
|
-
),
|
39
|
-
max_req_input_len=max_req_input_len,
|
35
|
+
multimodal_tokens=self.mm_tokens,
|
40
36
|
)
|
41
37
|
|
42
|
-
mm_items, input_ids, _ = self.process_and_combine_mm_data(
|
38
|
+
mm_items, input_ids, _ = self.process_and_combine_mm_data(
|
39
|
+
base_output, self.mm_tokens
|
40
|
+
)
|
43
41
|
|
44
42
|
return {
|
45
43
|
"input_ids": input_ids.tolist(),
|
46
44
|
"mm_items": mm_items,
|
47
|
-
"im_token_id": self.
|
45
|
+
"im_token_id": self.mm_tokens.image_token_id,
|
48
46
|
}
|
@@ -158,8 +158,10 @@ class LlavaImageProcessor(BaseMultimodalProcessor):
|
|
158
158
|
return {
|
159
159
|
"mm_items": [
|
160
160
|
MultimodalDataItem(
|
161
|
-
|
162
|
-
|
161
|
+
feature=pixel_values,
|
162
|
+
model_specific_data={
|
163
|
+
"image_sizes": image_sizes,
|
164
|
+
},
|
163
165
|
modality=modality,
|
164
166
|
)
|
165
167
|
],
|
@@ -17,9 +17,22 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
|
|
17
17
|
|
18
18
|
def __init__(self, hf_config, server_args, _processor):
|
19
19
|
super().__init__(hf_config, server_args, _processor)
|
20
|
-
|
21
|
-
|
22
|
-
self.
|
20
|
+
# Collect special token ids
|
21
|
+
tokenizer = self._processor.tokenizer
|
22
|
+
self.slice_start_id = getattr(tokenizer, "slice_start_id", None)
|
23
|
+
self.slice_end_id = getattr(tokenizer, "slice_end_id", None)
|
24
|
+
self.audio_start_id = getattr(tokenizer, "audio_start_id", None)
|
25
|
+
self.audio_end_id = getattr(tokenizer, "audio_end_id", None)
|
26
|
+
self.im_start_id = getattr(tokenizer, "im_start_id", None)
|
27
|
+
self.im_end_id = getattr(tokenizer, "im_end_id", None)
|
28
|
+
self.im_token_id = getattr(tokenizer, "unk_id", None)
|
29
|
+
|
30
|
+
self.mm_tokens = MultimodalSpecialTokens(
|
31
|
+
image_token="(<image>./</image>)",
|
32
|
+
audio_token="(<audio>./</audio>)",
|
33
|
+
video_token="(<video>./</video>)",
|
34
|
+
image_token_id=self.im_token_id,
|
35
|
+
).build(_processor)
|
23
36
|
|
24
37
|
async def process_mm_data_async(
|
25
38
|
self,
|
@@ -27,19 +40,13 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
|
|
27
40
|
audio_data: List[Union[str, bytes]],
|
28
41
|
input_text,
|
29
42
|
request_obj,
|
30
|
-
max_req_input_len,
|
31
43
|
**kwargs,
|
32
44
|
):
|
33
45
|
base_output = self.load_mm_data(
|
34
46
|
prompt=input_text,
|
35
|
-
max_req_input_len=max_req_input_len,
|
36
47
|
audio_data=audio_data,
|
37
48
|
image_data=image_data,
|
38
|
-
multimodal_tokens=
|
39
|
-
image_token=self.image_token,
|
40
|
-
video_token=self.video_token,
|
41
|
-
audio_token=self.audio_token,
|
42
|
-
),
|
49
|
+
multimodal_tokens=self.mm_tokens,
|
43
50
|
)
|
44
51
|
if base_output is None:
|
45
52
|
return None
|
@@ -50,24 +57,6 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
|
|
50
57
|
audios=base_output.audios,
|
51
58
|
)
|
52
59
|
|
53
|
-
# Collect special token ids
|
54
|
-
tokenizer = self._processor.tokenizer
|
55
|
-
slice_start_id, slice_end_id, audio_start_id, audio_end_id = (
|
56
|
-
None,
|
57
|
-
None,
|
58
|
-
None,
|
59
|
-
None,
|
60
|
-
)
|
61
|
-
if tokenizer.slice_start_id:
|
62
|
-
slice_start_id = tokenizer.slice_start_id
|
63
|
-
slice_end_id = tokenizer.slice_end_id
|
64
|
-
if hasattr(tokenizer, "audio_start_id"):
|
65
|
-
audio_start_id = tokenizer.audio_start_id
|
66
|
-
audio_end_id = tokenizer.audio_end_id
|
67
|
-
|
68
|
-
im_start_id = tokenizer.im_start_id
|
69
|
-
im_end_id = tokenizer.im_end_id
|
70
|
-
im_token_id = tokenizer.unk_id
|
71
60
|
pixel_values = res["pixel_values"]
|
72
61
|
tgt_sizes = res["tgt_sizes"]
|
73
62
|
|
@@ -104,19 +93,21 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
|
|
104
93
|
items = []
|
105
94
|
input_ids = res["input_ids"].flatten()
|
106
95
|
image_offsets = self.get_mm_items_offset_by_pair(
|
107
|
-
input_ids=input_ids, mm_start_id=im_start_id, mm_end_id=im_end_id
|
96
|
+
input_ids=input_ids, mm_start_id=self.im_start_id, mm_end_id=self.im_end_id
|
108
97
|
)
|
109
98
|
slice_offsets = self.get_mm_items_offset_by_pair(
|
110
|
-
input_ids=input_ids,
|
99
|
+
input_ids=input_ids,
|
100
|
+
mm_start_id=self.slice_start_id,
|
101
|
+
mm_end_id=self.slice_end_id,
|
111
102
|
)
|
112
103
|
image_offsets.extend(slice_offsets)
|
113
104
|
image_offsets = sorted(image_offsets)
|
114
105
|
|
115
106
|
if len(pixel_values) != 0:
|
116
107
|
item = MultimodalDataItem(
|
117
|
-
|
108
|
+
feature=pixel_values,
|
118
109
|
offsets=image_offsets,
|
119
|
-
tgt_size
|
110
|
+
model_specific_data={"tgt_size": tgt_sizes_flat},
|
120
111
|
modality=Modality.IMAGE,
|
121
112
|
)
|
122
113
|
items += [item]
|
@@ -126,17 +117,17 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
|
|
126
117
|
and res["audio_features"] is not None
|
127
118
|
and len(res["audio_features"]) != 0
|
128
119
|
):
|
129
|
-
if audio_start_id is not None and audio_end_id is not None:
|
120
|
+
if self.audio_start_id is not None and self.audio_end_id is not None:
|
130
121
|
audio_offsets = self.get_mm_items_offset_by_pair(
|
131
122
|
input_ids=input_ids,
|
132
|
-
mm_start_id=audio_start_id,
|
133
|
-
mm_end_id=audio_end_id,
|
123
|
+
mm_start_id=self.audio_start_id,
|
124
|
+
mm_end_id=self.audio_end_id,
|
134
125
|
)
|
135
126
|
else:
|
136
127
|
audio_offsets = None
|
137
128
|
item = MultimodalDataItem(
|
138
|
-
|
139
|
-
audio_feature_lens
|
129
|
+
feature=[res["audio_features"]],
|
130
|
+
model_specific_data={"audio_feature_lens": res["audio_feature_lens"]},
|
140
131
|
offsets=audio_offsets,
|
141
132
|
modality=Modality.AUDIO,
|
142
133
|
)
|
@@ -144,11 +135,11 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
|
|
144
135
|
return {
|
145
136
|
"mm_items": items,
|
146
137
|
"input_ids": input_ids.tolist(),
|
147
|
-
"audio_start_id": audio_start_id,
|
148
|
-
"audio_end_id": audio_end_id,
|
149
|
-
"im_token_id": im_token_id,
|
150
|
-
"im_start_id": im_start_id,
|
151
|
-
"im_end_id": im_end_id,
|
152
|
-
"slice_start_id": slice_start_id,
|
153
|
-
"slice_end_id": slice_end_id,
|
138
|
+
"audio_start_id": self.audio_start_id,
|
139
|
+
"audio_end_id": self.audio_end_id,
|
140
|
+
"im_token_id": self.im_token_id,
|
141
|
+
"im_start_id": self.im_start_id,
|
142
|
+
"im_end_id": self.im_end_id,
|
143
|
+
"slice_start_id": self.slice_start_id,
|
144
|
+
"slice_end_id": self.slice_end_id,
|
154
145
|
}
|
@@ -1,9 +1,10 @@
|
|
1
1
|
from typing import List, Union
|
2
2
|
|
3
|
-
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
4
3
|
from sglang.srt.models.mllama import MllamaForConditionalGeneration
|
5
|
-
from sglang.srt.multimodal.processors.base_processor import
|
6
|
-
|
4
|
+
from sglang.srt.multimodal.processors.base_processor import (
|
5
|
+
BaseMultimodalProcessor,
|
6
|
+
MultimodalSpecialTokens,
|
7
|
+
)
|
7
8
|
|
8
9
|
|
9
10
|
class MllamaImageProcessor(BaseMultimodalProcessor):
|
@@ -11,24 +12,26 @@ class MllamaImageProcessor(BaseMultimodalProcessor):
|
|
11
12
|
|
12
13
|
def __init__(self, hf_config, server_args, _processor):
|
13
14
|
super().__init__(hf_config, server_args, _processor)
|
15
|
+
self.mm_tokens = MultimodalSpecialTokens(
|
16
|
+
image_token=self._processor.image_token,
|
17
|
+
image_token_id=self._processor.image_token_id,
|
18
|
+
).build(_processor)
|
14
19
|
|
15
20
|
async def process_mm_data_async(
|
16
21
|
self, image_data: List[Union[str, bytes]], input_text, *args, **kwargs
|
17
22
|
):
|
18
|
-
|
19
|
-
|
20
|
-
|
23
|
+
base_out = self.load_mm_data(
|
24
|
+
prompt=input_text,
|
25
|
+
image_data=image_data,
|
26
|
+
multimodal_tokens=self.mm_tokens,
|
27
|
+
)
|
21
28
|
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
image_inputs["mm_items"] = [
|
26
|
-
MultimodalDataItem(
|
27
|
-
pixel_values=image_inputs["pixel_values"],
|
28
|
-
aspect_ratio_id=image_inputs["aspect_ratio_ids"],
|
29
|
-
aspect_ratio_mask=image_inputs["aspect_ratio_mask"],
|
30
|
-
modality=Modality.IMAGE,
|
31
|
-
)
|
32
|
-
]
|
29
|
+
mm_items, input_ids, _ = self.process_and_combine_mm_data(
|
30
|
+
base_out, self.mm_tokens
|
31
|
+
)
|
33
32
|
|
34
|
-
return
|
33
|
+
return {
|
34
|
+
"mm_items": mm_items,
|
35
|
+
"input_ids": input_ids.tolist(),
|
36
|
+
"im_token_id": self.mm_tokens.image_token_id,
|
37
|
+
}
|
@@ -26,14 +26,14 @@ class Mllama4ImageProcessor(BaseMultimodalProcessor):
|
|
26
26
|
self.eoi_token_index = hf_config.eoi_token_index
|
27
27
|
self.image_token_index = hf_config.image_token_index
|
28
28
|
self.multimodal_tokens = MultimodalSpecialTokens(
|
29
|
-
image_token=_processor.image_token
|
30
|
-
|
29
|
+
image_token=_processor.image_token,
|
30
|
+
image_token_id=self.image_token_index,
|
31
|
+
).build(_processor)
|
31
32
|
|
32
33
|
async def process_mm_data_async(
|
33
34
|
self,
|
34
35
|
image_data: List[Union[str, bytes]],
|
35
36
|
input_text,
|
36
|
-
max_req_input_len=None,
|
37
37
|
*args,
|
38
38
|
**kwargs,
|
39
39
|
):
|
@@ -45,7 +45,6 @@ class Mllama4ImageProcessor(BaseMultimodalProcessor):
|
|
45
45
|
processed_data = self.load_mm_data(
|
46
46
|
prompt=input_text,
|
47
47
|
multimodal_tokens=self.multimodal_tokens,
|
48
|
-
max_req_input_len=max_req_input_len or 4096,
|
49
48
|
image_data=image_data,
|
50
49
|
return_text=True,
|
51
50
|
)
|
@@ -142,7 +141,7 @@ class Mllama4ImageProcessor(BaseMultimodalProcessor):
|
|
142
141
|
# Add metadata for image processing
|
143
142
|
processor_output["mm_items"] = [
|
144
143
|
MultimodalDataItem(
|
145
|
-
|
144
|
+
feature=processor_output["pixel_values"],
|
146
145
|
modality=Modality.IMAGE,
|
147
146
|
offsets=image_offsets,
|
148
147
|
)
|
@@ -1,6 +1,8 @@
|
|
1
1
|
import logging
|
2
2
|
from typing import List, Union
|
3
3
|
|
4
|
+
from transformers.processing_utils import ProcessorMixin
|
5
|
+
|
4
6
|
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
5
7
|
from sglang.srt.models.phi4mm import Phi4MMForCausalLM
|
6
8
|
from sglang.srt.multimodal.processors.base_processor import (
|
@@ -10,18 +12,59 @@ from sglang.srt.multimodal.processors.base_processor import (
|
|
10
12
|
|
11
13
|
logger = logging.getLogger(__name__)
|
12
14
|
|
13
|
-
|
14
|
-
|
15
|
+
|
16
|
+
# It is an adapter of hf phi4 mm processor to make it work for sglang
|
17
|
+
# Ref: https://huggingface.co/microsoft/Phi-4-multimodal-instruct/blob/main/processing_phi4mm.py#L693
|
18
|
+
class Phi4MMProcessorAdapter(ProcessorMixin):
|
19
|
+
def __init__(self, _processor) -> None:
|
20
|
+
self._processor = _processor
|
21
|
+
|
22
|
+
def __call__(self, **kwargs):
|
23
|
+
result = self._processor(**kwargs)
|
24
|
+
|
25
|
+
# Map HuggingFace output keys to sglang standard keys
|
26
|
+
key_mapping = {
|
27
|
+
"input_image_embeds": "pixel_values",
|
28
|
+
"input_audio_embeds": "audio_features",
|
29
|
+
"audio_embed_sizes": "audio_feature_lens",
|
30
|
+
}
|
31
|
+
for hf_key, sglang_key in key_mapping.items():
|
32
|
+
if hf_key in result:
|
33
|
+
result[sglang_key] = result[hf_key]
|
34
|
+
del result[hf_key]
|
35
|
+
|
36
|
+
# Filter out None or empty tensors from the result.
|
37
|
+
# This prevents the sglang function base_processor.collect_mm_items_from_processor_output()
|
38
|
+
# from misclassifying audio content as image content, and vice versa.
|
39
|
+
filtered_result = {
|
40
|
+
k: v
|
41
|
+
for k, v in result.items()
|
42
|
+
if v is not None and (not hasattr(v, "numel") or v.numel() > 0)
|
43
|
+
}
|
44
|
+
return filtered_result
|
15
45
|
|
16
46
|
|
17
|
-
class
|
47
|
+
class Phi4MMMultimodalProcessor(BaseMultimodalProcessor):
|
18
48
|
models = [Phi4MMForCausalLM]
|
19
49
|
|
20
50
|
def __init__(self, hf_config, server_args, _processor):
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
51
|
+
self.processor = Phi4MMProcessorAdapter(_processor)
|
52
|
+
super().__init__(hf_config, server_args, self.processor)
|
53
|
+
|
54
|
+
# the following CONSTANTS come from hugging-face microsoft/Phi-4-multimodal-instruct's processing_phi4mm.py file
|
55
|
+
# ref: https://huggingface.co/microsoft/Phi-4-multimodal-instruct/blob/main/processing_phi4mm.py
|
56
|
+
self.IMAGE_TOKEN = "<|endoftext10|>"
|
57
|
+
self.AUDIO_TOKEN = "<|endoftext11|>"
|
58
|
+
self.IM_TOKEN_ID = 200010
|
59
|
+
self.AUDIO_TOKEN_ID = 200011
|
60
|
+
self.AUDIO_SAMPLE_RATE = 16000
|
61
|
+
|
62
|
+
self.mm_tokens = MultimodalSpecialTokens(
|
63
|
+
image_token=self.IMAGE_TOKEN,
|
64
|
+
image_token_id=self.IM_TOKEN_ID,
|
65
|
+
audio_token=self.AUDIO_TOKEN,
|
66
|
+
audio_token_id=self.AUDIO_TOKEN_ID,
|
67
|
+
).build(self.processor)
|
25
68
|
|
26
69
|
async def process_mm_data_async(
|
27
70
|
self,
|
@@ -29,49 +72,30 @@ class Phi4MMImageProcessor(BaseMultimodalProcessor):
|
|
29
72
|
audio_data,
|
30
73
|
input_text,
|
31
74
|
request_obj,
|
32
|
-
max_req_input_len,
|
33
75
|
**kwargs,
|
34
76
|
):
|
35
|
-
if audio_data:
|
36
|
-
logger.warning(
|
37
|
-
"Currently SGLang does not support audio data for Phi4MM. We are working on it. You can file an issue to help us prioritize."
|
38
|
-
)
|
39
|
-
audio_data = []
|
40
|
-
|
41
77
|
base_output = self.load_mm_data(
|
42
78
|
prompt=input_text,
|
43
|
-
max_req_input_len=max_req_input_len,
|
44
79
|
audio_data=audio_data,
|
45
80
|
image_data=image_data,
|
46
|
-
multimodal_tokens=self.
|
81
|
+
multimodal_tokens=self.mm_tokens,
|
82
|
+
audio_sample_rate=self.AUDIO_SAMPLE_RATE,
|
47
83
|
)
|
48
|
-
if base_output is None:
|
49
|
-
return None
|
50
84
|
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
85
|
+
if base_output.audios is not None:
|
86
|
+
# hugging-face microsoft/Phi-4-multimodal-instruct's processing_phi4mm.py file requires the audio input to be tuple of (audio, sample_rate)
|
87
|
+
# ref: https://huggingface.co/microsoft/Phi-4-multimodal-instruct/blob/main/processing_phi4mm.py
|
88
|
+
base_output.audios = [
|
89
|
+
(audio, self.AUDIO_SAMPLE_RATE) for audio in base_output.audios
|
90
|
+
]
|
56
91
|
|
57
|
-
input_ids =
|
58
|
-
|
59
|
-
input_ids=input_ids,
|
60
|
-
mm_token_id=_IMAGE_SPECIAL_TOKEN_ID,
|
92
|
+
mm_items, input_ids, _ = self.process_and_combine_mm_data(
|
93
|
+
base_output, self.mm_tokens
|
61
94
|
)
|
62
95
|
|
63
|
-
items = [
|
64
|
-
MultimodalDataItem(
|
65
|
-
pixel_values=res["input_image_embeds"],
|
66
|
-
image_sizes=res["image_sizes"],
|
67
|
-
image_emb_mask=res["image_attention_mask"],
|
68
|
-
offsets=image_offsets,
|
69
|
-
modality=Modality.IMAGE,
|
70
|
-
)
|
71
|
-
]
|
72
|
-
|
73
96
|
return {
|
74
|
-
"mm_items": items,
|
75
97
|
"input_ids": input_ids.tolist(),
|
76
|
-
"
|
98
|
+
"mm_items": mm_items,
|
99
|
+
"im_token_id": self.mm_tokens.image_token_id,
|
100
|
+
"audio_token_id": self.mm_tokens.audio_token_id,
|
77
101
|
}
|
@@ -6,7 +6,6 @@ from transformers.models.pixtral.image_processing_pixtral import (
|
|
6
6
|
_num_image_tokens as _get_pixtral_hf_num_image_tokens,
|
7
7
|
)
|
8
8
|
|
9
|
-
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
10
9
|
from sglang.srt.models.pixtral import PixtralVisionModel
|
11
10
|
from sglang.srt.multimodal.processors.base_processor import (
|
12
11
|
BaseMultimodalProcessor,
|
@@ -45,7 +44,7 @@ class PixtralProcessor(BaseMultimodalProcessor):
|
|
45
44
|
|
46
45
|
def __init__(self, hf_config, server_args, _processor):
|
47
46
|
super().__init__(hf_config, server_args, _processor)
|
48
|
-
self.
|
47
|
+
self.IM_TOKEN_ID = getattr(
|
49
48
|
hf_config, "image_token_index", PixtralVisionModel.DEFAULT_IMAGE_TOKEN_ID
|
50
49
|
)
|
51
50
|
# Instantiate the patcher logic helper using the class defined above
|
@@ -53,9 +52,10 @@ class PixtralProcessor(BaseMultimodalProcessor):
|
|
53
52
|
self.vision_config = hf_config.vision_config
|
54
53
|
self.image_size = self.vision_config.image_size
|
55
54
|
self.patch_size = self.vision_config.patch_size
|
56
|
-
self.
|
57
|
-
image_token=_processor.image_token
|
58
|
-
|
55
|
+
self.mm_tokens = MultimodalSpecialTokens(
|
56
|
+
image_token=_processor.image_token,
|
57
|
+
image_token_id=self.IM_TOKEN_ID,
|
58
|
+
).build(_processor)
|
59
59
|
_processor.tokenizer.add_special_tokens(
|
60
60
|
{
|
61
61
|
"pad_token": getattr(hf_config, "pad_token", self.PAD_TOKEN),
|
@@ -80,42 +80,21 @@ class PixtralProcessor(BaseMultimodalProcessor):
|
|
80
80
|
):
|
81
81
|
mm_data = self.load_mm_data(
|
82
82
|
prompt=input_text,
|
83
|
-
multimodal_tokens=self.
|
84
|
-
max_req_input_len=kwargs.get("max_req_input_len", 4096),
|
83
|
+
multimodal_tokens=self.mm_tokens,
|
85
84
|
image_data=image_data,
|
86
85
|
return_text=True,
|
87
86
|
)
|
88
|
-
|
89
87
|
if mm_data.images:
|
90
88
|
resize_tasks = [self._resize(image) for image in mm_data.images]
|
91
89
|
mm_data.images = await asyncio.gather(*resize_tasks)
|
92
90
|
|
93
|
-
|
94
|
-
|
95
|
-
images=mm_data.images,
|
91
|
+
mm_items, input_ids, _ = self.process_and_combine_mm_data(
|
92
|
+
mm_data, self.mm_tokens
|
96
93
|
)
|
97
94
|
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
mm_items = [
|
105
|
-
MultimodalDataItem(
|
106
|
-
pixel_values=processor_output["pixel_values"],
|
107
|
-
image_sizes=processor_output["image_sizes"],
|
108
|
-
modality=Modality.IMAGE,
|
109
|
-
offsets=image_offsets,
|
110
|
-
)
|
111
|
-
]
|
112
|
-
|
113
|
-
input_ids = input_ids.tolist()
|
114
|
-
processor_output.update(
|
115
|
-
input_ids=input_ids,
|
116
|
-
mm_items=mm_items,
|
117
|
-
# there's no im_start_id for pixtral, only im_token and im_end_token
|
118
|
-
im_end_id=self.IMG_END_TOKEN_ID,
|
119
|
-
im_token_id=self.image_token_id,
|
120
|
-
)
|
121
|
-
return processor_output
|
95
|
+
return {
|
96
|
+
"mm_items": mm_items,
|
97
|
+
"input_ids": input_ids.tolist(),
|
98
|
+
"im_token_id": self.IM_TOKEN_ID,
|
99
|
+
"im_token": self._processor.image_token,
|
100
|
+
}
|