sglang 0.4.8__py3-none-any.whl → 0.4.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch_server.py +17 -2
- sglang/bench_serving.py +168 -22
- sglang/srt/configs/internvl.py +4 -2
- sglang/srt/configs/janus_pro.py +1 -1
- sglang/srt/configs/model_config.py +49 -0
- sglang/srt/configs/update_config.py +119 -0
- sglang/srt/conversation.py +35 -0
- sglang/srt/custom_op.py +7 -1
- sglang/srt/disaggregation/base/conn.py +2 -0
- sglang/srt/disaggregation/decode.py +22 -6
- sglang/srt/disaggregation/mooncake/conn.py +289 -48
- sglang/srt/disaggregation/mooncake/transfer_engine.py +31 -1
- sglang/srt/disaggregation/nixl/conn.py +100 -52
- sglang/srt/disaggregation/prefill.py +5 -4
- sglang/srt/disaggregation/utils.py +13 -12
- sglang/srt/distributed/parallel_state.py +44 -17
- sglang/srt/entrypoints/EngineBase.py +8 -0
- sglang/srt/entrypoints/engine.py +45 -9
- sglang/srt/entrypoints/http_server.py +111 -24
- sglang/srt/entrypoints/openai/protocol.py +51 -6
- sglang/srt/entrypoints/openai/serving_chat.py +52 -76
- sglang/srt/entrypoints/openai/serving_completions.py +1 -0
- sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
- sglang/srt/eplb/__init__.py +0 -0
- sglang/srt/{managers → eplb}/eplb_algorithms/__init__.py +1 -1
- sglang/srt/{managers → eplb}/eplb_manager.py +2 -4
- sglang/srt/{eplb_simulator → eplb/eplb_simulator}/reader.py +1 -1
- sglang/srt/{managers → eplb}/expert_distribution.py +18 -1
- sglang/srt/{managers → eplb}/expert_location.py +1 -1
- sglang/srt/{managers → eplb}/expert_location_dispatch.py +1 -1
- sglang/srt/{model_executor → eplb}/expert_location_updater.py +17 -1
- sglang/srt/hf_transformers_utils.py +2 -1
- sglang/srt/layers/activation.py +7 -0
- sglang/srt/layers/amx_utils.py +86 -0
- sglang/srt/layers/attention/ascend_backend.py +219 -0
- sglang/srt/layers/attention/flashattention_backend.py +56 -23
- sglang/srt/layers/attention/tbo_backend.py +37 -9
- sglang/srt/layers/communicator.py +18 -2
- sglang/srt/layers/dp_attention.py +9 -3
- sglang/srt/layers/elementwise.py +76 -12
- sglang/srt/layers/flashinfer_comm_fusion.py +202 -0
- sglang/srt/layers/layernorm.py +41 -0
- sglang/srt/layers/linear.py +99 -12
- sglang/srt/layers/logits_processor.py +15 -6
- sglang/srt/layers/moe/ep_moe/kernels.py +23 -8
- sglang/srt/layers/moe/ep_moe/layer.py +115 -25
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +42 -19
- sglang/srt/layers/moe/fused_moe_native.py +7 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +8 -4
- sglang/srt/layers/moe/fused_moe_triton/layer.py +129 -10
- sglang/srt/layers/moe/router.py +60 -22
- sglang/srt/layers/moe/topk.py +36 -28
- sglang/srt/layers/parameter.py +67 -7
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +1 -1
- sglang/srt/layers/quantization/fp8.py +44 -0
- sglang/srt/layers/quantization/fp8_kernel.py +1 -1
- sglang/srt/layers/quantization/fp8_utils.py +6 -6
- sglang/srt/layers/quantization/gptq.py +5 -1
- sglang/srt/layers/quantization/moe_wna16.py +1 -1
- sglang/srt/layers/quantization/quant_utils.py +166 -0
- sglang/srt/layers/quantization/w8a8_int8.py +52 -1
- sglang/srt/layers/rotary_embedding.py +105 -13
- sglang/srt/layers/vocab_parallel_embedding.py +19 -2
- sglang/srt/lora/lora.py +4 -5
- sglang/srt/lora/lora_manager.py +73 -20
- sglang/srt/managers/configure_logging.py +1 -1
- sglang/srt/managers/io_struct.py +60 -15
- sglang/srt/managers/mm_utils.py +73 -59
- sglang/srt/managers/multimodal_processor.py +2 -6
- sglang/srt/managers/multimodal_processors/qwen_audio.py +94 -0
- sglang/srt/managers/schedule_batch.py +80 -79
- sglang/srt/managers/scheduler.py +153 -63
- sglang/srt/managers/scheduler_output_processor_mixin.py +8 -2
- sglang/srt/managers/session_controller.py +12 -3
- sglang/srt/managers/tokenizer_manager.py +314 -103
- sglang/srt/managers/tp_worker.py +13 -1
- sglang/srt/managers/tp_worker_overlap_thread.py +8 -0
- sglang/srt/mem_cache/allocator.py +290 -0
- sglang/srt/mem_cache/chunk_cache.py +34 -2
- sglang/srt/mem_cache/memory_pool.py +289 -3
- sglang/srt/mem_cache/multimodal_cache.py +3 -0
- sglang/srt/model_executor/cuda_graph_runner.py +3 -2
- sglang/srt/model_executor/forward_batch_info.py +17 -4
- sglang/srt/model_executor/model_runner.py +302 -58
- sglang/srt/model_loader/loader.py +86 -10
- sglang/srt/model_loader/weight_utils.py +160 -3
- sglang/srt/models/deepseek_nextn.py +5 -4
- sglang/srt/models/deepseek_v2.py +305 -26
- sglang/srt/models/deepseek_vl2.py +3 -5
- sglang/srt/models/gemma3_causal.py +1 -2
- sglang/srt/models/gemma3n_audio.py +949 -0
- sglang/srt/models/gemma3n_causal.py +1010 -0
- sglang/srt/models/gemma3n_mm.py +495 -0
- sglang/srt/models/hunyuan.py +771 -0
- sglang/srt/models/kimi_vl.py +1 -2
- sglang/srt/models/llama.py +10 -4
- sglang/srt/models/llama4.py +32 -45
- sglang/srt/models/llama_eagle3.py +61 -11
- sglang/srt/models/llava.py +5 -5
- sglang/srt/models/minicpmo.py +2 -2
- sglang/srt/models/mistral.py +1 -1
- sglang/srt/models/mllama4.py +43 -11
- sglang/srt/models/phi4mm.py +1 -3
- sglang/srt/models/pixtral.py +3 -7
- sglang/srt/models/qwen2.py +31 -3
- sglang/srt/models/qwen2_5_vl.py +1 -3
- sglang/srt/models/qwen2_audio.py +200 -0
- sglang/srt/models/qwen2_moe.py +32 -6
- sglang/srt/models/qwen2_vl.py +1 -4
- sglang/srt/models/qwen3.py +94 -25
- sglang/srt/models/qwen3_moe.py +68 -21
- sglang/srt/models/vila.py +3 -8
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/base_processor.py +150 -133
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/clip.py +2 -13
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/deepseek_vl_v2.py +4 -11
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/gemma3.py +3 -10
- sglang/srt/multimodal/processors/gemma3n.py +82 -0
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/internvl.py +3 -10
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/janus_pro.py +3 -9
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/kimi_vl.py +6 -13
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/llava.py +2 -10
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/minicpm.py +5 -12
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/mlama.py +2 -14
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/mllama4.py +3 -6
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/phi4mm.py +4 -14
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/pixtral.py +3 -9
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/qwen_vl.py +8 -14
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/vila.py +13 -31
- sglang/srt/operations_strategy.py +6 -2
- sglang/srt/reasoning_parser.py +26 -0
- sglang/srt/sampling/sampling_batch_info.py +39 -1
- sglang/srt/server_args.py +85 -24
- sglang/srt/speculative/build_eagle_tree.py +57 -18
- sglang/srt/speculative/eagle_worker.py +6 -4
- sglang/srt/two_batch_overlap.py +204 -28
- sglang/srt/utils.py +369 -138
- sglang/srt/warmup.py +12 -3
- sglang/test/runners.py +10 -1
- sglang/test/test_utils.py +15 -3
- sglang/version.py +1 -1
- {sglang-0.4.8.dist-info → sglang-0.4.9.dist-info}/METADATA +9 -6
- {sglang-0.4.8.dist-info → sglang-0.4.9.dist-info}/RECORD +149 -137
- sglang/math_utils.py +0 -8
- /sglang/srt/{managers → eplb}/eplb_algorithms/deepseek.py +0 -0
- /sglang/srt/{managers → eplb}/eplb_algorithms/deepseek_vec.py +0 -0
- /sglang/srt/{eplb_simulator → eplb/eplb_simulator}/__init__.py +0 -0
- /sglang/srt/{mm_utils.py → multimodal/mm_utils.py} +0 -0
- {sglang-0.4.8.dist-info → sglang-0.4.9.dist-info}/WHEEL +0 -0
- {sglang-0.4.8.dist-info → sglang-0.4.9.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.8.dist-info → sglang-0.4.9.dist-info}/top_level.txt +0 -0
@@ -4,11 +4,9 @@ from typing import Dict, List, Union
|
|
4
4
|
from sglang.srt.managers.multimodal_processor import (
|
5
5
|
BaseMultimodalProcessor as SGLangBaseProcessor,
|
6
6
|
)
|
7
|
-
from sglang.srt.managers.multimodal_processors.base_processor import (
|
8
|
-
MultimodalSpecialTokens,
|
9
|
-
)
|
10
7
|
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
11
8
|
from sglang.srt.models.gemma3_mm import Gemma3ForConditionalGeneration
|
9
|
+
from sglang.srt.multimodal.processors.base_processor import MultimodalSpecialTokens
|
12
10
|
|
13
11
|
# Copied from: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gemma3/image_processing_gemma3_fast.py
|
14
12
|
# will be removed in the future
|
@@ -38,11 +36,6 @@ class Gemma3SGLangImageProcessor(SGLangBaseProcessor):
|
|
38
36
|
*args,
|
39
37
|
**kwargs,
|
40
38
|
):
|
41
|
-
if not image_data:
|
42
|
-
return None
|
43
|
-
if isinstance(image_data, str):
|
44
|
-
image_data = [image_data]
|
45
|
-
|
46
39
|
base_output = self.load_mm_data(
|
47
40
|
prompt=input_text,
|
48
41
|
image_data=image_data,
|
@@ -53,11 +46,11 @@ class Gemma3SGLangImageProcessor(SGLangBaseProcessor):
|
|
53
46
|
discard_alpha_channel=True,
|
54
47
|
)
|
55
48
|
|
56
|
-
|
49
|
+
mm_items, input_ids = self.process_and_combine_mm_data(base_output)
|
57
50
|
|
58
51
|
return {
|
59
52
|
"input_ids": input_ids.tolist(),
|
60
|
-
"mm_items":
|
53
|
+
"mm_items": mm_items,
|
61
54
|
"im_start_id": self.IM_START_TOKEN_ID,
|
62
55
|
"im_end_id": self.IM_END_TOKEN_ID,
|
63
56
|
}
|
@@ -0,0 +1,82 @@
|
|
1
|
+
# Copyright 2025 SGLang Team
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
3
|
+
# you may not use this file except in compliance with the License.
|
4
|
+
# You may obtain a copy of the License at
|
5
|
+
#
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
7
|
+
#
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
11
|
+
# See the License for the specific language governing permissions and
|
12
|
+
# limitations under the License.
|
13
|
+
# ==============================================================================
|
14
|
+
|
15
|
+
import re
|
16
|
+
from typing import Dict, List, Optional, Union
|
17
|
+
|
18
|
+
from sglang.srt.managers.multimodal_processor import (
|
19
|
+
BaseMultimodalProcessor as SGLangBaseProcessor,
|
20
|
+
)
|
21
|
+
from sglang.srt.models.gemma3n_mm import Gemma3nForConditionalGeneration
|
22
|
+
from sglang.srt.multimodal.processors.base_processor import MultimodalSpecialTokens
|
23
|
+
|
24
|
+
|
25
|
+
class Gemma3nSGLangProcessor(SGLangBaseProcessor):
|
26
|
+
"""Multimodal processor for Gemma3n supporting image and audio inputs."""
|
27
|
+
|
28
|
+
models = [Gemma3nForConditionalGeneration]
|
29
|
+
|
30
|
+
def __init__(self, hf_config, server_args, _processor):
|
31
|
+
super().__init__(hf_config, server_args, _processor)
|
32
|
+
|
33
|
+
self.IMAGE_TOKEN = "<image_soft_token>"
|
34
|
+
self.IMAGE_TOKEN_REGEX = re.compile(
|
35
|
+
r"<start_of_image>(?:(?:<image_soft_token>)*<end_of_image>)?"
|
36
|
+
)
|
37
|
+
|
38
|
+
self.AUDIO_TOKEN = "<audio_soft_token>"
|
39
|
+
self.AUDIO_TOKEN_REGEX = re.compile(
|
40
|
+
r"<start_of_audio>(?:(?:<audio_soft_token>)*<end_of_audio>)?"
|
41
|
+
)
|
42
|
+
|
43
|
+
self.IM_TOKEN_ID = hf_config.image_token_id
|
44
|
+
self.IM_START_TOKEN_ID = hf_config.boi_token_id
|
45
|
+
self.IM_END_TOKEN_ID = hf_config.eoi_token_id
|
46
|
+
|
47
|
+
self.AUDIO_TOKEN_ID = hf_config.audio_token_id
|
48
|
+
self.AUDIO_START_TOKEN_ID = hf_config.boa_token_id
|
49
|
+
self.AUDIO_END_TOKEN_ID = hf_config.eoa_token_id
|
50
|
+
|
51
|
+
async def process_mm_data_async(
|
52
|
+
self,
|
53
|
+
image_data: Optional[List[Union[str, bytes, Dict]]] = None,
|
54
|
+
audio_data: Optional[List[Union[str, bytes, Dict]]] = None,
|
55
|
+
input_text: str = "",
|
56
|
+
request_obj=None,
|
57
|
+
max_req_input_len: int = 0,
|
58
|
+
*args,
|
59
|
+
**kwargs,
|
60
|
+
):
|
61
|
+
"""Process multimodal data including images and audio."""
|
62
|
+
base_output = self.load_mm_data(
|
63
|
+
prompt=input_text,
|
64
|
+
image_data=image_data,
|
65
|
+
audio_data=audio_data,
|
66
|
+
max_req_input_len=max_req_input_len,
|
67
|
+
multimodal_tokens=MultimodalSpecialTokens(
|
68
|
+
image_token=self.IMAGE_TOKEN,
|
69
|
+
image_token_regex=self.IMAGE_TOKEN_REGEX,
|
70
|
+
audio_token=self.AUDIO_TOKEN,
|
71
|
+
audio_token_regex=self.AUDIO_TOKEN_REGEX,
|
72
|
+
),
|
73
|
+
)
|
74
|
+
|
75
|
+
mm_items, input_ids = self.process_and_combine_mm_data(base_output)
|
76
|
+
|
77
|
+
return {
|
78
|
+
"input_ids": input_ids.tolist(),
|
79
|
+
"mm_items": mm_items,
|
80
|
+
"im_token_id": self.IM_TOKEN_ID,
|
81
|
+
"audio_token_id": self.AUDIO_TOKEN_ID,
|
82
|
+
}
|
@@ -5,12 +5,12 @@ import torch
|
|
5
5
|
from decord import VideoReader, cpu
|
6
6
|
from PIL import Image
|
7
7
|
|
8
|
-
from sglang.srt.managers.
|
8
|
+
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
9
|
+
from sglang.srt.models.internvl import InternVLChatModel
|
10
|
+
from sglang.srt.multimodal.processors.base_processor import (
|
9
11
|
BaseMultimodalProcessor,
|
10
12
|
MultimodalSpecialTokens,
|
11
13
|
)
|
12
|
-
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
13
|
-
from sglang.srt.models.internvl import InternVLChatModel
|
14
14
|
|
15
15
|
|
16
16
|
class InternVLImageProcessor(BaseMultimodalProcessor):
|
@@ -172,13 +172,6 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
|
|
172
172
|
async def process_mm_data_async(
|
173
173
|
self, image_data, input_text, request_obj, max_req_input_len, **kwargs
|
174
174
|
):
|
175
|
-
if not image_data:
|
176
|
-
return None
|
177
|
-
|
178
|
-
# Ensure image_data is a list
|
179
|
-
if isinstance(image_data, str):
|
180
|
-
image_data = [image_data]
|
181
|
-
|
182
175
|
base_output = self.load_mm_data(
|
183
176
|
prompt=input_text,
|
184
177
|
image_data=image_data,
|
@@ -1,11 +1,11 @@
|
|
1
1
|
from typing import List, Union
|
2
2
|
|
3
|
-
from sglang.srt.managers.
|
3
|
+
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
4
|
+
from sglang.srt.models.deepseek_janus_pro import MultiModalityCausalLM
|
5
|
+
from sglang.srt.multimodal.processors.base_processor import (
|
4
6
|
BaseMultimodalProcessor,
|
5
7
|
MultimodalSpecialTokens,
|
6
8
|
)
|
7
|
-
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
8
|
-
from sglang.srt.models.deepseek_janus_pro import MultiModalityCausalLM
|
9
9
|
|
10
10
|
|
11
11
|
class JanusProImageProcessor(BaseMultimodalProcessor):
|
@@ -22,12 +22,6 @@ class JanusProImageProcessor(BaseMultimodalProcessor):
|
|
22
22
|
max_req_input_len,
|
23
23
|
**kwargs,
|
24
24
|
):
|
25
|
-
if not image_data:
|
26
|
-
return None
|
27
|
-
|
28
|
-
if not isinstance(image_data, list):
|
29
|
-
image_data = [image_data]
|
30
|
-
|
31
25
|
processor = self._processor
|
32
26
|
|
33
27
|
base_out = self.load_mm_data(
|
@@ -3,14 +3,12 @@ from typing import Any, Dict, List, Optional, Union
|
|
3
3
|
|
4
4
|
import torch
|
5
5
|
|
6
|
-
from sglang.srt.managers.multimodal_processors.base_processor import (
|
7
|
-
BaseMultimodalProcessor as SGLangBaseProcessor,
|
8
|
-
)
|
9
|
-
from sglang.srt.managers.multimodal_processors.base_processor import (
|
10
|
-
MultimodalSpecialTokens,
|
11
|
-
)
|
12
6
|
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
13
7
|
from sglang.srt.models.kimi_vl import KimiVLForConditionalGeneration
|
8
|
+
from sglang.srt.multimodal.processors.base_processor import (
|
9
|
+
BaseMultimodalProcessor as SGLangBaseProcessor,
|
10
|
+
)
|
11
|
+
from sglang.srt.multimodal.processors.base_processor import MultimodalSpecialTokens
|
14
12
|
|
15
13
|
|
16
14
|
# Compatible with KimiVLForConditionalGeneration
|
@@ -32,11 +30,6 @@ class KimiVLImageProcessor(SGLangBaseProcessor):
|
|
32
30
|
*args,
|
33
31
|
**kwargs,
|
34
32
|
):
|
35
|
-
if not image_data:
|
36
|
-
return None
|
37
|
-
if isinstance(image_data, str):
|
38
|
-
image_data = [image_data]
|
39
|
-
|
40
33
|
base_output = self.load_mm_data(
|
41
34
|
prompt=input_text,
|
42
35
|
image_data=image_data,
|
@@ -46,10 +39,10 @@ class KimiVLImageProcessor(SGLangBaseProcessor):
|
|
46
39
|
max_req_input_len=max_req_input_len,
|
47
40
|
)
|
48
41
|
|
49
|
-
|
42
|
+
mm_items, input_ids = self.process_and_combine_mm_data(base_output)
|
50
43
|
|
51
44
|
return {
|
52
45
|
"input_ids": input_ids.tolist(),
|
53
|
-
"mm_items":
|
46
|
+
"mm_items": mm_items,
|
54
47
|
"im_token_id": self.IM_TOKEN_ID,
|
55
48
|
}
|
@@ -7,11 +7,7 @@ from transformers.models.auto.processing_auto import (
|
|
7
7
|
)
|
8
8
|
|
9
9
|
import sglang.srt.managers.multimodal_processor as sgl_mm_processor_utils
|
10
|
-
from sglang.srt.managers.multimodal_processors.base_processor import (
|
11
|
-
BaseMultimodalProcessor,
|
12
|
-
)
|
13
10
|
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
14
|
-
from sglang.srt.mm_utils import expand2square, process_anyres_image
|
15
11
|
from sglang.srt.models.llava import (
|
16
12
|
LlavaForConditionalGeneration,
|
17
13
|
LlavaLlamaForCausalLM,
|
@@ -20,6 +16,8 @@ from sglang.srt.models.llava import (
|
|
20
16
|
)
|
21
17
|
from sglang.srt.models.llavavid import LlavaVidForCausalLM
|
22
18
|
from sglang.srt.models.mistral import Mistral3ForConditionalGeneration
|
19
|
+
from sglang.srt.multimodal.mm_utils import expand2square, process_anyres_image
|
20
|
+
from sglang.srt.multimodal.processors.base_processor import BaseMultimodalProcessor
|
23
21
|
from sglang.srt.utils import load_image, logger
|
24
22
|
from sglang.utils import get_exception_traceback
|
25
23
|
|
@@ -112,9 +110,6 @@ class LlavaImageProcessor(BaseMultimodalProcessor):
|
|
112
110
|
*args,
|
113
111
|
**kwargs,
|
114
112
|
):
|
115
|
-
if not image_data:
|
116
|
-
return None
|
117
|
-
|
118
113
|
modalities = request_obj.modalities or ["image"]
|
119
114
|
aspect_ratio = getattr(self.hf_config, "image_aspect_ratio", None)
|
120
115
|
grid_pinpoints = (
|
@@ -124,9 +119,6 @@ class LlavaImageProcessor(BaseMultimodalProcessor):
|
|
124
119
|
else None
|
125
120
|
)
|
126
121
|
|
127
|
-
if isinstance(image_data, str):
|
128
|
-
image_data = [image_data]
|
129
|
-
|
130
122
|
if isinstance(image_data, list) and len(image_data) > 0:
|
131
123
|
if "multi-images" in modalities or "video" in modalities:
|
132
124
|
# Multiple images
|
@@ -2,13 +2,13 @@ from typing import List, Union
|
|
2
2
|
|
3
3
|
import torch
|
4
4
|
|
5
|
-
from sglang.srt.managers.multimodal_processors.base_processor import (
|
6
|
-
BaseMultimodalProcessor,
|
7
|
-
MultimodalSpecialTokens,
|
8
|
-
)
|
9
5
|
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
10
6
|
from sglang.srt.models.minicpmo import MiniCPMO
|
11
7
|
from sglang.srt.models.minicpmv import MiniCPMV
|
8
|
+
from sglang.srt.multimodal.processors.base_processor import (
|
9
|
+
BaseMultimodalProcessor,
|
10
|
+
MultimodalSpecialTokens,
|
11
|
+
)
|
12
12
|
|
13
13
|
|
14
14
|
# Compatible with both 'O' and 'V'
|
@@ -23,19 +23,12 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
|
|
23
23
|
async def process_mm_data_async(
|
24
24
|
self,
|
25
25
|
image_data: List[Union[str, bytes]],
|
26
|
+
audio_data: List[Union[str, bytes]],
|
26
27
|
input_text,
|
27
28
|
request_obj,
|
28
29
|
max_req_input_len,
|
29
30
|
**kwargs,
|
30
31
|
):
|
31
|
-
audio_data = request_obj.audio_data
|
32
|
-
if not image_data and not audio_data:
|
33
|
-
return None
|
34
|
-
if not isinstance(image_data, list):
|
35
|
-
image_data = [image_data]
|
36
|
-
if not isinstance(audio_data, list):
|
37
|
-
audio_data = [audio_data]
|
38
|
-
|
39
32
|
base_output = self.load_mm_data(
|
40
33
|
prompt=input_text,
|
41
34
|
max_req_input_len=max_req_input_len,
|
@@ -1,10 +1,8 @@
|
|
1
1
|
from typing import List, Union
|
2
2
|
|
3
|
-
from sglang.srt.managers.multimodal_processors.base_processor import (
|
4
|
-
BaseMultimodalProcessor,
|
5
|
-
)
|
6
3
|
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
7
4
|
from sglang.srt.models.mllama import MllamaForConditionalGeneration
|
5
|
+
from sglang.srt.multimodal.processors.base_processor import BaseMultimodalProcessor
|
8
6
|
from sglang.srt.utils import load_image
|
9
7
|
|
10
8
|
|
@@ -17,21 +15,11 @@ class MllamaImageProcessor(BaseMultimodalProcessor):
|
|
17
15
|
async def process_mm_data_async(
|
18
16
|
self, image_data: List[Union[str, bytes]], input_text, *args, **kwargs
|
19
17
|
):
|
20
|
-
if not image_data:
|
21
|
-
return None
|
22
|
-
|
23
18
|
if isinstance(input_text, list):
|
24
19
|
assert len(input_text) and isinstance(input_text[0], int)
|
25
20
|
input_text = self._processor.tokenizer.decode(input_text)
|
26
21
|
|
27
|
-
|
28
|
-
image_data = [image_data]
|
29
|
-
|
30
|
-
if len(image_data) > 0:
|
31
|
-
images = [load_image(image)[0] for image in image_data]
|
32
|
-
else:
|
33
|
-
images = load_image(image_data[0])[0]
|
34
|
-
|
22
|
+
images = [load_image(image)[0] for image in image_data]
|
35
23
|
image_inputs = self.process_mm_data(input_text=input_text, images=images)
|
36
24
|
image_inputs["input_ids"] = image_inputs["input_ids"].tolist()[0]
|
37
25
|
image_inputs["mm_items"] = [
|
@@ -7,12 +7,12 @@ from transformers.models.llama4.image_processing_llama4_fast import (
|
|
7
7
|
get_best_fit,
|
8
8
|
)
|
9
9
|
|
10
|
-
from sglang.srt.managers.
|
10
|
+
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
11
|
+
from sglang.srt.models.mllama4 import Llama4ForConditionalGeneration
|
12
|
+
from sglang.srt.multimodal.processors.base_processor import (
|
11
13
|
BaseMultimodalProcessor,
|
12
14
|
MultimodalSpecialTokens,
|
13
15
|
)
|
14
|
-
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
15
|
-
from sglang.srt.models.mllama4 import Llama4ForConditionalGeneration
|
16
16
|
|
17
17
|
|
18
18
|
class Mllama4ImageProcessor(BaseMultimodalProcessor):
|
@@ -37,9 +37,6 @@ class Mllama4ImageProcessor(BaseMultimodalProcessor):
|
|
37
37
|
*args,
|
38
38
|
**kwargs,
|
39
39
|
):
|
40
|
-
if not image_data:
|
41
|
-
return None
|
42
|
-
|
43
40
|
if isinstance(input_text, list):
|
44
41
|
assert len(input_text) and isinstance(input_text[0], int)
|
45
42
|
input_text = self._processor.tokenizer.decode(input_text)
|
@@ -1,12 +1,12 @@
|
|
1
1
|
import logging
|
2
2
|
from typing import List, Union
|
3
3
|
|
4
|
-
from sglang.srt.managers.
|
4
|
+
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
5
|
+
from sglang.srt.models.phi4mm import Phi4MMForCausalLM
|
6
|
+
from sglang.srt.multimodal.processors.base_processor import (
|
5
7
|
BaseMultimodalProcessor,
|
6
8
|
MultimodalSpecialTokens,
|
7
9
|
)
|
8
|
-
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
9
|
-
from sglang.srt.models.phi4mm import Phi4MMForCausalLM
|
10
10
|
|
11
11
|
logger = logging.getLogger(__name__)
|
12
12
|
|
@@ -26,22 +26,12 @@ class Phi4MMImageProcessor(BaseMultimodalProcessor):
|
|
26
26
|
async def process_mm_data_async(
|
27
27
|
self,
|
28
28
|
image_data: List[Union[str, bytes]],
|
29
|
+
audio_data,
|
29
30
|
input_text,
|
30
31
|
request_obj,
|
31
32
|
max_req_input_len,
|
32
33
|
**kwargs,
|
33
34
|
):
|
34
|
-
audio_data = request_obj.audio_data
|
35
|
-
|
36
|
-
if not image_data and not audio_data:
|
37
|
-
return None
|
38
|
-
|
39
|
-
if not isinstance(image_data, list):
|
40
|
-
image_data = [image_data]
|
41
|
-
|
42
|
-
if not isinstance(audio_data, list):
|
43
|
-
audio_data = [audio_data]
|
44
|
-
|
45
35
|
if audio_data:
|
46
36
|
logger.warning(
|
47
37
|
"Currently SGLang does not support audio data for Phi4MM. We are working on it. You can file an issue to help us prioritize."
|
@@ -6,12 +6,12 @@ from transformers.models.pixtral.image_processing_pixtral import (
|
|
6
6
|
_num_image_tokens as _get_pixtral_hf_num_image_tokens,
|
7
7
|
)
|
8
8
|
|
9
|
-
from sglang.srt.managers.
|
9
|
+
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
10
|
+
from sglang.srt.models.pixtral import PixtralVisionModel
|
11
|
+
from sglang.srt.multimodal.processors.base_processor import (
|
10
12
|
BaseMultimodalProcessor,
|
11
13
|
MultimodalSpecialTokens,
|
12
14
|
)
|
13
|
-
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
14
|
-
from sglang.srt.models.pixtral import PixtralVisionModel
|
15
15
|
|
16
16
|
|
17
17
|
class PixtralProcessor(BaseMultimodalProcessor):
|
@@ -78,12 +78,6 @@ class PixtralProcessor(BaseMultimodalProcessor):
|
|
78
78
|
*args,
|
79
79
|
**kwargs,
|
80
80
|
):
|
81
|
-
if not image_data:
|
82
|
-
return None
|
83
|
-
|
84
|
-
if isinstance(image_data, str):
|
85
|
-
image_data = [image_data]
|
86
|
-
|
87
81
|
mm_data = self.load_mm_data(
|
88
82
|
prompt=input_text,
|
89
83
|
multimodal_tokens=self.multimodal_tokens,
|
@@ -3,19 +3,15 @@ import math
|
|
3
3
|
import re
|
4
4
|
from typing import Dict, List, Union
|
5
5
|
|
6
|
-
import torch
|
7
6
|
from PIL import Image
|
8
7
|
|
9
8
|
from sglang.srt.layers.rotary_embedding import MRotaryEmbedding
|
10
|
-
from sglang.srt.managers.multimodal_processors.base_processor import (
|
11
|
-
BaseMultimodalProcessor as SGLangBaseProcessor,
|
12
|
-
)
|
13
|
-
from sglang.srt.managers.multimodal_processors.base_processor import (
|
14
|
-
MultimodalSpecialTokens,
|
15
|
-
)
|
16
|
-
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
17
9
|
from sglang.srt.models.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
|
18
10
|
from sglang.srt.models.qwen2_vl import Qwen2VLForConditionalGeneration
|
11
|
+
from sglang.srt.multimodal.processors.base_processor import (
|
12
|
+
BaseMultimodalProcessor as SGLangBaseProcessor,
|
13
|
+
)
|
14
|
+
from sglang.srt.multimodal.processors.base_processor import MultimodalSpecialTokens
|
19
15
|
|
20
16
|
|
21
17
|
# Compatible with Qwen2VL and Qwen2_5VL
|
@@ -51,9 +47,6 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
|
|
51
47
|
*args,
|
52
48
|
**kwargs,
|
53
49
|
):
|
54
|
-
if isinstance(image_data, str):
|
55
|
-
image_data = [image_data]
|
56
|
-
|
57
50
|
base_output = self.load_mm_data(
|
58
51
|
prompt=input_text,
|
59
52
|
image_data=image_data,
|
@@ -132,12 +125,13 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
|
|
132
125
|
|
133
126
|
video_grid_thw = None # TODO
|
134
127
|
|
135
|
-
|
128
|
+
mm_items, input_ids = self.process_and_combine_mm_data(base_output)
|
136
129
|
|
137
|
-
if
|
130
|
+
if not mm_items:
|
138
131
|
# Note(Xinyuan): This is the case where image loading fails.
|
139
132
|
return None
|
140
133
|
|
134
|
+
combined_mm_item = mm_items[0] # only image is supported for now
|
141
135
|
video_grid_thw = None # TODO
|
142
136
|
second_per_grid_ts = getattr(combined_mm_item, "second_per_grid_ts", None)
|
143
137
|
|
@@ -159,7 +153,7 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
|
|
159
153
|
|
160
154
|
return {
|
161
155
|
"input_ids": input_ids.tolist(),
|
162
|
-
"mm_items":
|
156
|
+
"mm_items": mm_items,
|
163
157
|
"im_start_id": self.IM_START_TOKEN_ID,
|
164
158
|
"im_end_id": self.IM_END_TOKEN_ID,
|
165
159
|
"im_token_id": self.IM_TOKEN_ID,
|
@@ -10,12 +10,12 @@ from sglang.srt.managers.io_struct import (
|
|
10
10
|
GenerateReqInput,
|
11
11
|
ImageDataItem,
|
12
12
|
)
|
13
|
-
from sglang.srt.managers.
|
13
|
+
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
14
|
+
from sglang.srt.models.vila import VILAForConditionalGeneration
|
15
|
+
from sglang.srt.multimodal.processors.base_processor import (
|
14
16
|
BaseMultimodalProcessor,
|
15
17
|
MultimodalSpecialTokens,
|
16
18
|
)
|
17
|
-
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
18
|
-
from sglang.srt.models.vila import VILAForConditionalGeneration
|
19
19
|
from sglang.srt.server_args import ServerArgs
|
20
20
|
|
21
21
|
|
@@ -37,6 +37,8 @@ class VILAMultimodalProcessor(BaseMultimodalProcessor):
|
|
37
37
|
_processor: VILAProcessor,
|
38
38
|
) -> None:
|
39
39
|
super().__init__(hf_config, server_args, _processor)
|
40
|
+
self.IM_TOKEN_ID = hf_config.image_token_id
|
41
|
+
self.VIDEO_TOKEN_ID = hf_config.video_token_id
|
40
42
|
|
41
43
|
async def process_mm_data_async(
|
42
44
|
self,
|
@@ -46,13 +48,7 @@ class VILAMultimodalProcessor(BaseMultimodalProcessor):
|
|
46
48
|
max_req_input_len: int,
|
47
49
|
**kwargs,
|
48
50
|
) -> Optional[Dict[str, Any]]:
|
49
|
-
|
50
|
-
return None
|
51
|
-
|
52
|
-
if not isinstance(image_data, list):
|
53
|
-
image_data = [image_data]
|
54
|
-
|
55
|
-
mm_data = self.load_mm_data(
|
51
|
+
base_output = self.load_mm_data(
|
56
52
|
prompt=input_text,
|
57
53
|
multimodal_tokens=MultimodalSpecialTokens(
|
58
54
|
image_token=self._processor.tokenizer.image_token
|
@@ -61,25 +57,11 @@ class VILAMultimodalProcessor(BaseMultimodalProcessor):
|
|
61
57
|
image_data=image_data,
|
62
58
|
)
|
63
59
|
|
64
|
-
|
65
|
-
input_text=mm_data.input_text,
|
66
|
-
images=mm_data.images,
|
67
|
-
)
|
68
|
-
|
69
|
-
image_offsets = self.get_mm_items_offset(
|
70
|
-
input_ids=inputs.input_ids[0],
|
71
|
-
mm_token_id=cast(int, self._processor.tokenizer.image_token_id),
|
72
|
-
)
|
60
|
+
mm_items, input_ids = self.process_and_combine_mm_data(base_output)
|
73
61
|
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
]
|
81
|
-
|
82
|
-
return dict(
|
83
|
-
input_ids=inputs.input_ids[0].tolist(),
|
84
|
-
mm_items=mm_items,
|
85
|
-
)
|
62
|
+
return {
|
63
|
+
"input_ids": input_ids.tolist(),
|
64
|
+
"mm_items": mm_items,
|
65
|
+
"im_token_id": self.IM_TOKEN_ID,
|
66
|
+
"video_token_id": self.VIDEO_TOKEN_ID,
|
67
|
+
}
|
@@ -71,7 +71,9 @@ def _compute_moe_deepseek_layer_operations_strategy_tbo(
|
|
71
71
|
assert layer.is_layer_sparse, "dense layer TBO not yet implemented"
|
72
72
|
if forward_mode == ForwardMode.EXTEND:
|
73
73
|
return _compute_moe_deepseek_blog_prefill(layer)
|
74
|
-
elif
|
74
|
+
elif (
|
75
|
+
forward_mode == ForwardMode.DECODE or forward_mode == ForwardMode.TARGET_VERIFY
|
76
|
+
):
|
75
77
|
return _compute_moe_deepseek_blog_decode(layer)
|
76
78
|
else:
|
77
79
|
raise NotImplementedError(f"Unsupported {forward_mode=}")
|
@@ -146,7 +148,9 @@ def _compute_moe_qwen3_layer_operations_strategy_tbo(
|
|
146
148
|
assert layer.is_layer_sparse, "qwen3 moe only support sparse layers"
|
147
149
|
if forward_mode == ForwardMode.EXTEND:
|
148
150
|
return _compute_moe_qwen3_prefill(layer)
|
149
|
-
elif
|
151
|
+
elif (
|
152
|
+
forward_mode == ForwardMode.DECODE or forward_mode == ForwardMode.TARGET_VERIFY
|
153
|
+
):
|
150
154
|
return _compute_moe_qwen3_decode(layer)
|
151
155
|
else:
|
152
156
|
raise NotImplementedError(f"Unsupported {forward_mode=}")
|
sglang/srt/reasoning_parser.py
CHANGED
@@ -66,6 +66,13 @@ class BaseReasoningFormatDetector:
|
|
66
66
|
self._buffer += new_text
|
67
67
|
current_text = self._buffer
|
68
68
|
|
69
|
+
# If the current text is a prefix of the think token, keep buffering
|
70
|
+
if any(
|
71
|
+
token.startswith(current_text) and token != current_text
|
72
|
+
for token in [self.think_start_token, self.think_end_token]
|
73
|
+
):
|
74
|
+
return StreamingParseResult()
|
75
|
+
|
69
76
|
# Strip `<think>` token if present
|
70
77
|
if not self.stripped_think_start and self.think_start_token in current_text:
|
71
78
|
current_text = current_text.replace(self.think_start_token, "")
|
@@ -150,6 +157,24 @@ class Qwen3Detector(BaseReasoningFormatDetector):
|
|
150
157
|
)
|
151
158
|
|
152
159
|
|
160
|
+
class KimiDetector(BaseReasoningFormatDetector):
|
161
|
+
"""
|
162
|
+
Detector for Kimi Thinking model.
|
163
|
+
Assumes reasoning format:
|
164
|
+
◁think▷*(.*)◁/think▷
|
165
|
+
Returns all the text before the ◁/think▷ tag as `reasoning_text`
|
166
|
+
and the rest of the text as `normal_text`.
|
167
|
+
"""
|
168
|
+
|
169
|
+
def __init__(self, stream_reasoning: bool = True):
|
170
|
+
super().__init__(
|
171
|
+
"◁think▷",
|
172
|
+
"◁/think▷",
|
173
|
+
force_reasoning=False,
|
174
|
+
stream_reasoning=stream_reasoning,
|
175
|
+
)
|
176
|
+
|
177
|
+
|
153
178
|
class ReasoningParser:
|
154
179
|
"""
|
155
180
|
Parser that handles both streaming and non-streaming scenarios for extracting
|
@@ -164,6 +189,7 @@ class ReasoningParser:
|
|
164
189
|
DetectorMap: Dict[str, Type[BaseReasoningFormatDetector]] = {
|
165
190
|
"deepseek-r1": DeepSeekR1Detector,
|
166
191
|
"qwen3": Qwen3Detector,
|
192
|
+
"kimi": KimiDetector,
|
167
193
|
}
|
168
194
|
|
169
195
|
def __init__(self, model_type: Optional[str] = None, stream_reasoning: bool = True):
|