sglang 0.4.8.post1__py3-none-any.whl → 0.4.9.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch_server.py +17 -2
- sglang/bench_serving.py +170 -24
- sglang/srt/configs/internvl.py +4 -2
- sglang/srt/configs/janus_pro.py +1 -1
- sglang/srt/configs/model_config.py +60 -1
- sglang/srt/configs/update_config.py +119 -0
- sglang/srt/conversation.py +69 -1
- sglang/srt/disaggregation/decode.py +21 -5
- sglang/srt/disaggregation/mooncake/conn.py +35 -4
- sglang/srt/disaggregation/nixl/conn.py +6 -6
- sglang/srt/disaggregation/prefill.py +2 -2
- sglang/srt/disaggregation/utils.py +1 -1
- sglang/srt/distributed/parallel_state.py +44 -17
- sglang/srt/entrypoints/EngineBase.py +8 -0
- sglang/srt/entrypoints/engine.py +40 -6
- sglang/srt/entrypoints/http_server.py +111 -24
- sglang/srt/entrypoints/http_server_engine.py +1 -1
- sglang/srt/entrypoints/openai/protocol.py +4 -2
- sglang/srt/eplb/__init__.py +0 -0
- sglang/srt/{managers → eplb}/eplb_algorithms/__init__.py +1 -1
- sglang/srt/{managers → eplb}/eplb_manager.py +2 -4
- sglang/srt/{eplb_simulator → eplb/eplb_simulator}/reader.py +1 -1
- sglang/srt/{managers → eplb}/expert_distribution.py +1 -5
- sglang/srt/{managers → eplb}/expert_location.py +1 -1
- sglang/srt/{managers → eplb}/expert_location_dispatch.py +1 -1
- sglang/srt/{model_executor → eplb}/expert_location_updater.py +17 -1
- sglang/srt/hf_transformers_utils.py +2 -1
- sglang/srt/layers/activation.py +2 -2
- sglang/srt/layers/amx_utils.py +86 -0
- sglang/srt/layers/attention/ascend_backend.py +219 -0
- sglang/srt/layers/attention/flashattention_backend.py +32 -9
- sglang/srt/layers/attention/tbo_backend.py +37 -9
- sglang/srt/layers/communicator.py +20 -2
- sglang/srt/layers/dp_attention.py +9 -3
- sglang/srt/layers/elementwise.py +76 -12
- sglang/srt/layers/flashinfer_comm_fusion.py +202 -0
- sglang/srt/layers/layernorm.py +26 -0
- sglang/srt/layers/linear.py +84 -14
- sglang/srt/layers/logits_processor.py +4 -4
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +215 -0
- sglang/srt/layers/moe/ep_moe/kernels.py +81 -8
- sglang/srt/layers/moe/ep_moe/layer.py +176 -15
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +23 -17
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +3 -2
- sglang/srt/layers/moe/fused_moe_triton/layer.py +211 -74
- sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +176 -0
- sglang/srt/layers/moe/router.py +60 -22
- sglang/srt/layers/moe/topk.py +10 -28
- sglang/srt/layers/parameter.py +67 -7
- sglang/srt/layers/quantization/__init__.py +2 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +1 -1
- sglang/srt/layers/quantization/fp8.py +72 -7
- sglang/srt/layers/quantization/fp8_kernel.py +1 -1
- sglang/srt/layers/quantization/fp8_utils.py +1 -2
- sglang/srt/layers/quantization/gptq.py +5 -1
- sglang/srt/layers/quantization/modelopt_quant.py +244 -1
- sglang/srt/layers/quantization/moe_wna16.py +1 -1
- sglang/srt/layers/quantization/quant_utils.py +166 -0
- sglang/srt/layers/quantization/w4afp8.py +264 -0
- sglang/srt/layers/quantization/w8a8_int8.py +52 -1
- sglang/srt/layers/rotary_embedding.py +2 -2
- sglang/srt/layers/vocab_parallel_embedding.py +20 -10
- sglang/srt/lora/lora.py +4 -5
- sglang/srt/lora/lora_manager.py +73 -20
- sglang/srt/lora/triton_ops/gate_up_lora_b.py +30 -19
- sglang/srt/lora/triton_ops/qkv_lora_b.py +30 -19
- sglang/srt/lora/triton_ops/sgemm_lora_a.py +27 -11
- sglang/srt/lora/triton_ops/sgemm_lora_b.py +27 -15
- sglang/srt/managers/cache_controller.py +41 -195
- sglang/srt/managers/configure_logging.py +1 -1
- sglang/srt/managers/io_struct.py +58 -14
- sglang/srt/managers/mm_utils.py +77 -61
- sglang/srt/managers/multimodal_processor.py +2 -6
- sglang/srt/managers/multimodal_processors/qwen_audio.py +94 -0
- sglang/srt/managers/schedule_batch.py +78 -85
- sglang/srt/managers/scheduler.py +130 -64
- sglang/srt/managers/scheduler_output_processor_mixin.py +8 -2
- sglang/srt/managers/session_controller.py +12 -3
- sglang/srt/managers/tokenizer_manager.py +314 -103
- sglang/srt/managers/tp_worker.py +13 -1
- sglang/srt/managers/tp_worker_overlap_thread.py +8 -0
- sglang/srt/mem_cache/allocator.py +290 -0
- sglang/srt/mem_cache/chunk_cache.py +34 -2
- sglang/srt/mem_cache/hiradix_cache.py +2 -0
- sglang/srt/mem_cache/memory_pool.py +402 -66
- sglang/srt/mem_cache/memory_pool_host.py +6 -109
- sglang/srt/mem_cache/multimodal_cache.py +3 -0
- sglang/srt/mem_cache/radix_cache.py +8 -4
- sglang/srt/model_executor/cuda_graph_runner.py +2 -1
- sglang/srt/model_executor/forward_batch_info.py +17 -4
- sglang/srt/model_executor/model_runner.py +297 -56
- sglang/srt/model_loader/loader.py +41 -0
- sglang/srt/model_loader/weight_utils.py +72 -4
- sglang/srt/models/deepseek_nextn.py +1 -3
- sglang/srt/models/deepseek_v2.py +195 -45
- sglang/srt/models/deepseek_vl2.py +3 -5
- sglang/srt/models/gemma3_causal.py +1 -2
- sglang/srt/models/gemma3n_causal.py +4 -3
- sglang/srt/models/gemma3n_mm.py +4 -20
- sglang/srt/models/hunyuan.py +1 -1
- sglang/srt/models/kimi_vl.py +1 -2
- sglang/srt/models/llama.py +10 -4
- sglang/srt/models/llama4.py +32 -45
- sglang/srt/models/llama_eagle3.py +61 -11
- sglang/srt/models/llava.py +5 -5
- sglang/srt/models/minicpmo.py +2 -2
- sglang/srt/models/mistral.py +1 -1
- sglang/srt/models/mllama4.py +402 -89
- sglang/srt/models/phi4mm.py +1 -3
- sglang/srt/models/pixtral.py +3 -7
- sglang/srt/models/qwen2.py +31 -3
- sglang/srt/models/qwen2_5_vl.py +1 -3
- sglang/srt/models/qwen2_audio.py +200 -0
- sglang/srt/models/qwen2_moe.py +32 -6
- sglang/srt/models/qwen2_vl.py +1 -4
- sglang/srt/models/qwen3.py +94 -25
- sglang/srt/models/qwen3_moe.py +68 -21
- sglang/srt/models/vila.py +3 -8
- sglang/srt/{mm_utils.py → multimodal/mm_utils.py} +2 -2
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/base_processor.py +140 -158
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/clip.py +2 -13
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/deepseek_vl_v2.py +4 -11
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/gemma3.py +3 -10
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/gemma3n.py +5 -20
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/internvl.py +3 -10
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/janus_pro.py +3 -9
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/kimi_vl.py +6 -13
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/llava.py +2 -10
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/minicpm.py +5 -12
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/mlama.py +2 -14
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/mllama4.py +65 -66
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/phi4mm.py +4 -14
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/pixtral.py +3 -9
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/qwen_vl.py +8 -14
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/vila.py +13 -31
- sglang/srt/operations_strategy.py +6 -2
- sglang/srt/reasoning_parser.py +26 -0
- sglang/srt/sampling/sampling_batch_info.py +39 -1
- sglang/srt/server_args.py +84 -22
- sglang/srt/speculative/build_eagle_tree.py +57 -18
- sglang/srt/speculative/eagle_worker.py +6 -4
- sglang/srt/two_batch_overlap.py +203 -27
- sglang/srt/utils.py +343 -163
- sglang/srt/warmup.py +12 -3
- sglang/test/runners.py +10 -1
- sglang/test/test_cutlass_w4a8_moe.py +281 -0
- sglang/test/test_utils.py +15 -3
- sglang/utils.py +5 -5
- sglang/version.py +1 -1
- {sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/METADATA +12 -8
- {sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/RECORD +157 -146
- sglang/math_utils.py +0 -8
- /sglang/srt/{managers → eplb}/eplb_algorithms/deepseek.py +0 -0
- /sglang/srt/{managers → eplb}/eplb_algorithms/deepseek_vec.py +0 -0
- /sglang/srt/{eplb_simulator → eplb/eplb_simulator}/__init__.py +0 -0
- {sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/WHEEL +0 -0
- {sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/top_level.txt +0 -0
@@ -2,13 +2,13 @@ from typing import List, Union
|
|
2
2
|
|
3
3
|
import torch
|
4
4
|
|
5
|
-
from sglang.srt.managers.multimodal_processors.base_processor import (
|
6
|
-
BaseMultimodalProcessor,
|
7
|
-
MultimodalSpecialTokens,
|
8
|
-
)
|
9
5
|
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
10
6
|
from sglang.srt.models.minicpmo import MiniCPMO
|
11
7
|
from sglang.srt.models.minicpmv import MiniCPMV
|
8
|
+
from sglang.srt.multimodal.processors.base_processor import (
|
9
|
+
BaseMultimodalProcessor,
|
10
|
+
MultimodalSpecialTokens,
|
11
|
+
)
|
12
12
|
|
13
13
|
|
14
14
|
# Compatible with both 'O' and 'V'
|
@@ -23,19 +23,12 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
|
|
23
23
|
async def process_mm_data_async(
|
24
24
|
self,
|
25
25
|
image_data: List[Union[str, bytes]],
|
26
|
+
audio_data: List[Union[str, bytes]],
|
26
27
|
input_text,
|
27
28
|
request_obj,
|
28
29
|
max_req_input_len,
|
29
30
|
**kwargs,
|
30
31
|
):
|
31
|
-
audio_data = request_obj.audio_data
|
32
|
-
if not image_data and not audio_data:
|
33
|
-
return None
|
34
|
-
if not isinstance(image_data, list):
|
35
|
-
image_data = [image_data]
|
36
|
-
if not isinstance(audio_data, list):
|
37
|
-
audio_data = [audio_data]
|
38
|
-
|
39
32
|
base_output = self.load_mm_data(
|
40
33
|
prompt=input_text,
|
41
34
|
max_req_input_len=max_req_input_len,
|
@@ -1,10 +1,8 @@
|
|
1
1
|
from typing import List, Union
|
2
2
|
|
3
|
-
from sglang.srt.managers.multimodal_processors.base_processor import (
|
4
|
-
BaseMultimodalProcessor,
|
5
|
-
)
|
6
3
|
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
7
4
|
from sglang.srt.models.mllama import MllamaForConditionalGeneration
|
5
|
+
from sglang.srt.multimodal.processors.base_processor import BaseMultimodalProcessor
|
8
6
|
from sglang.srt.utils import load_image
|
9
7
|
|
10
8
|
|
@@ -17,21 +15,11 @@ class MllamaImageProcessor(BaseMultimodalProcessor):
|
|
17
15
|
async def process_mm_data_async(
|
18
16
|
self, image_data: List[Union[str, bytes]], input_text, *args, **kwargs
|
19
17
|
):
|
20
|
-
if not image_data:
|
21
|
-
return None
|
22
|
-
|
23
18
|
if isinstance(input_text, list):
|
24
19
|
assert len(input_text) and isinstance(input_text[0], int)
|
25
20
|
input_text = self._processor.tokenizer.decode(input_text)
|
26
21
|
|
27
|
-
|
28
|
-
image_data = [image_data]
|
29
|
-
|
30
|
-
if len(image_data) > 0:
|
31
|
-
images = [load_image(image)[0] for image in image_data]
|
32
|
-
else:
|
33
|
-
images = load_image(image_data[0])[0]
|
34
|
-
|
22
|
+
images = [load_image(image)[0] for image in image_data]
|
35
23
|
image_inputs = self.process_mm_data(input_text=input_text, images=images)
|
36
24
|
image_inputs["input_ids"] = image_inputs["input_ids"].tolist()[0]
|
37
25
|
image_inputs["mm_items"] = [
|
@@ -7,12 +7,12 @@ from transformers.models.llama4.image_processing_llama4_fast import (
|
|
7
7
|
get_best_fit,
|
8
8
|
)
|
9
9
|
|
10
|
-
from sglang.srt.managers.
|
10
|
+
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
11
|
+
from sglang.srt.models.mllama4 import Llama4ForConditionalGeneration
|
12
|
+
from sglang.srt.multimodal.processors.base_processor import (
|
11
13
|
BaseMultimodalProcessor,
|
12
14
|
MultimodalSpecialTokens,
|
13
15
|
)
|
14
|
-
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
15
|
-
from sglang.srt.models.mllama4 import Llama4ForConditionalGeneration
|
16
16
|
|
17
17
|
|
18
18
|
class Mllama4ImageProcessor(BaseMultimodalProcessor):
|
@@ -37,9 +37,6 @@ class Mllama4ImageProcessor(BaseMultimodalProcessor):
|
|
37
37
|
*args,
|
38
38
|
**kwargs,
|
39
39
|
):
|
40
|
-
if not image_data:
|
41
|
-
return None
|
42
|
-
|
43
40
|
if isinstance(input_text, list):
|
44
41
|
assert len(input_text) and isinstance(input_text[0], int)
|
45
42
|
input_text = self._processor.tokenizer.decode(input_text)
|
@@ -63,70 +60,72 @@ class Mllama4ImageProcessor(BaseMultimodalProcessor):
|
|
63
60
|
)
|
64
61
|
|
65
62
|
# Handle image resolutions and aspect ratios
|
66
|
-
if "pixel_values" in processor_output:
|
67
|
-
|
68
|
-
|
63
|
+
if "pixel_values" not in processor_output: # no image processed
|
64
|
+
return None
|
65
|
+
|
66
|
+
image_processor = processor.image_processor
|
67
|
+
tokenizer = self._processor.tokenizer
|
69
68
|
|
70
|
-
|
71
|
-
|
72
|
-
|
69
|
+
# Calculate tile size and find supported resolutions
|
70
|
+
tile_size = self.vision_config.image_size
|
71
|
+
max_num_tiles = getattr(self.vision_config, "max_patches", 1)
|
73
72
|
|
74
|
-
|
75
|
-
|
76
|
-
|
73
|
+
possible_resolutions = find_supported_resolutions(
|
74
|
+
max_num_chunks=max_num_tiles,
|
75
|
+
patch_size=SizeDict(height=tile_size, width=tile_size),
|
76
|
+
)
|
77
|
+
|
78
|
+
# Find best fit for each image
|
79
|
+
best_fit_sizes = [
|
80
|
+
get_best_fit(
|
81
|
+
(image.size[1], image.size[0]), # (height, width)
|
82
|
+
torch.tensor(possible_resolutions),
|
83
|
+
resize_to_max_canvas=image_processor.resize_to_max_canvas,
|
77
84
|
)
|
85
|
+
for image in processed_data.images
|
86
|
+
]
|
87
|
+
|
88
|
+
# Calculate aspect ratios and patches per image
|
89
|
+
aspect_ratios = [
|
90
|
+
(image_size[0] // tile_size, image_size[1] // tile_size)
|
91
|
+
for image_size in best_fit_sizes
|
92
|
+
]
|
93
|
+
|
94
|
+
patches_per_image = [
|
95
|
+
1 if r_h * r_w == 1 else 1 + r_h * r_w for (r_h, r_w) in aspect_ratios
|
96
|
+
]
|
97
|
+
|
98
|
+
# Add to image_inputs
|
99
|
+
processor_output["aspect_ratios"] = aspect_ratios
|
100
|
+
processor_output["patches_per_image"] = torch.tensor(patches_per_image)
|
101
|
+
|
102
|
+
# Process embed_is_patch
|
103
|
+
vocab = tokenizer.get_vocab()
|
104
|
+
patch_id = vocab.get(processor.img_patch_token, -1)
|
105
|
+
image_end_id = vocab.get(processor.end_of_img_token, -1)
|
106
|
+
|
107
|
+
if patch_id != -1 and image_end_id != -1:
|
108
|
+
input_ids = processor_output["input_ids"].view(-1)
|
109
|
+
|
110
|
+
# Remove BOS token if present
|
111
|
+
if input_ids.size(0) > 0 and input_ids[0] == tokenizer.bos_token_id:
|
112
|
+
input_ids = input_ids[1:]
|
113
|
+
|
114
|
+
# Find image end indices and split input_ids
|
115
|
+
image_end_indices = (input_ids == image_end_id).nonzero().view(-1)
|
116
|
+
|
117
|
+
if image_end_indices.size(0) > 0:
|
118
|
+
# Split at image boundaries
|
119
|
+
split_indices = (image_end_indices + 1)[:-1]
|
120
|
+
split_input_ids = torch.tensor_split(input_ids, split_indices)
|
121
|
+
split_input_ids = [x for x in split_input_ids if x.numel() > 0]
|
122
|
+
|
123
|
+
# Create embed_is_patch for each image
|
124
|
+
embed_is_patch = []
|
125
|
+
for per_image_input_ids in split_input_ids:
|
126
|
+
embed_is_patch.append(per_image_input_ids == patch_id)
|
78
127
|
|
79
|
-
|
80
|
-
best_fit_sizes = [
|
81
|
-
get_best_fit(
|
82
|
-
(image.size[1], image.size[0]), # (height, width)
|
83
|
-
torch.tensor(possible_resolutions),
|
84
|
-
resize_to_max_canvas=image_processor.resize_to_max_canvas,
|
85
|
-
)
|
86
|
-
for image in processed_data.images
|
87
|
-
]
|
88
|
-
|
89
|
-
# Calculate aspect ratios and patches per image
|
90
|
-
aspect_ratios = [
|
91
|
-
(image_size[0] // tile_size, image_size[1] // tile_size)
|
92
|
-
for image_size in best_fit_sizes
|
93
|
-
]
|
94
|
-
|
95
|
-
patches_per_image = [
|
96
|
-
1 if r_h * r_w == 1 else 1 + r_h * r_w for (r_h, r_w) in aspect_ratios
|
97
|
-
]
|
98
|
-
|
99
|
-
# Add to image_inputs
|
100
|
-
processor_output["aspect_ratios"] = aspect_ratios
|
101
|
-
processor_output["patches_per_image"] = torch.tensor(patches_per_image)
|
102
|
-
|
103
|
-
# Process embed_is_patch
|
104
|
-
vocab = tokenizer.get_vocab()
|
105
|
-
patch_id = vocab.get(processor.img_patch_token, -1)
|
106
|
-
image_end_id = vocab.get(processor.end_of_img_token, -1)
|
107
|
-
|
108
|
-
if patch_id != -1 and image_end_id != -1:
|
109
|
-
input_ids = processor_output["input_ids"].view(-1)
|
110
|
-
|
111
|
-
# Remove BOS token if present
|
112
|
-
if input_ids.size(0) > 0 and input_ids[0] == tokenizer.bos_token_id:
|
113
|
-
input_ids = input_ids[1:]
|
114
|
-
|
115
|
-
# Find image end indices and split input_ids
|
116
|
-
image_end_indices = (input_ids == image_end_id).nonzero().view(-1)
|
117
|
-
|
118
|
-
if image_end_indices.size(0) > 0:
|
119
|
-
# Split at image boundaries
|
120
|
-
split_indices = (image_end_indices + 1)[:-1]
|
121
|
-
split_input_ids = torch.tensor_split(input_ids, split_indices)
|
122
|
-
split_input_ids = [x for x in split_input_ids if x.numel() > 0]
|
123
|
-
|
124
|
-
# Create embed_is_patch for each image
|
125
|
-
embed_is_patch = []
|
126
|
-
for per_image_input_ids in split_input_ids:
|
127
|
-
embed_is_patch.append(per_image_input_ids == patch_id)
|
128
|
-
|
129
|
-
processor_output["embed_is_patch"] = embed_is_patch
|
128
|
+
processor_output["embed_is_patch"] = embed_is_patch
|
130
129
|
|
131
130
|
# Convert to the format expected by SGLang
|
132
131
|
processor_output["input_ids"] = processor_output["input_ids"].tolist()[0]
|
@@ -1,12 +1,12 @@
|
|
1
1
|
import logging
|
2
2
|
from typing import List, Union
|
3
3
|
|
4
|
-
from sglang.srt.managers.
|
4
|
+
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
5
|
+
from sglang.srt.models.phi4mm import Phi4MMForCausalLM
|
6
|
+
from sglang.srt.multimodal.processors.base_processor import (
|
5
7
|
BaseMultimodalProcessor,
|
6
8
|
MultimodalSpecialTokens,
|
7
9
|
)
|
8
|
-
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
9
|
-
from sglang.srt.models.phi4mm import Phi4MMForCausalLM
|
10
10
|
|
11
11
|
logger = logging.getLogger(__name__)
|
12
12
|
|
@@ -26,22 +26,12 @@ class Phi4MMImageProcessor(BaseMultimodalProcessor):
|
|
26
26
|
async def process_mm_data_async(
|
27
27
|
self,
|
28
28
|
image_data: List[Union[str, bytes]],
|
29
|
+
audio_data,
|
29
30
|
input_text,
|
30
31
|
request_obj,
|
31
32
|
max_req_input_len,
|
32
33
|
**kwargs,
|
33
34
|
):
|
34
|
-
audio_data = request_obj.audio_data
|
35
|
-
|
36
|
-
if not image_data and not audio_data:
|
37
|
-
return None
|
38
|
-
|
39
|
-
if not isinstance(image_data, list):
|
40
|
-
image_data = [image_data]
|
41
|
-
|
42
|
-
if not isinstance(audio_data, list):
|
43
|
-
audio_data = [audio_data]
|
44
|
-
|
45
35
|
if audio_data:
|
46
36
|
logger.warning(
|
47
37
|
"Currently SGLang does not support audio data for Phi4MM. We are working on it. You can file an issue to help us prioritize."
|
@@ -6,12 +6,12 @@ from transformers.models.pixtral.image_processing_pixtral import (
|
|
6
6
|
_num_image_tokens as _get_pixtral_hf_num_image_tokens,
|
7
7
|
)
|
8
8
|
|
9
|
-
from sglang.srt.managers.
|
9
|
+
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
10
|
+
from sglang.srt.models.pixtral import PixtralVisionModel
|
11
|
+
from sglang.srt.multimodal.processors.base_processor import (
|
10
12
|
BaseMultimodalProcessor,
|
11
13
|
MultimodalSpecialTokens,
|
12
14
|
)
|
13
|
-
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
14
|
-
from sglang.srt.models.pixtral import PixtralVisionModel
|
15
15
|
|
16
16
|
|
17
17
|
class PixtralProcessor(BaseMultimodalProcessor):
|
@@ -78,12 +78,6 @@ class PixtralProcessor(BaseMultimodalProcessor):
|
|
78
78
|
*args,
|
79
79
|
**kwargs,
|
80
80
|
):
|
81
|
-
if not image_data:
|
82
|
-
return None
|
83
|
-
|
84
|
-
if isinstance(image_data, str):
|
85
|
-
image_data = [image_data]
|
86
|
-
|
87
81
|
mm_data = self.load_mm_data(
|
88
82
|
prompt=input_text,
|
89
83
|
multimodal_tokens=self.multimodal_tokens,
|
@@ -3,19 +3,15 @@ import math
|
|
3
3
|
import re
|
4
4
|
from typing import Dict, List, Union
|
5
5
|
|
6
|
-
import torch
|
7
6
|
from PIL import Image
|
8
7
|
|
9
8
|
from sglang.srt.layers.rotary_embedding import MRotaryEmbedding
|
10
|
-
from sglang.srt.managers.multimodal_processors.base_processor import (
|
11
|
-
BaseMultimodalProcessor as SGLangBaseProcessor,
|
12
|
-
)
|
13
|
-
from sglang.srt.managers.multimodal_processors.base_processor import (
|
14
|
-
MultimodalSpecialTokens,
|
15
|
-
)
|
16
|
-
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
17
9
|
from sglang.srt.models.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
|
18
10
|
from sglang.srt.models.qwen2_vl import Qwen2VLForConditionalGeneration
|
11
|
+
from sglang.srt.multimodal.processors.base_processor import (
|
12
|
+
BaseMultimodalProcessor as SGLangBaseProcessor,
|
13
|
+
)
|
14
|
+
from sglang.srt.multimodal.processors.base_processor import MultimodalSpecialTokens
|
19
15
|
|
20
16
|
|
21
17
|
# Compatible with Qwen2VL and Qwen2_5VL
|
@@ -51,9 +47,6 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
|
|
51
47
|
*args,
|
52
48
|
**kwargs,
|
53
49
|
):
|
54
|
-
if isinstance(image_data, str):
|
55
|
-
image_data = [image_data]
|
56
|
-
|
57
50
|
base_output = self.load_mm_data(
|
58
51
|
prompt=input_text,
|
59
52
|
image_data=image_data,
|
@@ -132,12 +125,13 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
|
|
132
125
|
|
133
126
|
video_grid_thw = None # TODO
|
134
127
|
|
135
|
-
|
128
|
+
mm_items, input_ids = self.process_and_combine_mm_data(base_output)
|
136
129
|
|
137
|
-
if
|
130
|
+
if not mm_items:
|
138
131
|
# Note(Xinyuan): This is the case where image loading fails.
|
139
132
|
return None
|
140
133
|
|
134
|
+
combined_mm_item = mm_items[0] # only image is supported for now
|
141
135
|
video_grid_thw = None # TODO
|
142
136
|
second_per_grid_ts = getattr(combined_mm_item, "second_per_grid_ts", None)
|
143
137
|
|
@@ -159,7 +153,7 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
|
|
159
153
|
|
160
154
|
return {
|
161
155
|
"input_ids": input_ids.tolist(),
|
162
|
-
"mm_items":
|
156
|
+
"mm_items": mm_items,
|
163
157
|
"im_start_id": self.IM_START_TOKEN_ID,
|
164
158
|
"im_end_id": self.IM_END_TOKEN_ID,
|
165
159
|
"im_token_id": self.IM_TOKEN_ID,
|
@@ -10,12 +10,12 @@ from sglang.srt.managers.io_struct import (
|
|
10
10
|
GenerateReqInput,
|
11
11
|
ImageDataItem,
|
12
12
|
)
|
13
|
-
from sglang.srt.managers.
|
13
|
+
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
14
|
+
from sglang.srt.models.vila import VILAForConditionalGeneration
|
15
|
+
from sglang.srt.multimodal.processors.base_processor import (
|
14
16
|
BaseMultimodalProcessor,
|
15
17
|
MultimodalSpecialTokens,
|
16
18
|
)
|
17
|
-
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
18
|
-
from sglang.srt.models.vila import VILAForConditionalGeneration
|
19
19
|
from sglang.srt.server_args import ServerArgs
|
20
20
|
|
21
21
|
|
@@ -37,6 +37,8 @@ class VILAMultimodalProcessor(BaseMultimodalProcessor):
|
|
37
37
|
_processor: VILAProcessor,
|
38
38
|
) -> None:
|
39
39
|
super().__init__(hf_config, server_args, _processor)
|
40
|
+
self.IM_TOKEN_ID = hf_config.image_token_id
|
41
|
+
self.VIDEO_TOKEN_ID = hf_config.video_token_id
|
40
42
|
|
41
43
|
async def process_mm_data_async(
|
42
44
|
self,
|
@@ -46,13 +48,7 @@ class VILAMultimodalProcessor(BaseMultimodalProcessor):
|
|
46
48
|
max_req_input_len: int,
|
47
49
|
**kwargs,
|
48
50
|
) -> Optional[Dict[str, Any]]:
|
49
|
-
|
50
|
-
return None
|
51
|
-
|
52
|
-
if not isinstance(image_data, list):
|
53
|
-
image_data = [image_data]
|
54
|
-
|
55
|
-
mm_data = self.load_mm_data(
|
51
|
+
base_output = self.load_mm_data(
|
56
52
|
prompt=input_text,
|
57
53
|
multimodal_tokens=MultimodalSpecialTokens(
|
58
54
|
image_token=self._processor.tokenizer.image_token
|
@@ -61,25 +57,11 @@ class VILAMultimodalProcessor(BaseMultimodalProcessor):
|
|
61
57
|
image_data=image_data,
|
62
58
|
)
|
63
59
|
|
64
|
-
|
65
|
-
input_text=mm_data.input_text,
|
66
|
-
images=mm_data.images,
|
67
|
-
)
|
68
|
-
|
69
|
-
image_offsets = self.get_mm_items_offset(
|
70
|
-
input_ids=inputs.input_ids[0],
|
71
|
-
mm_token_id=cast(int, self._processor.tokenizer.image_token_id),
|
72
|
-
)
|
60
|
+
mm_items, input_ids = self.process_and_combine_mm_data(base_output)
|
73
61
|
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
]
|
81
|
-
|
82
|
-
return dict(
|
83
|
-
input_ids=inputs.input_ids[0].tolist(),
|
84
|
-
mm_items=mm_items,
|
85
|
-
)
|
62
|
+
return {
|
63
|
+
"input_ids": input_ids.tolist(),
|
64
|
+
"mm_items": mm_items,
|
65
|
+
"im_token_id": self.IM_TOKEN_ID,
|
66
|
+
"video_token_id": self.VIDEO_TOKEN_ID,
|
67
|
+
}
|
@@ -71,7 +71,9 @@ def _compute_moe_deepseek_layer_operations_strategy_tbo(
|
|
71
71
|
assert layer.is_layer_sparse, "dense layer TBO not yet implemented"
|
72
72
|
if forward_mode == ForwardMode.EXTEND:
|
73
73
|
return _compute_moe_deepseek_blog_prefill(layer)
|
74
|
-
elif
|
74
|
+
elif (
|
75
|
+
forward_mode == ForwardMode.DECODE or forward_mode == ForwardMode.TARGET_VERIFY
|
76
|
+
):
|
75
77
|
return _compute_moe_deepseek_blog_decode(layer)
|
76
78
|
else:
|
77
79
|
raise NotImplementedError(f"Unsupported {forward_mode=}")
|
@@ -146,7 +148,9 @@ def _compute_moe_qwen3_layer_operations_strategy_tbo(
|
|
146
148
|
assert layer.is_layer_sparse, "qwen3 moe only support sparse layers"
|
147
149
|
if forward_mode == ForwardMode.EXTEND:
|
148
150
|
return _compute_moe_qwen3_prefill(layer)
|
149
|
-
elif
|
151
|
+
elif (
|
152
|
+
forward_mode == ForwardMode.DECODE or forward_mode == ForwardMode.TARGET_VERIFY
|
153
|
+
):
|
150
154
|
return _compute_moe_qwen3_decode(layer)
|
151
155
|
else:
|
152
156
|
raise NotImplementedError(f"Unsupported {forward_mode=}")
|
sglang/srt/reasoning_parser.py
CHANGED
@@ -66,6 +66,13 @@ class BaseReasoningFormatDetector:
|
|
66
66
|
self._buffer += new_text
|
67
67
|
current_text = self._buffer
|
68
68
|
|
69
|
+
# If the current text is a prefix of the think token, keep buffering
|
70
|
+
if any(
|
71
|
+
token.startswith(current_text) and token != current_text
|
72
|
+
for token in [self.think_start_token, self.think_end_token]
|
73
|
+
):
|
74
|
+
return StreamingParseResult()
|
75
|
+
|
69
76
|
# Strip `<think>` token if present
|
70
77
|
if not self.stripped_think_start and self.think_start_token in current_text:
|
71
78
|
current_text = current_text.replace(self.think_start_token, "")
|
@@ -150,6 +157,24 @@ class Qwen3Detector(BaseReasoningFormatDetector):
|
|
150
157
|
)
|
151
158
|
|
152
159
|
|
160
|
+
class KimiDetector(BaseReasoningFormatDetector):
|
161
|
+
"""
|
162
|
+
Detector for Kimi Thinking model.
|
163
|
+
Assumes reasoning format:
|
164
|
+
◁think▷*(.*)◁/think▷
|
165
|
+
Returns all the text before the ◁/think▷ tag as `reasoning_text`
|
166
|
+
and the rest of the text as `normal_text`.
|
167
|
+
"""
|
168
|
+
|
169
|
+
def __init__(self, stream_reasoning: bool = True):
|
170
|
+
super().__init__(
|
171
|
+
"◁think▷",
|
172
|
+
"◁/think▷",
|
173
|
+
force_reasoning=False,
|
174
|
+
stream_reasoning=stream_reasoning,
|
175
|
+
)
|
176
|
+
|
177
|
+
|
153
178
|
class ReasoningParser:
|
154
179
|
"""
|
155
180
|
Parser that handles both streaming and non-streaming scenarios for extracting
|
@@ -164,6 +189,7 @@ class ReasoningParser:
|
|
164
189
|
DetectorMap: Dict[str, Type[BaseReasoningFormatDetector]] = {
|
165
190
|
"deepseek-r1": DeepSeekR1Detector,
|
166
191
|
"qwen3": Qwen3Detector,
|
192
|
+
"kimi": KimiDetector,
|
167
193
|
}
|
168
194
|
|
169
195
|
def __init__(self, model_type: Optional[str] = None, stream_reasoning: bool = True):
|
@@ -10,7 +10,6 @@ import torch
|
|
10
10
|
import sglang.srt.sampling.penaltylib as penaltylib
|
11
11
|
from sglang.srt.sampling.custom_logit_processor import CustomLogitProcessor
|
12
12
|
from sglang.srt.sampling.sampling_params import TOP_K_ALL
|
13
|
-
from sglang.srt.utils import merge_bias_tensor
|
14
13
|
|
15
14
|
if TYPE_CHECKING:
|
16
15
|
from sglang.srt.managers.schedule_batch import ScheduleBatch
|
@@ -345,3 +344,42 @@ class SamplingBatchInfo:
|
|
345
344
|
self.logit_bias = merge_bias_tensor(
|
346
345
|
self.logit_bias, other.logit_bias, len(self), len(other), self.device, 0.0
|
347
346
|
)
|
347
|
+
|
348
|
+
|
349
|
+
def merge_bias_tensor(
|
350
|
+
lhs: Optional[torch.Tensor],
|
351
|
+
rhs: Optional[torch.Tensor],
|
352
|
+
bs1: int,
|
353
|
+
bs2: int,
|
354
|
+
device: str,
|
355
|
+
default: float,
|
356
|
+
):
|
357
|
+
"""Merge two bias tensors for batch merging.
|
358
|
+
|
359
|
+
Args:
|
360
|
+
lhs: Left-hand side tensor
|
361
|
+
rhs: Right-hand side tensor
|
362
|
+
bs1: Batch size of left-hand side tensor
|
363
|
+
bs2: Batch size of right-hand side tensor
|
364
|
+
device: Device to place the merged tensor on
|
365
|
+
default: Default value for missing tensor elements
|
366
|
+
|
367
|
+
Returns:
|
368
|
+
Merged tensor or None if both inputs are None
|
369
|
+
"""
|
370
|
+
if lhs is None and rhs is None:
|
371
|
+
return None
|
372
|
+
|
373
|
+
if lhs is not None and rhs is not None:
|
374
|
+
return torch.cat([lhs, rhs])
|
375
|
+
else:
|
376
|
+
if lhs is not None:
|
377
|
+
shape, dtype = lhs.shape[1:], lhs.dtype
|
378
|
+
else:
|
379
|
+
shape, dtype = rhs.shape[1:], rhs.dtype
|
380
|
+
|
381
|
+
if lhs is None:
|
382
|
+
lhs = torch.empty((bs1, *shape), device=device, dtype=dtype).fill_(default)
|
383
|
+
if rhs is None:
|
384
|
+
rhs = torch.empty((bs2, *shape), device=device, dtype=dtype).fill_(default)
|
385
|
+
return torch.cat([lhs, rhs])
|