sglang 0.4.4.post2__py3-none-any.whl → 0.4.4.post4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_serving.py +72 -10
- sglang/srt/_custom_ops.py +59 -92
- sglang/srt/configs/deepseekvl2.py +10 -1
- sglang/srt/configs/model_config.py +6 -16
- sglang/srt/constrained/base_grammar_backend.py +5 -1
- sglang/srt/custom_op.py +5 -0
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +28 -80
- sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +2 -2
- sglang/srt/distributed/parallel_state.py +32 -5
- sglang/srt/entrypoints/engine.py +0 -5
- sglang/srt/entrypoints/http_server.py +7 -1
- sglang/srt/entrypoints/verl_engine.py +2 -0
- sglang/srt/function_call_parser.py +0 -1
- sglang/srt/layers/attention/flashattention_backend.py +582 -125
- sglang/srt/layers/attention/flashinfer_backend.py +5 -7
- sglang/srt/layers/attention/flashinfer_mla_backend.py +1 -3
- sglang/srt/layers/attention/flashmla_backend.py +1 -1
- sglang/srt/layers/dp_attention.py +12 -1
- sglang/srt/layers/moe/ep_moe/kernels.py +142 -0
- sglang/srt/layers/moe/ep_moe/layer.py +79 -80
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +382 -199
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +403 -47
- sglang/srt/layers/moe/topk.py +79 -6
- sglang/srt/layers/quantization/__init__.py +137 -165
- sglang/srt/layers/quantization/awq.py +200 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +2 -1
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +34 -10
- sglang/srt/layers/quantization/fp8_kernel.py +2 -1
- sglang/srt/layers/quantization/fp8_utils.py +1 -4
- sglang/srt/layers/quantization/gptq.py +30 -40
- sglang/srt/layers/quantization/moe_wna16.py +501 -0
- sglang/srt/layers/quantization/utils.py +1 -1
- sglang/srt/layers/quantization/w8a8_fp8.py +1 -1
- sglang/srt/lora/backend/base_backend.py +4 -4
- sglang/srt/lora/backend/flashinfer_backend.py +12 -9
- sglang/srt/lora/backend/triton_backend.py +5 -8
- sglang/srt/lora/layers.py +19 -33
- sglang/srt/lora/lora_manager.py +20 -7
- sglang/srt/lora/mem_pool.py +12 -6
- sglang/srt/lora/triton_ops/gate_up_lora_b.py +10 -4
- sglang/srt/lora/triton_ops/qkv_lora_b.py +8 -3
- sglang/srt/lora/triton_ops/sgemm_lora_a.py +16 -5
- sglang/srt/lora/triton_ops/sgemm_lora_b.py +11 -6
- sglang/srt/lora/utils.py +6 -0
- sglang/srt/managers/cache_controller.py +34 -11
- sglang/srt/managers/io_struct.py +4 -2
- sglang/srt/managers/mm_utils.py +202 -156
- sglang/srt/managers/multimodal_processor.py +0 -2
- sglang/srt/managers/multimodal_processors/base_processor.py +45 -77
- sglang/srt/managers/multimodal_processors/clip.py +44 -0
- sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +17 -58
- sglang/srt/managers/multimodal_processors/gemma3.py +12 -27
- sglang/srt/managers/multimodal_processors/janus_pro.py +21 -47
- sglang/srt/managers/multimodal_processors/llava.py +34 -14
- sglang/srt/managers/multimodal_processors/minicpm.py +35 -38
- sglang/srt/managers/multimodal_processors/mlama.py +10 -23
- sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -45
- sglang/srt/managers/schedule_batch.py +185 -127
- sglang/srt/managers/scheduler.py +29 -23
- sglang/srt/managers/tokenizer_manager.py +1 -2
- sglang/srt/managers/tp_worker.py +3 -0
- sglang/srt/managers/utils.py +1 -6
- sglang/srt/mem_cache/hiradix_cache.py +62 -52
- sglang/srt/mem_cache/memory_pool.py +72 -6
- sglang/srt/mem_cache/paged_allocator.py +39 -0
- sglang/srt/metrics/collector.py +23 -53
- sglang/srt/model_executor/cuda_graph_runner.py +16 -13
- sglang/srt/model_executor/forward_batch_info.py +10 -10
- sglang/srt/model_executor/model_runner.py +64 -59
- sglang/srt/model_loader/loader.py +19 -1
- sglang/srt/model_loader/weight_utils.py +6 -3
- sglang/srt/models/clip.py +568 -0
- sglang/srt/models/deepseek_janus_pro.py +12 -17
- sglang/srt/models/deepseek_v2.py +339 -123
- sglang/srt/models/deepseek_vl2.py +105 -104
- sglang/srt/models/gemma3_causal.py +12 -2
- sglang/srt/models/gemma3_mm.py +20 -80
- sglang/srt/models/llama.py +4 -1
- sglang/srt/models/llava.py +31 -19
- sglang/srt/models/llavavid.py +16 -7
- sglang/srt/models/minicpmo.py +63 -147
- sglang/srt/models/minicpmv.py +17 -27
- sglang/srt/models/mllama.py +29 -14
- sglang/srt/models/qwen2.py +9 -6
- sglang/srt/models/qwen2_5_vl.py +21 -31
- sglang/srt/models/qwen2_vl.py +20 -21
- sglang/srt/openai_api/adapter.py +106 -93
- sglang/srt/openai_api/protocol.py +10 -5
- sglang/srt/patch_torch.py +71 -0
- sglang/srt/platforms/interface.py +371 -0
- sglang/srt/server_args.py +120 -25
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -5
- sglang/srt/speculative/eagle_utils.py +140 -28
- sglang/srt/speculative/eagle_worker.py +94 -25
- sglang/srt/utils.py +137 -51
- sglang/test/runners.py +27 -2
- sglang/test/test_custom_ops.py +55 -0
- sglang/test/test_utils.py +14 -27
- sglang/utils.py +2 -2
- sglang/version.py +1 -1
- {sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/METADATA +10 -5
- {sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/RECORD +108 -99
- {sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/WHEEL +0 -0
- {sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/top_level.txt +0 -0
@@ -1,13 +1,13 @@
|
|
1
|
-
import asyncio
|
2
1
|
from typing import List, Union
|
3
2
|
|
4
3
|
import torch
|
4
|
+
from transformers import BaseImageProcessorFast
|
5
5
|
|
6
6
|
from sglang.srt.managers.multimodal_processors.base_processor import (
|
7
7
|
BaseMultimodalProcessor,
|
8
8
|
MultimodalSpecialTokens,
|
9
|
-
get_global_processor,
|
10
9
|
)
|
10
|
+
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
11
11
|
from sglang.srt.models.minicpmo import MiniCPMO
|
12
12
|
from sglang.srt.models.minicpmv import MiniCPMV
|
13
13
|
|
@@ -21,19 +21,23 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
|
|
21
21
|
self.image_token = "(<image>./</image>)"
|
22
22
|
self.audio_token = "(<audio>./</audio>)"
|
23
23
|
|
24
|
-
|
25
|
-
def _process_data_task(input_text, images=None, audios=None):
|
24
|
+
def process_data_task(self, input_text, images=None, audios=None):
|
26
25
|
|
27
26
|
if isinstance(images, list) and len(images) == 0:
|
28
27
|
images = None
|
29
28
|
if isinstance(audios, list) and len(audios) == 0:
|
30
29
|
audios = None
|
31
|
-
|
30
|
+
processor = self._processor
|
31
|
+
args = {}
|
32
|
+
if isinstance(processor, BaseImageProcessorFast):
|
33
|
+
args["device"] = "cuda"
|
34
|
+
result = self._processor.__call__(
|
32
35
|
text=input_text,
|
33
36
|
images=images,
|
34
37
|
audios=audios,
|
35
38
|
return_tensors="pt",
|
36
39
|
chunk_input=True,
|
40
|
+
**args,
|
37
41
|
)
|
38
42
|
return {
|
39
43
|
"input_ids": result.input_ids,
|
@@ -44,23 +48,6 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
|
|
44
48
|
"audio_bounds": getattr(result, "audio_bounds", None),
|
45
49
|
}
|
46
50
|
|
47
|
-
async def _process_data(self, images, input_text, audios=None):
|
48
|
-
if self.executor is not None:
|
49
|
-
loop = asyncio.get_event_loop()
|
50
|
-
multimodal_data_inputs = await loop.run_in_executor(
|
51
|
-
self.executor,
|
52
|
-
MiniCPMMultimodalProcessor._process_data_task,
|
53
|
-
input_text,
|
54
|
-
images,
|
55
|
-
audios,
|
56
|
-
)
|
57
|
-
else:
|
58
|
-
multimodal_data_inputs = self._processor(
|
59
|
-
images=images, text=input_text, audios=audios, return_tensors="pt"
|
60
|
-
)
|
61
|
-
|
62
|
-
return multimodal_data_inputs
|
63
|
-
|
64
51
|
async def process_mm_data_async(
|
65
52
|
self,
|
66
53
|
image_data: List[Union[str, bytes]],
|
@@ -77,7 +64,7 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
|
|
77
64
|
audio_data = [audio_data]
|
78
65
|
|
79
66
|
base_output = self.load_mm_data(
|
80
|
-
|
67
|
+
prompt=input_ids,
|
81
68
|
max_req_input_len=max_req_input_len,
|
82
69
|
audio_data=audio_data,
|
83
70
|
image_data=image_data,
|
@@ -88,9 +75,9 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
|
|
88
75
|
if base_output is None:
|
89
76
|
return None
|
90
77
|
|
91
|
-
res =
|
92
|
-
images=base_output.images,
|
78
|
+
res = self.process_mm_data(
|
93
79
|
input_text=base_output.input_text,
|
80
|
+
images=base_output.images,
|
94
81
|
audios=base_output.audios,
|
95
82
|
)
|
96
83
|
|
@@ -142,23 +129,33 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
|
|
142
129
|
tgt_sizes_flat += [tgt_n]
|
143
130
|
|
144
131
|
pixel_values = pixel_values_flat
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
132
|
+
|
133
|
+
items = []
|
134
|
+
if len(pixel_values) != 0:
|
135
|
+
item = MultimodalDataItem(
|
136
|
+
pixel_values=pixel_values,
|
137
|
+
tgt_size=tgt_sizes_flat,
|
138
|
+
modality=Modality.IMAGE,
|
139
|
+
)
|
140
|
+
items += [item]
|
141
|
+
|
142
|
+
if (
|
143
|
+
"audio_features" in res
|
144
|
+
and res["audio_features"] is not None
|
145
|
+
and len(res["audio_features"]) != 0
|
146
|
+
):
|
147
|
+
item = MultimodalDataItem(
|
148
|
+
audio_features=[res["audio_features"]],
|
149
|
+
audio_feature_lens=res["audio_feature_lens"],
|
150
|
+
modality=Modality.AUDIO,
|
151
|
+
)
|
152
|
+
items += [item]
|
153
|
+
|
151
154
|
return {
|
155
|
+
"mm_items": items,
|
152
156
|
"input_ids": res["input_ids"].flatten().tolist(),
|
153
|
-
"pixel_values": pixel_values,
|
154
|
-
"tgt_sizes": tgt_sizes,
|
155
|
-
"data_hashes": base_output.mm_data_hashes,
|
156
|
-
"modalities": request_obj.modalities or ["image"],
|
157
157
|
"audio_start_id": audio_start_id,
|
158
158
|
"audio_end_id": audio_end_id,
|
159
|
-
"audio_features": res["audio_features"],
|
160
|
-
"audio_bounds": res["audio_bounds"],
|
161
|
-
"audio_feature_lens": res["audio_feature_lens"],
|
162
159
|
"im_token_id": im_token_id,
|
163
160
|
"im_start_id": tokenizer.im_start_id,
|
164
161
|
"im_end_id": tokenizer.im_end_id,
|
@@ -1,10 +1,9 @@
|
|
1
|
-
import asyncio
|
2
1
|
from typing import List, Union
|
3
2
|
|
4
3
|
from sglang.srt.managers.multimodal_processors.base_processor import (
|
5
4
|
BaseMultimodalProcessor,
|
6
|
-
get_global_processor,
|
7
5
|
)
|
6
|
+
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
8
7
|
from sglang.srt.models.mllama import MllamaForConditionalGeneration
|
9
8
|
from sglang.srt.utils import load_image
|
10
9
|
|
@@ -15,25 +14,6 @@ class MllamaImageProcessor(BaseMultimodalProcessor):
|
|
15
14
|
def __init__(self, hf_config, server_args, _processor):
|
16
15
|
super().__init__(hf_config, server_args, _processor)
|
17
16
|
|
18
|
-
@staticmethod
|
19
|
-
def _process_single_image_task(images, input_text):
|
20
|
-
# input_ids', 'attention_mask', 'pixel_values', 'aspect_ratio_ids', 'aspect_ratio_mask', 'cross_attention_mask'
|
21
|
-
return get_global_processor()(images, input_text, return_tensors="pt")
|
22
|
-
|
23
|
-
async def _process_single_image(self, images, input_text):
|
24
|
-
if self.executor is not None:
|
25
|
-
loop = asyncio.get_event_loop()
|
26
|
-
image_inputs = await loop.run_in_executor(
|
27
|
-
self.executor,
|
28
|
-
MllamaImageProcessor._process_single_image_task,
|
29
|
-
images,
|
30
|
-
input_text,
|
31
|
-
)
|
32
|
-
else:
|
33
|
-
image_inputs = self._processor(images, input_text, return_tensors="pt")
|
34
|
-
|
35
|
-
return image_inputs
|
36
|
-
|
37
17
|
async def process_mm_data_async(
|
38
18
|
self, image_data: List[Union[str, bytes]], input_text, *args, **kwargs
|
39
19
|
):
|
@@ -52,8 +32,15 @@ class MllamaImageProcessor(BaseMultimodalProcessor):
|
|
52
32
|
else:
|
53
33
|
images = load_image(image_data[0])[0]
|
54
34
|
|
55
|
-
image_inputs =
|
56
|
-
image_inputs["data_hashes"] = [hash(str(image_data))]
|
35
|
+
image_inputs = self.process_mm_data(input_text=input_text, images=images)
|
57
36
|
image_inputs["input_ids"] = image_inputs["input_ids"].tolist()[0]
|
37
|
+
image_inputs["mm_items"] = [
|
38
|
+
MultimodalDataItem(
|
39
|
+
pixel_values=image_inputs["pixel_values"],
|
40
|
+
aspect_ratio_id=image_inputs["aspect_ratio_ids"],
|
41
|
+
aspect_ratio_mask=image_inputs["aspect_ratio_mask"],
|
42
|
+
modality=Modality.IMAGE,
|
43
|
+
)
|
44
|
+
]
|
58
45
|
|
59
46
|
return image_inputs
|
@@ -1,18 +1,17 @@
|
|
1
1
|
import asyncio
|
2
2
|
import math
|
3
|
-
import time
|
4
3
|
from typing import List, Union
|
5
4
|
|
6
5
|
import torch
|
7
6
|
from PIL import Image
|
8
7
|
|
9
|
-
from sglang.srt.managers.
|
8
|
+
from sglang.srt.managers.multimodal_processors.base_processor import (
|
10
9
|
BaseMultimodalProcessor as SGLangBaseProcessor,
|
11
10
|
)
|
12
11
|
from sglang.srt.managers.multimodal_processors.base_processor import (
|
13
12
|
MultimodalSpecialTokens,
|
14
|
-
get_global_processor,
|
15
13
|
)
|
14
|
+
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
16
15
|
from sglang.srt.models.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
|
17
16
|
from sglang.srt.models.qwen2_vl import Qwen2VLForConditionalGeneration
|
18
17
|
|
@@ -34,45 +33,15 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
|
|
34
33
|
self.MAX_PIXELS = 16384 * 28 * 28
|
35
34
|
self.MAX_RATIO = 200
|
36
35
|
|
37
|
-
@staticmethod
|
38
|
-
def _process_images_task(images, input_text, _hf_config):
|
39
|
-
if isinstance(images, list) and len(images) == 0:
|
40
|
-
images = None
|
41
|
-
result = get_global_processor().__call__(
|
42
|
-
text=[input_text], images=images, padding=True, return_tensors="pt"
|
43
|
-
)
|
44
|
-
|
45
|
-
return {
|
46
|
-
"input_ids": result.input_ids,
|
47
|
-
"pixel_values": getattr(result, "pixel_values", None),
|
48
|
-
"image_grid_thw": getattr(result, "image_grid_thw", None),
|
49
|
-
"second_per_grid_ts": getattr(result, "second_per_grid_ts", None),
|
50
|
-
"video_grid_thws": getattr(result, "video_grid_thws", None),
|
51
|
-
}
|
52
|
-
|
53
|
-
async def _process_single_image(self, images, input_text) -> dict:
|
54
|
-
if self.executor is not None:
|
55
|
-
loop = asyncio.get_event_loop()
|
56
|
-
return await loop.run_in_executor(
|
57
|
-
self.executor,
|
58
|
-
Qwen2_5VLImageProcessor._process_images_task,
|
59
|
-
images,
|
60
|
-
input_text,
|
61
|
-
self.hf_config,
|
62
|
-
)
|
63
|
-
else:
|
64
|
-
return self._process_images_task(images, input_text, self.hf_config)
|
65
|
-
|
66
36
|
async def process_mm_data_async(
|
67
37
|
self,
|
68
38
|
image_data: List[Union[str, bytes]],
|
69
|
-
|
39
|
+
prompt,
|
70
40
|
request_obj,
|
71
41
|
max_req_input_len,
|
72
42
|
*args,
|
73
43
|
**kwargs,
|
74
44
|
):
|
75
|
-
start = time.time()
|
76
45
|
if not image_data:
|
77
46
|
return None
|
78
47
|
if isinstance(image_data, str):
|
@@ -80,7 +49,7 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
|
|
80
49
|
|
81
50
|
image_token = self.IMAGE_TOKEN
|
82
51
|
base_output = self.load_mm_data(
|
83
|
-
|
52
|
+
prompt=prompt,
|
84
53
|
image_data=image_data,
|
85
54
|
multimodal_tokens=MultimodalSpecialTokens(image_token=image_token),
|
86
55
|
max_req_input_len=max_req_input_len,
|
@@ -144,24 +113,32 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
|
|
144
113
|
"""Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
|
145
114
|
return math.floor(number / factor) * factor
|
146
115
|
|
147
|
-
|
116
|
+
async def resize_image_async(image):
|
117
|
+
return resize_image(image)
|
148
118
|
|
149
|
-
|
150
|
-
|
119
|
+
resize_tasks = [resize_image_async(image) for image in base_output.images]
|
120
|
+
resized_images = await asyncio.gather(*resize_tasks)
|
121
|
+
|
122
|
+
ret = self.process_mm_data(
|
123
|
+
input_text=base_output.input_text,
|
124
|
+
images=resized_images,
|
151
125
|
)
|
152
126
|
|
153
127
|
image_grid_thws = torch.concat([ret["image_grid_thw"]])
|
154
|
-
video_grid_thws = None
|
155
128
|
return {
|
156
129
|
"input_ids": ret["input_ids"].flatten().tolist(),
|
157
|
-
"
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
130
|
+
"mm_items": [
|
131
|
+
MultimodalDataItem(
|
132
|
+
pixel_values=ret["pixel_values"],
|
133
|
+
image_grid_thws=image_grid_thws,
|
134
|
+
# TODO
|
135
|
+
video_grid_thws=None,
|
136
|
+
second_per_grid_ts=ret.get("second_per_grid_ts", None),
|
137
|
+
modality=Modality.IMAGE,
|
138
|
+
)
|
139
|
+
],
|
162
140
|
"im_start_id": self.IM_START_TOKEN_ID,
|
163
141
|
"im_end_id": self.IM_END_TOKEN_ID,
|
164
142
|
"im_token_id": self.image_token_id,
|
165
143
|
"video_token_id": self.video_token_id,
|
166
|
-
"second_per_grid_ts": ret["second_per_grid_ts"],
|
167
144
|
}
|