sglang 0.4.2__py3-none-any.whl → 0.4.2.post2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/srt/constrained/outlines_backend.py +9 -1
- sglang/srt/custom_op.py +40 -0
- sglang/srt/entrypoints/engine.py +2 -2
- sglang/srt/layers/activation.py +10 -5
- sglang/srt/layers/attention/flashinfer_backend.py +284 -39
- sglang/srt/layers/attention/triton_backend.py +71 -7
- sglang/srt/layers/attention/triton_ops/decode_attention.py +53 -59
- sglang/srt/layers/attention/triton_ops/prefill_attention.py +6 -0
- sglang/srt/layers/attention/vision.py +243 -40
- sglang/srt/layers/layernorm.py +1 -5
- sglang/srt/layers/moe/ep_moe/layer.py +1 -3
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Radeon_Graphics.json +200 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Radeon_Graphics.json +200 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Radeon_Graphics.json +200 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +178 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Radeon_Graphics.json +200 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +175 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +3 -11
- sglang/srt/layers/moe/fused_moe_triton/layer.py +1 -3
- sglang/srt/layers/moe/topk.py +4 -0
- sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/fp8.py +7 -0
- sglang/srt/layers/quantization/fp8_kernel.py +140 -2
- sglang/srt/layers/rotary_embedding.py +29 -15
- sglang/srt/layers/sampler.py +9 -6
- sglang/srt/lora/backend/__init__.py +8 -0
- sglang/srt/lora/backend/base_backend.py +95 -0
- sglang/srt/lora/backend/flashinfer_backend.py +91 -0
- sglang/srt/lora/backend/triton_backend.py +61 -0
- sglang/srt/lora/lora.py +127 -112
- sglang/srt/lora/lora_manager.py +50 -18
- sglang/srt/lora/triton_ops/__init__.py +5 -0
- sglang/srt/lora/triton_ops/qkv_lora_b.py +182 -0
- sglang/srt/lora/triton_ops/sgemm_lora_a.py +143 -0
- sglang/srt/lora/triton_ops/sgemm_lora_b.py +159 -0
- sglang/srt/managers/image_processor.py +77 -38
- sglang/srt/managers/scheduler.py +17 -3
- sglang/srt/mem_cache/base_prefix_cache.py +4 -0
- sglang/srt/mem_cache/chunk_cache.py +3 -0
- sglang/srt/mem_cache/radix_cache.py +30 -1
- sglang/srt/model_executor/cuda_graph_runner.py +77 -80
- sglang/srt/model_executor/forward_batch_info.py +58 -59
- sglang/srt/model_executor/model_runner.py +2 -2
- sglang/srt/models/minicpmv.py +129 -76
- sglang/srt/models/mllama.py +16 -56
- sglang/srt/models/qwen2.py +4 -1
- sglang/srt/models/qwen2_vl.py +19 -9
- sglang/srt/server_args.py +19 -2
- sglang/srt/speculative/build_eagle_tree.py +4 -2
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +213 -0
- sglang/srt/speculative/eagle_utils.py +361 -372
- sglang/srt/speculative/eagle_worker.py +177 -45
- sglang/srt/utils.py +7 -2
- sglang/test/runners.py +2 -0
- sglang/utils.py +42 -0
- sglang/version.py +1 -1
- {sglang-0.4.2.dist-info → sglang-0.4.2.post2.dist-info}/METADATA +16 -7
- {sglang-0.4.2.dist-info → sglang-0.4.2.post2.dist-info}/RECORD +84 -45
- sglang/srt/layers/custom_op_util.py +0 -25
- {sglang-0.4.2.dist-info → sglang-0.4.2.post2.dist-info}/LICENSE +0 -0
- {sglang-0.4.2.dist-info → sglang-0.4.2.post2.dist-info}/WHEEL +0 -0
- {sglang-0.4.2.dist-info → sglang-0.4.2.post2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,213 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import bisect
|
4
|
+
import time
|
5
|
+
from typing import TYPE_CHECKING, Callable
|
6
|
+
|
7
|
+
import torch
|
8
|
+
|
9
|
+
from sglang.srt.model_executor.cuda_graph_runner import (
|
10
|
+
CudaGraphRunner,
|
11
|
+
get_batch_sizes_to_capture,
|
12
|
+
get_global_graph_memory_pool,
|
13
|
+
set_global_graph_memory_pool,
|
14
|
+
set_torch_compile_config,
|
15
|
+
)
|
16
|
+
from sglang.srt.model_executor.forward_batch_info import (
|
17
|
+
CaptureHiddenMode,
|
18
|
+
ForwardBatch,
|
19
|
+
ForwardMode,
|
20
|
+
)
|
21
|
+
from sglang.srt.speculative.eagle_utils import EagleDraftInput
|
22
|
+
|
23
|
+
if TYPE_CHECKING:
|
24
|
+
from sglang.srt.model_executor.model_runner import ModelRunner
|
25
|
+
from sglang.srt.speculative.eagle_worker import EAGLEWorker
|
26
|
+
|
27
|
+
|
28
|
+
class EAGLEDraftCudaGraphRunner:
|
29
|
+
def __init__(self, eagle_worker: EAGLEWorker):
|
30
|
+
# Parse args
|
31
|
+
self.eagle_worker = eagle_worker
|
32
|
+
self.model_runner = model_runner = eagle_worker.model_runner
|
33
|
+
self.graphs = {}
|
34
|
+
self.output_buffers = {}
|
35
|
+
self.enable_torch_compile = model_runner.server_args.enable_torch_compile
|
36
|
+
self.disable_padding = model_runner.server_args.disable_cuda_graph_padding
|
37
|
+
self.tp_size = self.model_runner.tp_size
|
38
|
+
self.dp_size = model_runner.server_args.dp_size
|
39
|
+
self.topk = model_runner.server_args.speculative_eagle_topk
|
40
|
+
self.speculative_num_steps = model_runner.server_args.speculative_num_steps
|
41
|
+
server_args = model_runner.server_args
|
42
|
+
|
43
|
+
assert self.disable_padding
|
44
|
+
|
45
|
+
# Batch sizes to capture
|
46
|
+
self.capture_bs, self.compile_bs = get_batch_sizes_to_capture(model_runner)
|
47
|
+
self.num_tokens_per_bs = server_args.speculative_eagle_topk
|
48
|
+
|
49
|
+
# Attention backend
|
50
|
+
self.max_bs = max(self.capture_bs)
|
51
|
+
self.max_num_token = self.max_bs * self.num_tokens_per_bs
|
52
|
+
self.model_runner.draft_attn_backend.init_cuda_graph_state(self.max_num_token)
|
53
|
+
self.seq_len_fill_value = self.model_runner.draft_attn_backend.attn_backends[
|
54
|
+
0
|
55
|
+
].get_cuda_graph_seq_len_fill_value()
|
56
|
+
|
57
|
+
if self.enable_torch_compile:
|
58
|
+
set_torch_compile_config()
|
59
|
+
|
60
|
+
# Graph inputs
|
61
|
+
with torch.device("cuda"):
|
62
|
+
self.input_ids = torch.zeros((self.max_num_token,), dtype=torch.int64)
|
63
|
+
self.req_pool_indices = torch.zeros((self.max_bs,), dtype=torch.int32)
|
64
|
+
self.seq_lens = torch.full(
|
65
|
+
(self.max_bs,), self.seq_len_fill_value, dtype=torch.int32
|
66
|
+
)
|
67
|
+
self.out_cache_loc = torch.zeros(
|
68
|
+
(self.max_num_token * self.speculative_num_steps,), dtype=torch.int64
|
69
|
+
)
|
70
|
+
self.positions = torch.zeros((self.max_num_token,), dtype=torch.int64)
|
71
|
+
self.topk_p = torch.zeros((self.max_bs, self.topk), dtype=torch.float32)
|
72
|
+
self.topk_index = torch.zeros((self.max_bs, self.topk), dtype=torch.int64)
|
73
|
+
self.hidden_states = torch.zeros(
|
74
|
+
(self.max_bs, self.model_runner.model_config.hidden_size),
|
75
|
+
dtype=self.model_runner.dtype,
|
76
|
+
)
|
77
|
+
|
78
|
+
# Capture
|
79
|
+
try:
|
80
|
+
self.capture()
|
81
|
+
except RuntimeError as e:
|
82
|
+
raise Exception(
|
83
|
+
f"Capture cuda graph failed: {e}\n"
|
84
|
+
"Possible solutions:\n"
|
85
|
+
"1. disable cuda graph by --disable-cuda-graph\n"
|
86
|
+
"2. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n"
|
87
|
+
"3. disable torch compile by not using --enable-torch-compile\n"
|
88
|
+
"Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n"
|
89
|
+
)
|
90
|
+
|
91
|
+
def can_run(self, forward_batch: ForwardBatch):
|
92
|
+
is_bs_supported = (
|
93
|
+
forward_batch.batch_size in self.graphs
|
94
|
+
if self.disable_padding
|
95
|
+
else forward_batch.batch_size <= self.max_bs
|
96
|
+
)
|
97
|
+
return is_bs_supported
|
98
|
+
|
99
|
+
def capture(self):
|
100
|
+
CudaGraphRunner.capture(self)
|
101
|
+
|
102
|
+
def capture_one_batch_size(self, num_seqs: int, forward: Callable):
|
103
|
+
graph = torch.cuda.CUDAGraph()
|
104
|
+
stream = self.stream
|
105
|
+
num_tokens = num_seqs * self.num_tokens_per_bs
|
106
|
+
|
107
|
+
# Graph inputs
|
108
|
+
req_pool_indices = self.req_pool_indices[:num_seqs]
|
109
|
+
seq_lens = self.seq_lens[:num_seqs]
|
110
|
+
out_cache_loc = self.out_cache_loc[: num_tokens * self.speculative_num_steps]
|
111
|
+
positions = self.positions[:num_tokens]
|
112
|
+
topk_p = self.topk_p[:num_seqs]
|
113
|
+
topk_index = self.topk_index[:num_seqs]
|
114
|
+
hidden_states = self.hidden_states[:num_seqs]
|
115
|
+
|
116
|
+
spec_info = EagleDraftInput(
|
117
|
+
topk_p=topk_p,
|
118
|
+
topk_index=topk_index,
|
119
|
+
hidden_states=hidden_states,
|
120
|
+
)
|
121
|
+
|
122
|
+
# Forward batch
|
123
|
+
forward_batch = ForwardBatch(
|
124
|
+
forward_mode=ForwardMode.DECODE,
|
125
|
+
batch_size=num_seqs,
|
126
|
+
input_ids=None,
|
127
|
+
req_pool_indices=req_pool_indices,
|
128
|
+
seq_lens=seq_lens,
|
129
|
+
req_to_token_pool=self.model_runner.req_to_token_pool,
|
130
|
+
token_to_kv_pool=self.model_runner.token_to_kv_pool,
|
131
|
+
out_cache_loc=out_cache_loc,
|
132
|
+
seq_lens_sum=seq_lens.sum(),
|
133
|
+
return_logprob=False,
|
134
|
+
positions=positions,
|
135
|
+
spec_algorithm=self.model_runner.spec_algorithm,
|
136
|
+
spec_info=spec_info,
|
137
|
+
capture_hidden_mode=(
|
138
|
+
spec_info.capture_hidden_mode if spec_info else CaptureHiddenMode.NULL
|
139
|
+
),
|
140
|
+
)
|
141
|
+
|
142
|
+
# Attention backend
|
143
|
+
self.model_runner.draft_attn_backend.init_forward_metadata_capture_cuda_graph(
|
144
|
+
forward_batch
|
145
|
+
)
|
146
|
+
|
147
|
+
# Run and capture
|
148
|
+
def run_once():
|
149
|
+
# Backup two fileds, which will be modified in-place in `draft_forward`.
|
150
|
+
output_cache_loc_backup = forward_batch.out_cache_loc
|
151
|
+
hidden_states_backup = forward_batch.spec_info.hidden_states
|
152
|
+
|
153
|
+
ret = self.eagle_worker.draft_forward(forward_batch)
|
154
|
+
|
155
|
+
forward_batch.out_cache_loc = output_cache_loc_backup
|
156
|
+
forward_batch.spec_info.hidden_states = hidden_states_backup
|
157
|
+
return ret
|
158
|
+
|
159
|
+
for _ in range(2):
|
160
|
+
torch.cuda.synchronize()
|
161
|
+
self.model_runner.tp_group.barrier()
|
162
|
+
|
163
|
+
run_once()
|
164
|
+
|
165
|
+
torch.cuda.synchronize()
|
166
|
+
self.model_runner.tp_group.barrier()
|
167
|
+
|
168
|
+
torch.cuda.synchronize()
|
169
|
+
self.model_runner.tp_group.barrier()
|
170
|
+
|
171
|
+
with torch.cuda.graph(
|
172
|
+
graph, pool=get_global_graph_memory_pool(), stream=stream
|
173
|
+
):
|
174
|
+
out = run_once()
|
175
|
+
|
176
|
+
torch.cuda.synchronize()
|
177
|
+
self.model_runner.tp_group.barrier()
|
178
|
+
|
179
|
+
set_global_graph_memory_pool(graph.pool())
|
180
|
+
return graph, out
|
181
|
+
|
182
|
+
def replay(self, forward_batch: ForwardBatch):
|
183
|
+
assert forward_batch.out_cache_loc is not None
|
184
|
+
raw_bs = forward_batch.batch_size
|
185
|
+
raw_num_token = raw_bs * self.num_tokens_per_bs
|
186
|
+
|
187
|
+
# Pad
|
188
|
+
index = bisect.bisect_left(self.capture_bs, raw_bs)
|
189
|
+
bs = self.capture_bs[index]
|
190
|
+
if bs != raw_bs:
|
191
|
+
self.seq_lens.fill_(1)
|
192
|
+
self.out_cache_loc.zero_()
|
193
|
+
|
194
|
+
# Common inputs
|
195
|
+
self.req_pool_indices[:raw_bs].copy_(forward_batch.req_pool_indices)
|
196
|
+
self.seq_lens[:raw_bs].copy_(forward_batch.seq_lens)
|
197
|
+
self.out_cache_loc[: raw_num_token * self.speculative_num_steps].copy_(
|
198
|
+
forward_batch.out_cache_loc
|
199
|
+
)
|
200
|
+
self.positions[:raw_num_token].copy_(forward_batch.positions)
|
201
|
+
self.topk_p[:raw_bs].copy_(forward_batch.spec_info.topk_p)
|
202
|
+
self.topk_index[:raw_bs].copy_(forward_batch.spec_info.topk_index)
|
203
|
+
self.hidden_states[:raw_bs].copy_(forward_batch.spec_info.hidden_states)
|
204
|
+
|
205
|
+
# Attention backend
|
206
|
+
self.model_runner.draft_attn_backend.init_forward_metadata_replay_cuda_graph(
|
207
|
+
forward_batch
|
208
|
+
)
|
209
|
+
|
210
|
+
# Replay
|
211
|
+
self.graphs[bs].replay()
|
212
|
+
|
213
|
+
return self.output_buffers[bs]
|