sglang 0.4.2.post1__py3-none-any.whl → 0.4.2.post3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. sglang/srt/constrained/outlines_backend.py +9 -1
  2. sglang/srt/custom_op.py +40 -0
  3. sglang/srt/entrypoints/engine.py +2 -2
  4. sglang/srt/function_call_parser.py +96 -69
  5. sglang/srt/layers/activation.py +10 -5
  6. sglang/srt/layers/attention/double_sparsity_backend.py +1 -3
  7. sglang/srt/layers/attention/flashinfer_backend.py +284 -39
  8. sglang/srt/layers/attention/triton_backend.py +124 -12
  9. sglang/srt/layers/attention/triton_ops/decode_attention.py +53 -59
  10. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +337 -3
  11. sglang/srt/layers/attention/triton_ops/extend_attention.py +70 -42
  12. sglang/srt/layers/layernorm.py +1 -5
  13. sglang/srt/layers/moe/ep_moe/layer.py +1 -3
  14. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  15. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  16. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Radeon_Graphics.json +200 -0
  17. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Radeon_Graphics.json +200 -0
  18. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Radeon_Graphics.json +200 -0
  19. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +178 -0
  20. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Radeon_Graphics.json +200 -0
  21. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +175 -0
  22. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -13
  23. sglang/srt/layers/moe/fused_moe_triton/layer.py +1 -3
  24. sglang/srt/layers/moe/topk.py +4 -0
  25. sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  26. sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  27. sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  28. sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  29. sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  30. sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  31. sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  32. sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  33. sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  34. sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  35. sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  36. sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  37. sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  38. sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  39. sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  40. sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  41. sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  42. sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  43. sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  44. sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  45. sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  46. sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  47. sglang/srt/layers/quantization/fp8_kernel.py +173 -2
  48. sglang/srt/layers/rotary_embedding.py +1 -3
  49. sglang/srt/layers/sampler.py +4 -4
  50. sglang/srt/lora/backend/__init__.py +8 -0
  51. sglang/srt/lora/backend/base_backend.py +95 -0
  52. sglang/srt/lora/backend/flashinfer_backend.py +91 -0
  53. sglang/srt/lora/backend/triton_backend.py +61 -0
  54. sglang/srt/lora/lora.py +127 -112
  55. sglang/srt/lora/lora_manager.py +50 -18
  56. sglang/srt/lora/triton_ops/__init__.py +5 -0
  57. sglang/srt/lora/triton_ops/qkv_lora_b.py +182 -0
  58. sglang/srt/lora/triton_ops/sgemm_lora_a.py +143 -0
  59. sglang/srt/lora/triton_ops/sgemm_lora_b.py +159 -0
  60. sglang/srt/model_executor/cuda_graph_runner.py +77 -80
  61. sglang/srt/model_executor/forward_batch_info.py +58 -59
  62. sglang/srt/model_executor/model_runner.py +2 -2
  63. sglang/srt/models/llama.py +8 -3
  64. sglang/srt/models/qwen2_vl.py +1 -1
  65. sglang/srt/server_args.py +13 -2
  66. sglang/srt/speculative/build_eagle_tree.py +486 -104
  67. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +213 -0
  68. sglang/srt/speculative/eagle_utils.py +420 -401
  69. sglang/srt/speculative/eagle_worker.py +177 -45
  70. sglang/srt/utils.py +7 -0
  71. sglang/test/runners.py +2 -0
  72. sglang/version.py +1 -1
  73. {sglang-0.4.2.post1.dist-info → sglang-0.4.2.post3.dist-info}/METADATA +15 -6
  74. {sglang-0.4.2.post1.dist-info → sglang-0.4.2.post3.dist-info}/RECORD +77 -38
  75. sglang/srt/layers/custom_op_util.py +0 -25
  76. {sglang-0.4.2.post1.dist-info → sglang-0.4.2.post3.dist-info}/LICENSE +0 -0
  77. {sglang-0.4.2.post1.dist-info → sglang-0.4.2.post3.dist-info}/WHEEL +0 -0
  78. {sglang-0.4.2.post1.dist-info → sglang-0.4.2.post3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,213 @@
1
+ from __future__ import annotations
2
+
3
+ import bisect
4
+ import time
5
+ from typing import TYPE_CHECKING, Callable
6
+
7
+ import torch
8
+
9
+ from sglang.srt.model_executor.cuda_graph_runner import (
10
+ CudaGraphRunner,
11
+ get_batch_sizes_to_capture,
12
+ get_global_graph_memory_pool,
13
+ set_global_graph_memory_pool,
14
+ set_torch_compile_config,
15
+ )
16
+ from sglang.srt.model_executor.forward_batch_info import (
17
+ CaptureHiddenMode,
18
+ ForwardBatch,
19
+ ForwardMode,
20
+ )
21
+ from sglang.srt.speculative.eagle_utils import EagleDraftInput
22
+
23
+ if TYPE_CHECKING:
24
+ from sglang.srt.model_executor.model_runner import ModelRunner
25
+ from sglang.srt.speculative.eagle_worker import EAGLEWorker
26
+
27
+
28
+ class EAGLEDraftCudaGraphRunner:
29
+ def __init__(self, eagle_worker: EAGLEWorker):
30
+ # Parse args
31
+ self.eagle_worker = eagle_worker
32
+ self.model_runner = model_runner = eagle_worker.model_runner
33
+ self.graphs = {}
34
+ self.output_buffers = {}
35
+ self.enable_torch_compile = model_runner.server_args.enable_torch_compile
36
+ self.disable_padding = model_runner.server_args.disable_cuda_graph_padding
37
+ self.tp_size = self.model_runner.tp_size
38
+ self.dp_size = model_runner.server_args.dp_size
39
+ self.topk = model_runner.server_args.speculative_eagle_topk
40
+ self.speculative_num_steps = model_runner.server_args.speculative_num_steps
41
+ server_args = model_runner.server_args
42
+
43
+ assert self.disable_padding
44
+
45
+ # Batch sizes to capture
46
+ self.capture_bs, self.compile_bs = get_batch_sizes_to_capture(model_runner)
47
+ self.num_tokens_per_bs = server_args.speculative_eagle_topk
48
+
49
+ # Attention backend
50
+ self.max_bs = max(self.capture_bs)
51
+ self.max_num_token = self.max_bs * self.num_tokens_per_bs
52
+ self.model_runner.draft_attn_backend.init_cuda_graph_state(self.max_num_token)
53
+ self.seq_len_fill_value = self.model_runner.draft_attn_backend.attn_backends[
54
+ 0
55
+ ].get_cuda_graph_seq_len_fill_value()
56
+
57
+ if self.enable_torch_compile:
58
+ set_torch_compile_config()
59
+
60
+ # Graph inputs
61
+ with torch.device("cuda"):
62
+ self.input_ids = torch.zeros((self.max_num_token,), dtype=torch.int64)
63
+ self.req_pool_indices = torch.zeros((self.max_bs,), dtype=torch.int32)
64
+ self.seq_lens = torch.full(
65
+ (self.max_bs,), self.seq_len_fill_value, dtype=torch.int32
66
+ )
67
+ self.out_cache_loc = torch.zeros(
68
+ (self.max_num_token * self.speculative_num_steps,), dtype=torch.int64
69
+ )
70
+ self.positions = torch.zeros((self.max_num_token,), dtype=torch.int64)
71
+ self.topk_p = torch.zeros((self.max_bs, self.topk), dtype=torch.float32)
72
+ self.topk_index = torch.zeros((self.max_bs, self.topk), dtype=torch.int64)
73
+ self.hidden_states = torch.zeros(
74
+ (self.max_bs, self.model_runner.model_config.hidden_size),
75
+ dtype=self.model_runner.dtype,
76
+ )
77
+
78
+ # Capture
79
+ try:
80
+ self.capture()
81
+ except RuntimeError as e:
82
+ raise Exception(
83
+ f"Capture cuda graph failed: {e}\n"
84
+ "Possible solutions:\n"
85
+ "1. disable cuda graph by --disable-cuda-graph\n"
86
+ "2. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n"
87
+ "3. disable torch compile by not using --enable-torch-compile\n"
88
+ "Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n"
89
+ )
90
+
91
+ def can_run(self, forward_batch: ForwardBatch):
92
+ is_bs_supported = (
93
+ forward_batch.batch_size in self.graphs
94
+ if self.disable_padding
95
+ else forward_batch.batch_size <= self.max_bs
96
+ )
97
+ return is_bs_supported
98
+
99
+ def capture(self):
100
+ CudaGraphRunner.capture(self)
101
+
102
+ def capture_one_batch_size(self, num_seqs: int, forward: Callable):
103
+ graph = torch.cuda.CUDAGraph()
104
+ stream = self.stream
105
+ num_tokens = num_seqs * self.num_tokens_per_bs
106
+
107
+ # Graph inputs
108
+ req_pool_indices = self.req_pool_indices[:num_seqs]
109
+ seq_lens = self.seq_lens[:num_seqs]
110
+ out_cache_loc = self.out_cache_loc[: num_tokens * self.speculative_num_steps]
111
+ positions = self.positions[:num_tokens]
112
+ topk_p = self.topk_p[:num_seqs]
113
+ topk_index = self.topk_index[:num_seqs]
114
+ hidden_states = self.hidden_states[:num_seqs]
115
+
116
+ spec_info = EagleDraftInput(
117
+ topk_p=topk_p,
118
+ topk_index=topk_index,
119
+ hidden_states=hidden_states,
120
+ )
121
+
122
+ # Forward batch
123
+ forward_batch = ForwardBatch(
124
+ forward_mode=ForwardMode.DECODE,
125
+ batch_size=num_seqs,
126
+ input_ids=None,
127
+ req_pool_indices=req_pool_indices,
128
+ seq_lens=seq_lens,
129
+ req_to_token_pool=self.model_runner.req_to_token_pool,
130
+ token_to_kv_pool=self.model_runner.token_to_kv_pool,
131
+ out_cache_loc=out_cache_loc,
132
+ seq_lens_sum=seq_lens.sum(),
133
+ return_logprob=False,
134
+ positions=positions,
135
+ spec_algorithm=self.model_runner.spec_algorithm,
136
+ spec_info=spec_info,
137
+ capture_hidden_mode=(
138
+ spec_info.capture_hidden_mode if spec_info else CaptureHiddenMode.NULL
139
+ ),
140
+ )
141
+
142
+ # Attention backend
143
+ self.model_runner.draft_attn_backend.init_forward_metadata_capture_cuda_graph(
144
+ forward_batch
145
+ )
146
+
147
+ # Run and capture
148
+ def run_once():
149
+ # Backup two fileds, which will be modified in-place in `draft_forward`.
150
+ output_cache_loc_backup = forward_batch.out_cache_loc
151
+ hidden_states_backup = forward_batch.spec_info.hidden_states
152
+
153
+ ret = self.eagle_worker.draft_forward(forward_batch)
154
+
155
+ forward_batch.out_cache_loc = output_cache_loc_backup
156
+ forward_batch.spec_info.hidden_states = hidden_states_backup
157
+ return ret
158
+
159
+ for _ in range(2):
160
+ torch.cuda.synchronize()
161
+ self.model_runner.tp_group.barrier()
162
+
163
+ run_once()
164
+
165
+ torch.cuda.synchronize()
166
+ self.model_runner.tp_group.barrier()
167
+
168
+ torch.cuda.synchronize()
169
+ self.model_runner.tp_group.barrier()
170
+
171
+ with torch.cuda.graph(
172
+ graph, pool=get_global_graph_memory_pool(), stream=stream
173
+ ):
174
+ out = run_once()
175
+
176
+ torch.cuda.synchronize()
177
+ self.model_runner.tp_group.barrier()
178
+
179
+ set_global_graph_memory_pool(graph.pool())
180
+ return graph, out
181
+
182
+ def replay(self, forward_batch: ForwardBatch):
183
+ assert forward_batch.out_cache_loc is not None
184
+ raw_bs = forward_batch.batch_size
185
+ raw_num_token = raw_bs * self.num_tokens_per_bs
186
+
187
+ # Pad
188
+ index = bisect.bisect_left(self.capture_bs, raw_bs)
189
+ bs = self.capture_bs[index]
190
+ if bs != raw_bs:
191
+ self.seq_lens.fill_(1)
192
+ self.out_cache_loc.zero_()
193
+
194
+ # Common inputs
195
+ self.req_pool_indices[:raw_bs].copy_(forward_batch.req_pool_indices)
196
+ self.seq_lens[:raw_bs].copy_(forward_batch.seq_lens)
197
+ self.out_cache_loc[: raw_num_token * self.speculative_num_steps].copy_(
198
+ forward_batch.out_cache_loc
199
+ )
200
+ self.positions[:raw_num_token].copy_(forward_batch.positions)
201
+ self.topk_p[:raw_bs].copy_(forward_batch.spec_info.topk_p)
202
+ self.topk_index[:raw_bs].copy_(forward_batch.spec_info.topk_index)
203
+ self.hidden_states[:raw_bs].copy_(forward_batch.spec_info.hidden_states)
204
+
205
+ # Attention backend
206
+ self.model_runner.draft_attn_backend.init_forward_metadata_replay_cuda_graph(
207
+ forward_batch
208
+ )
209
+
210
+ # Replay
211
+ self.graphs[bs].replay()
212
+
213
+ return self.output_buffers[bs]