sglang 0.4.2.post3__py3-none-any.whl → 0.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/check_env.py +1 -0
- sglang/global_config.py +2 -0
- sglang/srt/constrained/outlines_backend.py +4 -1
- sglang/srt/entrypoints/engine.py +2 -2
- sglang/srt/layers/attention/flashinfer_backend.py +265 -147
- sglang/srt/layers/attention/triton_backend.py +358 -72
- sglang/srt/layers/attention/triton_ops/extend_attention.py +4 -4
- sglang/srt/layers/linear.py +12 -5
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +2 -2
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +2 -2
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +200 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +200 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +200 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +178 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +200 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +175 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +27 -5
- sglang/srt/layers/moe/fused_moe_triton/layer.py +2 -0
- sglang/srt/layers/moe/topk.py +1 -1
- sglang/srt/layers/quantization/__init__.py +51 -5
- sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +30 -30
- sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +29 -29
- sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +33 -33
- sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +31 -31
- sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +27 -27
- sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +31 -31
- sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +24 -24
- sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +30 -30
- sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +42 -42
- sglang/srt/layers/quantization/fp8_kernel.py +123 -17
- sglang/srt/layers/quantization/fp8_utils.py +33 -4
- sglang/srt/lora/backend/__init__.py +25 -5
- sglang/srt/lora/backend/base_backend.py +31 -9
- sglang/srt/lora/backend/flashinfer_backend.py +41 -4
- sglang/srt/lora/backend/triton_backend.py +34 -4
- sglang/srt/lora/layers.py +293 -0
- sglang/srt/lora/lora.py +101 -326
- sglang/srt/lora/lora_manager.py +101 -269
- sglang/srt/lora/mem_pool.py +174 -0
- sglang/srt/lora/triton_ops/__init__.py +7 -1
- sglang/srt/lora/triton_ops/gate_up_lora_b.py +170 -0
- sglang/srt/lora/triton_ops/qkv_lora_b.py +5 -5
- sglang/srt/lora/triton_ops/sgemm_lora_a.py +2 -2
- sglang/srt/lora/triton_ops/sgemm_lora_b.py +2 -2
- sglang/srt/lora/utils.py +141 -0
- sglang/srt/managers/detokenizer_manager.py +1 -0
- sglang/srt/managers/io_struct.py +4 -0
- sglang/srt/managers/schedule_batch.py +16 -3
- sglang/srt/managers/scheduler.py +29 -0
- sglang/srt/managers/tokenizer_manager.py +6 -0
- sglang/srt/managers/tp_worker_overlap_thread.py +4 -0
- sglang/srt/model_executor/cuda_graph_runner.py +16 -1
- sglang/srt/model_executor/model_runner.py +12 -2
- sglang/srt/models/deepseek_v2.py +17 -7
- sglang/srt/server_args.py +20 -1
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +1 -0
- sglang/srt/speculative/eagle_utils.py +64 -21
- sglang/srt/speculative/eagle_worker.py +29 -8
- sglang/srt/utils.py +7 -0
- sglang/version.py +1 -1
- {sglang-0.4.2.post3.dist-info → sglang-0.4.3.dist-info}/METADATA +6 -5
- {sglang-0.4.2.post3.dist-info → sglang-0.4.3.dist-info}/RECORD +88 -55
- {sglang-0.4.2.post3.dist-info → sglang-0.4.3.dist-info}/LICENSE +0 -0
- {sglang-0.4.2.post3.dist-info → sglang-0.4.3.dist-info}/WHEEL +0 -0
- {sglang-0.4.2.post3.dist-info → sglang-0.4.3.dist-info}/top_level.txt +0 -0
sglang/srt/models/deepseek_v2.py
CHANGED
@@ -255,6 +255,8 @@ class DeepseekV2Attention(nn.Module):
|
|
255
255
|
self.kv_lora_rank + self.qk_rope_head_dim,
|
256
256
|
bias=False,
|
257
257
|
quant_config=quant_config,
|
258
|
+
# FIXME: quick fix for skip quantization
|
259
|
+
prefix=f"self_attn.kv_a_proj_with_mqa",
|
258
260
|
)
|
259
261
|
self.kv_a_layernorm = RMSNorm(self.kv_lora_rank, eps=config.rms_norm_eps)
|
260
262
|
self.kv_b_proj = ColumnParallelLinear(
|
@@ -455,6 +457,8 @@ class DeepseekV2AttentionMLA(nn.Module):
|
|
455
457
|
self.kv_lora_rank + self.qk_rope_head_dim,
|
456
458
|
bias=False,
|
457
459
|
quant_config=quant_config,
|
460
|
+
# FIXME: quick fix for skip quantization
|
461
|
+
prefix=f"self_attn.kv_a_proj_with_mqa",
|
458
462
|
)
|
459
463
|
self.kv_a_layernorm = RMSNorm(self.kv_lora_rank, eps=config.rms_norm_eps)
|
460
464
|
|
@@ -506,14 +510,20 @@ class DeepseekV2AttentionMLA(nn.Module):
|
|
506
510
|
hidden_states: torch.Tensor,
|
507
511
|
forward_batch: ForwardBatch,
|
508
512
|
) -> torch.Tensor:
|
509
|
-
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
|
514
|
-
return self.forward_normal(positions, hidden_states, forward_batch)
|
513
|
+
if global_server_args_dict["enable_flashinfer_mla"]:
|
514
|
+
if forward_batch.forward_mode.is_extend():
|
515
|
+
return self.forward_normal(positions, hidden_states, forward_batch)
|
516
|
+
else:
|
517
|
+
return self.forward_absorb(positions, hidden_states, forward_batch)
|
515
518
|
else:
|
516
|
-
|
519
|
+
# Triton: Use normal computation for prefill and use weight absorption for extend/decode
|
520
|
+
if (
|
521
|
+
forward_batch.forward_mode.is_extend()
|
522
|
+
and forward_batch.extend_prefix_lens.sum() == 0
|
523
|
+
):
|
524
|
+
return self.forward_normal(positions, hidden_states, forward_batch)
|
525
|
+
else:
|
526
|
+
return self.forward_absorb(positions, hidden_states, forward_batch)
|
517
527
|
|
518
528
|
def forward_normal(
|
519
529
|
self,
|
sglang/srt/server_args.py
CHANGED
@@ -140,6 +140,7 @@ class ServerArgs:
|
|
140
140
|
disable_jump_forward: bool = False
|
141
141
|
disable_cuda_graph: bool = False
|
142
142
|
disable_cuda_graph_padding: bool = False
|
143
|
+
enable_nccl_nvls: bool = False
|
143
144
|
disable_outlines_disk_cache: bool = False
|
144
145
|
disable_custom_all_reduce: bool = False
|
145
146
|
disable_mla: bool = False
|
@@ -160,12 +161,15 @@ class ServerArgs:
|
|
160
161
|
delete_ckpt_after_loading: bool = False
|
161
162
|
enable_memory_saver: bool = False
|
162
163
|
allow_auto_truncate: bool = False
|
164
|
+
return_hidden_states: bool = False
|
163
165
|
|
164
166
|
# Custom logit processor
|
165
167
|
enable_custom_logit_processor: bool = False
|
166
168
|
tool_call_parser: str = None
|
167
169
|
enable_hierarchical_cache: bool = False
|
168
170
|
|
171
|
+
enable_flashinfer_mla: bool = False
|
172
|
+
|
169
173
|
def __post_init__(self):
|
170
174
|
# Set missing default values
|
171
175
|
if self.tokenizer_path is None:
|
@@ -691,6 +695,11 @@ class ServerArgs:
|
|
691
695
|
default=ServerArgs.grammar_backend,
|
692
696
|
help="Choose the backend for grammar-guided decoding.",
|
693
697
|
)
|
698
|
+
parser.add_argument(
|
699
|
+
"--enable-flashinfer-mla",
|
700
|
+
action="store_true",
|
701
|
+
help="Enable FlashInfer MLA optimization",
|
702
|
+
)
|
694
703
|
|
695
704
|
# Speculative decoding
|
696
705
|
parser.add_argument(
|
@@ -782,6 +791,11 @@ class ServerArgs:
|
|
782
791
|
action="store_true",
|
783
792
|
help="Disable cuda graph when padding is needed. Still uses cuda graph when padding is not needed.",
|
784
793
|
)
|
794
|
+
parser.add_argument(
|
795
|
+
"--enable-nccl-nvls",
|
796
|
+
action="store_true",
|
797
|
+
help="Enable NCCL NVLS for prefill heavy requests when available.",
|
798
|
+
)
|
785
799
|
parser.add_argument(
|
786
800
|
"--disable-outlines-disk-cache",
|
787
801
|
action="store_true",
|
@@ -795,7 +809,7 @@ class ServerArgs:
|
|
795
809
|
parser.add_argument(
|
796
810
|
"--disable-mla",
|
797
811
|
action="store_true",
|
798
|
-
help="Disable Multi-head Latent Attention (MLA) for DeepSeek
|
812
|
+
help="Disable Multi-head Latent Attention (MLA) for DeepSeek V2/V3/R1 series models.",
|
799
813
|
)
|
800
814
|
parser.add_argument(
|
801
815
|
"--disable-overlap-schedule",
|
@@ -896,6 +910,11 @@ class ServerArgs:
|
|
896
910
|
action="store_true",
|
897
911
|
help="Enable users to pass custom logit processors to the server (disabled by default for security)",
|
898
912
|
)
|
913
|
+
parser.add_argument(
|
914
|
+
"--return-hidden-states",
|
915
|
+
action="store_true",
|
916
|
+
help="Return hidden states in the response.",
|
917
|
+
)
|
899
918
|
# Function Calling
|
900
919
|
parser.add_argument(
|
901
920
|
"--tool-call-parser",
|
@@ -85,6 +85,7 @@ class EAGLEDraftCudaGraphRunner:
|
|
85
85
|
"1. disable cuda graph by --disable-cuda-graph\n"
|
86
86
|
"2. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n"
|
87
87
|
"3. disable torch compile by not using --enable-torch-compile\n"
|
88
|
+
"4. specify --dtype to the same dtype (e.g. bfloat16)\n"
|
88
89
|
"Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n"
|
89
90
|
)
|
90
91
|
|
@@ -4,6 +4,7 @@ import dataclasses
|
|
4
4
|
from typing import TYPE_CHECKING, List
|
5
5
|
|
6
6
|
import torch
|
7
|
+
import torch.nn.functional as F
|
7
8
|
import triton
|
8
9
|
import triton.language as tl
|
9
10
|
|
@@ -11,7 +12,14 @@ from sglang.srt.layers.attention.flashinfer_backend import (
|
|
11
12
|
create_flashinfer_kv_indices_triton,
|
12
13
|
)
|
13
14
|
from sglang.srt.model_executor.forward_batch_info import CaptureHiddenMode
|
14
|
-
from sglang.srt.speculative.build_eagle_tree import
|
15
|
+
from sglang.srt.speculative.build_eagle_tree import (
|
16
|
+
build_tree_kernel,
|
17
|
+
build_tree_kernel_efficient,
|
18
|
+
)
|
19
|
+
from sglang.srt.utils import is_cuda_available
|
20
|
+
|
21
|
+
if is_cuda_available():
|
22
|
+
from sgl_kernel import tree_speculative_sampling_target_only
|
15
23
|
|
16
24
|
if TYPE_CHECKING:
|
17
25
|
from sglang.srt.managers.schedule_batch import ScheduleBatch
|
@@ -160,8 +168,11 @@ class EagleVerifyInput:
|
|
160
168
|
custom_mask: torch.Tensor
|
161
169
|
positions: torch.Tensor
|
162
170
|
retrive_index: torch.Tensor
|
171
|
+
retrive_next_token: torch.Tensor
|
172
|
+
retrive_next_sibling: torch.Tensor
|
163
173
|
retrive_cum_len: torch.Tensor
|
164
174
|
draft_token_num: int
|
175
|
+
spec_steps: int
|
165
176
|
capture_hidden_mode: CaptureHiddenMode
|
166
177
|
|
167
178
|
@classmethod
|
@@ -175,10 +186,45 @@ class EagleVerifyInput:
|
|
175
186
|
seq_lens_sum: int,
|
176
187
|
topk: int,
|
177
188
|
spec_steps: int,
|
178
|
-
|
189
|
+
num_verify_tokens: int,
|
190
|
+
is_all_greedy: bool,
|
179
191
|
):
|
180
|
-
|
181
|
-
|
192
|
+
if is_all_greedy:
|
193
|
+
tree_mask, position, retrive_index, retrive_cum_len, draft_tokens = (
|
194
|
+
build_tree_kernel(
|
195
|
+
verified_id,
|
196
|
+
score_list, # b, n, topk; n= 1 + (num_steps-1) * self.topk
|
197
|
+
token_list,
|
198
|
+
parents_list,
|
199
|
+
seq_lens,
|
200
|
+
seq_lens_sum,
|
201
|
+
topk,
|
202
|
+
spec_steps,
|
203
|
+
num_verify_tokens,
|
204
|
+
)
|
205
|
+
)
|
206
|
+
|
207
|
+
return cls(
|
208
|
+
draft_tokens,
|
209
|
+
tree_mask,
|
210
|
+
position,
|
211
|
+
retrive_index,
|
212
|
+
None,
|
213
|
+
None,
|
214
|
+
retrive_cum_len,
|
215
|
+
num_verify_tokens,
|
216
|
+
spec_steps,
|
217
|
+
CaptureHiddenMode.FULL,
|
218
|
+
)
|
219
|
+
else:
|
220
|
+
(
|
221
|
+
tree_mask,
|
222
|
+
position,
|
223
|
+
retrive_index,
|
224
|
+
retrive_next_token,
|
225
|
+
retrive_next_sibling,
|
226
|
+
draft_tokens,
|
227
|
+
) = build_tree_kernel_efficient(
|
182
228
|
verified_id,
|
183
229
|
score_list,
|
184
230
|
token_list,
|
@@ -187,18 +233,21 @@ class EagleVerifyInput:
|
|
187
233
|
seq_lens_sum,
|
188
234
|
topk,
|
189
235
|
spec_steps,
|
190
|
-
|
236
|
+
num_verify_tokens,
|
237
|
+
)
|
238
|
+
|
239
|
+
return cls(
|
240
|
+
draft_tokens,
|
241
|
+
tree_mask,
|
242
|
+
position,
|
243
|
+
retrive_index,
|
244
|
+
retrive_next_token,
|
245
|
+
retrive_next_sibling,
|
246
|
+
None,
|
247
|
+
num_verify_tokens,
|
248
|
+
spec_steps,
|
249
|
+
CaptureHiddenMode.FULL,
|
191
250
|
)
|
192
|
-
)
|
193
|
-
return cls(
|
194
|
-
draft_tokens,
|
195
|
-
tree_mask,
|
196
|
-
position,
|
197
|
-
retrive_index,
|
198
|
-
retrive_cum_len,
|
199
|
-
num_verify_token,
|
200
|
-
CaptureHiddenMode.FULL,
|
201
|
-
)
|
202
251
|
|
203
252
|
def prepare_for_verify(self, batch: ScheduleBatch):
|
204
253
|
batch.input_ids = self.draft_token
|
@@ -313,12 +362,6 @@ class EagleVerifyInput:
|
|
313
362
|
uniform_samples=coins,
|
314
363
|
target_probs=target_probs,
|
315
364
|
draft_probs=draft_probs,
|
316
|
-
threshold_single=global_server_args_dict[
|
317
|
-
"speculative_accept_threshold_single"
|
318
|
-
],
|
319
|
-
threshold_acc=global_server_args_dict[
|
320
|
-
"speculative_accept_threshold_acc"
|
321
|
-
],
|
322
365
|
deterministic=True,
|
323
366
|
)
|
324
367
|
|
@@ -65,15 +65,31 @@ class EAGLEWorker(TpModelWorker):
|
|
65
65
|
self.model_runner.server_args.disable_cuda_graph = backup_disable_cuda_graph
|
66
66
|
|
67
67
|
# Create multi-step attn backends and cuda graph runners
|
68
|
-
|
69
|
-
|
70
|
-
|
68
|
+
if server_args.attention_backend == "flashinfer":
|
69
|
+
from sglang.srt.layers.attention.flashinfer_backend import (
|
70
|
+
FlashInferMultiStepDraftBackend,
|
71
|
+
)
|
72
|
+
|
73
|
+
self.draft_attn_backend = FlashInferMultiStepDraftBackend(
|
74
|
+
self.model_runner,
|
75
|
+
self.topk,
|
76
|
+
self.speculative_num_steps,
|
77
|
+
)
|
78
|
+
elif server_args.attention_backend == "triton":
|
79
|
+
from sglang.srt.layers.attention.triton_backend import (
|
80
|
+
TritonMultiStepDraftBackend,
|
81
|
+
)
|
82
|
+
|
83
|
+
self.draft_attn_backend = TritonMultiStepDraftBackend(
|
84
|
+
self.model_runner,
|
85
|
+
self.topk,
|
86
|
+
self.speculative_num_steps,
|
87
|
+
)
|
88
|
+
else:
|
89
|
+
raise ValueError(
|
90
|
+
f"EAGLE is not supportted in attention backend {server_args.attention_backend}"
|
91
|
+
)
|
71
92
|
|
72
|
-
self.draft_attn_backend = FlashInferMultiStepDraftBackend(
|
73
|
-
self.model_runner,
|
74
|
-
self.topk,
|
75
|
-
self.speculative_num_steps,
|
76
|
-
)
|
77
93
|
self.model_runner.draft_attn_backend = self.draft_attn_backend
|
78
94
|
self.init_cuda_graphs()
|
79
95
|
|
@@ -185,6 +201,7 @@ class EAGLEWorker(TpModelWorker):
|
|
185
201
|
self.topk,
|
186
202
|
self.speculative_num_steps,
|
187
203
|
self.server_args.speculative_num_draft_tokens,
|
204
|
+
batch.sampling_info.is_all_greedy,
|
188
205
|
)
|
189
206
|
|
190
207
|
# Free cache locations
|
@@ -217,6 +234,10 @@ class EAGLEWorker(TpModelWorker):
|
|
217
234
|
token_list.append(tree_info[1])
|
218
235
|
parents_list.append(tree_info[2])
|
219
236
|
|
237
|
+
# we don't need to run the last forward. we get 1 token from draft prefill and (#spec steps - 1) tokens here
|
238
|
+
if i == self.speculative_num_steps - 1:
|
239
|
+
break
|
240
|
+
|
220
241
|
# Set inputs
|
221
242
|
forward_batch.input_ids = input_ids
|
222
243
|
forward_batch.out_cache_loc = out_cache_loc[
|
sglang/srt/utils.py
CHANGED
@@ -1444,3 +1444,10 @@ def launch_dummy_health_check_server(host, port):
|
|
1444
1444
|
timeout_keep_alive=5,
|
1445
1445
|
loop="uvloop",
|
1446
1446
|
)
|
1447
|
+
|
1448
|
+
|
1449
|
+
def set_cuda_arch():
|
1450
|
+
if is_flashinfer_available():
|
1451
|
+
capability = torch.cuda.get_device_capability()
|
1452
|
+
arch = f"{capability[0]}.{capability[1]}"
|
1453
|
+
os.environ["TORCH_CUDA_ARCH_LIST"] = f"{arch}{'+PTX' if arch == '9.0' else ''}"
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.4.
|
1
|
+
__version__ = "0.4.3"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.4.
|
3
|
+
Version: 0.4.3
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -236,14 +236,15 @@ Requires-Dist: torchao>=0.7.0; extra == "runtime-common"
|
|
236
236
|
Requires-Dist: uvicorn; extra == "runtime-common"
|
237
237
|
Requires-Dist: uvloop; extra == "runtime-common"
|
238
238
|
Requires-Dist: xgrammar>=0.1.10; extra == "runtime-common"
|
239
|
+
Requires-Dist: ninja; extra == "runtime-common"
|
239
240
|
Provides-Extra: srt
|
240
241
|
Requires-Dist: sglang[runtime_common]; extra == "srt"
|
241
242
|
Requires-Dist: cuda-python; extra == "srt"
|
242
|
-
Requires-Dist: sgl-kernel>=0.0.3.
|
243
|
+
Requires-Dist: sgl-kernel>=0.0.3.post6; extra == "srt"
|
243
244
|
Requires-Dist: torch; extra == "srt"
|
244
|
-
Requires-Dist: vllm
|
245
|
-
Requires-Dist: flashinfer_python>=0.2.
|
246
|
-
Requires-Dist: outlines
|
245
|
+
Requires-Dist: vllm<=0.7.2,>=0.6.4.post1; extra == "srt"
|
246
|
+
Requires-Dist: flashinfer_python>=0.2.1.post1; extra == "srt"
|
247
|
+
Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt"
|
247
248
|
Provides-Extra: srt-hip
|
248
249
|
Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
|
249
250
|
Requires-Dist: torch; extra == "srt-hip"
|