sglang 0.4.2.post1__py3-none-any.whl → 0.4.2.post3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/srt/constrained/outlines_backend.py +9 -1
- sglang/srt/custom_op.py +40 -0
- sglang/srt/entrypoints/engine.py +2 -2
- sglang/srt/function_call_parser.py +96 -69
- sglang/srt/layers/activation.py +10 -5
- sglang/srt/layers/attention/double_sparsity_backend.py +1 -3
- sglang/srt/layers/attention/flashinfer_backend.py +284 -39
- sglang/srt/layers/attention/triton_backend.py +124 -12
- sglang/srt/layers/attention/triton_ops/decode_attention.py +53 -59
- sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +337 -3
- sglang/srt/layers/attention/triton_ops/extend_attention.py +70 -42
- sglang/srt/layers/layernorm.py +1 -5
- sglang/srt/layers/moe/ep_moe/layer.py +1 -3
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Radeon_Graphics.json +200 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Radeon_Graphics.json +200 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Radeon_Graphics.json +200 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +178 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Radeon_Graphics.json +200 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +175 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -13
- sglang/srt/layers/moe/fused_moe_triton/layer.py +1 -3
- sglang/srt/layers/moe/topk.py +4 -0
- sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/fp8_kernel.py +173 -2
- sglang/srt/layers/rotary_embedding.py +1 -3
- sglang/srt/layers/sampler.py +4 -4
- sglang/srt/lora/backend/__init__.py +8 -0
- sglang/srt/lora/backend/base_backend.py +95 -0
- sglang/srt/lora/backend/flashinfer_backend.py +91 -0
- sglang/srt/lora/backend/triton_backend.py +61 -0
- sglang/srt/lora/lora.py +127 -112
- sglang/srt/lora/lora_manager.py +50 -18
- sglang/srt/lora/triton_ops/__init__.py +5 -0
- sglang/srt/lora/triton_ops/qkv_lora_b.py +182 -0
- sglang/srt/lora/triton_ops/sgemm_lora_a.py +143 -0
- sglang/srt/lora/triton_ops/sgemm_lora_b.py +159 -0
- sglang/srt/model_executor/cuda_graph_runner.py +77 -80
- sglang/srt/model_executor/forward_batch_info.py +58 -59
- sglang/srt/model_executor/model_runner.py +2 -2
- sglang/srt/models/llama.py +8 -3
- sglang/srt/models/qwen2_vl.py +1 -1
- sglang/srt/server_args.py +13 -2
- sglang/srt/speculative/build_eagle_tree.py +486 -104
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +213 -0
- sglang/srt/speculative/eagle_utils.py +420 -401
- sglang/srt/speculative/eagle_worker.py +177 -45
- sglang/srt/utils.py +7 -0
- sglang/test/runners.py +2 -0
- sglang/version.py +1 -1
- {sglang-0.4.2.post1.dist-info → sglang-0.4.2.post3.dist-info}/METADATA +15 -6
- {sglang-0.4.2.post1.dist-info → sglang-0.4.2.post3.dist-info}/RECORD +77 -38
- sglang/srt/layers/custom_op_util.py +0 -25
- {sglang-0.4.2.post1.dist-info → sglang-0.4.2.post3.dist-info}/LICENSE +0 -0
- {sglang-0.4.2.post1.dist-info → sglang-0.4.2.post3.dist-info}/WHEEL +0 -0
- {sglang-0.4.2.post1.dist-info → sglang-0.4.2.post3.dist-info}/top_level.txt +0 -0
@@ -1,3 +1,5 @@
|
|
1
|
+
import logging
|
2
|
+
import time
|
1
3
|
from typing import List, Optional, Union
|
2
4
|
|
3
5
|
import torch
|
@@ -12,8 +14,18 @@ from sglang.srt.model_executor.forward_batch_info import (
|
|
12
14
|
)
|
13
15
|
from sglang.srt.model_executor.model_runner import ModelRunner
|
14
16
|
from sglang.srt.server_args import ServerArgs
|
15
|
-
from sglang.srt.speculative.
|
16
|
-
|
17
|
+
from sglang.srt.speculative.eagle_draft_cuda_graph_runner import (
|
18
|
+
EAGLEDraftCudaGraphRunner,
|
19
|
+
)
|
20
|
+
from sglang.srt.speculative.eagle_utils import (
|
21
|
+
EagleDraftInput,
|
22
|
+
EagleVerifyInput,
|
23
|
+
assign_draft_cache_locs,
|
24
|
+
fast_topk,
|
25
|
+
select_top_k_tokens,
|
26
|
+
)
|
27
|
+
|
28
|
+
logger = logging.getLogger(__name__)
|
17
29
|
|
18
30
|
|
19
31
|
class EAGLEWorker(TpModelWorker):
|
@@ -40,41 +52,47 @@ class EAGLEWorker(TpModelWorker):
|
|
40
52
|
is_draft_worker=True,
|
41
53
|
)
|
42
54
|
self.target_worker = target_worker
|
43
|
-
self.server_args = server_args
|
44
55
|
self.finish_extend_len = []
|
45
56
|
|
57
|
+
# Parse arguments
|
58
|
+
self.topk = server_args.speculative_eagle_topk
|
59
|
+
self.speculative_num_steps = server_args.speculative_num_steps
|
60
|
+
self.server_args = server_args
|
61
|
+
|
46
62
|
# Share the embedding and lm_head
|
47
63
|
embed, head = self.target_worker.model_runner.model.get_embed_and_head()
|
48
64
|
self.model_runner.model.set_embed_and_head(embed, head)
|
49
65
|
self.model_runner.server_args.disable_cuda_graph = backup_disable_cuda_graph
|
50
|
-
self.model_runner.init_cuda_graphs()
|
51
66
|
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
forward_batch = ForwardBatch.init_new(model_worker_batch, self.model_runner)
|
57
|
-
logits_output = self.model_runner.forward(forward_batch)
|
58
|
-
self.capture_for_decode(logits_output, forward_batch)
|
67
|
+
# Create multi-step attn backends and cuda graph runners
|
68
|
+
from sglang.srt.layers.attention.flashinfer_backend import (
|
69
|
+
FlashInferMultiStepDraftBackend,
|
70
|
+
)
|
59
71
|
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
72
|
+
self.draft_attn_backend = FlashInferMultiStepDraftBackend(
|
73
|
+
self.model_runner,
|
74
|
+
self.topk,
|
75
|
+
self.speculative_num_steps,
|
76
|
+
)
|
77
|
+
self.model_runner.draft_attn_backend = self.draft_attn_backend
|
78
|
+
self.init_cuda_graphs()
|
79
|
+
|
80
|
+
def init_cuda_graphs(self):
|
81
|
+
"""Capture cuda graphs."""
|
82
|
+
self.cuda_graph_runner = None
|
83
|
+
|
84
|
+
if self.server_args.disable_cuda_graph:
|
85
|
+
return
|
86
|
+
|
87
|
+
tic = time.time()
|
88
|
+
logger.info("Capture cuda graph begin. This can take up to several minutes.")
|
89
|
+
self.cuda_graph_runner = EAGLEDraftCudaGraphRunner(self)
|
90
|
+
logger.info(f"Capture cuda graph end. Time elapsed: {time.time() - tic:.2f} s")
|
69
91
|
|
70
92
|
def forward_batch_speculative_generation(self, batch: ScheduleBatch):
|
71
93
|
if batch.forward_mode.is_decode():
|
72
94
|
# Draft
|
73
|
-
self.
|
74
|
-
for i in range(self.server_args.speculative_num_steps):
|
75
|
-
self.forward_draft_decode(batch)
|
76
|
-
batch.spec_info.clear_draft_cache(batch)
|
77
|
-
self._set_mem_pool(batch, self.target_worker.model_runner)
|
95
|
+
spec_info: EagleVerifyInput = self.draft(batch)
|
78
96
|
|
79
97
|
# Verify
|
80
98
|
(
|
@@ -84,8 +102,7 @@ class EAGLEWorker(TpModelWorker):
|
|
84
102
|
self.finish_extend_len,
|
85
103
|
accept_length_cpu,
|
86
104
|
model_worker_batch,
|
87
|
-
) = self.verify(batch)
|
88
|
-
next_draft_input.load_server_args(self.server_args)
|
105
|
+
) = self.verify(batch, spec_info)
|
89
106
|
batch.spec_info = next_draft_input
|
90
107
|
# if it is None, means all requsets are finished
|
91
108
|
if batch.spec_info.verified_id is not None:
|
@@ -107,39 +124,156 @@ class EAGLEWorker(TpModelWorker):
|
|
107
124
|
)
|
108
125
|
|
109
126
|
# Forward with the draft model.
|
110
|
-
spec_info =
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
batch.spec_info = spec_info
|
127
|
+
batch.spec_info = EagleDraftInput(
|
128
|
+
hidden_states=logits_output.hidden_states,
|
129
|
+
verified_id=next_token_ids,
|
130
|
+
)
|
115
131
|
self.forward_draft_extend(batch)
|
116
132
|
return logits_output, next_token_ids, model_worker_batch, 0
|
117
133
|
|
118
|
-
def
|
119
|
-
|
120
|
-
|
134
|
+
def draft(self, batch: ScheduleBatch):
|
135
|
+
self._set_mem_pool(batch, self.model_runner)
|
136
|
+
|
137
|
+
# Parse args
|
138
|
+
num_seqs = batch.batch_size()
|
139
|
+
spec_info = batch.spec_info
|
140
|
+
|
141
|
+
# Allocate cache locations
|
142
|
+
out_cache_loc = batch.alloc_token_slots(
|
143
|
+
num_seqs * self.topk * self.speculative_num_steps
|
144
|
+
)
|
145
|
+
assign_draft_cache_locs[(num_seqs,)](
|
146
|
+
batch.req_pool_indices,
|
147
|
+
batch.req_to_token_pool.req_to_token,
|
148
|
+
batch.seq_lens,
|
149
|
+
out_cache_loc,
|
150
|
+
batch.req_to_token_pool.req_to_token.shape[1],
|
151
|
+
self.topk,
|
152
|
+
self.speculative_num_steps,
|
153
|
+
)
|
154
|
+
|
155
|
+
batch.out_cache_loc = out_cache_loc
|
156
|
+
batch.seq_lens_sum = torch.sum(batch.seq_lens).item()
|
157
|
+
spec_info.positions = batch.seq_lens.repeat_interleave(self.topk, dim=0)
|
158
|
+
|
159
|
+
# Get forward batch
|
160
|
+
spec_info.capture_hidden_mode = CaptureHiddenMode.LAST
|
161
|
+
model_worker_batch = batch.get_model_worker_batch()
|
162
|
+
forward_batch = ForwardBatch.init_new(model_worker_batch, self.model_runner)
|
163
|
+
can_cuda_graph = self.cuda_graph_runner and self.cuda_graph_runner.can_run(
|
164
|
+
forward_batch
|
165
|
+
)
|
166
|
+
|
167
|
+
if can_cuda_graph:
|
168
|
+
score_list, token_list, parents_list = self.cuda_graph_runner.replay(
|
169
|
+
forward_batch
|
170
|
+
)
|
171
|
+
else:
|
172
|
+
# Initialize attention backend
|
173
|
+
self.draft_attn_backend.init_forward_metadata(forward_batch)
|
174
|
+
|
175
|
+
# Run forward steps
|
176
|
+
score_list, token_list, parents_list = self.draft_forward(forward_batch)
|
177
|
+
|
178
|
+
ret = EagleVerifyInput.create(
|
179
|
+
spec_info.verified_id,
|
180
|
+
score_list,
|
181
|
+
token_list,
|
182
|
+
parents_list,
|
183
|
+
batch.seq_lens,
|
184
|
+
batch.seq_lens_sum,
|
185
|
+
self.topk,
|
186
|
+
self.speculative_num_steps,
|
187
|
+
self.server_args.speculative_num_draft_tokens,
|
188
|
+
)
|
189
|
+
|
190
|
+
# Free cache locations
|
191
|
+
batch.token_to_kv_pool.free(out_cache_loc)
|
192
|
+
self._set_mem_pool(batch, self.target_worker.model_runner)
|
193
|
+
return ret
|
194
|
+
|
195
|
+
def draft_forward(self, forward_batch: ForwardBatch):
|
196
|
+
# Parse args
|
197
|
+
spec_info = forward_batch.spec_info
|
198
|
+
out_cache_loc = forward_batch.out_cache_loc
|
199
|
+
topk_p, topk_index, hidden_states = (
|
200
|
+
spec_info.topk_p,
|
201
|
+
spec_info.topk_index,
|
202
|
+
spec_info.hidden_states,
|
203
|
+
)
|
204
|
+
|
205
|
+
# Return values
|
206
|
+
score_list: List[torch.Tensor] = []
|
207
|
+
token_list: List[torch.Tensor] = []
|
208
|
+
parents_list: List[torch.Tensor] = []
|
209
|
+
|
210
|
+
# Forward multiple steps
|
211
|
+
scores = None
|
212
|
+
for i in range(self.speculative_num_steps):
|
213
|
+
input_ids, hidden_states, scores, tree_info = select_top_k_tokens(
|
214
|
+
i, topk_p, topk_index, hidden_states, scores, self.topk
|
215
|
+
)
|
216
|
+
score_list.append(tree_info[0])
|
217
|
+
token_list.append(tree_info[1])
|
218
|
+
parents_list.append(tree_info[2])
|
219
|
+
|
220
|
+
# Set inputs
|
221
|
+
forward_batch.input_ids = input_ids
|
222
|
+
forward_batch.out_cache_loc = out_cache_loc[
|
223
|
+
forward_batch.batch_size
|
224
|
+
* self.topk
|
225
|
+
* i : forward_batch.batch_size
|
226
|
+
* self.topk
|
227
|
+
* (i + 1)
|
228
|
+
]
|
229
|
+
forward_batch.positions.add_(1)
|
230
|
+
forward_batch.attn_backend = self.draft_attn_backend.attn_backends[i]
|
231
|
+
spec_info.hidden_states = hidden_states
|
232
|
+
|
233
|
+
# Run forward
|
234
|
+
logits_output = self.model_runner.model.forward(
|
235
|
+
forward_batch.input_ids, forward_batch.positions, forward_batch
|
236
|
+
)
|
237
|
+
probs = torch.softmax(logits_output.next_token_logits, dim=-1)
|
238
|
+
topk_p, topk_index = fast_topk(probs, self.topk, dim=-1)
|
239
|
+
hidden_states = logits_output.hidden_states
|
240
|
+
|
241
|
+
return score_list, token_list, parents_list
|
242
|
+
|
243
|
+
def verify(self, batch: ScheduleBatch, spec_info: EagleVerifyInput):
|
244
|
+
spec_info.prepare_for_verify(batch)
|
121
245
|
batch.forward_mode = ForwardMode.TARGET_VERIFY
|
122
|
-
batch.spec_info =
|
123
|
-
batch.spec_info.capture_hidden_mode = CaptureHiddenMode.FULL
|
246
|
+
batch.spec_info = spec_info
|
124
247
|
model_worker_batch = batch.get_model_worker_batch()
|
125
248
|
logits_output, _ = self.target_worker.forward_batch_generation(
|
126
249
|
model_worker_batch, skip_sample=True
|
127
250
|
)
|
128
|
-
|
129
|
-
res =
|
251
|
+
spec_info.hidden_states = logits_output.hidden_states
|
252
|
+
res = spec_info.verify(batch, logits_output)
|
130
253
|
batch.forward_mode = ForwardMode.DECODE
|
131
254
|
return res + (model_worker_batch,)
|
132
255
|
|
256
|
+
def forward_draft_extend(self, batch: ScheduleBatch):
|
257
|
+
self._set_mem_pool(batch, self.model_runner)
|
258
|
+
batch.spec_info.prepare_for_extend(batch)
|
259
|
+
batch.spec_info.capture_hidden_mode = CaptureHiddenMode.LAST
|
260
|
+
model_worker_batch = batch.get_model_worker_batch()
|
261
|
+
forward_batch = ForwardBatch.init_new(model_worker_batch, self.model_runner)
|
262
|
+
logits_output = self.model_runner.forward(forward_batch)
|
263
|
+
self.capture_for_decode(logits_output, forward_batch)
|
264
|
+
self._set_mem_pool(batch, self.target_worker.model_runner)
|
265
|
+
|
133
266
|
def _set_mem_pool(self, batch: ScheduleBatch, runner: ModelRunner):
|
134
267
|
batch.token_to_kv_pool = runner.token_to_kv_pool
|
135
268
|
batch.req_to_token_pool = runner.req_to_token_pool
|
136
269
|
|
137
270
|
def forward_draft_extend_after_decode(self, batch: ScheduleBatch):
|
138
271
|
seq_lens_backup = batch.seq_lens
|
272
|
+
req_pool_indices_backup = batch.req_pool_indices
|
139
273
|
|
140
274
|
self._set_mem_pool(batch, self.model_runner)
|
141
275
|
batch.forward_mode = ForwardMode.DRAFT_EXTEND
|
142
|
-
batch.spec_info.prepare_extend_after_decode(batch)
|
276
|
+
batch.spec_info.prepare_extend_after_decode(batch, self.speculative_num_steps)
|
143
277
|
batch.spec_info.capture_hidden_mode = CaptureHiddenMode.LAST
|
144
278
|
model_worker_batch = batch.get_model_worker_batch()
|
145
279
|
forward_batch = ForwardBatch.init_new(model_worker_batch, self.model_runner)
|
@@ -151,17 +285,15 @@ class EAGLEWorker(TpModelWorker):
|
|
151
285
|
# This is because `seq_lens` can be modified in `prepare_extend_after_decode`
|
152
286
|
batch.forward_mode = ForwardMode.DECODE
|
153
287
|
batch.seq_lens = seq_lens_backup
|
288
|
+
batch.req_pool_indices = req_pool_indices_backup
|
154
289
|
|
155
290
|
def capture_for_decode(
|
156
291
|
self, logits_output: LogitsProcessorOutput, forward_batch: ForwardBatch
|
157
292
|
):
|
158
|
-
|
159
|
-
logits_output.next_token_logits, dim=-1
|
160
|
-
) # TODO(kavioyu): Support more sampling methods
|
293
|
+
probs = torch.softmax(logits_output.next_token_logits, dim=-1)
|
161
294
|
spec_info = forward_batch.spec_info
|
162
|
-
spec_info.
|
295
|
+
spec_info.topk_p, spec_info.topk_index = fast_topk(probs, self.topk, dim=-1)
|
163
296
|
spec_info.hidden_states = logits_output.hidden_states
|
164
|
-
spec_info.prev_mode = forward_batch.forward_mode
|
165
297
|
|
166
298
|
# Don't support prefix share now.
|
167
299
|
def finish_request(self, reqs: Union[Req, List[Req]]):
|
sglang/srt/utils.py
CHANGED
@@ -1046,6 +1046,13 @@ def get_device_name(device_id: int = 0) -> str:
|
|
1046
1046
|
return torch.hpu.get_device_name(device_id)
|
1047
1047
|
|
1048
1048
|
|
1049
|
+
def get_device_core_count(device_id: int = 0) -> int:
|
1050
|
+
if hasattr(torch, "cuda") and torch.cuda.is_available():
|
1051
|
+
return torch.cuda.get_device_properties(device_id).multi_processor_count
|
1052
|
+
|
1053
|
+
return 0
|
1054
|
+
|
1055
|
+
|
1049
1056
|
def get_device_capability(device_id: int = 0) -> Tuple[int, int]:
|
1050
1057
|
major, minor = None, None
|
1051
1058
|
if hasattr(torch, "cuda") and torch.cuda.is_available():
|
sglang/test/runners.py
CHANGED
@@ -272,6 +272,7 @@ class SRTRunner:
|
|
272
272
|
port: int = DEFAULT_PORT_FOR_SRT_TEST_RUNNER,
|
273
273
|
lora_paths: List[str] = None,
|
274
274
|
max_loras_per_batch: int = 4,
|
275
|
+
lora_backend: str = "triton",
|
275
276
|
disable_cuda_graph: bool = False,
|
276
277
|
disable_radix_cache: bool = False,
|
277
278
|
):
|
@@ -287,6 +288,7 @@ class SRTRunner:
|
|
287
288
|
is_embedding=not self.is_generation,
|
288
289
|
lora_paths=lora_paths,
|
289
290
|
max_loras_per_batch=max_loras_per_batch,
|
291
|
+
lora_backend=lora_backend,
|
290
292
|
disable_cuda_graph=disable_cuda_graph,
|
291
293
|
disable_radix_cache=disable_radix_cache,
|
292
294
|
)
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.4.2.
|
1
|
+
__version__ = "0.4.2.post3"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.4.2.
|
3
|
+
Version: 0.4.2.post3
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -225,7 +225,6 @@ Requires-Dist: huggingface_hub; extra == "runtime-common"
|
|
225
225
|
Requires-Dist: interegular; extra == "runtime-common"
|
226
226
|
Requires-Dist: modelscope; extra == "runtime-common"
|
227
227
|
Requires-Dist: orjson; extra == "runtime-common"
|
228
|
-
Requires-Dist: outlines<0.1.0,>=0.0.44; extra == "runtime-common"
|
229
228
|
Requires-Dist: packaging; extra == "runtime-common"
|
230
229
|
Requires-Dist: pillow; extra == "runtime-common"
|
231
230
|
Requires-Dist: prometheus-client>=0.20.0; extra == "runtime-common"
|
@@ -240,21 +239,27 @@ Requires-Dist: xgrammar>=0.1.10; extra == "runtime-common"
|
|
240
239
|
Provides-Extra: srt
|
241
240
|
Requires-Dist: sglang[runtime_common]; extra == "srt"
|
242
241
|
Requires-Dist: cuda-python; extra == "srt"
|
243
|
-
Requires-Dist: sgl-kernel>=0.0.3; extra == "srt"
|
242
|
+
Requires-Dist: sgl-kernel>=0.0.3.post2; extra == "srt"
|
244
243
|
Requires-Dist: torch; extra == "srt"
|
245
244
|
Requires-Dist: vllm==0.6.4.post1; extra == "srt"
|
246
|
-
Requires-Dist:
|
245
|
+
Requires-Dist: flashinfer_python>=0.2.0.post2; extra == "srt"
|
246
|
+
Requires-Dist: outlines<0.1.0,>=0.0.44; extra == "srt"
|
247
247
|
Provides-Extra: srt-hip
|
248
248
|
Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
|
249
249
|
Requires-Dist: torch; extra == "srt-hip"
|
250
|
-
Requires-Dist: vllm==0.6.
|
250
|
+
Requires-Dist: vllm==0.6.7.dev2; extra == "srt-hip"
|
251
|
+
Requires-Dist: outlines==0.1.11; extra == "srt-hip"
|
252
|
+
Requires-Dist: sgl-kernel>=0.0.3.post1; extra == "srt-hip"
|
251
253
|
Provides-Extra: srt-xpu
|
252
254
|
Requires-Dist: sglang[runtime_common]; extra == "srt-xpu"
|
255
|
+
Requires-Dist: outlines<0.1.0,>=0.0.44; extra == "srt-xpu"
|
253
256
|
Provides-Extra: srt-hpu
|
254
257
|
Requires-Dist: sglang[runtime_common]; extra == "srt-hpu"
|
258
|
+
Requires-Dist: outlines<0.1.0,>=0.0.44; extra == "srt-hpu"
|
255
259
|
Provides-Extra: srt-cpu
|
256
260
|
Requires-Dist: sglang[runtime_common]; extra == "srt-cpu"
|
257
261
|
Requires-Dist: torch; extra == "srt-cpu"
|
262
|
+
Requires-Dist: outlines<0.1.0,>=0.0.44; extra == "srt-cpu"
|
258
263
|
Provides-Extra: openai
|
259
264
|
Requires-Dist: openai>=1.0; extra == "openai"
|
260
265
|
Requires-Dist: tiktoken; extra == "openai"
|
@@ -372,7 +377,11 @@ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
|
|
372
377
|
[Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
|
373
378
|
|
374
379
|
## Adoption and Sponsorship
|
375
|
-
The project is supported by (alphabetically): AMD, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, LMSYS
|
380
|
+
The project is supported by (alphabetically): AMD, Atlas Cloud, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, LMSYS CORP, Meituan, Nebius, Novita AI, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, 01.AI.
|
381
|
+
|
382
|
+
## Contact Us
|
383
|
+
|
384
|
+
For enterprises interested in adopting or deploying SGLang at scale, including technical consulting, sponsorship opportunities, or partnership inquiries, please contact us at contact@sglang.ai.
|
376
385
|
|
377
386
|
## Acknowledgment and Citation
|
378
387
|
We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql). Please cite the paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
|