sglang 0.5.0rc0__py3-none-any.whl → 0.5.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +8 -3
- sglang/bench_one_batch.py +6 -1
- sglang/lang/chat_template.py +18 -0
- sglang/srt/bench_utils.py +137 -0
- sglang/srt/configs/model_config.py +8 -7
- sglang/srt/disaggregation/decode.py +8 -4
- sglang/srt/disaggregation/mooncake/conn.py +43 -25
- sglang/srt/disaggregation/mooncake/transfer_engine.py +29 -0
- sglang/srt/distributed/parallel_state.py +4 -2
- sglang/srt/entrypoints/context.py +3 -20
- sglang/srt/entrypoints/engine.py +13 -8
- sglang/srt/entrypoints/harmony_utils.py +2 -0
- sglang/srt/entrypoints/http_server.py +68 -5
- sglang/srt/entrypoints/openai/protocol.py +2 -9
- sglang/srt/entrypoints/openai/serving_chat.py +60 -265
- sglang/srt/entrypoints/openai/serving_completions.py +1 -0
- sglang/srt/entrypoints/openai/tool_server.py +4 -3
- sglang/srt/function_call/ebnf_composer.py +1 -0
- sglang/srt/function_call/function_call_parser.py +2 -0
- sglang/srt/function_call/glm4_moe_detector.py +1 -1
- sglang/srt/function_call/gpt_oss_detector.py +331 -0
- sglang/srt/function_call/kimik2_detector.py +3 -3
- sglang/srt/function_call/qwen3_coder_detector.py +219 -9
- sglang/srt/jinja_template_utils.py +6 -0
- sglang/srt/layers/attention/aiter_backend.py +370 -107
- sglang/srt/layers/attention/ascend_backend.py +3 -0
- sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
- sglang/srt/layers/attention/flashattention_backend.py +18 -0
- sglang/srt/layers/attention/flashinfer_backend.py +55 -13
- sglang/srt/layers/attention/flashinfer_mla_backend.py +1 -0
- sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
- sglang/srt/layers/attention/triton_backend.py +24 -27
- sglang/srt/layers/attention/trtllm_mha_backend.py +8 -6
- sglang/srt/layers/attention/trtllm_mla_backend.py +129 -25
- sglang/srt/layers/attention/vision.py +9 -1
- sglang/srt/layers/attention/wave_backend.py +627 -0
- sglang/srt/layers/attention/wave_ops/decode_attention.py +186 -0
- sglang/srt/layers/attention/wave_ops/extend_attention.py +149 -0
- sglang/srt/layers/attention/wave_ops/prefill_attention.py +79 -0
- sglang/srt/layers/communicator.py +11 -13
- sglang/srt/layers/dp_attention.py +118 -27
- sglang/srt/layers/flashinfer_comm_fusion.py +4 -4
- sglang/srt/layers/linear.py +1 -0
- sglang/srt/layers/logits_processor.py +12 -18
- sglang/srt/layers/moe/cutlass_moe.py +11 -16
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +4 -5
- sglang/srt/layers/moe/ep_moe/kernels.py +43 -0
- sglang/srt/layers/moe/ep_moe/layer.py +60 -2
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=129,N=352,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=161,N=192,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_0/E=16,N=1024,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/layer.py +7 -9
- sglang/srt/layers/moe/token_dispatcher/deepep.py +61 -24
- sglang/srt/layers/moe/topk.py +4 -1
- sglang/srt/layers/multimodal.py +156 -40
- sglang/srt/layers/quantization/__init__.py +10 -35
- sglang/srt/layers/quantization/awq.py +15 -16
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +0 -1
- sglang/srt/layers/quantization/fp8_kernel.py +277 -0
- sglang/srt/layers/quantization/fp8_utils.py +22 -10
- sglang/srt/layers/quantization/gptq.py +12 -17
- sglang/srt/layers/quantization/marlin_utils.py +15 -5
- sglang/srt/layers/quantization/modelopt_quant.py +58 -41
- sglang/srt/layers/quantization/mxfp4.py +20 -3
- sglang/srt/layers/quantization/utils.py +52 -2
- sglang/srt/layers/quantization/w4afp8.py +20 -11
- sglang/srt/layers/quantization/w8a8_int8.py +48 -34
- sglang/srt/layers/rotary_embedding.py +281 -2
- sglang/srt/layers/sampler.py +5 -2
- sglang/srt/lora/backend/base_backend.py +3 -23
- sglang/srt/lora/layers.py +66 -116
- sglang/srt/lora/lora.py +17 -62
- sglang/srt/lora/lora_manager.py +12 -48
- sglang/srt/lora/lora_registry.py +20 -9
- sglang/srt/lora/mem_pool.py +20 -63
- sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
- sglang/srt/lora/utils.py +25 -58
- sglang/srt/managers/cache_controller.py +24 -29
- sglang/srt/managers/detokenizer_manager.py +1 -1
- sglang/srt/managers/io_struct.py +20 -6
- sglang/srt/managers/mm_utils.py +1 -2
- sglang/srt/managers/multimodal_processor.py +1 -1
- sglang/srt/managers/schedule_batch.py +43 -49
- sglang/srt/managers/schedule_policy.py +6 -6
- sglang/srt/managers/scheduler.py +18 -11
- sglang/srt/managers/scheduler_profiler_mixin.py +28 -8
- sglang/srt/managers/tokenizer_manager.py +53 -44
- sglang/srt/mem_cache/allocator.py +39 -214
- sglang/srt/mem_cache/allocator_ascend.py +158 -0
- sglang/srt/mem_cache/chunk_cache.py +1 -1
- sglang/srt/mem_cache/hicache_storage.py +1 -1
- sglang/srt/mem_cache/hiradix_cache.py +34 -24
- sglang/srt/mem_cache/lora_radix_cache.py +421 -0
- sglang/srt/mem_cache/memory_pool_host.py +33 -35
- sglang/srt/mem_cache/radix_cache.py +2 -5
- sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +443 -0
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +139 -67
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +6 -9
- sglang/srt/model_executor/cuda_graph_runner.py +29 -23
- sglang/srt/model_executor/forward_batch_info.py +33 -14
- sglang/srt/model_executor/model_runner.py +179 -81
- sglang/srt/model_loader/loader.py +18 -6
- sglang/srt/models/deepseek_nextn.py +2 -1
- sglang/srt/models/deepseek_v2.py +79 -38
- sglang/srt/models/gemma2.py +0 -34
- sglang/srt/models/gemma3n_mm.py +8 -9
- sglang/srt/models/glm4.py +6 -0
- sglang/srt/models/glm4_moe.py +11 -11
- sglang/srt/models/glm4_moe_nextn.py +2 -1
- sglang/srt/models/glm4v.py +589 -0
- sglang/srt/models/glm4v_moe.py +400 -0
- sglang/srt/models/gpt_oss.py +142 -20
- sglang/srt/models/granite.py +0 -25
- sglang/srt/models/llama.py +10 -27
- sglang/srt/models/llama4.py +19 -6
- sglang/srt/models/qwen2.py +2 -2
- sglang/srt/models/qwen2_5_vl.py +7 -3
- sglang/srt/models/qwen2_audio.py +10 -9
- sglang/srt/models/qwen2_moe.py +20 -5
- sglang/srt/models/qwen3.py +0 -24
- sglang/srt/models/qwen3_classification.py +78 -0
- sglang/srt/models/qwen3_moe.py +18 -5
- sglang/srt/models/registry.py +1 -1
- sglang/srt/models/step3_vl.py +6 -2
- sglang/srt/models/torch_native_llama.py +0 -24
- sglang/srt/multimodal/processors/base_processor.py +23 -13
- sglang/srt/multimodal/processors/glm4v.py +132 -0
- sglang/srt/multimodal/processors/qwen_audio.py +4 -2
- sglang/srt/operations.py +17 -2
- sglang/srt/reasoning_parser.py +316 -0
- sglang/srt/sampling/sampling_batch_info.py +7 -4
- sglang/srt/server_args.py +142 -140
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -21
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +7 -21
- sglang/srt/speculative/eagle_worker.py +16 -0
- sglang/srt/two_batch_overlap.py +16 -12
- sglang/srt/utils.py +3 -3
- sglang/srt/weight_sync/tensor_bucket.py +106 -0
- sglang/test/attention/test_trtllm_mla_backend.py +186 -36
- sglang/test/doc_patch.py +59 -0
- sglang/test/few_shot_gsm8k.py +1 -1
- sglang/test/few_shot_gsm8k_engine.py +1 -1
- sglang/test/run_eval.py +4 -1
- sglang/test/simple_eval_common.py +6 -0
- sglang/test/simple_eval_gpqa.py +2 -0
- sglang/test/test_fp4_moe.py +118 -36
- sglang/test/test_marlin_moe.py +1 -1
- sglang/test/test_marlin_utils.py +1 -1
- sglang/utils.py +1 -1
- sglang/version.py +1 -1
- {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/METADATA +27 -31
- {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/RECORD +166 -142
- sglang/lang/backend/__init__.py +0 -0
- sglang/srt/function_call/harmony_tool_parser.py +0 -130
- sglang/srt/layers/quantization/scalar_type.py +0 -352
- sglang/srt/lora/backend/flashinfer_backend.py +0 -131
- /sglang/{api.py → lang/api.py} +0 -0
- {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/WHEEL +0 -0
- {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/top_level.txt +0 -0
sglang/test/test_fp4_moe.py
CHANGED
@@ -1,6 +1,9 @@
|
|
1
1
|
# SPDX-License-Identifier: Apache-2.0
|
2
|
+
from typing import Callable
|
3
|
+
|
2
4
|
import pytest
|
3
5
|
import torch
|
6
|
+
from flashinfer.fused_moe import cutlass_fused_moe as flashinfer_cutlass_fused_moe
|
4
7
|
from sgl_kernel import scaled_fp4_quant
|
5
8
|
|
6
9
|
from sglang.srt.layers.activation import SiluAndMul
|
@@ -111,15 +114,16 @@ def torch_moe(a, w1, w2, score, topk, expert_map):
|
|
111
114
|
).sum(dim=1)
|
112
115
|
|
113
116
|
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
117
|
+
def check_moe(
|
118
|
+
m: int,
|
119
|
+
n: int,
|
120
|
+
k: int,
|
121
|
+
e: int,
|
122
|
+
topk: int,
|
123
|
+
dtype: torch.dtype,
|
124
|
+
moe_impl: Callable,
|
125
|
+
flip_w13: bool,
|
121
126
|
):
|
122
|
-
|
123
127
|
torch.manual_seed(7)
|
124
128
|
a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
|
125
129
|
w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
|
@@ -167,38 +171,18 @@ def test_cutlass_fp4_moe_no_graph(
|
|
167
171
|
|
168
172
|
a1_gs = torch.ones((e,), device="cuda", dtype=torch.float32)
|
169
173
|
a2_gs = torch.ones((e,), device="cuda", dtype=torch.float32)
|
170
|
-
|
171
|
-
ab_strides_13 = torch.full(
|
172
|
-
(e,), w1_q.shape[2] * 2, dtype=torch.int64, device=w1_q.device
|
173
|
-
)
|
174
|
-
c_strides_13 = torch.full(
|
175
|
-
(e,), w1_q.shape[1], dtype=torch.int64, device=w1_q.device
|
176
|
-
)
|
177
|
-
ab_strides_2 = torch.full(
|
178
|
-
(e,), w2_q.shape[2] * 2, dtype=torch.int64, device=w2_q.device
|
179
|
-
)
|
180
|
-
c_strides_2 = torch.full((e,), w2_q.shape[1], dtype=torch.int64, device=w2_q.device)
|
181
|
-
params = CutlassMoEParams(
|
182
|
-
CutlassMoEType.BlockscaledFP4,
|
183
|
-
device=a.device,
|
184
|
-
num_experts=e,
|
185
|
-
intermediate_size_per_partition=n, # n
|
186
|
-
hidden_size=k,
|
187
|
-
) # k
|
188
|
-
cutlass_output = cutlass_moe_fp4(
|
174
|
+
test_output = moe_impl(
|
189
175
|
a=a,
|
190
|
-
|
191
|
-
|
176
|
+
topk_weights=topk_weights,
|
177
|
+
topk_ids=topk_ids,
|
178
|
+
w1_q=w1_q,
|
179
|
+
w2_q=w2_q,
|
180
|
+
a1_gs=a1_gs,
|
192
181
|
w1_blockscale=w1_blockscale,
|
193
182
|
w1_alphas=(1 / w1_gs),
|
194
|
-
|
195
|
-
w2_fp4=w2_q,
|
183
|
+
a2_gs=a2_gs,
|
196
184
|
w2_blockscale=w2_blockscale,
|
197
185
|
w2_alphas=(1 / w2_gs),
|
198
|
-
topk_weights=topk_weights,
|
199
|
-
topk_ids=topk_ids,
|
200
|
-
params=params,
|
201
|
-
apply_router_weight_on_input=False,
|
202
186
|
)
|
203
187
|
|
204
188
|
# Reference check:
|
@@ -237,10 +221,108 @@ def test_cutlass_fp4_moe_no_graph(
|
|
237
221
|
block_size=quant_blocksize,
|
238
222
|
)
|
239
223
|
|
224
|
+
if flip_w13:
|
225
|
+
dim = -2
|
226
|
+
size = w1_d.size(dim)
|
227
|
+
assert size % 2 == 0, f"Expected even size in dim {dim}, got {size}"
|
228
|
+
half = size // 2
|
229
|
+
# Reorder weight
|
230
|
+
w1, w3 = w1_d.split(half, dim=dim)
|
231
|
+
w1_d = torch.cat([w3, w1], dim=dim).contiguous()
|
232
|
+
|
240
233
|
torch_output = torch_moe(a_in_dtype, w1_d, w2_d, score, topk, None)
|
241
234
|
|
242
|
-
torch.testing.assert_close(torch_output,
|
235
|
+
torch.testing.assert_close(torch_output, test_output, atol=1e-1, rtol=1e-1)
|
236
|
+
|
237
|
+
|
238
|
+
@pytest.mark.parametrize("m,n,k", MNK_FACTORS)
|
239
|
+
@pytest.mark.parametrize("e", [40, 64, 256])
|
240
|
+
@pytest.mark.parametrize("topk", [1, 6, 8])
|
241
|
+
@pytest.mark.parametrize("dtype", [torch.half, torch.bfloat16])
|
242
|
+
@torch.inference_mode()
|
243
|
+
def test_cutlass_fp4_moe_no_graph(
|
244
|
+
m: int, n: int, k: int, e: int, topk: int, dtype: torch.dtype
|
245
|
+
):
|
246
|
+
def cutlass_moe_impl(
|
247
|
+
a,
|
248
|
+
topk_weights,
|
249
|
+
topk_ids,
|
250
|
+
w1_q,
|
251
|
+
w2_q,
|
252
|
+
a1_gs,
|
253
|
+
w1_blockscale,
|
254
|
+
w1_alphas,
|
255
|
+
a2_gs,
|
256
|
+
w2_blockscale,
|
257
|
+
w2_alphas,
|
258
|
+
):
|
259
|
+
params = CutlassMoEParams(
|
260
|
+
CutlassMoEType.BlockscaledFP4,
|
261
|
+
device=a.device,
|
262
|
+
num_experts=e,
|
263
|
+
intermediate_size_per_partition=n, # n
|
264
|
+
hidden_size=k,
|
265
|
+
) # k
|
266
|
+
return cutlass_moe_fp4(
|
267
|
+
a=a,
|
268
|
+
a1_gscale=a1_gs,
|
269
|
+
w1_fp4=w1_q,
|
270
|
+
w1_blockscale=w1_blockscale,
|
271
|
+
w1_alphas=w1_alphas,
|
272
|
+
a2_gscale=a2_gs,
|
273
|
+
w2_fp4=w2_q,
|
274
|
+
w2_blockscale=w2_blockscale,
|
275
|
+
w2_alphas=w2_alphas,
|
276
|
+
topk_weights=topk_weights,
|
277
|
+
topk_ids=topk_ids,
|
278
|
+
params=params,
|
279
|
+
apply_router_weight_on_input=False,
|
280
|
+
)
|
281
|
+
|
282
|
+
check_moe(m, n, k, e, topk, dtype, cutlass_moe_impl, flip_w13=False)
|
283
|
+
|
284
|
+
|
285
|
+
@pytest.mark.parametrize("m,n,k", MNK_FACTORS)
|
286
|
+
@pytest.mark.parametrize("e", [40, 64, 256])
|
287
|
+
@pytest.mark.parametrize("topk", [1, 6, 8])
|
288
|
+
@pytest.mark.parametrize("dtype", [torch.half, torch.bfloat16])
|
289
|
+
@torch.inference_mode()
|
290
|
+
def test_flashinfer_fp4_moe_no_graph(
|
291
|
+
m: int, n: int, k: int, e: int, topk: int, dtype: torch.dtype
|
292
|
+
):
|
293
|
+
def flashinfer_moe_impl(
|
294
|
+
a,
|
295
|
+
topk_weights,
|
296
|
+
topk_ids,
|
297
|
+
w1_q,
|
298
|
+
w2_q,
|
299
|
+
a1_gs,
|
300
|
+
w1_blockscale,
|
301
|
+
w1_alphas,
|
302
|
+
a2_gs,
|
303
|
+
w2_blockscale,
|
304
|
+
w2_alphas,
|
305
|
+
):
|
306
|
+
return flashinfer_cutlass_fused_moe(
|
307
|
+
a,
|
308
|
+
topk_ids.to(torch.int),
|
309
|
+
topk_weights,
|
310
|
+
w1_q.view(torch.long),
|
311
|
+
w2_q.view(torch.long),
|
312
|
+
a.dtype,
|
313
|
+
quant_scales=[
|
314
|
+
a1_gs,
|
315
|
+
w1_blockscale.view(torch.int32),
|
316
|
+
w1_alphas,
|
317
|
+
a2_gs,
|
318
|
+
w2_blockscale.view(torch.int32),
|
319
|
+
w2_alphas,
|
320
|
+
],
|
321
|
+
)[0]
|
322
|
+
|
323
|
+
check_moe(m, n, k, e, topk, dtype, flashinfer_moe_impl, flip_w13=True)
|
243
324
|
|
244
325
|
|
245
326
|
if __name__ == "__main__":
|
246
327
|
test_cutlass_fp4_moe_no_graph(224, 1024, 1024, 256, 8, torch.half)
|
328
|
+
test_flashinfer_fp4_moe_no_graph(224, 1024, 1024, 256, 8, torch.half)
|
sglang/test/test_marlin_moe.py
CHANGED
@@ -4,9 +4,9 @@ from typing import Optional
|
|
4
4
|
import pytest
|
5
5
|
import torch
|
6
6
|
from sgl_kernel import fused_marlin_moe
|
7
|
+
from sgl_kernel.scalar_type import ScalarType, scalar_types
|
7
8
|
|
8
9
|
from sglang.srt.layers.activation import SiluAndMul
|
9
|
-
from sglang.srt.layers.quantization.scalar_type import ScalarType, scalar_types
|
10
10
|
from sglang.test.test_marlin_utils import awq_marlin_quantize, marlin_quantize
|
11
11
|
|
12
12
|
|
sglang/test/test_marlin_utils.py
CHANGED
@@ -10,13 +10,13 @@ from typing import Optional
|
|
10
10
|
|
11
11
|
import numpy as np
|
12
12
|
import torch
|
13
|
+
from sgl_kernel.scalar_type import ScalarType
|
13
14
|
|
14
15
|
from sglang.srt.layers.quantization.marlin_utils import (
|
15
16
|
GPTQ_MARLIN_TILE,
|
16
17
|
marlin_permute_scales,
|
17
18
|
marlin_zero_points,
|
18
19
|
)
|
19
|
-
from sglang.srt.layers.quantization.scalar_type import ScalarType
|
20
20
|
from sglang.srt.layers.quantization.utils import (
|
21
21
|
get_pack_factor,
|
22
22
|
gptq_quantize_weights,
|
sglang/utils.py
CHANGED
@@ -458,7 +458,7 @@ def wait_for_server(base_url: str, timeout: int = None) -> None:
|
|
458
458
|
NOTE: Typically, the server runs in a separate terminal.
|
459
459
|
In this notebook, we run the server and notebook code together, so their outputs are combined.
|
460
460
|
To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.
|
461
|
-
We are running those notebooks in a CI
|
461
|
+
We are running those notebooks in a CI environment, so the throughput is not representative of the actual performance.
|
462
462
|
"""
|
463
463
|
)
|
464
464
|
break
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.5.
|
1
|
+
__version__ = "0.5.0rc2"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.5.
|
3
|
+
Version: 0.5.0rc2
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -208,7 +208,7 @@ Project-URL: Homepage, https://github.com/sgl-project/sglang
|
|
208
208
|
Project-URL: Bug Tracker, https://github.com/sgl-project/sglang/issues
|
209
209
|
Classifier: Programming Language :: Python :: 3
|
210
210
|
Classifier: License :: OSI Approved :: Apache Software License
|
211
|
-
Requires-Python: >=3.
|
211
|
+
Requires-Python: >=3.10
|
212
212
|
Description-Content-Type: text/markdown
|
213
213
|
License-File: LICENSE
|
214
214
|
Requires-Dist: aiohttp
|
@@ -222,6 +222,7 @@ Requires-Dist: blobfile==3.0.0; extra == "runtime-common"
|
|
222
222
|
Requires-Dist: build; extra == "runtime-common"
|
223
223
|
Requires-Dist: compressed-tensors; extra == "runtime-common"
|
224
224
|
Requires-Dist: datasets; extra == "runtime-common"
|
225
|
+
Requires-Dist: einops; extra == "runtime-common"
|
225
226
|
Requires-Dist: fastapi; extra == "runtime-common"
|
226
227
|
Requires-Dist: hf_transfer; extra == "runtime-common"
|
227
228
|
Requires-Dist: huggingface_hub; extra == "runtime-common"
|
@@ -230,6 +231,7 @@ Requires-Dist: llguidance<0.8.0,>=0.7.11; extra == "runtime-common"
|
|
230
231
|
Requires-Dist: modelscope; extra == "runtime-common"
|
231
232
|
Requires-Dist: msgspec; extra == "runtime-common"
|
232
233
|
Requires-Dist: ninja; extra == "runtime-common"
|
234
|
+
Requires-Dist: openai==1.99.1; extra == "runtime-common"
|
233
235
|
Requires-Dist: openai-harmony==0.0.3; extra == "runtime-common"
|
234
236
|
Requires-Dist: orjson; extra == "runtime-common"
|
235
237
|
Requires-Dist: outlines==0.1.11; extra == "runtime-common"
|
@@ -246,21 +248,21 @@ Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
|
|
246
248
|
Requires-Dist: sentencepiece; extra == "runtime-common"
|
247
249
|
Requires-Dist: soundfile==0.13.1; extra == "runtime-common"
|
248
250
|
Requires-Dist: scipy; extra == "runtime-common"
|
249
|
-
Requires-Dist: torchao==0.9.0; extra == "runtime-common"
|
250
|
-
Requires-Dist: transformers==4.55.0; extra == "runtime-common"
|
251
251
|
Requires-Dist: timm==1.0.16; extra == "runtime-common"
|
252
|
+
Requires-Dist: tiktoken; extra == "runtime-common"
|
253
|
+
Requires-Dist: torchao==0.9.0; extra == "runtime-common"
|
254
|
+
Requires-Dist: transformers==4.55.2; extra == "runtime-common"
|
252
255
|
Requires-Dist: uvicorn; extra == "runtime-common"
|
253
256
|
Requires-Dist: uvloop; extra == "runtime-common"
|
254
257
|
Requires-Dist: xgrammar==0.1.22; extra == "runtime-common"
|
255
258
|
Provides-Extra: srt
|
256
259
|
Requires-Dist: sglang[runtime_common]; extra == "srt"
|
257
|
-
Requires-Dist: sgl-kernel==0.3.
|
260
|
+
Requires-Dist: sgl-kernel==0.3.5; extra == "srt"
|
258
261
|
Requires-Dist: torch==2.8.0; extra == "srt"
|
259
262
|
Requires-Dist: torchaudio==2.8.0; extra == "srt"
|
260
263
|
Requires-Dist: torchvision; extra == "srt"
|
261
264
|
Requires-Dist: cuda-python; extra == "srt"
|
262
|
-
Requires-Dist:
|
263
|
-
Requires-Dist: flashinfer_python==0.2.10; extra == "srt"
|
265
|
+
Requires-Dist: flashinfer_python==0.2.11.post3; extra == "srt"
|
264
266
|
Provides-Extra: blackwell
|
265
267
|
Requires-Dist: sglang[runtime_common]; extra == "blackwell"
|
266
268
|
Requires-Dist: sgl-kernel; extra == "blackwell"
|
@@ -268,21 +270,19 @@ Requires-Dist: torch==2.8.0; extra == "blackwell"
|
|
268
270
|
Requires-Dist: torchaudio==2.8.0; extra == "blackwell"
|
269
271
|
Requires-Dist: torchvision; extra == "blackwell"
|
270
272
|
Requires-Dist: cuda-python; extra == "blackwell"
|
271
|
-
Requires-Dist:
|
272
|
-
Requires-Dist: flashinfer_python==0.2.10; extra == "blackwell"
|
273
|
-
Requires-Dist: tiktoken; extra == "blackwell"
|
274
|
-
Requires-Dist: openai==1.99.1; extra == "blackwell"
|
273
|
+
Requires-Dist: flashinfer_python==0.2.11.post3; extra == "blackwell"
|
275
274
|
Provides-Extra: srt-hip
|
276
275
|
Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
|
277
276
|
Requires-Dist: torch; extra == "srt-hip"
|
278
277
|
Requires-Dist: petit_kernel==0.0.2; extra == "srt-hip"
|
278
|
+
Requires-Dist: wave-lang==1.0.1; extra == "srt-hip"
|
279
|
+
Provides-Extra: srt-cpu
|
280
|
+
Requires-Dist: sglang[runtime_common]; extra == "srt-cpu"
|
281
|
+
Requires-Dist: einops; extra == "srt-cpu"
|
279
282
|
Provides-Extra: srt-xpu
|
280
283
|
Requires-Dist: sglang[runtime_common]; extra == "srt-xpu"
|
281
284
|
Provides-Extra: srt-hpu
|
282
285
|
Requires-Dist: sglang[runtime_common]; extra == "srt-hpu"
|
283
|
-
Provides-Extra: srt-cpu
|
284
|
-
Requires-Dist: sglang[runtime_common]; extra == "srt-cpu"
|
285
|
-
Requires-Dist: einops; extra == "srt-cpu"
|
286
286
|
Provides-Extra: srt-npu
|
287
287
|
Requires-Dist: sglang[runtime_common]; extra == "srt-npu"
|
288
288
|
Provides-Extra: openai
|
@@ -293,11 +293,12 @@ Requires-Dist: anthropic>=0.20.0; extra == "anthropic"
|
|
293
293
|
Provides-Extra: litellm
|
294
294
|
Requires-Dist: litellm>=1.0.0; extra == "litellm"
|
295
295
|
Provides-Extra: torch-memory-saver
|
296
|
-
Requires-Dist: torch_memory_saver
|
296
|
+
Requires-Dist: torch_memory_saver==0.0.8; extra == "torch-memory-saver"
|
297
297
|
Provides-Extra: decord
|
298
298
|
Requires-Dist: decord; extra == "decord"
|
299
299
|
Provides-Extra: test
|
300
300
|
Requires-Dist: accelerate; extra == "test"
|
301
|
+
Requires-Dist: expecttest; extra == "test"
|
301
302
|
Requires-Dist: jsonlines; extra == "test"
|
302
303
|
Requires-Dist: matplotlib; extra == "test"
|
303
304
|
Requires-Dist: pandas; extra == "test"
|
@@ -308,38 +309,32 @@ Provides-Extra: all
|
|
308
309
|
Requires-Dist: sglang[srt]; extra == "all"
|
309
310
|
Requires-Dist: sglang[openai]; extra == "all"
|
310
311
|
Requires-Dist: sglang[anthropic]; extra == "all"
|
311
|
-
Requires-Dist: sglang[litellm]; extra == "all"
|
312
312
|
Requires-Dist: sglang[torch_memory_saver]; extra == "all"
|
313
313
|
Requires-Dist: sglang[decord]; extra == "all"
|
314
314
|
Provides-Extra: all-hip
|
315
315
|
Requires-Dist: sglang[srt_hip]; extra == "all-hip"
|
316
316
|
Requires-Dist: sglang[openai]; extra == "all-hip"
|
317
317
|
Requires-Dist: sglang[anthropic]; extra == "all-hip"
|
318
|
-
Requires-Dist: sglang[litellm]; extra == "all-hip"
|
319
318
|
Requires-Dist: sglang[decord]; extra == "all-hip"
|
320
319
|
Provides-Extra: all-xpu
|
321
320
|
Requires-Dist: sglang[srt_xpu]; extra == "all-xpu"
|
322
321
|
Requires-Dist: sglang[openai]; extra == "all-xpu"
|
323
322
|
Requires-Dist: sglang[anthropic]; extra == "all-xpu"
|
324
|
-
Requires-Dist: sglang[litellm]; extra == "all-xpu"
|
325
323
|
Requires-Dist: sglang[decord]; extra == "all-xpu"
|
326
324
|
Provides-Extra: all-hpu
|
327
325
|
Requires-Dist: sglang[srt_hpu]; extra == "all-hpu"
|
328
326
|
Requires-Dist: sglang[openai]; extra == "all-hpu"
|
329
327
|
Requires-Dist: sglang[anthropic]; extra == "all-hpu"
|
330
|
-
Requires-Dist: sglang[litellm]; extra == "all-hpu"
|
331
328
|
Requires-Dist: sglang[decord]; extra == "all-hpu"
|
332
329
|
Provides-Extra: all-cpu
|
333
330
|
Requires-Dist: sglang[srt_cpu]; extra == "all-cpu"
|
334
331
|
Requires-Dist: sglang[openai]; extra == "all-cpu"
|
335
332
|
Requires-Dist: sglang[anthropic]; extra == "all-cpu"
|
336
|
-
Requires-Dist: sglang[litellm]; extra == "all-cpu"
|
337
333
|
Requires-Dist: sglang[decord]; extra == "all-cpu"
|
338
334
|
Provides-Extra: all-npu
|
339
335
|
Requires-Dist: sglang[srt_npu]; extra == "all-npu"
|
340
336
|
Requires-Dist: sglang[openai]; extra == "all-npu"
|
341
337
|
Requires-Dist: sglang[anthropic]; extra == "all-npu"
|
342
|
-
Requires-Dist: sglang[litellm]; extra == "all-npu"
|
343
338
|
Requires-Dist: sglang[decord]; extra == "all-npu"
|
344
339
|
Provides-Extra: dev
|
345
340
|
Requires-Dist: sglang[all]; extra == "dev"
|
@@ -376,17 +371,17 @@ Dynamic: license-file
|
|
376
371
|
| [**Documentation**](https://docs.sglang.ai/)
|
377
372
|
| [**Join Slack**](https://slack.sglang.ai/)
|
378
373
|
| [**Join Bi-Weekly Development Meeting**](https://meeting.sglang.ai/)
|
379
|
-
| [**Roadmap**](https://github.com/sgl-project/sglang/issues/
|
374
|
+
| [**Roadmap**](https://github.com/sgl-project/sglang/issues/7736)
|
380
375
|
| [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
|
381
376
|
|
382
377
|
## News
|
378
|
+
- [2025/08] 🔥 SGLang provides day-0 support for OpenAI gpt-oss model ([instructions](https://github.com/sgl-project/sglang/issues/8833))
|
383
379
|
- [2025/06] 🔥 SGLang, the high-performance serving infrastructure powering trillions of tokens daily, has been awarded the third batch of the Open Source AI Grant by a16z ([a16z blog](https://a16z.com/advancing-open-source-ai-through-benchmarks-and-bold-experimentation/)).
|
384
380
|
- [2025/06] 🔥 Deploying DeepSeek on GB200 NVL72 with PD and Large Scale EP (Part I): 2.7x Higher Decoding Throughput ([blog](https://lmsys.org/blog/2025-06-16-gb200-part-1/)).
|
385
381
|
- [2025/05] 🔥 Deploying DeepSeek with PD Disaggregation and Large-scale Expert Parallelism on 96 H100 GPUs ([blog](https://lmsys.org/blog/2025-05-05-large-scale-ep/)).
|
386
382
|
- [2025/03] Supercharge DeepSeek-R1 Inference on AMD Instinct MI300X ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1-Part2/README.html))
|
387
383
|
- [2025/03] SGLang Joins PyTorch Ecosystem: Efficient LLM Serving Engine ([PyTorch blog](https://pytorch.org/blog/sglang-joins-pytorch/))
|
388
384
|
- [2024/12] v0.4 Release: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
|
389
|
-
- [2024/07] v0.2 Release: Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
|
390
385
|
|
391
386
|
<details>
|
392
387
|
<summary>More</summary>
|
@@ -395,6 +390,7 @@ Dynamic: license-file
|
|
395
390
|
- [2025/01] SGLang provides day one support for DeepSeek V3/R1 models on NVIDIA and AMD GPUs with DeepSeek-specific optimizations. ([instructions](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3), [AMD blog](https://www.amd.com/en/developer/resources/technical-articles/amd-instinct-gpus-power-deepseek-v3-revolutionizing-ai-development-with-sglang.html), [10+ other companies](https://x.com/lmsysorg/status/1887262321636221412))
|
396
391
|
- [2024/10] The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
|
397
392
|
- [2024/09] v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
|
393
|
+
- [2024/07] v0.2 Release: Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
|
398
394
|
- [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
|
399
395
|
- [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
|
400
396
|
- [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
|
@@ -406,17 +402,17 @@ SGLang is a fast serving framework for large language models and vision language
|
|
406
402
|
It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
|
407
403
|
The core features include:
|
408
404
|
|
409
|
-
- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, prefill-decode disaggregation, speculative decoding, continuous batching, paged attention, tensor
|
405
|
+
- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, prefill-decode disaggregation, speculative decoding, continuous batching, paged attention, tensor/pipeline/expert/data parallelism, structured outputs, chunked prefill, quantization (FP4/FP8/INT4/AWQ/GPTQ), and multi-lora batching.
|
410
406
|
- **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
|
411
|
-
- **Extensive Model Support**: Supports a wide range of generative models (Llama,
|
412
|
-
- **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
|
407
|
+
- **Extensive Model Support**: Supports a wide range of generative models (Llama, Qwen, DeepSeek, Kimi, GPT, Gemma, Mistral, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
|
408
|
+
- **Active Community**: SGLang is open-source and backed by an active community with wide industry adoption.
|
413
409
|
|
414
410
|
## Getting Started
|
415
|
-
- [Install SGLang](https://docs.sglang.ai/
|
416
|
-
- [Quick Start](https://docs.sglang.ai/
|
417
|
-
- [Backend Tutorial](https://docs.sglang.ai/
|
418
|
-
- [Frontend Tutorial](https://docs.sglang.ai/frontend/
|
419
|
-
- [Contribution Guide](https://docs.sglang.ai/
|
411
|
+
- [Install SGLang](https://docs.sglang.ai/get_started/install.html)
|
412
|
+
- [Quick Start](https://docs.sglang.ai/basic_usage/send_request.html)
|
413
|
+
- [Backend Tutorial](https://docs.sglang.ai/basic_usage/openai_api_completions.html)
|
414
|
+
- [Frontend Tutorial](https://docs.sglang.ai/references/frontend/frontend_tutorial.html)
|
415
|
+
- [Contribution Guide](https://docs.sglang.ai/developer_guide/contribution_guide.html)
|
420
416
|
|
421
417
|
## Benchmark and Performance
|
422
418
|
Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/), [v0.4 blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/), [Large-scale expert parallelism](https://lmsys.org/blog/2025-05-05-large-scale-ep/).
|