sglang 0.5.0rc0__py3-none-any.whl → 0.5.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +8 -3
- sglang/bench_one_batch.py +6 -0
- sglang/lang/chat_template.py +18 -0
- sglang/srt/bench_utils.py +137 -0
- sglang/srt/configs/model_config.py +7 -7
- sglang/srt/disaggregation/decode.py +8 -3
- sglang/srt/disaggregation/mooncake/conn.py +43 -25
- sglang/srt/disaggregation/mooncake/transfer_engine.py +29 -0
- sglang/srt/distributed/parallel_state.py +4 -2
- sglang/srt/entrypoints/context.py +3 -20
- sglang/srt/entrypoints/engine.py +13 -8
- sglang/srt/entrypoints/harmony_utils.py +2 -0
- sglang/srt/entrypoints/http_server.py +4 -5
- sglang/srt/entrypoints/openai/protocol.py +0 -9
- sglang/srt/entrypoints/openai/serving_chat.py +59 -265
- sglang/srt/entrypoints/openai/tool_server.py +4 -3
- sglang/srt/function_call/ebnf_composer.py +1 -0
- sglang/srt/function_call/function_call_parser.py +2 -0
- sglang/srt/function_call/glm4_moe_detector.py +1 -1
- sglang/srt/function_call/gpt_oss_detector.py +331 -0
- sglang/srt/function_call/kimik2_detector.py +3 -3
- sglang/srt/function_call/qwen3_coder_detector.py +219 -9
- sglang/srt/jinja_template_utils.py +6 -0
- sglang/srt/layers/attention/aiter_backend.py +370 -107
- sglang/srt/layers/attention/ascend_backend.py +3 -0
- sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
- sglang/srt/layers/attention/flashattention_backend.py +18 -0
- sglang/srt/layers/attention/flashinfer_backend.py +52 -13
- sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
- sglang/srt/layers/attention/trtllm_mla_backend.py +119 -22
- sglang/srt/layers/attention/vision.py +9 -1
- sglang/srt/layers/attention/wave_backend.py +627 -0
- sglang/srt/layers/attention/wave_ops/decode_attention.py +186 -0
- sglang/srt/layers/attention/wave_ops/extend_attention.py +149 -0
- sglang/srt/layers/attention/wave_ops/prefill_attention.py +79 -0
- sglang/srt/layers/communicator.py +8 -10
- sglang/srt/layers/flashinfer_comm_fusion.py +4 -4
- sglang/srt/layers/linear.py +1 -0
- sglang/srt/layers/moe/cutlass_moe.py +11 -16
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +4 -5
- sglang/srt/layers/moe/ep_moe/kernels.py +43 -0
- sglang/srt/layers/moe/ep_moe/layer.py +60 -2
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/layer.py +7 -9
- sglang/srt/layers/moe/token_dispatcher/deepep.py +61 -24
- sglang/srt/layers/moe/topk.py +4 -1
- sglang/srt/layers/quantization/__init__.py +5 -3
- sglang/srt/layers/quantization/fp8_kernel.py +277 -0
- sglang/srt/layers/quantization/fp8_utils.py +22 -10
- sglang/srt/layers/quantization/modelopt_quant.py +6 -11
- sglang/srt/layers/quantization/mxfp4.py +4 -1
- sglang/srt/layers/quantization/w4afp8.py +20 -11
- sglang/srt/layers/quantization/w8a8_int8.py +48 -34
- sglang/srt/layers/rotary_embedding.py +281 -2
- sglang/srt/lora/backend/base_backend.py +3 -23
- sglang/srt/lora/layers.py +60 -114
- sglang/srt/lora/lora.py +17 -62
- sglang/srt/lora/lora_manager.py +12 -48
- sglang/srt/lora/lora_registry.py +20 -9
- sglang/srt/lora/mem_pool.py +20 -63
- sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
- sglang/srt/lora/utils.py +25 -58
- sglang/srt/managers/cache_controller.py +21 -29
- sglang/srt/managers/detokenizer_manager.py +1 -1
- sglang/srt/managers/io_struct.py +6 -6
- sglang/srt/managers/mm_utils.py +1 -2
- sglang/srt/managers/multimodal_processor.py +1 -1
- sglang/srt/managers/schedule_batch.py +35 -20
- sglang/srt/managers/schedule_policy.py +6 -6
- sglang/srt/managers/scheduler.py +15 -7
- sglang/srt/managers/scheduler_profiler_mixin.py +28 -8
- sglang/srt/managers/tokenizer_manager.py +25 -26
- sglang/srt/mem_cache/allocator.py +61 -87
- sglang/srt/mem_cache/hicache_storage.py +1 -1
- sglang/srt/mem_cache/hiradix_cache.py +34 -24
- sglang/srt/mem_cache/lora_radix_cache.py +421 -0
- sglang/srt/mem_cache/memory_pool_host.py +33 -35
- sglang/srt/mem_cache/radix_cache.py +2 -5
- sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +443 -0
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +139 -67
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +6 -9
- sglang/srt/model_executor/cuda_graph_runner.py +22 -3
- sglang/srt/model_executor/forward_batch_info.py +26 -5
- sglang/srt/model_executor/model_runner.py +129 -35
- sglang/srt/model_loader/loader.py +18 -6
- sglang/srt/models/deepseek_v2.py +74 -35
- sglang/srt/models/gemma2.py +0 -34
- sglang/srt/models/gemma3n_mm.py +8 -9
- sglang/srt/models/glm4.py +6 -0
- sglang/srt/models/glm4_moe.py +9 -9
- sglang/srt/models/glm4v.py +589 -0
- sglang/srt/models/glm4v_moe.py +400 -0
- sglang/srt/models/gpt_oss.py +136 -19
- sglang/srt/models/granite.py +0 -25
- sglang/srt/models/llama.py +0 -25
- sglang/srt/models/llama4.py +1 -1
- sglang/srt/models/qwen2_5_vl.py +7 -3
- sglang/srt/models/qwen2_audio.py +10 -9
- sglang/srt/models/qwen3.py +0 -24
- sglang/srt/models/registry.py +1 -1
- sglang/srt/models/torch_native_llama.py +0 -24
- sglang/srt/multimodal/processors/base_processor.py +23 -13
- sglang/srt/multimodal/processors/glm4v.py +132 -0
- sglang/srt/multimodal/processors/qwen_audio.py +4 -2
- sglang/srt/reasoning_parser.py +316 -0
- sglang/srt/server_args.py +115 -139
- sglang/srt/speculative/eagle_worker.py +16 -0
- sglang/srt/two_batch_overlap.py +12 -4
- sglang/srt/utils.py +3 -3
- sglang/srt/weight_sync/tensor_bucket.py +106 -0
- sglang/test/attention/test_trtllm_mla_backend.py +186 -36
- sglang/test/doc_patch.py +59 -0
- sglang/test/few_shot_gsm8k.py +1 -1
- sglang/test/few_shot_gsm8k_engine.py +1 -1
- sglang/test/run_eval.py +4 -1
- sglang/test/simple_eval_common.py +6 -0
- sglang/test/simple_eval_gpqa.py +2 -0
- sglang/test/test_fp4_moe.py +118 -36
- sglang/utils.py +1 -1
- sglang/version.py +1 -1
- {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc1.dist-info}/METADATA +26 -30
- {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc1.dist-info}/RECORD +127 -115
- sglang/lang/backend/__init__.py +0 -0
- sglang/srt/function_call/harmony_tool_parser.py +0 -130
- sglang/srt/lora/backend/flashinfer_backend.py +0 -131
- /sglang/{api.py → lang/api.py} +0 -0
- {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc1.dist-info}/WHEEL +0 -0
- {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc1.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc1.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.5.
|
3
|
+
Version: 0.5.0rc1
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -208,7 +208,7 @@ Project-URL: Homepage, https://github.com/sgl-project/sglang
|
|
208
208
|
Project-URL: Bug Tracker, https://github.com/sgl-project/sglang/issues
|
209
209
|
Classifier: Programming Language :: Python :: 3
|
210
210
|
Classifier: License :: OSI Approved :: Apache Software License
|
211
|
-
Requires-Python: >=3.
|
211
|
+
Requires-Python: >=3.10
|
212
212
|
Description-Content-Type: text/markdown
|
213
213
|
License-File: LICENSE
|
214
214
|
Requires-Dist: aiohttp
|
@@ -222,6 +222,7 @@ Requires-Dist: blobfile==3.0.0; extra == "runtime-common"
|
|
222
222
|
Requires-Dist: build; extra == "runtime-common"
|
223
223
|
Requires-Dist: compressed-tensors; extra == "runtime-common"
|
224
224
|
Requires-Dist: datasets; extra == "runtime-common"
|
225
|
+
Requires-Dist: einops; extra == "runtime-common"
|
225
226
|
Requires-Dist: fastapi; extra == "runtime-common"
|
226
227
|
Requires-Dist: hf_transfer; extra == "runtime-common"
|
227
228
|
Requires-Dist: huggingface_hub; extra == "runtime-common"
|
@@ -230,6 +231,7 @@ Requires-Dist: llguidance<0.8.0,>=0.7.11; extra == "runtime-common"
|
|
230
231
|
Requires-Dist: modelscope; extra == "runtime-common"
|
231
232
|
Requires-Dist: msgspec; extra == "runtime-common"
|
232
233
|
Requires-Dist: ninja; extra == "runtime-common"
|
234
|
+
Requires-Dist: openai==1.99.1; extra == "runtime-common"
|
233
235
|
Requires-Dist: openai-harmony==0.0.3; extra == "runtime-common"
|
234
236
|
Requires-Dist: orjson; extra == "runtime-common"
|
235
237
|
Requires-Dist: outlines==0.1.11; extra == "runtime-common"
|
@@ -246,21 +248,21 @@ Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
|
|
246
248
|
Requires-Dist: sentencepiece; extra == "runtime-common"
|
247
249
|
Requires-Dist: soundfile==0.13.1; extra == "runtime-common"
|
248
250
|
Requires-Dist: scipy; extra == "runtime-common"
|
251
|
+
Requires-Dist: timm==1.0.16; extra == "runtime-common"
|
252
|
+
Requires-Dist: tiktoken; extra == "runtime-common"
|
249
253
|
Requires-Dist: torchao==0.9.0; extra == "runtime-common"
|
250
254
|
Requires-Dist: transformers==4.55.0; extra == "runtime-common"
|
251
|
-
Requires-Dist: timm==1.0.16; extra == "runtime-common"
|
252
255
|
Requires-Dist: uvicorn; extra == "runtime-common"
|
253
256
|
Requires-Dist: uvloop; extra == "runtime-common"
|
254
257
|
Requires-Dist: xgrammar==0.1.22; extra == "runtime-common"
|
255
258
|
Provides-Extra: srt
|
256
259
|
Requires-Dist: sglang[runtime_common]; extra == "srt"
|
257
|
-
Requires-Dist: sgl-kernel==0.3.
|
260
|
+
Requires-Dist: sgl-kernel==0.3.4.post1; extra == "srt"
|
258
261
|
Requires-Dist: torch==2.8.0; extra == "srt"
|
259
262
|
Requires-Dist: torchaudio==2.8.0; extra == "srt"
|
260
263
|
Requires-Dist: torchvision; extra == "srt"
|
261
264
|
Requires-Dist: cuda-python; extra == "srt"
|
262
|
-
Requires-Dist:
|
263
|
-
Requires-Dist: flashinfer_python==0.2.10; extra == "srt"
|
265
|
+
Requires-Dist: flashinfer_python==0.2.11.post1; extra == "srt"
|
264
266
|
Provides-Extra: blackwell
|
265
267
|
Requires-Dist: sglang[runtime_common]; extra == "blackwell"
|
266
268
|
Requires-Dist: sgl-kernel; extra == "blackwell"
|
@@ -268,21 +270,19 @@ Requires-Dist: torch==2.8.0; extra == "blackwell"
|
|
268
270
|
Requires-Dist: torchaudio==2.8.0; extra == "blackwell"
|
269
271
|
Requires-Dist: torchvision; extra == "blackwell"
|
270
272
|
Requires-Dist: cuda-python; extra == "blackwell"
|
271
|
-
Requires-Dist:
|
272
|
-
Requires-Dist: flashinfer_python==0.2.10; extra == "blackwell"
|
273
|
-
Requires-Dist: tiktoken; extra == "blackwell"
|
274
|
-
Requires-Dist: openai==1.99.1; extra == "blackwell"
|
273
|
+
Requires-Dist: flashinfer_python==0.2.11.post1; extra == "blackwell"
|
275
274
|
Provides-Extra: srt-hip
|
276
275
|
Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
|
277
276
|
Requires-Dist: torch; extra == "srt-hip"
|
278
277
|
Requires-Dist: petit_kernel==0.0.2; extra == "srt-hip"
|
278
|
+
Requires-Dist: wave-lang==1.0.1; extra == "srt-hip"
|
279
|
+
Provides-Extra: srt-cpu
|
280
|
+
Requires-Dist: sglang[runtime_common]; extra == "srt-cpu"
|
281
|
+
Requires-Dist: einops; extra == "srt-cpu"
|
279
282
|
Provides-Extra: srt-xpu
|
280
283
|
Requires-Dist: sglang[runtime_common]; extra == "srt-xpu"
|
281
284
|
Provides-Extra: srt-hpu
|
282
285
|
Requires-Dist: sglang[runtime_common]; extra == "srt-hpu"
|
283
|
-
Provides-Extra: srt-cpu
|
284
|
-
Requires-Dist: sglang[runtime_common]; extra == "srt-cpu"
|
285
|
-
Requires-Dist: einops; extra == "srt-cpu"
|
286
286
|
Provides-Extra: srt-npu
|
287
287
|
Requires-Dist: sglang[runtime_common]; extra == "srt-npu"
|
288
288
|
Provides-Extra: openai
|
@@ -293,11 +293,12 @@ Requires-Dist: anthropic>=0.20.0; extra == "anthropic"
|
|
293
293
|
Provides-Extra: litellm
|
294
294
|
Requires-Dist: litellm>=1.0.0; extra == "litellm"
|
295
295
|
Provides-Extra: torch-memory-saver
|
296
|
-
Requires-Dist: torch_memory_saver
|
296
|
+
Requires-Dist: torch_memory_saver==0.0.8; extra == "torch-memory-saver"
|
297
297
|
Provides-Extra: decord
|
298
298
|
Requires-Dist: decord; extra == "decord"
|
299
299
|
Provides-Extra: test
|
300
300
|
Requires-Dist: accelerate; extra == "test"
|
301
|
+
Requires-Dist: expecttest; extra == "test"
|
301
302
|
Requires-Dist: jsonlines; extra == "test"
|
302
303
|
Requires-Dist: matplotlib; extra == "test"
|
303
304
|
Requires-Dist: pandas; extra == "test"
|
@@ -308,38 +309,32 @@ Provides-Extra: all
|
|
308
309
|
Requires-Dist: sglang[srt]; extra == "all"
|
309
310
|
Requires-Dist: sglang[openai]; extra == "all"
|
310
311
|
Requires-Dist: sglang[anthropic]; extra == "all"
|
311
|
-
Requires-Dist: sglang[litellm]; extra == "all"
|
312
312
|
Requires-Dist: sglang[torch_memory_saver]; extra == "all"
|
313
313
|
Requires-Dist: sglang[decord]; extra == "all"
|
314
314
|
Provides-Extra: all-hip
|
315
315
|
Requires-Dist: sglang[srt_hip]; extra == "all-hip"
|
316
316
|
Requires-Dist: sglang[openai]; extra == "all-hip"
|
317
317
|
Requires-Dist: sglang[anthropic]; extra == "all-hip"
|
318
|
-
Requires-Dist: sglang[litellm]; extra == "all-hip"
|
319
318
|
Requires-Dist: sglang[decord]; extra == "all-hip"
|
320
319
|
Provides-Extra: all-xpu
|
321
320
|
Requires-Dist: sglang[srt_xpu]; extra == "all-xpu"
|
322
321
|
Requires-Dist: sglang[openai]; extra == "all-xpu"
|
323
322
|
Requires-Dist: sglang[anthropic]; extra == "all-xpu"
|
324
|
-
Requires-Dist: sglang[litellm]; extra == "all-xpu"
|
325
323
|
Requires-Dist: sglang[decord]; extra == "all-xpu"
|
326
324
|
Provides-Extra: all-hpu
|
327
325
|
Requires-Dist: sglang[srt_hpu]; extra == "all-hpu"
|
328
326
|
Requires-Dist: sglang[openai]; extra == "all-hpu"
|
329
327
|
Requires-Dist: sglang[anthropic]; extra == "all-hpu"
|
330
|
-
Requires-Dist: sglang[litellm]; extra == "all-hpu"
|
331
328
|
Requires-Dist: sglang[decord]; extra == "all-hpu"
|
332
329
|
Provides-Extra: all-cpu
|
333
330
|
Requires-Dist: sglang[srt_cpu]; extra == "all-cpu"
|
334
331
|
Requires-Dist: sglang[openai]; extra == "all-cpu"
|
335
332
|
Requires-Dist: sglang[anthropic]; extra == "all-cpu"
|
336
|
-
Requires-Dist: sglang[litellm]; extra == "all-cpu"
|
337
333
|
Requires-Dist: sglang[decord]; extra == "all-cpu"
|
338
334
|
Provides-Extra: all-npu
|
339
335
|
Requires-Dist: sglang[srt_npu]; extra == "all-npu"
|
340
336
|
Requires-Dist: sglang[openai]; extra == "all-npu"
|
341
337
|
Requires-Dist: sglang[anthropic]; extra == "all-npu"
|
342
|
-
Requires-Dist: sglang[litellm]; extra == "all-npu"
|
343
338
|
Requires-Dist: sglang[decord]; extra == "all-npu"
|
344
339
|
Provides-Extra: dev
|
345
340
|
Requires-Dist: sglang[all]; extra == "dev"
|
@@ -376,17 +371,17 @@ Dynamic: license-file
|
|
376
371
|
| [**Documentation**](https://docs.sglang.ai/)
|
377
372
|
| [**Join Slack**](https://slack.sglang.ai/)
|
378
373
|
| [**Join Bi-Weekly Development Meeting**](https://meeting.sglang.ai/)
|
379
|
-
| [**Roadmap**](https://github.com/sgl-project/sglang/issues/
|
374
|
+
| [**Roadmap**](https://github.com/sgl-project/sglang/issues/7736)
|
380
375
|
| [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
|
381
376
|
|
382
377
|
## News
|
378
|
+
- [2025/08] 🔥 SGLang provides day-0 support for OpenAI gpt-oss model ([instructions](https://github.com/sgl-project/sglang/issues/8833))
|
383
379
|
- [2025/06] 🔥 SGLang, the high-performance serving infrastructure powering trillions of tokens daily, has been awarded the third batch of the Open Source AI Grant by a16z ([a16z blog](https://a16z.com/advancing-open-source-ai-through-benchmarks-and-bold-experimentation/)).
|
384
380
|
- [2025/06] 🔥 Deploying DeepSeek on GB200 NVL72 with PD and Large Scale EP (Part I): 2.7x Higher Decoding Throughput ([blog](https://lmsys.org/blog/2025-06-16-gb200-part-1/)).
|
385
381
|
- [2025/05] 🔥 Deploying DeepSeek with PD Disaggregation and Large-scale Expert Parallelism on 96 H100 GPUs ([blog](https://lmsys.org/blog/2025-05-05-large-scale-ep/)).
|
386
382
|
- [2025/03] Supercharge DeepSeek-R1 Inference on AMD Instinct MI300X ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1-Part2/README.html))
|
387
383
|
- [2025/03] SGLang Joins PyTorch Ecosystem: Efficient LLM Serving Engine ([PyTorch blog](https://pytorch.org/blog/sglang-joins-pytorch/))
|
388
384
|
- [2024/12] v0.4 Release: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
|
389
|
-
- [2024/07] v0.2 Release: Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
|
390
385
|
|
391
386
|
<details>
|
392
387
|
<summary>More</summary>
|
@@ -395,6 +390,7 @@ Dynamic: license-file
|
|
395
390
|
- [2025/01] SGLang provides day one support for DeepSeek V3/R1 models on NVIDIA and AMD GPUs with DeepSeek-specific optimizations. ([instructions](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3), [AMD blog](https://www.amd.com/en/developer/resources/technical-articles/amd-instinct-gpus-power-deepseek-v3-revolutionizing-ai-development-with-sglang.html), [10+ other companies](https://x.com/lmsysorg/status/1887262321636221412))
|
396
391
|
- [2024/10] The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
|
397
392
|
- [2024/09] v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
|
393
|
+
- [2024/07] v0.2 Release: Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
|
398
394
|
- [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
|
399
395
|
- [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
|
400
396
|
- [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
|
@@ -406,17 +402,17 @@ SGLang is a fast serving framework for large language models and vision language
|
|
406
402
|
It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
|
407
403
|
The core features include:
|
408
404
|
|
409
|
-
- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, prefill-decode disaggregation, speculative decoding, continuous batching, paged attention, tensor
|
405
|
+
- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, prefill-decode disaggregation, speculative decoding, continuous batching, paged attention, tensor/pipeline/expert/data parallelism, structured outputs, chunked prefill, quantization (FP4/FP8/INT4/AWQ/GPTQ), and multi-lora batching.
|
410
406
|
- **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
|
411
|
-
- **Extensive Model Support**: Supports a wide range of generative models (Llama,
|
412
|
-
- **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
|
407
|
+
- **Extensive Model Support**: Supports a wide range of generative models (Llama, Qwen, DeepSeek, Kimi, GPT, Gemma, Mistral, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
|
408
|
+
- **Active Community**: SGLang is open-source and backed by an active community with wide industry adoption.
|
413
409
|
|
414
410
|
## Getting Started
|
415
|
-
- [Install SGLang](https://docs.sglang.ai/
|
416
|
-
- [Quick Start](https://docs.sglang.ai/
|
417
|
-
- [Backend Tutorial](https://docs.sglang.ai/
|
418
|
-
- [Frontend Tutorial](https://docs.sglang.ai/frontend/
|
419
|
-
- [Contribution Guide](https://docs.sglang.ai/
|
411
|
+
- [Install SGLang](https://docs.sglang.ai/get_started/install.html)
|
412
|
+
- [Quick Start](https://docs.sglang.ai/basic_usage/send_request.html)
|
413
|
+
- [Backend Tutorial](https://docs.sglang.ai/basic_usage/openai_api_completions.html)
|
414
|
+
- [Frontend Tutorial](https://docs.sglang.ai/references/frontend/frontend_tutorial.html)
|
415
|
+
- [Contribution Guide](https://docs.sglang.ai/developer_guide/contribution_guide.html)
|
420
416
|
|
421
417
|
## Benchmark and Performance
|
422
418
|
Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/), [v0.4 blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/), [Large-scale expert parallelism](https://lmsys.org/blog/2025-05-05-large-scale-ep/).
|