sglang 0.5.0rc0__py3-none-any.whl → 0.5.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (130) hide show
  1. sglang/__init__.py +8 -3
  2. sglang/bench_one_batch.py +6 -0
  3. sglang/lang/chat_template.py +18 -0
  4. sglang/srt/bench_utils.py +137 -0
  5. sglang/srt/configs/model_config.py +7 -7
  6. sglang/srt/disaggregation/decode.py +8 -3
  7. sglang/srt/disaggregation/mooncake/conn.py +43 -25
  8. sglang/srt/disaggregation/mooncake/transfer_engine.py +29 -0
  9. sglang/srt/distributed/parallel_state.py +4 -2
  10. sglang/srt/entrypoints/context.py +3 -20
  11. sglang/srt/entrypoints/engine.py +13 -8
  12. sglang/srt/entrypoints/harmony_utils.py +2 -0
  13. sglang/srt/entrypoints/http_server.py +4 -5
  14. sglang/srt/entrypoints/openai/protocol.py +0 -9
  15. sglang/srt/entrypoints/openai/serving_chat.py +59 -265
  16. sglang/srt/entrypoints/openai/tool_server.py +4 -3
  17. sglang/srt/function_call/ebnf_composer.py +1 -0
  18. sglang/srt/function_call/function_call_parser.py +2 -0
  19. sglang/srt/function_call/glm4_moe_detector.py +1 -1
  20. sglang/srt/function_call/gpt_oss_detector.py +331 -0
  21. sglang/srt/function_call/kimik2_detector.py +3 -3
  22. sglang/srt/function_call/qwen3_coder_detector.py +219 -9
  23. sglang/srt/jinja_template_utils.py +6 -0
  24. sglang/srt/layers/attention/aiter_backend.py +370 -107
  25. sglang/srt/layers/attention/ascend_backend.py +3 -0
  26. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  27. sglang/srt/layers/attention/flashattention_backend.py +18 -0
  28. sglang/srt/layers/attention/flashinfer_backend.py +52 -13
  29. sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
  30. sglang/srt/layers/attention/trtllm_mla_backend.py +119 -22
  31. sglang/srt/layers/attention/vision.py +9 -1
  32. sglang/srt/layers/attention/wave_backend.py +627 -0
  33. sglang/srt/layers/attention/wave_ops/decode_attention.py +186 -0
  34. sglang/srt/layers/attention/wave_ops/extend_attention.py +149 -0
  35. sglang/srt/layers/attention/wave_ops/prefill_attention.py +79 -0
  36. sglang/srt/layers/communicator.py +8 -10
  37. sglang/srt/layers/flashinfer_comm_fusion.py +4 -4
  38. sglang/srt/layers/linear.py +1 -0
  39. sglang/srt/layers/moe/cutlass_moe.py +11 -16
  40. sglang/srt/layers/moe/cutlass_w4a8_moe.py +4 -5
  41. sglang/srt/layers/moe/ep_moe/kernels.py +43 -0
  42. sglang/srt/layers/moe/ep_moe/layer.py +60 -2
  43. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  44. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  45. sglang/srt/layers/moe/fused_moe_triton/layer.py +7 -9
  46. sglang/srt/layers/moe/token_dispatcher/deepep.py +61 -24
  47. sglang/srt/layers/moe/topk.py +4 -1
  48. sglang/srt/layers/quantization/__init__.py +5 -3
  49. sglang/srt/layers/quantization/fp8_kernel.py +277 -0
  50. sglang/srt/layers/quantization/fp8_utils.py +22 -10
  51. sglang/srt/layers/quantization/modelopt_quant.py +6 -11
  52. sglang/srt/layers/quantization/mxfp4.py +4 -1
  53. sglang/srt/layers/quantization/w4afp8.py +20 -11
  54. sglang/srt/layers/quantization/w8a8_int8.py +48 -34
  55. sglang/srt/layers/rotary_embedding.py +281 -2
  56. sglang/srt/lora/backend/base_backend.py +3 -23
  57. sglang/srt/lora/layers.py +60 -114
  58. sglang/srt/lora/lora.py +17 -62
  59. sglang/srt/lora/lora_manager.py +12 -48
  60. sglang/srt/lora/lora_registry.py +20 -9
  61. sglang/srt/lora/mem_pool.py +20 -63
  62. sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
  63. sglang/srt/lora/utils.py +25 -58
  64. sglang/srt/managers/cache_controller.py +21 -29
  65. sglang/srt/managers/detokenizer_manager.py +1 -1
  66. sglang/srt/managers/io_struct.py +6 -6
  67. sglang/srt/managers/mm_utils.py +1 -2
  68. sglang/srt/managers/multimodal_processor.py +1 -1
  69. sglang/srt/managers/schedule_batch.py +35 -20
  70. sglang/srt/managers/schedule_policy.py +6 -6
  71. sglang/srt/managers/scheduler.py +15 -7
  72. sglang/srt/managers/scheduler_profiler_mixin.py +28 -8
  73. sglang/srt/managers/tokenizer_manager.py +25 -26
  74. sglang/srt/mem_cache/allocator.py +61 -87
  75. sglang/srt/mem_cache/hicache_storage.py +1 -1
  76. sglang/srt/mem_cache/hiradix_cache.py +34 -24
  77. sglang/srt/mem_cache/lora_radix_cache.py +421 -0
  78. sglang/srt/mem_cache/memory_pool_host.py +33 -35
  79. sglang/srt/mem_cache/radix_cache.py +2 -5
  80. sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +443 -0
  81. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +139 -67
  82. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +6 -9
  83. sglang/srt/model_executor/cuda_graph_runner.py +22 -3
  84. sglang/srt/model_executor/forward_batch_info.py +26 -5
  85. sglang/srt/model_executor/model_runner.py +129 -35
  86. sglang/srt/model_loader/loader.py +18 -6
  87. sglang/srt/models/deepseek_v2.py +74 -35
  88. sglang/srt/models/gemma2.py +0 -34
  89. sglang/srt/models/gemma3n_mm.py +8 -9
  90. sglang/srt/models/glm4.py +6 -0
  91. sglang/srt/models/glm4_moe.py +9 -9
  92. sglang/srt/models/glm4v.py +589 -0
  93. sglang/srt/models/glm4v_moe.py +400 -0
  94. sglang/srt/models/gpt_oss.py +136 -19
  95. sglang/srt/models/granite.py +0 -25
  96. sglang/srt/models/llama.py +0 -25
  97. sglang/srt/models/llama4.py +1 -1
  98. sglang/srt/models/qwen2_5_vl.py +7 -3
  99. sglang/srt/models/qwen2_audio.py +10 -9
  100. sglang/srt/models/qwen3.py +0 -24
  101. sglang/srt/models/registry.py +1 -1
  102. sglang/srt/models/torch_native_llama.py +0 -24
  103. sglang/srt/multimodal/processors/base_processor.py +23 -13
  104. sglang/srt/multimodal/processors/glm4v.py +132 -0
  105. sglang/srt/multimodal/processors/qwen_audio.py +4 -2
  106. sglang/srt/reasoning_parser.py +316 -0
  107. sglang/srt/server_args.py +115 -139
  108. sglang/srt/speculative/eagle_worker.py +16 -0
  109. sglang/srt/two_batch_overlap.py +12 -4
  110. sglang/srt/utils.py +3 -3
  111. sglang/srt/weight_sync/tensor_bucket.py +106 -0
  112. sglang/test/attention/test_trtllm_mla_backend.py +186 -36
  113. sglang/test/doc_patch.py +59 -0
  114. sglang/test/few_shot_gsm8k.py +1 -1
  115. sglang/test/few_shot_gsm8k_engine.py +1 -1
  116. sglang/test/run_eval.py +4 -1
  117. sglang/test/simple_eval_common.py +6 -0
  118. sglang/test/simple_eval_gpqa.py +2 -0
  119. sglang/test/test_fp4_moe.py +118 -36
  120. sglang/utils.py +1 -1
  121. sglang/version.py +1 -1
  122. {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc1.dist-info}/METADATA +26 -30
  123. {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc1.dist-info}/RECORD +127 -115
  124. sglang/lang/backend/__init__.py +0 -0
  125. sglang/srt/function_call/harmony_tool_parser.py +0 -130
  126. sglang/srt/lora/backend/flashinfer_backend.py +0 -131
  127. /sglang/{api.py → lang/api.py} +0 -0
  128. {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc1.dist-info}/WHEEL +0 -0
  129. {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc1.dist-info}/licenses/LICENSE +0 -0
  130. {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc1.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sglang
3
- Version: 0.5.0rc0
3
+ Version: 0.5.0rc1
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -208,7 +208,7 @@ Project-URL: Homepage, https://github.com/sgl-project/sglang
208
208
  Project-URL: Bug Tracker, https://github.com/sgl-project/sglang/issues
209
209
  Classifier: Programming Language :: Python :: 3
210
210
  Classifier: License :: OSI Approved :: Apache Software License
211
- Requires-Python: >=3.9
211
+ Requires-Python: >=3.10
212
212
  Description-Content-Type: text/markdown
213
213
  License-File: LICENSE
214
214
  Requires-Dist: aiohttp
@@ -222,6 +222,7 @@ Requires-Dist: blobfile==3.0.0; extra == "runtime-common"
222
222
  Requires-Dist: build; extra == "runtime-common"
223
223
  Requires-Dist: compressed-tensors; extra == "runtime-common"
224
224
  Requires-Dist: datasets; extra == "runtime-common"
225
+ Requires-Dist: einops; extra == "runtime-common"
225
226
  Requires-Dist: fastapi; extra == "runtime-common"
226
227
  Requires-Dist: hf_transfer; extra == "runtime-common"
227
228
  Requires-Dist: huggingface_hub; extra == "runtime-common"
@@ -230,6 +231,7 @@ Requires-Dist: llguidance<0.8.0,>=0.7.11; extra == "runtime-common"
230
231
  Requires-Dist: modelscope; extra == "runtime-common"
231
232
  Requires-Dist: msgspec; extra == "runtime-common"
232
233
  Requires-Dist: ninja; extra == "runtime-common"
234
+ Requires-Dist: openai==1.99.1; extra == "runtime-common"
233
235
  Requires-Dist: openai-harmony==0.0.3; extra == "runtime-common"
234
236
  Requires-Dist: orjson; extra == "runtime-common"
235
237
  Requires-Dist: outlines==0.1.11; extra == "runtime-common"
@@ -246,21 +248,21 @@ Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
246
248
  Requires-Dist: sentencepiece; extra == "runtime-common"
247
249
  Requires-Dist: soundfile==0.13.1; extra == "runtime-common"
248
250
  Requires-Dist: scipy; extra == "runtime-common"
251
+ Requires-Dist: timm==1.0.16; extra == "runtime-common"
252
+ Requires-Dist: tiktoken; extra == "runtime-common"
249
253
  Requires-Dist: torchao==0.9.0; extra == "runtime-common"
250
254
  Requires-Dist: transformers==4.55.0; extra == "runtime-common"
251
- Requires-Dist: timm==1.0.16; extra == "runtime-common"
252
255
  Requires-Dist: uvicorn; extra == "runtime-common"
253
256
  Requires-Dist: uvloop; extra == "runtime-common"
254
257
  Requires-Dist: xgrammar==0.1.22; extra == "runtime-common"
255
258
  Provides-Extra: srt
256
259
  Requires-Dist: sglang[runtime_common]; extra == "srt"
257
- Requires-Dist: sgl-kernel==0.3.2; extra == "srt"
260
+ Requires-Dist: sgl-kernel==0.3.4.post1; extra == "srt"
258
261
  Requires-Dist: torch==2.8.0; extra == "srt"
259
262
  Requires-Dist: torchaudio==2.8.0; extra == "srt"
260
263
  Requires-Dist: torchvision; extra == "srt"
261
264
  Requires-Dist: cuda-python; extra == "srt"
262
- Requires-Dist: einops; extra == "srt"
263
- Requires-Dist: flashinfer_python==0.2.10; extra == "srt"
265
+ Requires-Dist: flashinfer_python==0.2.11.post1; extra == "srt"
264
266
  Provides-Extra: blackwell
265
267
  Requires-Dist: sglang[runtime_common]; extra == "blackwell"
266
268
  Requires-Dist: sgl-kernel; extra == "blackwell"
@@ -268,21 +270,19 @@ Requires-Dist: torch==2.8.0; extra == "blackwell"
268
270
  Requires-Dist: torchaudio==2.8.0; extra == "blackwell"
269
271
  Requires-Dist: torchvision; extra == "blackwell"
270
272
  Requires-Dist: cuda-python; extra == "blackwell"
271
- Requires-Dist: einops; extra == "blackwell"
272
- Requires-Dist: flashinfer_python==0.2.10; extra == "blackwell"
273
- Requires-Dist: tiktoken; extra == "blackwell"
274
- Requires-Dist: openai==1.99.1; extra == "blackwell"
273
+ Requires-Dist: flashinfer_python==0.2.11.post1; extra == "blackwell"
275
274
  Provides-Extra: srt-hip
276
275
  Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
277
276
  Requires-Dist: torch; extra == "srt-hip"
278
277
  Requires-Dist: petit_kernel==0.0.2; extra == "srt-hip"
278
+ Requires-Dist: wave-lang==1.0.1; extra == "srt-hip"
279
+ Provides-Extra: srt-cpu
280
+ Requires-Dist: sglang[runtime_common]; extra == "srt-cpu"
281
+ Requires-Dist: einops; extra == "srt-cpu"
279
282
  Provides-Extra: srt-xpu
280
283
  Requires-Dist: sglang[runtime_common]; extra == "srt-xpu"
281
284
  Provides-Extra: srt-hpu
282
285
  Requires-Dist: sglang[runtime_common]; extra == "srt-hpu"
283
- Provides-Extra: srt-cpu
284
- Requires-Dist: sglang[runtime_common]; extra == "srt-cpu"
285
- Requires-Dist: einops; extra == "srt-cpu"
286
286
  Provides-Extra: srt-npu
287
287
  Requires-Dist: sglang[runtime_common]; extra == "srt-npu"
288
288
  Provides-Extra: openai
@@ -293,11 +293,12 @@ Requires-Dist: anthropic>=0.20.0; extra == "anthropic"
293
293
  Provides-Extra: litellm
294
294
  Requires-Dist: litellm>=1.0.0; extra == "litellm"
295
295
  Provides-Extra: torch-memory-saver
296
- Requires-Dist: torch_memory_saver>=0.0.8; extra == "torch-memory-saver"
296
+ Requires-Dist: torch_memory_saver==0.0.8; extra == "torch-memory-saver"
297
297
  Provides-Extra: decord
298
298
  Requires-Dist: decord; extra == "decord"
299
299
  Provides-Extra: test
300
300
  Requires-Dist: accelerate; extra == "test"
301
+ Requires-Dist: expecttest; extra == "test"
301
302
  Requires-Dist: jsonlines; extra == "test"
302
303
  Requires-Dist: matplotlib; extra == "test"
303
304
  Requires-Dist: pandas; extra == "test"
@@ -308,38 +309,32 @@ Provides-Extra: all
308
309
  Requires-Dist: sglang[srt]; extra == "all"
309
310
  Requires-Dist: sglang[openai]; extra == "all"
310
311
  Requires-Dist: sglang[anthropic]; extra == "all"
311
- Requires-Dist: sglang[litellm]; extra == "all"
312
312
  Requires-Dist: sglang[torch_memory_saver]; extra == "all"
313
313
  Requires-Dist: sglang[decord]; extra == "all"
314
314
  Provides-Extra: all-hip
315
315
  Requires-Dist: sglang[srt_hip]; extra == "all-hip"
316
316
  Requires-Dist: sglang[openai]; extra == "all-hip"
317
317
  Requires-Dist: sglang[anthropic]; extra == "all-hip"
318
- Requires-Dist: sglang[litellm]; extra == "all-hip"
319
318
  Requires-Dist: sglang[decord]; extra == "all-hip"
320
319
  Provides-Extra: all-xpu
321
320
  Requires-Dist: sglang[srt_xpu]; extra == "all-xpu"
322
321
  Requires-Dist: sglang[openai]; extra == "all-xpu"
323
322
  Requires-Dist: sglang[anthropic]; extra == "all-xpu"
324
- Requires-Dist: sglang[litellm]; extra == "all-xpu"
325
323
  Requires-Dist: sglang[decord]; extra == "all-xpu"
326
324
  Provides-Extra: all-hpu
327
325
  Requires-Dist: sglang[srt_hpu]; extra == "all-hpu"
328
326
  Requires-Dist: sglang[openai]; extra == "all-hpu"
329
327
  Requires-Dist: sglang[anthropic]; extra == "all-hpu"
330
- Requires-Dist: sglang[litellm]; extra == "all-hpu"
331
328
  Requires-Dist: sglang[decord]; extra == "all-hpu"
332
329
  Provides-Extra: all-cpu
333
330
  Requires-Dist: sglang[srt_cpu]; extra == "all-cpu"
334
331
  Requires-Dist: sglang[openai]; extra == "all-cpu"
335
332
  Requires-Dist: sglang[anthropic]; extra == "all-cpu"
336
- Requires-Dist: sglang[litellm]; extra == "all-cpu"
337
333
  Requires-Dist: sglang[decord]; extra == "all-cpu"
338
334
  Provides-Extra: all-npu
339
335
  Requires-Dist: sglang[srt_npu]; extra == "all-npu"
340
336
  Requires-Dist: sglang[openai]; extra == "all-npu"
341
337
  Requires-Dist: sglang[anthropic]; extra == "all-npu"
342
- Requires-Dist: sglang[litellm]; extra == "all-npu"
343
338
  Requires-Dist: sglang[decord]; extra == "all-npu"
344
339
  Provides-Extra: dev
345
340
  Requires-Dist: sglang[all]; extra == "dev"
@@ -376,17 +371,17 @@ Dynamic: license-file
376
371
  | [**Documentation**](https://docs.sglang.ai/)
377
372
  | [**Join Slack**](https://slack.sglang.ai/)
378
373
  | [**Join Bi-Weekly Development Meeting**](https://meeting.sglang.ai/)
379
- | [**Roadmap**](https://github.com/sgl-project/sglang/issues/4042)
374
+ | [**Roadmap**](https://github.com/sgl-project/sglang/issues/7736)
380
375
  | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
381
376
 
382
377
  ## News
378
+ - [2025/08] 🔥 SGLang provides day-0 support for OpenAI gpt-oss model ([instructions](https://github.com/sgl-project/sglang/issues/8833))
383
379
  - [2025/06] 🔥 SGLang, the high-performance serving infrastructure powering trillions of tokens daily, has been awarded the third batch of the Open Source AI Grant by a16z ([a16z blog](https://a16z.com/advancing-open-source-ai-through-benchmarks-and-bold-experimentation/)).
384
380
  - [2025/06] 🔥 Deploying DeepSeek on GB200 NVL72 with PD and Large Scale EP (Part I): 2.7x Higher Decoding Throughput ([blog](https://lmsys.org/blog/2025-06-16-gb200-part-1/)).
385
381
  - [2025/05] 🔥 Deploying DeepSeek with PD Disaggregation and Large-scale Expert Parallelism on 96 H100 GPUs ([blog](https://lmsys.org/blog/2025-05-05-large-scale-ep/)).
386
382
  - [2025/03] Supercharge DeepSeek-R1 Inference on AMD Instinct MI300X ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1-Part2/README.html))
387
383
  - [2025/03] SGLang Joins PyTorch Ecosystem: Efficient LLM Serving Engine ([PyTorch blog](https://pytorch.org/blog/sglang-joins-pytorch/))
388
384
  - [2024/12] v0.4 Release: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
389
- - [2024/07] v0.2 Release: Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
390
385
 
391
386
  <details>
392
387
  <summary>More</summary>
@@ -395,6 +390,7 @@ Dynamic: license-file
395
390
  - [2025/01] SGLang provides day one support for DeepSeek V3/R1 models on NVIDIA and AMD GPUs with DeepSeek-specific optimizations. ([instructions](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3), [AMD blog](https://www.amd.com/en/developer/resources/technical-articles/amd-instinct-gpus-power-deepseek-v3-revolutionizing-ai-development-with-sglang.html), [10+ other companies](https://x.com/lmsysorg/status/1887262321636221412))
396
391
  - [2024/10] The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
397
392
  - [2024/09] v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
393
+ - [2024/07] v0.2 Release: Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
398
394
  - [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
399
395
  - [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
400
396
  - [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
@@ -406,17 +402,17 @@ SGLang is a fast serving framework for large language models and vision language
406
402
  It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
407
403
  The core features include:
408
404
 
409
- - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, prefill-decode disaggregation, speculative decoding, continuous batching, paged attention, tensor parallelism, pipeline parallelism, expert parallelism, structured outputs, chunked prefill, quantization (FP8/INT4/AWQ/GPTQ), and multi-lora batching.
405
+ - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, prefill-decode disaggregation, speculative decoding, continuous batching, paged attention, tensor/pipeline/expert/data parallelism, structured outputs, chunked prefill, quantization (FP4/FP8/INT4/AWQ/GPTQ), and multi-lora batching.
410
406
  - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
411
- - **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, Qwen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
412
- - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
407
+ - **Extensive Model Support**: Supports a wide range of generative models (Llama, Qwen, DeepSeek, Kimi, GPT, Gemma, Mistral, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
408
+ - **Active Community**: SGLang is open-source and backed by an active community with wide industry adoption.
413
409
 
414
410
  ## Getting Started
415
- - [Install SGLang](https://docs.sglang.ai/start/install.html)
416
- - [Quick Start](https://docs.sglang.ai/backend/send_request.html)
417
- - [Backend Tutorial](https://docs.sglang.ai/backend/openai_api_completions.html)
418
- - [Frontend Tutorial](https://docs.sglang.ai/frontend/frontend.html)
419
- - [Contribution Guide](https://docs.sglang.ai/references/contribution_guide.html)
411
+ - [Install SGLang](https://docs.sglang.ai/get_started/install.html)
412
+ - [Quick Start](https://docs.sglang.ai/basic_usage/send_request.html)
413
+ - [Backend Tutorial](https://docs.sglang.ai/basic_usage/openai_api_completions.html)
414
+ - [Frontend Tutorial](https://docs.sglang.ai/references/frontend/frontend_tutorial.html)
415
+ - [Contribution Guide](https://docs.sglang.ai/developer_guide/contribution_guide.html)
420
416
 
421
417
  ## Benchmark and Performance
422
418
  Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/), [v0.4 blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/), [Large-scale expert parallelism](https://lmsys.org/blog/2025-05-05-large-scale-ep/).