PyPI - sglang - Versions diffs - 0.5.0rc0__tar.gz → 0.5.0rc1__tar.gz - Mend

sglang 0.5.0rc0tar.gz → 0.5.0rc1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (850) hide show

{sglang-0.5.0rc0/sglang.egg-info → sglang-0.5.0rc1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sglang
-Version: 0.5.0rc0
+Version: 0.5.0rc1
 Summary: SGLang is yet another fast serving framework for large language models and vision language models.
 License:                                  Apache License
                                    Version 2.0, January 2004
@@ -208,7 +208,7 @@ Project-URL: Homepage, https://github.com/sgl-project/sglang
 Project-URL: Bug Tracker, https://github.com/sgl-project/sglang/issues
 Classifier: Programming Language :: Python :: 3
 Classifier: License :: OSI Approved :: Apache Software License
-Requires-Python: >=3.9
+Requires-Python: >=3.10
 Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: aiohttp
@@ -222,6 +222,7 @@ Requires-Dist: blobfile==3.0.0; extra == "runtime-common"
 Requires-Dist: build; extra == "runtime-common"
 Requires-Dist: compressed-tensors; extra == "runtime-common"
 Requires-Dist: datasets; extra == "runtime-common"
+Requires-Dist: einops; extra == "runtime-common"
 Requires-Dist: fastapi; extra == "runtime-common"
 Requires-Dist: hf_transfer; extra == "runtime-common"
 Requires-Dist: huggingface_hub; extra == "runtime-common"
@@ -230,6 +231,7 @@ Requires-Dist: llguidance<0.8.0,>=0.7.11; extra == "runtime-common"
 Requires-Dist: modelscope; extra == "runtime-common"
 Requires-Dist: msgspec; extra == "runtime-common"
 Requires-Dist: ninja; extra == "runtime-common"
+Requires-Dist: openai==1.99.1; extra == "runtime-common"
 Requires-Dist: openai-harmony==0.0.3; extra == "runtime-common"
 Requires-Dist: orjson; extra == "runtime-common"
 Requires-Dist: outlines==0.1.11; extra == "runtime-common"
@@ -246,21 +248,21 @@ Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
 Requires-Dist: sentencepiece; extra == "runtime-common"
 Requires-Dist: soundfile==0.13.1; extra == "runtime-common"
 Requires-Dist: scipy; extra == "runtime-common"
+Requires-Dist: timm==1.0.16; extra == "runtime-common"
+Requires-Dist: tiktoken; extra == "runtime-common"
 Requires-Dist: torchao==0.9.0; extra == "runtime-common"
 Requires-Dist: transformers==4.55.0; extra == "runtime-common"
-Requires-Dist: timm==1.0.16; extra == "runtime-common"
 Requires-Dist: uvicorn; extra == "runtime-common"
 Requires-Dist: uvloop; extra == "runtime-common"
 Requires-Dist: xgrammar==0.1.22; extra == "runtime-common"
 Provides-Extra: srt
 Requires-Dist: sglang[runtime_common]; extra == "srt"
-Requires-Dist: sgl-kernel==0.3.2; extra == "srt"
+Requires-Dist: sgl-kernel==0.3.4.post1; extra == "srt"
 Requires-Dist: torch==2.8.0; extra == "srt"
 Requires-Dist: torchaudio==2.8.0; extra == "srt"
 Requires-Dist: torchvision; extra == "srt"
 Requires-Dist: cuda-python; extra == "srt"
-Requires-Dist: einops; extra == "srt"
-Requires-Dist: flashinfer_python==0.2.10; extra == "srt"
+Requires-Dist: flashinfer_python==0.2.11.post1; extra == "srt"
 Provides-Extra: blackwell
 Requires-Dist: sglang[runtime_common]; extra == "blackwell"
 Requires-Dist: sgl-kernel; extra == "blackwell"
@@ -268,21 +270,19 @@ Requires-Dist: torch==2.8.0; extra == "blackwell"
 Requires-Dist: torchaudio==2.8.0; extra == "blackwell"
 Requires-Dist: torchvision; extra == "blackwell"
 Requires-Dist: cuda-python; extra == "blackwell"
-Requires-Dist: einops; extra == "blackwell"
-Requires-Dist: flashinfer_python==0.2.10; extra == "blackwell"
-Requires-Dist: tiktoken; extra == "blackwell"
-Requires-Dist: openai==1.99.1; extra == "blackwell"
+Requires-Dist: flashinfer_python==0.2.11.post1; extra == "blackwell"
 Provides-Extra: srt-hip
 Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
 Requires-Dist: torch; extra == "srt-hip"
 Requires-Dist: petit_kernel==0.0.2; extra == "srt-hip"
+Requires-Dist: wave-lang==1.0.1; extra == "srt-hip"
+Provides-Extra: srt-cpu
+Requires-Dist: sglang[runtime_common]; extra == "srt-cpu"
+Requires-Dist: einops; extra == "srt-cpu"
 Provides-Extra: srt-xpu
 Requires-Dist: sglang[runtime_common]; extra == "srt-xpu"
 Provides-Extra: srt-hpu
 Requires-Dist: sglang[runtime_common]; extra == "srt-hpu"
-Provides-Extra: srt-cpu
-Requires-Dist: sglang[runtime_common]; extra == "srt-cpu"
-Requires-Dist: einops; extra == "srt-cpu"
 Provides-Extra: srt-npu
 Requires-Dist: sglang[runtime_common]; extra == "srt-npu"
 Provides-Extra: openai
@@ -293,11 +293,12 @@ Requires-Dist: anthropic>=0.20.0; extra == "anthropic"
 Provides-Extra: litellm
 Requires-Dist: litellm>=1.0.0; extra == "litellm"
 Provides-Extra: torch-memory-saver
-Requires-Dist: torch_memory_saver>=0.0.8; extra == "torch-memory-saver"
+Requires-Dist: torch_memory_saver==0.0.8; extra == "torch-memory-saver"
 Provides-Extra: decord
 Requires-Dist: decord; extra == "decord"
 Provides-Extra: test
 Requires-Dist: accelerate; extra == "test"
+Requires-Dist: expecttest; extra == "test"
 Requires-Dist: jsonlines; extra == "test"
 Requires-Dist: matplotlib; extra == "test"
 Requires-Dist: pandas; extra == "test"
@@ -308,38 +309,32 @@ Provides-Extra: all
 Requires-Dist: sglang[srt]; extra == "all"
 Requires-Dist: sglang[openai]; extra == "all"
 Requires-Dist: sglang[anthropic]; extra == "all"
-Requires-Dist: sglang[litellm]; extra == "all"
 Requires-Dist: sglang[torch_memory_saver]; extra == "all"
 Requires-Dist: sglang[decord]; extra == "all"
 Provides-Extra: all-hip
 Requires-Dist: sglang[srt_hip]; extra == "all-hip"
 Requires-Dist: sglang[openai]; extra == "all-hip"
 Requires-Dist: sglang[anthropic]; extra == "all-hip"
-Requires-Dist: sglang[litellm]; extra == "all-hip"
 Requires-Dist: sglang[decord]; extra == "all-hip"
 Provides-Extra: all-xpu
 Requires-Dist: sglang[srt_xpu]; extra == "all-xpu"
 Requires-Dist: sglang[openai]; extra == "all-xpu"
 Requires-Dist: sglang[anthropic]; extra == "all-xpu"
-Requires-Dist: sglang[litellm]; extra == "all-xpu"
 Requires-Dist: sglang[decord]; extra == "all-xpu"
 Provides-Extra: all-hpu
 Requires-Dist: sglang[srt_hpu]; extra == "all-hpu"
 Requires-Dist: sglang[openai]; extra == "all-hpu"
 Requires-Dist: sglang[anthropic]; extra == "all-hpu"
-Requires-Dist: sglang[litellm]; extra == "all-hpu"
 Requires-Dist: sglang[decord]; extra == "all-hpu"
 Provides-Extra: all-cpu
 Requires-Dist: sglang[srt_cpu]; extra == "all-cpu"
 Requires-Dist: sglang[openai]; extra == "all-cpu"
 Requires-Dist: sglang[anthropic]; extra == "all-cpu"
-Requires-Dist: sglang[litellm]; extra == "all-cpu"
 Requires-Dist: sglang[decord]; extra == "all-cpu"
 Provides-Extra: all-npu
 Requires-Dist: sglang[srt_npu]; extra == "all-npu"
 Requires-Dist: sglang[openai]; extra == "all-npu"
 Requires-Dist: sglang[anthropic]; extra == "all-npu"
-Requires-Dist: sglang[litellm]; extra == "all-npu"
 Requires-Dist: sglang[decord]; extra == "all-npu"
 Provides-Extra: dev
 Requires-Dist: sglang[all]; extra == "dev"
@@ -376,17 +371,17 @@ Dynamic: license-file
 | [**Documentation**](https://docs.sglang.ai/)
 | [**Join Slack**](https://slack.sglang.ai/)
 | [**Join Bi-Weekly Development Meeting**](https://meeting.sglang.ai/)
-| [**Roadmap**](https://github.com/sgl-project/sglang/issues/4042)
+| [**Roadmap**](https://github.com/sgl-project/sglang/issues/7736)
 | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
 ## News
+- [2025/08] 🔥 SGLang provides day-0 support for OpenAI gpt-oss model ([instructions](https://github.com/sgl-project/sglang/issues/8833))
 - [2025/06] 🔥 SGLang, the high-performance serving infrastructure powering trillions of tokens daily, has been awarded the third batch of the Open Source AI Grant by a16z ([a16z blog](https://a16z.com/advancing-open-source-ai-through-benchmarks-and-bold-experimentation/)).
 - [2025/06] 🔥 Deploying DeepSeek on GB200 NVL72 with PD and Large Scale EP (Part I): 2.7x Higher Decoding Throughput ([blog](https://lmsys.org/blog/2025-06-16-gb200-part-1/)).
 - [2025/05] 🔥 Deploying DeepSeek with PD Disaggregation and Large-scale Expert Parallelism on 96 H100 GPUs ([blog](https://lmsys.org/blog/2025-05-05-large-scale-ep/)).
 - [2025/03] Supercharge DeepSeek-R1 Inference on AMD Instinct MI300X ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1-Part2/README.html))
 - [2025/03] SGLang Joins PyTorch Ecosystem: Efficient LLM Serving Engine ([PyTorch blog](https://pytorch.org/blog/sglang-joins-pytorch/))
 - [2024/12] v0.4 Release: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
-- [2024/07] v0.2 Release: Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
 <details>
 <summary>More</summary>
@@ -395,6 +390,7 @@ Dynamic: license-file
 - [2025/01] SGLang provides day one support for DeepSeek V3/R1 models on NVIDIA and AMD GPUs with DeepSeek-specific optimizations. ([instructions](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3), [AMD blog](https://www.amd.com/en/developer/resources/technical-articles/amd-instinct-gpus-power-deepseek-v3-revolutionizing-ai-development-with-sglang.html), [10+ other companies](https://x.com/lmsysorg/status/1887262321636221412))
 - [2024/10] The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
 - [2024/09] v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
+- [2024/07] v0.2 Release: Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
 - [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
 - [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
 - [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
@@ -406,17 +402,17 @@ SGLang is a fast serving framework for large language models and vision language
 It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
 The core features include:
-- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, prefill-decode disaggregation, speculative decoding, continuous batching, paged attention, tensor parallelism, pipeline parallelism, expert parallelism, structured outputs, chunked prefill, quantization (FP8/INT4/AWQ/GPTQ), and multi-lora batching.
+- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, prefill-decode disaggregation, speculative decoding, continuous batching, paged attention, tensor/pipeline/expert/data parallelism, structured outputs, chunked prefill, quantization (FP4/FP8/INT4/AWQ/GPTQ), and multi-lora batching.
 - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
-- **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, Qwen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
-- **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
+- **Extensive Model Support**: Supports a wide range of generative models (Llama, Qwen, DeepSeek, Kimi, GPT, Gemma, Mistral, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
+- **Active Community**: SGLang is open-source and backed by an active community with wide industry adoption.
 ## Getting Started
-- [Install SGLang](https://docs.sglang.ai/start/install.html)
-- [Quick Start](https://docs.sglang.ai/backend/send_request.html)
-- [Backend Tutorial](https://docs.sglang.ai/backend/openai_api_completions.html)
-- [Frontend Tutorial](https://docs.sglang.ai/frontend/frontend.html)
-- [Contribution Guide](https://docs.sglang.ai/references/contribution_guide.html)
+- [Install SGLang](https://docs.sglang.ai/get_started/install.html)
+- [Quick Start](https://docs.sglang.ai/basic_usage/send_request.html)
+- [Backend Tutorial](https://docs.sglang.ai/basic_usage/openai_api_completions.html)
+- [Frontend Tutorial](https://docs.sglang.ai/references/frontend/frontend_tutorial.html)
+- [Contribution Guide](https://docs.sglang.ai/developer_guide/contribution_guide.html)
 ## Benchmark and Performance
 Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/), [v0.4 blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/), [Large-scale expert parallelism](https://lmsys.org/blog/2025-05-05-large-scale-ep/).

{sglang-0.5.0rc0 → sglang-0.5.0rc1}/README.md RENAMED Viewed

@@ -16,17 +16,17 @@
 | [**Documentation**](https://docs.sglang.ai/)
 | [**Join Slack**](https://slack.sglang.ai/)
 | [**Join Bi-Weekly Development Meeting**](https://meeting.sglang.ai/)
-| [**Roadmap**](https://github.com/sgl-project/sglang/issues/4042)
+| [**Roadmap**](https://github.com/sgl-project/sglang/issues/7736)
 | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
 ## News
+- [2025/08] 🔥 SGLang provides day-0 support for OpenAI gpt-oss model ([instructions](https://github.com/sgl-project/sglang/issues/8833))
 - [2025/06] 🔥 SGLang, the high-performance serving infrastructure powering trillions of tokens daily, has been awarded the third batch of the Open Source AI Grant by a16z ([a16z blog](https://a16z.com/advancing-open-source-ai-through-benchmarks-and-bold-experimentation/)).
 - [2025/06] 🔥 Deploying DeepSeek on GB200 NVL72 with PD and Large Scale EP (Part I): 2.7x Higher Decoding Throughput ([blog](https://lmsys.org/blog/2025-06-16-gb200-part-1/)).
 - [2025/05] 🔥 Deploying DeepSeek with PD Disaggregation and Large-scale Expert Parallelism on 96 H100 GPUs ([blog](https://lmsys.org/blog/2025-05-05-large-scale-ep/)).
 - [2025/03] Supercharge DeepSeek-R1 Inference on AMD Instinct MI300X ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1-Part2/README.html))
 - [2025/03] SGLang Joins PyTorch Ecosystem: Efficient LLM Serving Engine ([PyTorch blog](https://pytorch.org/blog/sglang-joins-pytorch/))
 - [2024/12] v0.4 Release: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
-- [2024/07] v0.2 Release: Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
 <details>
 <summary>More</summary>
@@ -35,6 +35,7 @@
 - [2025/01] SGLang provides day one support for DeepSeek V3/R1 models on NVIDIA and AMD GPUs with DeepSeek-specific optimizations. ([instructions](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3), [AMD blog](https://www.amd.com/en/developer/resources/technical-articles/amd-instinct-gpus-power-deepseek-v3-revolutionizing-ai-development-with-sglang.html), [10+ other companies](https://x.com/lmsysorg/status/1887262321636221412))
 - [2024/10] The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
 - [2024/09] v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
+- [2024/07] v0.2 Release: Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
 - [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
 - [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
 - [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
@@ -46,17 +47,17 @@ SGLang is a fast serving framework for large language models and vision language
 It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
 The core features include:
-- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, prefill-decode disaggregation, speculative decoding, continuous batching, paged attention, tensor parallelism, pipeline parallelism, expert parallelism, structured outputs, chunked prefill, quantization (FP8/INT4/AWQ/GPTQ), and multi-lora batching.
+- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, prefill-decode disaggregation, speculative decoding, continuous batching, paged attention, tensor/pipeline/expert/data parallelism, structured outputs, chunked prefill, quantization (FP4/FP8/INT4/AWQ/GPTQ), and multi-lora batching.
 - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
-- **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, Qwen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
-- **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
+- **Extensive Model Support**: Supports a wide range of generative models (Llama, Qwen, DeepSeek, Kimi, GPT, Gemma, Mistral, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
+- **Active Community**: SGLang is open-source and backed by an active community with wide industry adoption.
 ## Getting Started
-- [Install SGLang](https://docs.sglang.ai/start/install.html)
-- [Quick Start](https://docs.sglang.ai/backend/send_request.html)
-- [Backend Tutorial](https://docs.sglang.ai/backend/openai_api_completions.html)
-- [Frontend Tutorial](https://docs.sglang.ai/frontend/frontend.html)
-- [Contribution Guide](https://docs.sglang.ai/references/contribution_guide.html)
+- [Install SGLang](https://docs.sglang.ai/get_started/install.html)
+- [Quick Start](https://docs.sglang.ai/basic_usage/send_request.html)
+- [Backend Tutorial](https://docs.sglang.ai/basic_usage/openai_api_completions.html)
+- [Frontend Tutorial](https://docs.sglang.ai/references/frontend/frontend_tutorial.html)
+- [Contribution Guide](https://docs.sglang.ai/developer_guide/contribution_guide.html)
 ## Benchmark and Performance
 Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/), [v0.4 blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/), [Large-scale expert parallelism](https://lmsys.org/blog/2025-05-05-large-scale-ep/).

{sglang-0.5.0rc0 → sglang-0.5.0rc1}/pyproject.toml RENAMED Viewed

@@ -4,10 +4,10 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "sglang"
-version = "0.5.0rc0"
+version = "0.5.0rc1"
 description = "SGLang is yet another fast serving framework for large language models and vision language models."
 readme = "README.md"
-requires-python = ">=3.9"
+requires-python = ">=3.10"
 license = { file = "LICENSE" }
 classifiers = [
     "Programming Language :: Python :: 3",
@@ -21,6 +21,7 @@ runtime_common = [
     "build",
     "compressed-tensors",
     "datasets",
+    "einops",
     "fastapi",
     "hf_transfer",
     "huggingface_hub",
@@ -29,6 +30,7 @@ runtime_common = [
     "modelscope",
     "msgspec",
     "ninja",
+    "openai==1.99.1",
     "openai-harmony==0.0.3",
     "orjson",
     "outlines==0.1.11",
@@ -45,9 +47,10 @@ runtime_common = [
     "sentencepiece",
     "soundfile==0.13.1",
     "scipy",
+    "timm==1.0.16",
+    "tiktoken",
     "torchao==0.9.0",
     "transformers==4.55.0",
-    "timm==1.0.16",
     "uvicorn",
     "uvloop",
     "xgrammar==0.1.22",
@@ -55,13 +58,12 @@ runtime_common = [
 srt = [
     "sglang[runtime_common]",
-    "sgl-kernel==0.3.2",
+    "sgl-kernel==0.3.4.post1",
     "torch==2.8.0",
     "torchaudio==2.8.0",
     "torchvision",
     "cuda-python",
-    "einops",
-    "flashinfer_python==0.2.10",
+    "flashinfer_python==0.2.11.post1",
 ]
 blackwell = [
@@ -71,10 +73,7 @@ blackwell = [
     "torchaudio==2.8.0",
     "torchvision",
     "cuda-python",
-    "einops",
-    "flashinfer_python==0.2.10",
-    "tiktoken",
-    "openai==1.99.1",
+    "flashinfer_python==0.2.11.post1",
 ]
 # HIP (Heterogeneous-computing Interface for Portability) for AMD
@@ -83,8 +82,12 @@ srt_hip = [
     "sglang[runtime_common]",
     "torch",
     "petit_kernel==0.0.2",
+    "wave-lang==1.0.1",
 ]
+# CPU: torch wheel for CPU needs to be installed from https://download.pytorch.org/whl/cpu
+srt_cpu = ["sglang[runtime_common]", "einops"]
 # xpu is not enabled in public vllm and torch whl,
 # need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.htmlinstall vllm
 srt_xpu = ["sglang[runtime_common]"]
@@ -93,18 +96,17 @@ srt_xpu = ["sglang[runtime_common]"]
 # https://docs.vllm.ai/en/latest/getting_started/gaudi-installation.html
 srt_hpu = ["sglang[runtime_common]"]
-# CPU: torch wheel for CPU needs to be installed from https://download.pytorch.org/whl/cpu
-srt_cpu = ["sglang[runtime_common]", "einops"]
 # https://vllm-ascend.readthedocs.io/en/latest/installation.html
 srt_npu = ["sglang[runtime_common]"]
 openai = ["openai==1.99.1", "tiktoken"]
 anthropic = ["anthropic>=0.20.0"]
 litellm = ["litellm>=1.0.0"]
-torch_memory_saver = ["torch_memory_saver>=0.0.8"]
+torch_memory_saver = ["torch_memory_saver==0.0.8"]
 decord = ["decord"]
 test = [
     "accelerate",
+    "expecttest",
     "jsonlines",
     "matplotlib",
     "pandas",
@@ -112,12 +114,12 @@ test = [
     "sentence_transformers",
     "pytest",
 ]
-all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]", "sglang[torch_memory_saver]", "sglang[decord]"]
-all_hip = ["sglang[srt_hip]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]", "sglang[decord]"]
-all_xpu = ["sglang[srt_xpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]", "sglang[decord]"]
-all_hpu = ["sglang[srt_hpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]", "sglang[decord]"]
-all_cpu = ["sglang[srt_cpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]", "sglang[decord]"]
-all_npu = ["sglang[srt_npu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]", "sglang[decord]"]
+all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[torch_memory_saver]", "sglang[decord]"]
+all_hip = ["sglang[srt_hip]", "sglang[openai]", "sglang[anthropic]", "sglang[decord]"]
+all_xpu = ["sglang[srt_xpu]", "sglang[openai]", "sglang[anthropic]", "sglang[decord]"]
+all_hpu = ["sglang[srt_hpu]", "sglang[openai]", "sglang[anthropic]", "sglang[decord]"]
+all_cpu = ["sglang[srt_cpu]", "sglang[openai]", "sglang[anthropic]", "sglang[decord]"]
+all_npu = ["sglang[srt_npu]", "sglang[openai]", "sglang[anthropic]", "sglang[decord]"]
 dev = ["sglang[all]", "sglang[test]"]
 dev_hip = ["sglang[all_hip]", "sglang[test]"]

{sglang-0.5.0rc0 → sglang-0.5.0rc1}/sglang/__init__.py RENAMED Viewed

@@ -1,7 +1,8 @@
 # SGLang public APIs
 # Frontend Language APIs
-from sglang.api import (
+from sglang.global_config import global_config
+from sglang.lang.api import (
     Engine,
     Runtime,
     assistant,
@@ -25,22 +26,26 @@ from sglang.api import (
     user_end,
     video,
 )
-from sglang.global_config import global_config
 from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
 from sglang.lang.choices import (
     greedy_token_selection,
     token_length_normalized,
     unconditional_likelihood_normalized,
 )
+# Lazy import some libraries
 from sglang.utils import LazyImport
 from sglang.version import __version__
-ServerArgs = LazyImport("sglang.srt.server_args", "ServerArgs")
 Anthropic = LazyImport("sglang.lang.backend.anthropic", "Anthropic")
 LiteLLM = LazyImport("sglang.lang.backend.litellm", "LiteLLM")
 OpenAI = LazyImport("sglang.lang.backend.openai", "OpenAI")
 VertexAI = LazyImport("sglang.lang.backend.vertexai", "VertexAI")
+# Runtime Engine APIs
+ServerArgs = LazyImport("sglang.srt.server_args", "ServerArgs")
+Engine = LazyImport("sglang.srt.entrypoints.engine", "Engine")
 __all__ = [
     "Engine",
     "Runtime",

{sglang-0.5.0rc0 → sglang-0.5.0rc1}/sglang/bench_one_batch.py RENAMED Viewed

@@ -61,6 +61,7 @@ from sglang.srt.configs.model_config import ModelConfig
 from sglang.srt.distributed.parallel_state import destroy_distributed_environment
 from sglang.srt.entrypoints.engine import _set_envs_and_config
 from sglang.srt.hf_transformers_utils import get_tokenizer
+from sglang.srt.layers.moe.utils import DeepEPMode, MoeA2ABackend
 from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
 from sglang.srt.managers.scheduler import Scheduler
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
@@ -300,6 +301,11 @@ def _maybe_prepare_mlp_sync_batch(batch: ScheduleBatch, model_runner):
             disable_cuda_graph=model_runner.server_args.disable_cuda_graph,
             spec_algorithm=SpeculativeAlgorithm.NONE,
             speculative_num_draft_tokens=None,
+            enable_two_batch_overlap=model_runner.server_args.enable_two_batch_overlap,
+            enable_deepep_moe=MoeA2ABackend(
+                model_runner.server_args.moe_a2a_backend
+            ).is_deepep(),
+            deepep_mode=DeepEPMode(model_runner.server_args.deepep_mode),
             require_mlp_tp_gather=require_mlp_tp_gather(model_runner.server_args),
             disable_overlap_schedule=model_runner.server_args.disable_overlap_schedule,
         )

{sglang-0.5.0rc0 → sglang-0.5.0rc1}/sglang/lang/chat_template.py RENAMED Viewed

@@ -505,6 +505,22 @@ register_chat_template(
     )
 )
+# Reference: https://huggingface.co/docs/transformers/main/model_doc/glm4_v#usage-example
+register_chat_template(
+    ChatTemplate(
+        name="glm-4v",
+        default_system_prompt=None,
+        role_prefix_and_suffix={
+            "system": ("<|system|>\n", "\n"),
+            "user": ("<|user|>\n", "\n"),
+            "assistant": ("<|assistant|>\n", "\n"),
+        },
+        style=ChatTemplateStyle.PLAIN,
+        stop_str=["<|user|>", "<|endoftext|>", "<|observation|>"],
+        image_token="<|image|>",
+    )
+)
 @register_chat_template_matching_function
 def match_deepseek(model_path: str):
@@ -562,6 +578,8 @@ def match_chat_ml(model_path: str):
         return "chatml"
     if re.search(r"qwen.*vl", model_path, re.IGNORECASE):
         return "qwen2-vl"
+    if re.search(r"glm[-_]?4(\.\d+)?v", model_path, re.IGNORECASE):
+        return "glm-4v"
     if re.search(r"qwen.*(chat|instruct)", model_path, re.IGNORECASE) and not re.search(
         r"llava", model_path, re.IGNORECASE
     ):

sglang-0.5.0rc1/sglang/srt/bench_utils.py ADDED Viewed

@@ -0,0 +1,137 @@
+import os
+import sys
+from contextlib import nullcontext
+import torch
+# NOTE copied and modified from DeepGEMM
+class suppress_stdout_stderr:
+    def __enter__(self):
+        self.outnull_file = open(os.devnull, "w")
+        self.errnull_file = open(os.devnull, "w")
+        self.old_stdout_fileno_undup = sys.stdout.fileno()
+        self.old_stderr_fileno_undup = sys.stderr.fileno()
+        self.old_stdout_fileno = os.dup(sys.stdout.fileno())
+        self.old_stderr_fileno = os.dup(sys.stderr.fileno())
+        self.old_stdout = sys.stdout
+        self.old_stderr = sys.stderr
+        os.dup2(self.outnull_file.fileno(), self.old_stdout_fileno_undup)
+        os.dup2(self.errnull_file.fileno(), self.old_stderr_fileno_undup)
+        sys.stdout = self.outnull_file
+        sys.stderr = self.errnull_file
+        return self
+    def __exit__(self, *_):
+        sys.stdout = self.old_stdout
+        sys.stderr = self.old_stderr
+        os.dup2(self.old_stdout_fileno, self.old_stdout_fileno_undup)
+        os.dup2(self.old_stderr_fileno, self.old_stderr_fileno_undup)
+        os.close(self.old_stdout_fileno)
+        os.close(self.old_stderr_fileno)
+        self.outnull_file.close()
+        self.errnull_file.close()
+# NOTE copied and modified from DeepGEMM
+def bench_kineto(
+    fn,
+    kernel_names,
+    num_tests: int = 30,
+    suppress_kineto_output: bool = False,
+    trace_path: str = None,
+    flush_l2: bool = True,
+    with_multiple_kernels: bool = False,
+):
+    # Conflict with Nsight Systems
+    using_nsys = int(os.environ.get("SGLANG_NSYS_PROFILING", 0))
+    # By default, flush L2 with an excessive 8GB memset to give the GPU some (literal) chill time without full idle
+    flush_l2_size = int(8e9 // 4)
+    # For some auto-tuning kernels with prints
+    fn()
+    # Profile
+    suppress = (
+        suppress_stdout_stderr
+        if suppress_kineto_output and not using_nsys
+        else nullcontext
+    )
+    with suppress():
+        schedule = (
+            torch.profiler.schedule(wait=0, warmup=1, active=1, repeat=1)
+            if not using_nsys
+            else None
+        )
+        profiler = (
+            torch.profiler.profile(
+                activities=[torch.profiler.ProfilerActivity.CUDA], schedule=schedule
+            )
+            if not using_nsys
+            else nullcontext()
+        )
+        with profiler:
+            for i in range(2):
+                for _ in range(num_tests):
+                    if flush_l2:
+                        torch.empty(
+                            flush_l2_size, dtype=torch.int, device="cuda"
+                        ).zero_()
+                    fn()
+                if not using_nsys:
+                    profiler.step()
+    # Return 1 if using Nsight Systems
+    if using_nsys:
+        return 1
+    # Parse the profiling table
+    assert isinstance(kernel_names, str) or isinstance(kernel_names, tuple)
+    is_tuple = isinstance(kernel_names, tuple)
+    prof_lines = (
+        profiler.key_averages()
+        .table(sort_by="cuda_time_total", max_name_column_width=100)
+        .split("\n")
+    )
+    kernel_names = (kernel_names,) if isinstance(kernel_names, str) else kernel_names
+    assert all([isinstance(name, str) for name in kernel_names])
+    if not with_multiple_kernels:
+        for name in kernel_names:
+            assert (
+                sum([name in line for line in prof_lines]) == 1
+            ), f"Errors of the kernel {name} in the profiling table (table: {prof_lines})"
+    # Save chrome traces
+    if trace_path is not None:
+        profiler.export_chrome_trace(trace_path)
+    # Return average kernel times
+    units = {"ms": 1e3, "us": 1e6}
+    kernel_times = []
+    for name in kernel_names:
+        total_time = 0
+        total_num = 0
+        for line in prof_lines:
+            if name in line:
+                time_str = line.split()[-2]
+                num_str = line.split()[-1]
+                for unit, scale in units.items():
+                    if unit in time_str:
+                        total_time += (
+                            float(time_str.replace(unit, "")) / scale * int(num_str)
+                        )
+                        total_num += int(num_str)
+                        break
+        kernel_times.append(total_time / total_num)
+    return tuple(kernel_times) if is_tuple else kernel_times[0]

{sglang-0.5.0rc0 → sglang-0.5.0rc1}/sglang/srt/configs/model_config.py RENAMED Viewed

@@ -64,13 +64,12 @@ class ModelConfig:
         hybrid_kvcache_ratio: Optional[float] = None,
         model_impl: Union[str, ModelImpl] = ModelImpl.AUTO,
     ) -> None:
+        # Parse args
         self.model_path = model_path
         self.revision = revision
         self.quantization = quantization
         self.model_impl = model_impl
-        # Parse args
         self.maybe_pull_model_tokenizer_from_remote()
         self.model_override_args = json.loads(model_override_args)
         kwargs = {}
@@ -139,6 +138,7 @@ class ModelConfig:
             and self.hf_config.architectures[0] == "Ernie4_5_MoeForCausalLM"
         ):
             self.hf_config.architectures[0] = "Ernie4_5_MoeForCausalLMMTP"
         # Check model type
         self.is_generation = is_generation_model(
             self.hf_config.architectures, is_embedding
@@ -282,12 +282,10 @@ class ModelConfig:
         # Cache attributes
         self.hf_eos_token_id = self.get_hf_eos_token_id()
-        config = self.hf_config
         # multimodal
-        self.image_token_id = getattr(config, "image_token_id", None) or getattr(
-            config, "image_token_index", None
-        )
+        self.image_token_id = getattr(
+            self.hf_config, "image_token_id", None
+        ) or getattr(self.hf_config, "image_token_index", None)
     @staticmethod
     def from_server_args(server_args: ServerArgs, model_path: str = None, **kwargs):
@@ -661,6 +659,8 @@ multimodal_model_archs = [
     "DeepseekVL2ForCausalLM",
     "Gemma3ForConditionalGeneration",
     "Gemma3nForConditionalGeneration",
+    "Glm4vForConditionalGeneration",
+    "Glm4vMoeForConditionalGeneration",
     "Grok1VForCausalLM",
     "Grok1AForCausalLM",
     "LlavaLlamaForCausalLM",

sglang 0.5.0rc0__tar.gz → 0.5.0rc1__tar.gz

sglang 0.5.0rc0tar.gz → 0.5.0rc1tar.gz