PyPI - sglang - Versions diffs - 0.5.0rc0__py3-none-any.whl → 0.5.0rc2__py3-none-any.whl - Mend

sglang 0.5.0rc0py3-none-any.whl → 0.5.0rc2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (170) hide show

sglang/__init__.py +8 -3
sglang/bench_one_batch.py +6 -1
sglang/lang/chat_template.py +18 -0
sglang/srt/bench_utils.py +137 -0
sglang/srt/configs/model_config.py +8 -7
sglang/srt/disaggregation/decode.py +8 -4
sglang/srt/disaggregation/mooncake/conn.py +43 -25
sglang/srt/disaggregation/mooncake/transfer_engine.py +29 -0
sglang/srt/distributed/parallel_state.py +4 -2
sglang/srt/entrypoints/context.py +3 -20
sglang/srt/entrypoints/engine.py +13 -8
sglang/srt/entrypoints/harmony_utils.py +2 -0
sglang/srt/entrypoints/http_server.py +68 -5
sglang/srt/entrypoints/openai/protocol.py +2 -9
sglang/srt/entrypoints/openai/serving_chat.py +60 -265
sglang/srt/entrypoints/openai/serving_completions.py +1 -0
sglang/srt/entrypoints/openai/tool_server.py +4 -3
sglang/srt/function_call/ebnf_composer.py +1 -0
sglang/srt/function_call/function_call_parser.py +2 -0
sglang/srt/function_call/glm4_moe_detector.py +1 -1
sglang/srt/function_call/gpt_oss_detector.py +331 -0
sglang/srt/function_call/kimik2_detector.py +3 -3
sglang/srt/function_call/qwen3_coder_detector.py +219 -9
sglang/srt/jinja_template_utils.py +6 -0
sglang/srt/layers/attention/aiter_backend.py +370 -107
sglang/srt/layers/attention/ascend_backend.py +3 -0
sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
sglang/srt/layers/attention/flashattention_backend.py +18 -0
sglang/srt/layers/attention/flashinfer_backend.py +55 -13
sglang/srt/layers/attention/flashinfer_mla_backend.py +1 -0
sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
sglang/srt/layers/attention/triton_backend.py +24 -27
sglang/srt/layers/attention/trtllm_mha_backend.py +8 -6
sglang/srt/layers/attention/trtllm_mla_backend.py +129 -25
sglang/srt/layers/attention/vision.py +9 -1
sglang/srt/layers/attention/wave_backend.py +627 -0
sglang/srt/layers/attention/wave_ops/decode_attention.py +186 -0
sglang/srt/layers/attention/wave_ops/extend_attention.py +149 -0
sglang/srt/layers/attention/wave_ops/prefill_attention.py +79 -0
sglang/srt/layers/communicator.py +11 -13
sglang/srt/layers/dp_attention.py +118 -27
sglang/srt/layers/flashinfer_comm_fusion.py +4 -4
sglang/srt/layers/linear.py +1 -0
sglang/srt/layers/logits_processor.py +12 -18
sglang/srt/layers/moe/cutlass_moe.py +11 -16
sglang/srt/layers/moe/cutlass_w4a8_moe.py +4 -5
sglang/srt/layers/moe/ep_moe/kernels.py +43 -0
sglang/srt/layers/moe/ep_moe/layer.py +60 -2
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=129,N=352,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=161,N=192,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_0/E=16,N=1024,device_name=NVIDIA_B200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/layer.py +7 -9
sglang/srt/layers/moe/token_dispatcher/deepep.py +61 -24
sglang/srt/layers/moe/topk.py +4 -1
sglang/srt/layers/multimodal.py +156 -40
sglang/srt/layers/quantization/__init__.py +10 -35
sglang/srt/layers/quantization/awq.py +15 -16
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +0 -1
sglang/srt/layers/quantization/fp8_kernel.py +277 -0
sglang/srt/layers/quantization/fp8_utils.py +22 -10
sglang/srt/layers/quantization/gptq.py +12 -17
sglang/srt/layers/quantization/marlin_utils.py +15 -5
sglang/srt/layers/quantization/modelopt_quant.py +58 -41
sglang/srt/layers/quantization/mxfp4.py +20 -3
sglang/srt/layers/quantization/utils.py +52 -2
sglang/srt/layers/quantization/w4afp8.py +20 -11
sglang/srt/layers/quantization/w8a8_int8.py +48 -34
sglang/srt/layers/rotary_embedding.py +281 -2
sglang/srt/layers/sampler.py +5 -2
sglang/srt/lora/backend/base_backend.py +3 -23
sglang/srt/lora/layers.py +66 -116
sglang/srt/lora/lora.py +17 -62
sglang/srt/lora/lora_manager.py +12 -48
sglang/srt/lora/lora_registry.py +20 -9
sglang/srt/lora/mem_pool.py +20 -63
sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
sglang/srt/lora/utils.py +25 -58
sglang/srt/managers/cache_controller.py +24 -29
sglang/srt/managers/detokenizer_manager.py +1 -1
sglang/srt/managers/io_struct.py +20 -6
sglang/srt/managers/mm_utils.py +1 -2
sglang/srt/managers/multimodal_processor.py +1 -1
sglang/srt/managers/schedule_batch.py +43 -49
sglang/srt/managers/schedule_policy.py +6 -6
sglang/srt/managers/scheduler.py +18 -11
sglang/srt/managers/scheduler_profiler_mixin.py +28 -8
sglang/srt/managers/tokenizer_manager.py +53 -44
sglang/srt/mem_cache/allocator.py +39 -214
sglang/srt/mem_cache/allocator_ascend.py +158 -0
sglang/srt/mem_cache/chunk_cache.py +1 -1
sglang/srt/mem_cache/hicache_storage.py +1 -1
sglang/srt/mem_cache/hiradix_cache.py +34 -24
sglang/srt/mem_cache/lora_radix_cache.py +421 -0
sglang/srt/mem_cache/memory_pool_host.py +33 -35
sglang/srt/mem_cache/radix_cache.py +2 -5
sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +443 -0
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +139 -67
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +6 -9
sglang/srt/model_executor/cuda_graph_runner.py +29 -23
sglang/srt/model_executor/forward_batch_info.py +33 -14
sglang/srt/model_executor/model_runner.py +179 -81
sglang/srt/model_loader/loader.py +18 -6
sglang/srt/models/deepseek_nextn.py +2 -1
sglang/srt/models/deepseek_v2.py +79 -38
sglang/srt/models/gemma2.py +0 -34
sglang/srt/models/gemma3n_mm.py +8 -9
sglang/srt/models/glm4.py +6 -0
sglang/srt/models/glm4_moe.py +11 -11
sglang/srt/models/glm4_moe_nextn.py +2 -1
sglang/srt/models/glm4v.py +589 -0
sglang/srt/models/glm4v_moe.py +400 -0
sglang/srt/models/gpt_oss.py +142 -20
sglang/srt/models/granite.py +0 -25
sglang/srt/models/llama.py +10 -27
sglang/srt/models/llama4.py +19 -6
sglang/srt/models/qwen2.py +2 -2
sglang/srt/models/qwen2_5_vl.py +7 -3
sglang/srt/models/qwen2_audio.py +10 -9
sglang/srt/models/qwen2_moe.py +20 -5
sglang/srt/models/qwen3.py +0 -24
sglang/srt/models/qwen3_classification.py +78 -0
sglang/srt/models/qwen3_moe.py +18 -5
sglang/srt/models/registry.py +1 -1
sglang/srt/models/step3_vl.py +6 -2
sglang/srt/models/torch_native_llama.py +0 -24
sglang/srt/multimodal/processors/base_processor.py +23 -13
sglang/srt/multimodal/processors/glm4v.py +132 -0
sglang/srt/multimodal/processors/qwen_audio.py +4 -2
sglang/srt/operations.py +17 -2
sglang/srt/reasoning_parser.py +316 -0
sglang/srt/sampling/sampling_batch_info.py +7 -4
sglang/srt/server_args.py +142 -140
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -21
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +7 -21
sglang/srt/speculative/eagle_worker.py +16 -0
sglang/srt/two_batch_overlap.py +16 -12
sglang/srt/utils.py +3 -3
sglang/srt/weight_sync/tensor_bucket.py +106 -0
sglang/test/attention/test_trtllm_mla_backend.py +186 -36
sglang/test/doc_patch.py +59 -0
sglang/test/few_shot_gsm8k.py +1 -1
sglang/test/few_shot_gsm8k_engine.py +1 -1
sglang/test/run_eval.py +4 -1
sglang/test/simple_eval_common.py +6 -0
sglang/test/simple_eval_gpqa.py +2 -0
sglang/test/test_fp4_moe.py +118 -36
sglang/test/test_marlin_moe.py +1 -1
sglang/test/test_marlin_utils.py +1 -1
sglang/utils.py +1 -1
sglang/version.py +1 -1
{sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/METADATA +27 -31
{sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/RECORD +166 -142
sglang/lang/backend/__init__.py +0 -0
sglang/srt/function_call/harmony_tool_parser.py +0 -130
sglang/srt/layers/quantization/scalar_type.py +0 -352
sglang/srt/lora/backend/flashinfer_backend.py +0 -131
/sglang/{api.py → lang/api.py} +0 -0
{sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/WHEEL +0 -0
{sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/top_level.txt +0 -0

sglang/test/test_fp4_moe.py CHANGED Viewed

@@ -1,6 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
+from typing import Callable
 import pytest
 import torch
+from flashinfer.fused_moe import cutlass_fused_moe as flashinfer_cutlass_fused_moe
 from sgl_kernel import scaled_fp4_quant
 from sglang.srt.layers.activation import SiluAndMul
@@ -111,15 +114,16 @@ def torch_moe(a, w1, w2, score, topk, expert_map):
     ).sum(dim=1)
-@pytest.mark.parametrize("m,n,k", MNK_FACTORS)
-@pytest.mark.parametrize("e", [40, 64, 256])
-@pytest.mark.parametrize("topk", [1, 6, 8])
-@pytest.mark.parametrize("dtype", [torch.half, torch.bfloat16])
-@torch.inference_mode()
-def test_cutlass_fp4_moe_no_graph(
-    m: int, n: int, k: int, e: int, topk: int, dtype: torch.dtype
+def check_moe(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    dtype: torch.dtype,
+    moe_impl: Callable,
+    flip_w13: bool,
 ):
     torch.manual_seed(7)
     a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
     w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
@@ -167,38 +171,18 @@ def test_cutlass_fp4_moe_no_graph(
     a1_gs = torch.ones((e,), device="cuda", dtype=torch.float32)
     a2_gs = torch.ones((e,), device="cuda", dtype=torch.float32)
-    # strides for the cutlass moe_fp4 kernel
-    ab_strides_13 = torch.full(
-        (e,), w1_q.shape[2] * 2, dtype=torch.int64, device=w1_q.device
-    )
-    c_strides_13 = torch.full(
-        (e,), w1_q.shape[1], dtype=torch.int64, device=w1_q.device
-    )
-    ab_strides_2 = torch.full(
-        (e,), w2_q.shape[2] * 2, dtype=torch.int64, device=w2_q.device
-    )
-    c_strides_2 = torch.full((e,), w2_q.shape[1], dtype=torch.int64, device=w2_q.device)
-    params = CutlassMoEParams(
-        CutlassMoEType.BlockscaledFP4,
-        device=a.device,
-        num_experts=e,
-        intermediate_size_per_partition=n,  # n
-        hidden_size=k,
-    )  # k
-    cutlass_output = cutlass_moe_fp4(
+    test_output = moe_impl(
         a=a,
-        a1_gscale=a1_gs,
-        w1_fp4=w1_q,
+        topk_weights=topk_weights,
+        topk_ids=topk_ids,
+        w1_q=w1_q,
+        w2_q=w2_q,
+        a1_gs=a1_gs,
         w1_blockscale=w1_blockscale,
         w1_alphas=(1 / w1_gs),
-        a2_gscale=a2_gs,
-        w2_fp4=w2_q,
+        a2_gs=a2_gs,
         w2_blockscale=w2_blockscale,
         w2_alphas=(1 / w2_gs),
-        topk_weights=topk_weights,
-        topk_ids=topk_ids,
-        params=params,
-        apply_router_weight_on_input=False,
     )
     # Reference check:
@@ -237,10 +221,108 @@ def test_cutlass_fp4_moe_no_graph(
             block_size=quant_blocksize,
         )
+    if flip_w13:
+        dim = -2
+        size = w1_d.size(dim)
+        assert size % 2 == 0, f"Expected even size in dim {dim}, got {size}"
+        half = size // 2
+        # Reorder weight
+        w1, w3 = w1_d.split(half, dim=dim)
+        w1_d = torch.cat([w3, w1], dim=dim).contiguous()
     torch_output = torch_moe(a_in_dtype, w1_d, w2_d, score, topk, None)
-    torch.testing.assert_close(torch_output, cutlass_output, atol=1e-1, rtol=1e-1)
+    torch.testing.assert_close(torch_output, test_output, atol=1e-1, rtol=1e-1)
+@pytest.mark.parametrize("m,n,k", MNK_FACTORS)
+@pytest.mark.parametrize("e", [40, 64, 256])
+@pytest.mark.parametrize("topk", [1, 6, 8])
+@pytest.mark.parametrize("dtype", [torch.half, torch.bfloat16])
+@torch.inference_mode()
+def test_cutlass_fp4_moe_no_graph(
+    m: int, n: int, k: int, e: int, topk: int, dtype: torch.dtype
+):
+    def cutlass_moe_impl(
+        a,
+        topk_weights,
+        topk_ids,
+        w1_q,
+        w2_q,
+        a1_gs,
+        w1_blockscale,
+        w1_alphas,
+        a2_gs,
+        w2_blockscale,
+        w2_alphas,
+    ):
+        params = CutlassMoEParams(
+            CutlassMoEType.BlockscaledFP4,
+            device=a.device,
+            num_experts=e,
+            intermediate_size_per_partition=n,  # n
+            hidden_size=k,
+        )  # k
+        return cutlass_moe_fp4(
+            a=a,
+            a1_gscale=a1_gs,
+            w1_fp4=w1_q,
+            w1_blockscale=w1_blockscale,
+            w1_alphas=w1_alphas,
+            a2_gscale=a2_gs,
+            w2_fp4=w2_q,
+            w2_blockscale=w2_blockscale,
+            w2_alphas=w2_alphas,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            params=params,
+            apply_router_weight_on_input=False,
+        )
+    check_moe(m, n, k, e, topk, dtype, cutlass_moe_impl, flip_w13=False)
+@pytest.mark.parametrize("m,n,k", MNK_FACTORS)
+@pytest.mark.parametrize("e", [40, 64, 256])
+@pytest.mark.parametrize("topk", [1, 6, 8])
+@pytest.mark.parametrize("dtype", [torch.half, torch.bfloat16])
+@torch.inference_mode()
+def test_flashinfer_fp4_moe_no_graph(
+    m: int, n: int, k: int, e: int, topk: int, dtype: torch.dtype
+):
+    def flashinfer_moe_impl(
+        a,
+        topk_weights,
+        topk_ids,
+        w1_q,
+        w2_q,
+        a1_gs,
+        w1_blockscale,
+        w1_alphas,
+        a2_gs,
+        w2_blockscale,
+        w2_alphas,
+    ):
+        return flashinfer_cutlass_fused_moe(
+            a,
+            topk_ids.to(torch.int),
+            topk_weights,
+            w1_q.view(torch.long),
+            w2_q.view(torch.long),
+            a.dtype,
+            quant_scales=[
+                a1_gs,
+                w1_blockscale.view(torch.int32),
+                w1_alphas,
+                a2_gs,
+                w2_blockscale.view(torch.int32),
+                w2_alphas,
+            ],
+        )[0]
+    check_moe(m, n, k, e, topk, dtype, flashinfer_moe_impl, flip_w13=True)
 if __name__ == "__main__":
     test_cutlass_fp4_moe_no_graph(224, 1024, 1024, 256, 8, torch.half)
+    test_flashinfer_fp4_moe_no_graph(224, 1024, 1024, 256, 8, torch.half)

sglang/test/test_marlin_moe.py CHANGED Viewed

@@ -4,9 +4,9 @@ from typing import Optional
 import pytest
 import torch
 from sgl_kernel import fused_marlin_moe
+from sgl_kernel.scalar_type import ScalarType, scalar_types
 from sglang.srt.layers.activation import SiluAndMul
-from sglang.srt.layers.quantization.scalar_type import ScalarType, scalar_types
 from sglang.test.test_marlin_utils import awq_marlin_quantize, marlin_quantize

sglang/test/test_marlin_utils.py CHANGED Viewed

@@ -10,13 +10,13 @@ from typing import Optional
 import numpy as np
 import torch
+from sgl_kernel.scalar_type import ScalarType
 from sglang.srt.layers.quantization.marlin_utils import (
     GPTQ_MARLIN_TILE,
     marlin_permute_scales,
     marlin_zero_points,
 )
-from sglang.srt.layers.quantization.scalar_type import ScalarType
 from sglang.srt.layers.quantization.utils import (
     get_pack_factor,
     gptq_quantize_weights,

sglang/utils.py CHANGED Viewed

@@ -458,7 +458,7 @@ def wait_for_server(base_url: str, timeout: int = None) -> None:
                     NOTE: Typically, the server runs in a separate terminal.
                     In this notebook, we run the server and notebook code together, so their outputs are combined.
                     To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.
-                    We are running those notebooks in a CI parallel environment, so the throughput is not representative of the actual performance.
+                    We are running those notebooks in a CI environment, so the throughput is not representative of the actual performance.
                     """
                 )
                 break

sglang/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.5.~~0rc0~~"
1	+ __version__ = "0.5.0rc2"

{sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sglang
-Version: 0.5.0rc0
+Version: 0.5.0rc2
 Summary: SGLang is yet another fast serving framework for large language models and vision language models.
 License:                                  Apache License
                                    Version 2.0, January 2004
@@ -208,7 +208,7 @@ Project-URL: Homepage, https://github.com/sgl-project/sglang
 Project-URL: Bug Tracker, https://github.com/sgl-project/sglang/issues
 Classifier: Programming Language :: Python :: 3
 Classifier: License :: OSI Approved :: Apache Software License
-Requires-Python: >=3.9
+Requires-Python: >=3.10
 Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: aiohttp
@@ -222,6 +222,7 @@ Requires-Dist: blobfile==3.0.0; extra == "runtime-common"
 Requires-Dist: build; extra == "runtime-common"
 Requires-Dist: compressed-tensors; extra == "runtime-common"
 Requires-Dist: datasets; extra == "runtime-common"
+Requires-Dist: einops; extra == "runtime-common"
 Requires-Dist: fastapi; extra == "runtime-common"
 Requires-Dist: hf_transfer; extra == "runtime-common"
 Requires-Dist: huggingface_hub; extra == "runtime-common"
@@ -230,6 +231,7 @@ Requires-Dist: llguidance<0.8.0,>=0.7.11; extra == "runtime-common"
 Requires-Dist: modelscope; extra == "runtime-common"
 Requires-Dist: msgspec; extra == "runtime-common"
 Requires-Dist: ninja; extra == "runtime-common"
+Requires-Dist: openai==1.99.1; extra == "runtime-common"
 Requires-Dist: openai-harmony==0.0.3; extra == "runtime-common"
 Requires-Dist: orjson; extra == "runtime-common"
 Requires-Dist: outlines==0.1.11; extra == "runtime-common"
@@ -246,21 +248,21 @@ Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
 Requires-Dist: sentencepiece; extra == "runtime-common"
 Requires-Dist: soundfile==0.13.1; extra == "runtime-common"
 Requires-Dist: scipy; extra == "runtime-common"
-Requires-Dist: torchao==0.9.0; extra == "runtime-common"
-Requires-Dist: transformers==4.55.0; extra == "runtime-common"
 Requires-Dist: timm==1.0.16; extra == "runtime-common"
+Requires-Dist: tiktoken; extra == "runtime-common"
+Requires-Dist: torchao==0.9.0; extra == "runtime-common"
+Requires-Dist: transformers==4.55.2; extra == "runtime-common"
 Requires-Dist: uvicorn; extra == "runtime-common"
 Requires-Dist: uvloop; extra == "runtime-common"
 Requires-Dist: xgrammar==0.1.22; extra == "runtime-common"
 Provides-Extra: srt
 Requires-Dist: sglang[runtime_common]; extra == "srt"
-Requires-Dist: sgl-kernel==0.3.2; extra == "srt"
+Requires-Dist: sgl-kernel==0.3.5; extra == "srt"
 Requires-Dist: torch==2.8.0; extra == "srt"
 Requires-Dist: torchaudio==2.8.0; extra == "srt"
 Requires-Dist: torchvision; extra == "srt"
 Requires-Dist: cuda-python; extra == "srt"
-Requires-Dist: einops; extra == "srt"
-Requires-Dist: flashinfer_python==0.2.10; extra == "srt"
+Requires-Dist: flashinfer_python==0.2.11.post3; extra == "srt"
 Provides-Extra: blackwell
 Requires-Dist: sglang[runtime_common]; extra == "blackwell"
 Requires-Dist: sgl-kernel; extra == "blackwell"
@@ -268,21 +270,19 @@ Requires-Dist: torch==2.8.0; extra == "blackwell"
 Requires-Dist: torchaudio==2.8.0; extra == "blackwell"
 Requires-Dist: torchvision; extra == "blackwell"
 Requires-Dist: cuda-python; extra == "blackwell"
-Requires-Dist: einops; extra == "blackwell"
-Requires-Dist: flashinfer_python==0.2.10; extra == "blackwell"
-Requires-Dist: tiktoken; extra == "blackwell"
-Requires-Dist: openai==1.99.1; extra == "blackwell"
+Requires-Dist: flashinfer_python==0.2.11.post3; extra == "blackwell"
 Provides-Extra: srt-hip
 Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
 Requires-Dist: torch; extra == "srt-hip"
 Requires-Dist: petit_kernel==0.0.2; extra == "srt-hip"
+Requires-Dist: wave-lang==1.0.1; extra == "srt-hip"
+Provides-Extra: srt-cpu
+Requires-Dist: sglang[runtime_common]; extra == "srt-cpu"
+Requires-Dist: einops; extra == "srt-cpu"
 Provides-Extra: srt-xpu
 Requires-Dist: sglang[runtime_common]; extra == "srt-xpu"
 Provides-Extra: srt-hpu
 Requires-Dist: sglang[runtime_common]; extra == "srt-hpu"
-Provides-Extra: srt-cpu
-Requires-Dist: sglang[runtime_common]; extra == "srt-cpu"
-Requires-Dist: einops; extra == "srt-cpu"
 Provides-Extra: srt-npu
 Requires-Dist: sglang[runtime_common]; extra == "srt-npu"
 Provides-Extra: openai
@@ -293,11 +293,12 @@ Requires-Dist: anthropic>=0.20.0; extra == "anthropic"
 Provides-Extra: litellm
 Requires-Dist: litellm>=1.0.0; extra == "litellm"
 Provides-Extra: torch-memory-saver
-Requires-Dist: torch_memory_saver>=0.0.8; extra == "torch-memory-saver"
+Requires-Dist: torch_memory_saver==0.0.8; extra == "torch-memory-saver"
 Provides-Extra: decord
 Requires-Dist: decord; extra == "decord"
 Provides-Extra: test
 Requires-Dist: accelerate; extra == "test"
+Requires-Dist: expecttest; extra == "test"
 Requires-Dist: jsonlines; extra == "test"
 Requires-Dist: matplotlib; extra == "test"
 Requires-Dist: pandas; extra == "test"
@@ -308,38 +309,32 @@ Provides-Extra: all
 Requires-Dist: sglang[srt]; extra == "all"
 Requires-Dist: sglang[openai]; extra == "all"
 Requires-Dist: sglang[anthropic]; extra == "all"
-Requires-Dist: sglang[litellm]; extra == "all"
 Requires-Dist: sglang[torch_memory_saver]; extra == "all"
 Requires-Dist: sglang[decord]; extra == "all"
 Provides-Extra: all-hip
 Requires-Dist: sglang[srt_hip]; extra == "all-hip"
 Requires-Dist: sglang[openai]; extra == "all-hip"
 Requires-Dist: sglang[anthropic]; extra == "all-hip"
-Requires-Dist: sglang[litellm]; extra == "all-hip"
 Requires-Dist: sglang[decord]; extra == "all-hip"
 Provides-Extra: all-xpu
 Requires-Dist: sglang[srt_xpu]; extra == "all-xpu"
 Requires-Dist: sglang[openai]; extra == "all-xpu"
 Requires-Dist: sglang[anthropic]; extra == "all-xpu"
-Requires-Dist: sglang[litellm]; extra == "all-xpu"
 Requires-Dist: sglang[decord]; extra == "all-xpu"
 Provides-Extra: all-hpu
 Requires-Dist: sglang[srt_hpu]; extra == "all-hpu"
 Requires-Dist: sglang[openai]; extra == "all-hpu"
 Requires-Dist: sglang[anthropic]; extra == "all-hpu"
-Requires-Dist: sglang[litellm]; extra == "all-hpu"
 Requires-Dist: sglang[decord]; extra == "all-hpu"
 Provides-Extra: all-cpu
 Requires-Dist: sglang[srt_cpu]; extra == "all-cpu"
 Requires-Dist: sglang[openai]; extra == "all-cpu"
 Requires-Dist: sglang[anthropic]; extra == "all-cpu"
-Requires-Dist: sglang[litellm]; extra == "all-cpu"
 Requires-Dist: sglang[decord]; extra == "all-cpu"
 Provides-Extra: all-npu
 Requires-Dist: sglang[srt_npu]; extra == "all-npu"
 Requires-Dist: sglang[openai]; extra == "all-npu"
 Requires-Dist: sglang[anthropic]; extra == "all-npu"
-Requires-Dist: sglang[litellm]; extra == "all-npu"
 Requires-Dist: sglang[decord]; extra == "all-npu"
 Provides-Extra: dev
 Requires-Dist: sglang[all]; extra == "dev"
@@ -376,17 +371,17 @@ Dynamic: license-file
 | [**Documentation**](https://docs.sglang.ai/)
 | [**Join Slack**](https://slack.sglang.ai/)
 | [**Join Bi-Weekly Development Meeting**](https://meeting.sglang.ai/)
-| [**Roadmap**](https://github.com/sgl-project/sglang/issues/4042)
+| [**Roadmap**](https://github.com/sgl-project/sglang/issues/7736)
 | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
 ## News
+- [2025/08] 🔥 SGLang provides day-0 support for OpenAI gpt-oss model ([instructions](https://github.com/sgl-project/sglang/issues/8833))
 - [2025/06] 🔥 SGLang, the high-performance serving infrastructure powering trillions of tokens daily, has been awarded the third batch of the Open Source AI Grant by a16z ([a16z blog](https://a16z.com/advancing-open-source-ai-through-benchmarks-and-bold-experimentation/)).
 - [2025/06] 🔥 Deploying DeepSeek on GB200 NVL72 with PD and Large Scale EP (Part I): 2.7x Higher Decoding Throughput ([blog](https://lmsys.org/blog/2025-06-16-gb200-part-1/)).
 - [2025/05] 🔥 Deploying DeepSeek with PD Disaggregation and Large-scale Expert Parallelism on 96 H100 GPUs ([blog](https://lmsys.org/blog/2025-05-05-large-scale-ep/)).
 - [2025/03] Supercharge DeepSeek-R1 Inference on AMD Instinct MI300X ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1-Part2/README.html))
 - [2025/03] SGLang Joins PyTorch Ecosystem: Efficient LLM Serving Engine ([PyTorch blog](https://pytorch.org/blog/sglang-joins-pytorch/))
 - [2024/12] v0.4 Release: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
-- [2024/07] v0.2 Release: Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
 <details>
 <summary>More</summary>
@@ -395,6 +390,7 @@ Dynamic: license-file
 - [2025/01] SGLang provides day one support for DeepSeek V3/R1 models on NVIDIA and AMD GPUs with DeepSeek-specific optimizations. ([instructions](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3), [AMD blog](https://www.amd.com/en/developer/resources/technical-articles/amd-instinct-gpus-power-deepseek-v3-revolutionizing-ai-development-with-sglang.html), [10+ other companies](https://x.com/lmsysorg/status/1887262321636221412))
 - [2024/10] The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
 - [2024/09] v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
+- [2024/07] v0.2 Release: Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
 - [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
 - [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
 - [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
@@ -406,17 +402,17 @@ SGLang is a fast serving framework for large language models and vision language
 It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
 The core features include:
-- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, prefill-decode disaggregation, speculative decoding, continuous batching, paged attention, tensor parallelism, pipeline parallelism, expert parallelism, structured outputs, chunked prefill, quantization (FP8/INT4/AWQ/GPTQ), and multi-lora batching.
+- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, prefill-decode disaggregation, speculative decoding, continuous batching, paged attention, tensor/pipeline/expert/data parallelism, structured outputs, chunked prefill, quantization (FP4/FP8/INT4/AWQ/GPTQ), and multi-lora batching.
 - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
-- **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, Qwen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
-- **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
+- **Extensive Model Support**: Supports a wide range of generative models (Llama, Qwen, DeepSeek, Kimi, GPT, Gemma, Mistral, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
+- **Active Community**: SGLang is open-source and backed by an active community with wide industry adoption.
 ## Getting Started
-- [Install SGLang](https://docs.sglang.ai/start/install.html)
-- [Quick Start](https://docs.sglang.ai/backend/send_request.html)
-- [Backend Tutorial](https://docs.sglang.ai/backend/openai_api_completions.html)
-- [Frontend Tutorial](https://docs.sglang.ai/frontend/frontend.html)
-- [Contribution Guide](https://docs.sglang.ai/references/contribution_guide.html)
+- [Install SGLang](https://docs.sglang.ai/get_started/install.html)
+- [Quick Start](https://docs.sglang.ai/basic_usage/send_request.html)
+- [Backend Tutorial](https://docs.sglang.ai/basic_usage/openai_api_completions.html)
+- [Frontend Tutorial](https://docs.sglang.ai/references/frontend/frontend_tutorial.html)
+- [Contribution Guide](https://docs.sglang.ai/developer_guide/contribution_guide.html)
 ## Benchmark and Performance
 Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/), [v0.4 blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/), [Large-scale expert parallelism](https://lmsys.org/blog/2025-05-05-large-scale-ep/).

sglang 0.5.0rc0__py3-none-any.whl → 0.5.0rc2__py3-none-any.whl

sglang 0.5.0rc0py3-none-any.whl → 0.5.0rc2py3-none-any.whl