PyPI - sglang - Versions diffs - 0.4.6.post5__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl - Mend

sglang 0.4.6.post5py3-none-any.whl → 0.4.7.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (359) hide show

sglang/test/test_cutlass_moe.py CHANGED Viewed

@@ -6,7 +6,7 @@ import triton  # Added import
 import triton.testing  # Added import
 from transformers import AutoConfig
-from sglang.srt.layers.moe.cutlass_moe import cutlass_fused_experts
+from sglang.srt.layers.moe.cutlass_moe import cutlass_fused_experts_fp8
 from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_experts
@@ -125,7 +125,7 @@ def run_test(tp_size, batch_size, model_config, check=False):
     problem_sizes2 = torch.empty((E, 3), dtype=torch.int32, device="cuda")
     # --- Lambdas for Benchmarking ---
-    cutlass_lambda = lambda: cutlass_fused_experts(
+    cutlass_lambda = lambda: cutlass_fused_experts_fp8(
         x,
         w1.transpose(1, 2),  # Transposed
         w2.transpose(1, 2),  # Transposed
@@ -193,7 +193,7 @@ def run_test(tp_size, batch_size, model_config, check=False):
         print("Running correctness check...")
         with torch.no_grad():
             # Run CUTLASS version (requires transposed weights)
-            y_cutlass = cutlass_fused_experts(
+            y_cutlass = cutlass_fused_experts_fp8(
                 x,
                 w1.transpose(1, 2),  # Transposed
                 w2.transpose(1, 2),  # Transposed

sglang/test/test_fp4_moe.py ADDED Viewed

@@ -0,0 +1,248 @@
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+import torch
+from sgl_kernel import scaled_fp4_quant
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.moe.cutlass_moe import cutlass_moe_fp4
+from sglang.srt.layers.moe.cutlass_moe_params import CutlassMoEParams, CutlassMoEType
+from sglang.srt.layers.moe.topk import select_experts
+if torch.cuda.get_device_capability() < (10, 0):
+    pytest.skip(
+        reason="Nvfp4 Requires compute capability of 10 or above.",
+        allow_module_level=True,
+    )
+kE2M1ToFloat = torch.tensor(
+    [0.0, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0], dtype=torch.float32
+)
+FLOAT8_E4M3_MAX = 448.0
+FLOAT4_E2M1_MAX = 6.0
+def convert_swizzled_to_linear(a_sf_swizzled: torch.Tensor, m, k, block_size):
+    m_tiles = (m + 128 - 1) // 128
+    f = block_size * 4
+    k_tiles = (k + f - 1) // f
+    tmp = torch.reshape(a_sf_swizzled, (1, m_tiles, k_tiles, 32, 4, 4))
+    tmp = torch.permute(tmp, (0, 1, 4, 3, 2, 5))
+    out = tmp.reshape(m_tiles * 128, k_tiles * f // block_size)
+    return out[0:m, 0:k]
+def dequantize_nvfp4_to_dtype(
+    tensor_fp4, tensor_sf, global_scale, dtype, device, block_size=16
+):
+    """Dequantize the fp4 tensor back to high precision."""
+    # Two fp4 values are packed into one uint8.
+    assert tensor_fp4.dtype == torch.uint8
+    m, packed_k = tensor_fp4.shape
+    k = packed_k * 2
+    tensor_f32 = break_fp4_bytes(tensor_fp4, dtype)
+    tensor_f32 = tensor_f32.reshape(m, k // block_size, block_size)
+    tensor_sf = tensor_sf.view(torch.float8_e4m3fn)
+    tensor_sf = convert_swizzled_to_linear(tensor_sf, m, k, block_size)
+    tensor_sf_dtype = tensor_sf.to(torch.float32) / global_scale
+    # scale the tensor
+    out = (tensor_f32 * tensor_sf_dtype.unsqueeze(-1)).reshape(m, k)
+    return out.to(dtype=dtype)
+def break_fp4_bytes(a, dtype):
+    assert a.dtype == torch.uint8
+    m, n = a.shape
+    # Vectorized nibble processing
+    a_flat = a.flatten()
+    high = (a_flat & 0xF0) >> 4  # Upper nibbles
+    low = a_flat & 0x0F  # Lower nibbles
+    # Combine nibbles for batch processing
+    combined = torch.stack((low, high), dim=1).flatten()
+    # Vectorized sign and magnitude extraction
+    signs = (combined & 0x08).to(torch.bool)  # Sign bits
+    abs_vals = (combined & 0x07).to(torch.long)  # Magnitude indices
+    # Device-aware lookup and sign application
+    kE2M1 = kE2M1ToFloat.to(device=a.device)
+    values = kE2M1[abs_vals] * torch.where(signs, -1.0, 1.0)
+    # Reshape to final form
+    return values.reshape(m, n * 2).to(dtype=dtype)
+MNK_FACTORS = [
+    (2, 1024, 1024),
+    (2, 1024, 1536),
+    (2, 3072, 1024),
+    (2, 3072, 1536),
+    (64, 1024, 1024),
+    (64, 1024, 1536),
+    (64, 3072, 1024),
+    (64, 2048, 1024),
+    (224, 1024, 1024),
+    (224, 1024, 1536),
+]
+# Reference implementation of torch_moe
+def torch_moe(a, w1, w2, score, topk, expert_map):
+    B, D = a.shape
+    a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
+    out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device)
+    score = torch.softmax(score, dim=-1, dtype=torch.float32)
+    topk_weight, topk_ids = torch.topk(score, topk)
+    topk_weight = topk_weight.view(-1)
+    topk_ids = topk_ids.view(-1)
+    if expert_map is not None:
+        topk_ids = expert_map[topk_ids]
+    for i in range(w1.shape[0]):
+        mask = topk_ids == i
+        if mask.sum():
+            out[mask] = SiluAndMul()(a[mask] @ w1[i].transpose(0, 1)) @ w2[i].transpose(
+                0, 1
+            )
+    return (
+        out.view(B, -1, w2.shape[1]) * topk_weight.view(B, -1, 1).to(out.dtype)
+    ).sum(dim=1)
+@pytest.mark.parametrize("m,n,k", MNK_FACTORS)
+@pytest.mark.parametrize("e", [40, 64, 256])
+@pytest.mark.parametrize("topk", [1, 6, 8])
+@pytest.mark.parametrize("dtype", [torch.half, torch.bfloat16])
+@torch.inference_mode()
+def test_cutlass_fp4_moe_no_graph(
+    m: int, n: int, k: int, e: int, topk: int, dtype: torch.dtype
+):
+    torch.manual_seed(7)
+    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
+    w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
+    quant_blocksize = 16
+    round_up = lambda x, y: (x + y - 1) // y * y
+    sf_w1_2n = round_up(2 * n, 128)
+    sf_w1_k = round_up(k // quant_blocksize, 4)
+    w1_blockscale = torch.empty(
+        (e, sf_w1_2n, sf_w1_k), device="cuda", dtype=torch.float8_e4m3fn
+    )
+    w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
+    sf_w2_k = round_up(k, 128)
+    sf_w2_n = round_up(n // quant_blocksize, 4)
+    w2_blockscale = torch.empty(
+        (e, sf_w2_k, sf_w2_n), device="cuda", dtype=torch.float8_e4m3fn
+    )
+    w1_q = torch.empty((e, 2 * n, k // 2), device="cuda", dtype=torch.uint8)
+    w2_q = torch.empty((e, k, n // 2), device="cuda", dtype=torch.uint8)
+    w1_gs = torch.empty((e,), device="cuda", dtype=torch.float32)
+    w2_gs = torch.empty((e,), device="cuda", dtype=torch.float32)
+    for expert in range(e):
+        w1_amax = torch.abs(w1).max().to(torch.float32)
+        w2_amax = torch.abs(w2).max().to(torch.float32)
+        w1_gs[expert] = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / w1_amax
+        w2_gs[expert] = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / w2_amax
+        w1_q[expert], w1_blockscale[expert] = scaled_fp4_quant(
+            w1[expert], w1_gs[expert]
+        )
+        w2_q[expert], w2_blockscale[expert] = scaled_fp4_quant(
+            w2[expert], w2_gs[expert]
+        )
+    score = torch.randn((m, e), device="cuda", dtype=dtype)
+    topk_weights, topk_ids = select_experts(
+        hidden_states=a,
+        router_logits=score,
+        top_k=topk,
+        use_grouped_topk=False,
+        renormalize=False,
+    )
+    a1_gs = torch.ones((e,), device="cuda", dtype=torch.float32)
+    a2_gs = torch.ones((e,), device="cuda", dtype=torch.float32)
+    # strides for the cutlass moe_fp4 kernel
+    ab_strides_13 = torch.full(
+        (e,), w1_q.shape[2] * 2, dtype=torch.int64, device=w1_q.device
+    )
+    c_strides_13 = torch.full(
+        (e,), w1_q.shape[1], dtype=torch.int64, device=w1_q.device
+    )
+    ab_strides_2 = torch.full(
+        (e,), w2_q.shape[2] * 2, dtype=torch.int64, device=w2_q.device
+    )
+    c_strides_2 = torch.full((e,), w2_q.shape[1], dtype=torch.int64, device=w2_q.device)
+    params = CutlassMoEParams(
+        CutlassMoEType.BlockscaledFP4,
+        device=a.device,
+        num_experts=e,
+        intermediate_size_per_partition=n,  # n
+        hidden_size=k,
+    )  # k
+    cutlass_output = cutlass_moe_fp4(
+        a=a,
+        a1_gscale=a1_gs,
+        w1_fp4=w1_q,
+        w1_blockscale=w1_blockscale,
+        w1_alphas=(1 / w1_gs),
+        a2_gscale=a2_gs,
+        w2_fp4=w2_q,
+        w2_blockscale=w2_blockscale,
+        w2_alphas=(1 / w2_gs),
+        topk_weights=topk_weights,
+        topk_ids=topk_ids,
+        params=params,
+        apply_router_weight_on_input=False,
+    )
+    # Reference check:
+    a_global_scale = (
+        (FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) / torch.amax(a.flatten(), dim=-1)
+    ).to(torch.float32)
+    a_fp4, a_scale_interleaved = scaled_fp4_quant(a, a_global_scale)
+    _, m_k = a_fp4.shape
+    a_in_dtype = dequantize_nvfp4_to_dtype(
+        a_fp4,
+        a_scale_interleaved,
+        a_global_scale,
+        dtype=a.dtype,
+        device=a.device,
+        block_size=quant_blocksize,
+    )
+    w1_d = torch.empty((e, 2 * n, k), device="cuda", dtype=dtype)
+    w2_d = torch.empty((e, k, n), device="cuda", dtype=dtype)
+    for idx in range(0, e):
+        w1_d[idx] = dequantize_nvfp4_to_dtype(
+            w1_q[idx],
+            w1_blockscale[idx],
+            w1_gs[idx],
+            dtype=w1.dtype,
+            device=w1.device,
+            block_size=quant_blocksize,
+        )
+        w2_d[idx] = dequantize_nvfp4_to_dtype(
+            w2_q[idx],
+            w2_blockscale[idx],
+            w2_gs[idx],
+            dtype=w2.dtype,
+            device=w2.device,
+            block_size=quant_blocksize,
+        )
+    torch_output = torch_moe(a_in_dtype, w1_d, w2_d, score, topk, None)
+    torch.testing.assert_close(torch_output, cutlass_output, atol=1e-1, rtol=1e-1)
+if __name__ == "__main__":
+    test_cutlass_fp4_moe_no_graph(224, 1024, 1024, 256, 8, torch.half)

sglang/test/test_utils.py CHANGED Viewed

@@ -26,6 +26,7 @@ from sglang.lang.backend.openai import OpenAI
 from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
 from sglang.srt.utils import (
     get_bool_env_var,
+    get_device,
     is_port_available,
     kill_process_tree,
     retry,
@@ -40,6 +41,8 @@ DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
 DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST = "Qwen/Qwen1.5-MoE-A2.7B"
 # MLA test models
+DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST = "Alibaba-NLP/gte-Qwen2-1.5B-instruct"
+DEFAULT_SMALL_CROSS_ENCODER_MODEL_NAME_FOR_TEST = "cross-encoder/ms-marco-MiniLM-L6-v2"
 DEFAULT_MLA_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
 DEFAULT_MLA_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
 DEFAULT_MODEL_NAME_FOR_TEST_MLA = "lmsys/sglang-ci-dsv3-test"
@@ -80,12 +83,11 @@ DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-In
 DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4,hugging-quants/Mixtral-8x7B-Instruct-v0.1-AWQ-INT4"
 DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN = "Qwen/Qwen2.5-1.5B-Instruct"
 DEFAULT_SMALL_VLM_MODEL_NAME_FOR_TEST = "Qwen/Qwen2.5-VL-3B-Instruct"
-DEFAULT_VLM_CHAT_TEMPLATE_FOR_TEST = "qwen2-vl"
 DEFAULT_IMAGE_URL = "https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true"
 DEFAULT_VIDEO_URL = "https://raw.githubusercontent.com/EvolvingLMMs-Lab/sglang/dev/onevision_local/assets/jobs.mp4"
-DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 1000
+DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 600
 def is_in_ci():
@@ -93,6 +95,11 @@ def is_in_ci():
     return get_bool_env_var("SGLANG_IS_IN_CI")
+def is_in_amd_ci():
+    """Return whether it is in an AMD CI runner."""
+    return get_bool_env_var("SGLANG_AMD_CI")
 if is_in_ci():
     DEFAULT_PORT_FOR_SRT_TEST_RUNNER = (
         5000 + int(os.environ.get("CUDA_VISIBLE_DEVICES", "0")[0]) * 100
@@ -103,6 +110,9 @@ else:
     )
 DEFAULT_URL_FOR_TEST = f"http://127.0.0.1:{DEFAULT_PORT_FOR_SRT_TEST_RUNNER + 1000}"
+if is_in_amd_ci():
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 3000
 def call_generate_lightllm(prompt, temperature, max_tokens, stop=None, url=None):
     assert url is not None
@@ -300,13 +310,33 @@ def add_common_other_args_and_parse(parser: argparse.ArgumentParser):
     return args
+def auto_config_device() -> str:
+    """Auto-config available device platform"""
+    try:
+        device = get_device()
+    except (RuntimeError, ImportError) as e:
+        print(f"Warning: {e} - Falling back to CPU")
+        device = "cpu"
+    return device
 def add_common_sglang_args_and_parse(parser: argparse.ArgumentParser):
     parser.add_argument("--parallel", type=int, default=64)
     parser.add_argument("--host", type=str, default="http://127.0.0.1")
     parser.add_argument("--port", type=int, default=30000)
     parser.add_argument("--backend", type=str, default="srt")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="auto",
+        choices=["auto", "cuda", "rocm", "cpu"],
+        help="Device type (auto/cuda/rocm/cpu). Auto will detect available platforms",
+    )
     parser.add_argument("--result-file", type=str, default="result.jsonl")
     args = parser.parse_args()
     return args
@@ -392,11 +422,25 @@ def popen_launch_server(
     base_url: str,
     timeout: float,
     api_key: Optional[str] = None,
-    other_args: list[str] = (),
+    other_args: list[str] = [],
     env: Optional[dict] = None,
     return_stdout_stderr: Optional[tuple] = None,
+    device: str = "auto",
     pd_separated: bool = False,
 ):
+    """Launch a server process with automatic device detection.
+    Args:
+        device: Device type ("auto", "cuda", "rocm" or "cpu").
+                If "auto", will detect available platforms automatically.
+    """
+    # Auto-detect device if needed
+    if device == "auto":
+        device = auto_config_device()
+        print(f"Auto-configed device: {device}", flush=True)
+        other_args = list(other_args)
+        other_args += ["--device", str(device)]
     _, host, port = base_url.split(":")
     host = host[2:]
@@ -452,6 +496,15 @@ def popen_launch_server(
     start_time = time.perf_counter()
     with requests.Session() as session:
         while time.perf_counter() - start_time < timeout:
+            return_code = process.poll()
+            if return_code is not None:
+                # Server failed to start (non-zero exit code) or crashed
+                raise Exception(
+                    f"Server process exited with code {return_code}. "
+                    "Check server logs for errors."
+                )
             try:
                 headers = {
                     "Content-Type": "application/json; charset=utf-8",
@@ -622,6 +675,7 @@ def get_benchmark_args(
     disable_stream=False,
     disable_ignore_eos=False,
     seed: int = 0,
+    device="auto",
     pd_separated: bool = False,
 ):
     return SimpleNamespace(
@@ -652,6 +706,7 @@ def get_benchmark_args(
         profile=None,
         lora_name=None,
         prompt_suffix="",
+        device=device,
         pd_separated=pd_separated,
     )
@@ -671,7 +726,10 @@ def run_bench_serving(
     disable_ignore_eos=False,
     need_warmup=False,
     seed: int = 0,
+    device="auto",
 ):
+    if device == "auto":
+        device = auto_config_device()
     # Launch the server
     base_url = DEFAULT_URL_FOR_TEST
     process = popen_launch_server(
@@ -695,6 +753,7 @@ def run_bench_serving(
         disable_stream=disable_stream,
         disable_ignore_eos=disable_ignore_eos,
         seed=seed,
+        device=device,
     )
     try:
@@ -745,6 +804,18 @@ def run_bench_serving_multi(
 def run_bench_one_batch(model, other_args):
+    """Launch a offline process with automatic device detection.
+    Args:
+        device: Device type ("auto", "cuda", "rocm" or "cpu").
+                If "auto", will detect available platforms automatically.
+    """
+    # Auto-detect device if needed
+    device = auto_config_device()
+    print(f"Auto-configed device: {device}", flush=True)
+    other_args += ["--device", str(device)]
     command = [
         "python3",
         "-m",
@@ -876,20 +947,24 @@ def calculate_rouge_l(output_strs_list1, output_strs_list2):
     return rouge_l_scores
-STDERR_FILENAME = "stderr.txt"
-STDOUT_FILENAME = "stdout.txt"
+STDERR_FILENAME = "/tmp/stderr.txt"
+STDOUT_FILENAME = "/tmp/stdout.txt"
 def read_output(output_lines: List[str], filename: str = STDERR_FILENAME):
     """Print the output in real time with another thread."""
     while not os.path.exists(filename):
-        time.sleep(1)
+        time.sleep(0.01)
     pt = 0
     while pt >= 0:
         if pt > 0 and not os.path.exists(filename):
             break
-        lines = open(filename).readlines()
+        try:
+            lines = open(filename).readlines()
+        except FileNotFoundError:
+            print(f"{pt=}, {os.path.exists(filename)=}")
+            raise
         for line in lines[pt:]:
             print(line, end="", flush=True)
             output_lines.append(line)

sglang/utils.py CHANGED Viewed

@@ -512,3 +512,12 @@ async def async_stream_and_merge(llm, prompt, sampling_params):
         cleaned_chunk = trim_overlap(final_text, chunk_text)
         final_text += cleaned_chunk
         yield cleaned_chunk  # yield the non-overlapping portion
+def resolve_obj_by_qualname(qualname: str) -> Any:
+    """
+    Resolve an object by its fully qualified name.
+    """
+    module_name, obj_name = qualname.rsplit(".", 1)
+    module = importlib.import_module(module_name)
+    return getattr(module, obj_name)

sglang/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.4.6.~~post5~~"
1	+ __version__ = "0.4.7.post1"

{sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sglang
-Version: 0.4.6.post5
+Version: 0.4.7.post1
 Summary: SGLang is yet another fast serving framework for large language models and vision language models.
 License:                                  Apache License
                                    Version 2.0, January 2004
@@ -218,6 +218,7 @@ Requires-Dist: numpy
 Requires-Dist: IPython
 Requires-Dist: setproctitle
 Provides-Extra: runtime-common
+Requires-Dist: blobfile==3.0.0; extra == "runtime-common"
 Requires-Dist: compressed-tensors; extra == "runtime-common"
 Requires-Dist: datasets; extra == "runtime-common"
 Requires-Dist: fastapi; extra == "runtime-common"
@@ -239,30 +240,32 @@ Requires-Dist: pynvml; extra == "runtime-common"
 Requires-Dist: python-multipart; extra == "runtime-common"
 Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
 Requires-Dist: soundfile==0.13.1; extra == "runtime-common"
+Requires-Dist: scipy; extra == "runtime-common"
 Requires-Dist: torchao==0.9.0; extra == "runtime-common"
-Requires-Dist: transformers==4.51.1; extra == "runtime-common"
+Requires-Dist: transformers==4.52.3; extra == "runtime-common"
 Requires-Dist: uvicorn; extra == "runtime-common"
 Requires-Dist: uvloop; extra == "runtime-common"
 Requires-Dist: xgrammar==0.1.19; extra == "runtime-common"
-Requires-Dist: blobfile==3.0.0; extra == "runtime-common"
 Provides-Extra: srt
 Requires-Dist: sglang[runtime_common]; extra == "srt"
-Requires-Dist: sgl-kernel==0.1.4; extra == "srt"
-Requires-Dist: flashinfer_python==0.2.5; extra == "srt"
-Requires-Dist: torch==2.6.0; extra == "srt"
-Requires-Dist: torchvision==0.21.0; extra == "srt"
+Requires-Dist: sgl-kernel==0.1.9; extra == "srt"
+Requires-Dist: flashinfer_python==0.2.6.post1; extra == "srt"
+Requires-Dist: torch==2.7.1; extra == "srt"
+Requires-Dist: torchaudio==2.7.1; extra == "srt"
+Requires-Dist: torchvision==0.22.1; extra == "srt"
 Requires-Dist: cuda-python; extra == "srt"
 Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt"
 Requires-Dist: einops; extra == "srt"
 Provides-Extra: blackwell
 Requires-Dist: sglang[runtime_common]; extra == "blackwell"
 Requires-Dist: sgl-kernel; extra == "blackwell"
-Requires-Dist: torch; extra == "blackwell"
-Requires-Dist: torchvision; extra == "blackwell"
+Requires-Dist: torch==2.7.1; extra == "blackwell"
+Requires-Dist: torchaudio==2.7.1; extra == "blackwell"
+Requires-Dist: torchvision==0.22.1; extra == "blackwell"
 Requires-Dist: cuda-python; extra == "blackwell"
 Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "blackwell"
 Requires-Dist: einops; extra == "blackwell"
-Requires-Dist: flashinfer_python==0.2.5; extra == "blackwell"
+Requires-Dist: flashinfer_python==0.2.6.post1; extra == "blackwell"
 Provides-Extra: srt-hip
 Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
 Requires-Dist: torch; extra == "srt-hip"
@@ -277,7 +280,7 @@ Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt-hpu"
 Provides-Extra: srt-cpu
 Requires-Dist: sglang[runtime_common]; extra == "srt-cpu"
 Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt-cpu"
-Requires-Dist: torch; extra == "srt-cpu"
+Requires-Dist: einops; extra == "srt-cpu"
 Provides-Extra: srt-npu
 Requires-Dist: sglang[runtime_common]; extra == "srt-npu"
 Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt-npu"
@@ -368,7 +371,7 @@ Dynamic: license-file
 --------------------------------------------------------------------------------
-| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/)
+| [**Blog**](https://lmsys.org/blog/2025-05-05-large-scale-ep/)
 | [**Documentation**](https://docs.sglang.ai/)
 | [**Join Slack**](https://slack.sglang.ai/)
 | [**Join Bi-Weekly Development Meeting**](https://meeting.sglang.ai/)
@@ -400,7 +403,7 @@ SGLang is a fast serving framework for large language models and vision language
 It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
 The core features include:
-- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, continuous batching, token attention (paged attention), speculative decoding, tensor parallelism, chunked prefill, structured outputs, quantization (FP8/INT4/AWQ/GPTQ), and multi-lora batching.
+- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, prefill-decode disaggregation, speculative decoding, continuous batching, paged attention, tensor parallelism, pipeline parallelism, expert parallelism, structured outputs, chunked prefill, quantization (FP8/INT4/AWQ/GPTQ), and multi-lora batching.
 - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
 - **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, Qwen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
 - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
@@ -419,7 +422,7 @@ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
 [Development Roadmap (2025 H1)](https://github.com/sgl-project/sglang/issues/4042)
 ## Adoption and Sponsorship
-SGLang has been deployed at large scale, serving trillions of tokens in production every day. It is trusted and adopted by a broad range of leading enterprises and institutions, including xAI, NVIDIA, AMD, Google Cloud, Oracle Cloud, LinkedIn, Cursor, Voltage Park, Atlas Cloud, DataCrunch, Baseten, Nebius, Novita, InnoMatrix, RunPod, Stanford, UC Berkeley, UCLA, ETCHED, Jam & Tea Studios, Hyperbolic, as well as major technology organizations across North America and Asia. As an open-source LLM inference engine, SGLang has become the de facto standard in the industry, with production deployments running on over 100,000 GPUs worldwide.
+SGLang has been deployed at large scale, generating trillions of tokens in production every day. It is trusted and adopted by a broad range of leading enterprises and institutions, including xAI, NVIDIA, AMD, Google Cloud, Oracle Cloud, LinkedIn, Cursor, Voltage Park, Atlas Cloud, DataCrunch, Baseten, Nebius, Novita, InnoMatrix, RunPod, Stanford, UC Berkeley, UCLA, ETCHED, Jam & Tea Studios, Hyperbolic, as well as major technology organizations across North America and Asia. As an open-source LLM inference engine, SGLang has become the de facto standard in the industry, with production deployments running on over 100,000 GPUs worldwide.
 <img src="https://raw.githubusercontent.com/sgl-project/sgl-learning-materials/refs/heads/main/slides/adoption.png" alt="logo" width="800" margin="10px"></img>

sglang 0.4.6.post5__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl

sglang 0.4.6.post5py3-none-any.whl → 0.4.7.post1py3-none-any.whl