PyPI - sglang - Versions diffs - 0.4.6.post4__py3-none-any.whl → 0.4.7__py3-none-any.whl - Mend

sglang 0.4.6.post4py3-none-any.whl → 0.4.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (358) hide show

sglang/srt/utils.py CHANGED Viewed

@@ -25,6 +25,7 @@ import json
 import logging
 import os
 import pickle
+import platform
 import random
 import re
 import resource
@@ -44,9 +45,22 @@ from functools import lru_cache
 from importlib.metadata import PackageNotFoundError, version
 from importlib.util import find_spec
 from io import BytesIO
+from json import JSONDecodeError
 from multiprocessing.reduction import ForkingPickler
 from pathlib import Path
-from typing import Any, Callable, Dict, List, Optional, Protocol, Set, Tuple, Union
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Generic,
+    List,
+    Optional,
+    Protocol,
+    Set,
+    Tuple,
+    TypeVar,
+    Union,
+)
 import numpy as np
 import psutil
@@ -125,10 +139,6 @@ builtins.FP8_E4M3_MAX = FP8_E4M3_MAX
 builtins.FP8_E4M3_MIN = FP8_E4M3_MIN
-def is_rocm() -> bool:
-    return torch.cuda.is_available() and torch.version.hip
 def is_cuda():
     return torch.cuda.is_available() and torch.version.cuda
@@ -149,6 +159,15 @@ def is_npu() -> bool:
     return hasattr(torch, "npu") and torch.npu.is_available()
+def is_cpu() -> bool:
+    machine = platform.machine().lower()
+    return (
+        machine in ("x86_64", "amd64", "i386", "i686")
+        and hasattr(torch, "cpu")
+        and torch.cpu.is_available()
+    )
 def is_flashinfer_available():
     """
     Check whether flashinfer is available.
@@ -250,7 +269,7 @@ def mark_start(name, interval=0.1, color=0, indent=0):
     torch.cuda.synchronize()
     if time_infos.get(name, None) is None:
         time_infos[name] = TimeInfo(name, interval, color, indent)
-    time_infos[name].acc_time -= time.time()
+    time_infos[name].acc_time -= time.perf_counter()
 def mark_end(name):
@@ -258,7 +277,7 @@ def mark_end(name):
     if not show_time_cost:
         return
     torch.cuda.synchronize()
-    time_infos[name].acc_time += time.time()
+    time_infos[name].acc_time += time.perf_counter()
     if time_infos[name].check():
         time_infos[name].pretty_print()
@@ -268,11 +287,11 @@ def calculate_time(show=False, min_cost_ms=0.0):
         def inner_func(*args, **kwargs):
             torch.cuda.synchronize()
             if show:
-                start_time = time.time()
+                start_time = time.perf_counter()
             result = func(*args, **kwargs)
             torch.cuda.synchronize()
             if show:
-                cost_time = (time.time() - start_time) * 1000
+                cost_time = (time.perf_counter() - start_time) * 1000
                 if cost_time > min_cost_ms:
                     print(f"Function {func.__name__} took {cost_time} ms to run.")
             return result
@@ -1851,6 +1870,8 @@ def get_cuda_version():
 def launch_dummy_health_check_server(host, port):
+    import asyncio
     import uvicorn
     from fastapi import FastAPI, Response
@@ -1866,13 +1887,27 @@ def launch_dummy_health_check_server(host, port):
         """Check the health of the http server."""
         return Response(status_code=200)
-    uvicorn.run(
+    config = uvicorn.Config(
         app,
         host=host,
         port=port,
         timeout_keep_alive=5,
-        loop="uvloop",
+        loop="auto",
+        log_config=None,
+        log_level="warning",
     )
+    server = uvicorn.Server(config=config)
+    try:
+        loop = asyncio.get_running_loop()
+        logger.info(
+            f"Dummy health check server scheduled on existing loop at {host}:{port}"
+        )
+        loop.create_task(server.serve())
+    except RuntimeError:
+        logger.info(f"Starting dummy health check server at {host}:{port}")
+        server.run()
 def create_checksum(directory: str):
@@ -1893,16 +1928,18 @@ def next_power_of_2(n: int):
 setattr(triton, "next_power_of_2", next_power_of_2)
-@contextmanager
-def empty_context(*args, **kwargs):
-    try:
-        # Setup code goes here
-        yield
-    finally:
-        # Cleanup code goes here
+class EmptyContextManager:
+    def __enter__(self):
+        return self
+    def __exit__(self, exc_type, exc_value, traceback):
         pass
+def empty_context(*args, **kwargs):
+    return EmptyContextManager()
 def add_prefix(name: str, prefix: str) -> str:
     """Add a weight path prefix to a module name.
@@ -2001,6 +2038,14 @@ class DeepEPMode(Enum):
             return DeepEPMode.normal
+def is_non_idle_and_non_empty(forward_mode, hidden_states):
+    return (
+        (forward_mode is not None)
+        and not forward_mode.is_idle()
+        and hidden_states.shape[0] > 0
+    )
 def fast_topk(values, topk, dim):
     if topk == 1:
         # Use max along the specified dimension to get both value and index
@@ -2022,6 +2067,12 @@ is_ampere_with_cuda_12_3 = lambda: _check(8)
 is_hopper_with_cuda_12_3 = lambda: _check(9)
+def is_blackwell():
+    if not is_cuda():
+        return False
+    return torch.cuda.get_device_capability()[0] == 10
 def get_free_port():
     # try ipv4
     try:
@@ -2044,6 +2095,14 @@ def get_local_ip_by_remote() -> str:
     except Exception:
         pass
+    try:
+        hostname = socket.gethostname()
+        ip = socket.gethostbyname(hostname)
+        if ip and ip != "127.0.0.1" and ip != "0.0.0.0":
+            return ip
+    except Exception:
+        pass
     # try ipv6
     try:
         s = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM)
@@ -2077,7 +2136,6 @@ def is_fa3_default_architecture(hf_config):
         "Qwen2ForCausalLM",
         "Llama4ForConditionalGeneration",
         "LlamaForCausalLM",
-        "MistralForCausalLM",
         "Gemma2ForCausalLM",
         "Gemma3ForConditionalGeneration",
         "Qwen3ForCausalLM",
@@ -2104,3 +2162,123 @@ def log_info_on_rank0(logger, msg):
     if get_tensor_model_parallel_rank() == 0:
         logger.info(msg)
+def load_json_config(data: str):
+    try:
+        return json.loads(data)
+    except JSONDecodeError:
+        return json.loads(Path(data).read_text())
+def dispose_tensor(x: torch.Tensor):
+    x.set_(torch.empty((0,), device=x.device, dtype=x.dtype))
+T = TypeVar("T")
+class Withable(Generic[T]):
+    def __init__(self):
+        self._value: Optional[T] = None
+    @property
+    def value(self) -> T:
+        return self._value
+    @contextmanager
+    def with_value(self, new_value: T):
+        assert self._value is None
+        self._value = new_value
+        try:
+            yield
+        finally:
+            assert self._value is new_value
+            self._value = None
+def find_local_repo_dir(repo_id: str, revision: Optional[str] = None) -> Optional[str]:
+    import huggingface_hub as hf
+    # Build cache path
+    cache_path = os.path.join(
+        hf.constants.HF_HUB_CACHE,
+        hf.constants.REPO_ID_SEPARATOR.join(["models", *repo_id.split("/")]),
+    )
+    # Get revision from main ref if not specified
+    if not revision:
+        ref_path = os.path.join(cache_path, "refs", "main")
+        if os.path.isfile(ref_path):
+            with open(ref_path) as f:
+                revision = f.read().strip()
+    # List files from revision directory
+    if revision:
+        rev_dir = os.path.join(cache_path, "snapshots", revision)
+        if os.path.isdir(rev_dir):
+            return rev_dir
+    return None
+def read_system_prompt_from_file(model_name: str) -> str:
+    """Read system prompt from a file in the HuggingFace cache directory.
+    Args:
+        model_name: The model name to construct the file path
+    Returns:
+        The system prompt content from the file, or empty string if file not found
+    """
+    try:
+        local_repo_dir = find_local_repo_dir(model_name)
+        if local_repo_dir:
+            system_prompt_file = os.path.join(local_repo_dir, "SYSTEM_PROMPT.txt")
+            if os.path.exists(system_prompt_file):
+                with open(system_prompt_file, "r", encoding="utf-8") as f:
+                    return f.read()
+        return ""
+    except Exception:
+        # If anything fails, return empty string
+        return ""
+def bind_or_assign(target, source):
+    if target is not None:
+        target.copy_(source)
+        return target
+    else:
+        return source
+def support_triton(backend: str) -> bool:
+    return backend not in ["torch_native", "intel_amx"]
+try:
+    import sgl_kernel
+    is_intel_amx_backend_available = hasattr(
+        torch.ops.sgl_kernel, "convert_weight_packed"
+    )
+except:
+    is_intel_amx_backend_available = False
+def cpu_has_amx_support():
+    return torch._C._cpu._is_amx_tile_supported() and is_intel_amx_backend_available
+class LazyValue:
+    def __init__(self, creator: Callable):
+        self._creator = creator
+        self._value = None
+    @property
+    def value(self):
+        if self._creator is not None:
+            self._value = self._creator()
+            self._creator = None
+        return self._value

sglang/test/runners.py CHANGED Viewed

@@ -26,6 +26,7 @@ from transformers import (
     AutoModelForCausalLM,
     AutoModelForVision2Seq,
     AutoProcessor,
+    GenerationConfig,
 )
 from sglang.srt.entrypoints.engine import Engine
@@ -382,13 +383,17 @@ class HFRunner:
                 model = base_model
             outputs = model.generate(
-                input_ids,
-                do_sample=False,
-                temperature=None,
-                top_p=None,
-                max_new_tokens=max_new_tokens,
-                return_dict_in_generate=True,
-                output_scores=(not output_str_only),
+                input_ids=input_ids,
+                generation_config=GenerationConfig(
+                    do_sample=False,
+                    temperature=None,
+                    top_p=None,
+                    max_new_tokens=max_new_tokens,
+                    return_dict_in_generate=True,
+                    output_scores=(not output_str_only),
+                    # make sure to disable compile
+                    disable_compile=True,
+                ),
             )
             text = tokenizer.decode(
@@ -450,6 +455,7 @@ class SRTRunner:
         torch_dtype: torch.dtype,
         model_type: str,
         tp_size: int = 1,
+        impl: str = "auto",
         port: int = DEFAULT_PORT_FOR_SRT_TEST_RUNNER,
         lora_paths: List[str] = None,
         max_loras_per_batch: int = 4,
@@ -470,6 +476,7 @@ class SRTRunner:
         speculative_num_draft_tokens: Optional[int] = None,
         disable_overlap_schedule: bool = False,
         disable_custom_all_reduce: bool = False,
+        torchao_config: Optional[str] = None,
     ):
         self.model_type = model_type
         self.is_generation = model_type == "generation"
@@ -488,6 +495,8 @@ class SRTRunner:
             tp_size=tp_size,
             dtype=get_dtype_str(torch_dtype),
             port=port,
+            impl=impl,
+            torchao_config=torchao_config,
             mem_fraction_static=mem_fraction_static,
             trust_remote_code=trust_remote_code,
             is_embedding=not self.is_generation,

sglang/test/send_one.py CHANGED Viewed

@@ -127,6 +127,10 @@ def send_one_prompt(args):
     if args.batch_size > 1:
         ret = ret[0]
+    if response.status_code != 200:
+        print(ret)
+        return 0, 0
     latency = ret["meta_info"]["e2e_latency"]
     if "spec_verify_ct" in ret["meta_info"]:

sglang/test/test_cutlass_moe.py ADDED Viewed

@@ -0,0 +1,278 @@
+import argparse
+import time
+import torch
+import triton  # Added import
+import triton.testing  # Added import
+from transformers import AutoConfig
+from sglang.srt.layers.moe.cutlass_moe import cutlass_fused_experts_fp8
+from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_experts
+def get_model_config(tp_size: int):
+    config = AutoConfig.from_pretrained(
+        "deepseek-ai/deepseek-R1", trust_remote_code=True
+    )
+    E = config.n_routed_experts
+    topk = config.num_experts_per_tok
+    intermediate_size = config.moe_intermediate_size
+    shard_intermediate_size = 2 * intermediate_size // tp_size
+    return {
+        "num_experts": E,
+        "topk": topk,
+        "hidden_size": config.hidden_size,
+        "shard_intermediate_size": shard_intermediate_size,
+        "dtype": config.torch_dtype,
+        "block_shape": config.quantization_config["weight_block_size"],
+    }
+def to_fp8(tensor: torch.Tensor) -> torch.Tensor:
+    """Converts tensor to FP8 E4M3, scaling values to fit the range."""
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    # Calculate max absolute value safely
+    max_val = torch.max(torch.abs(tensor))
+    # Avoid division by zero if tensor is all zeros
+    if max_val == 0:
+        scale_factor = 1.0
+    else:
+        # Scale factor to bring the max value to finfo.max
+        scale_factor = finfo.max / max_val
+    # Apply scaling
+    scaled_tensor = tensor * scale_factor
+    # Clamp and convert
+    fp8_tensor = scaled_tensor.clamp(min=finfo.min, max=finfo.max).to(
+        dtype=torch.float8_e4m3fn
+    )
+    return fp8_tensor
+def run_test(tp_size, batch_size, model_config, check=False):
+    print(f"\n--- Batch Size: {batch_size} ---")
+    torch.set_default_device("cuda")
+    torch.cuda.manual_seed_all(42)  # For reproducible random numbers
+    E = model_config["num_experts"]
+    topk = model_config["topk"]
+    H = model_config["hidden_size"]
+    I = model_config["shard_intermediate_size"]
+    block_shape = model_config["block_shape"]  # Tuple (BLOCK_N, BLOCK_K)
+    dtype = model_config["dtype"]  # e.g., torch.bfloat16
+    print(
+        f"Config: E={E}, topk={topk}, H={H}, I_shard={I}, dtype={dtype}, block_shape={block_shape}"
+    )
+    # --- Input Data ---
+    # Use bf16/fp16 for input activation based on model config
+    x = torch.randn((batch_size, H), device="cuda", dtype=dtype) * 0.0001
+    # --- Weights (Generate in higher precision, then convert to FP8) ---
+    # Generate weights suitable for FP8 conversion (e.g., scaled appropriately)
+    w1_hp = (
+        torch.randn((E, I, H), device="cuda", dtype=torch.float32) * 0.00001 + 0.00001
+    )
+    w2_hp = (
+        torch.randn((E, H, I // 2), device="cuda", dtype=torch.float32) * 0.00001
+        + 0.00001
+    )
+    w1 = to_fp8(w1_hp)
+    w2 = to_fp8(w2_hp)
+    # --- Scales for FP8 Weights ---
+    block_n, block_k = block_shape
+    # Calculate number of blocks needed
+    w1_blocks_dim1 = (I + block_n - 1) // block_n
+    w1_blocks_dim2 = (H + block_k - 1) // block_k
+    w2_blocks_dim1 = (H + block_n - 1) // block_n
+    w2_blocks_dim2 = (I // 2 + block_k - 1) // block_k
+    # Scales are typically float32 or float16/bfloat16
+    scale_dtype = torch.float32  # Or dtype if scales match model dtype
+    w1_scale = torch.full(
+        (E, w1_blocks_dim1, w1_blocks_dim2), 1, device="cuda", dtype=scale_dtype
+    )  # Avoid zero scales
+    w2_scale = torch.full(
+        (E, w2_blocks_dim1, w2_blocks_dim2), 1, device="cuda", dtype=scale_dtype
+    )  # Avoid zero scales
+    # --- Routing Information ---
+    topk_weights = torch.softmax(
+        torch.rand(batch_size, topk, device="cuda", dtype=dtype), dim=-1
+    )
+    topk_ids = torch.randint(0, E, (batch_size, topk), dtype=torch.int32, device="cuda")
+    a1_strides = torch.full((E,), H, dtype=torch.int64, device="cuda")
+    c1_strides = torch.full((E,), I, dtype=torch.int64, device="cuda")
+    a2_strides = torch.full((E,), I // 2, dtype=torch.int64, device="cuda")
+    c2_strides = torch.full((E,), H, dtype=torch.int64, device="cuda")
+    workspace = torch.empty(
+        (7182 * 1024), device="cuda", dtype=torch.uint8
+    )  # Allocate sufficient workspace
+    # Pointer arrays (often filled by the kernel or a prep step, but needed as args)
+    a_ptrs = torch.empty((E,), dtype=torch.int64, device="cuda")
+    b_ptrs = torch.empty((E,), dtype=torch.int64, device="cuda")
+    out_ptrs = torch.empty((E,), dtype=torch.int64, device="cuda")
+    a_scales_ptrs = torch.empty((E,), dtype=torch.int64, device="cuda")
+    b_scales_ptrs = torch.empty((E,), dtype=torch.int64, device="cuda")
+    expert_offsets = torch.empty((E + 1,), dtype=torch.int32, device="cuda")
+    problem_sizes1 = torch.empty((E, 3), dtype=torch.int32, device="cuda")
+    problem_sizes2 = torch.empty((E, 3), dtype=torch.int32, device="cuda")
+    # --- Lambdas for Benchmarking ---
+    cutlass_lambda = lambda: cutlass_fused_experts_fp8(
+        x,
+        w1.transpose(1, 2),  # Transposed
+        w2.transpose(1, 2),  # Transposed
+        w1_scale.transpose(1, 2),
+        w2_scale.transpose(1, 2),
+        topk_weights,
+        topk_ids,
+        a1_strides,
+        c1_strides,
+        a2_strides,
+        c2_strides,
+        workspace,
+        a_ptrs,
+        b_ptrs,
+        out_ptrs,
+        a_scales_ptrs,
+        b_scales_ptrs,
+        expert_offsets,
+        problem_sizes1,
+        problem_sizes2,
+    )
+    # Note: Triton expects non-transposed weights
+    triton_lambda = lambda: fused_experts(
+        x,
+        w1,
+        w2,
+        topk_weights,
+        topk_ids,
+        inplace=False,  # Use False for benchmarking to avoid side effects if run multiple times
+        activation="silu",  # Assuming SiLU activation common in MoEs
+        use_fp8_w8a8=True,
+        w1_scale=w1_scale,
+        w2_scale=w2_scale,
+        block_shape=block_shape,
+    )
+    # --- Warmup ---
+    print("Warming up...")
+    for _ in range(10):
+        _ = cutlass_lambda()
+        _ = triton_lambda()
+    torch.cuda.synchronize()
+    # --- Benchmarking ---
+    quantiles = [0.5, 0.2, 0.8]
+    print(f"Benchmarking Cutlass fused_experts...")
+    cutlass_ms, cutlass_min, cutlass_max = triton.testing.do_bench_cudagraph(
+        cutlass_lambda, rep=1000, quantiles=quantiles
+    )
+    print(f"Benchmarking Triton fused_experts...")
+    triton_ms, triton_min, triton_max = triton.testing.do_bench_cudagraph(
+        triton_lambda, rep=1000, quantiles=quantiles
+    )
+    print(
+        f"Cutlass fused_experts time: {cutlass_ms:.3f} ms (median) [{cutlass_min:.3f} - {cutlass_max:.3f}]"
+    )
+    print(
+        f"Triton  fused_experts time: {triton_ms:.3f} ms (median) [{triton_min:.3f} - {triton_max:.3f}]"
+    )
+    # --- Correctness Check ---
+    if check:
+        print("Running correctness check...")
+        with torch.no_grad():
+            # Run CUTLASS version (requires transposed weights)
+            y_cutlass = cutlass_fused_experts_fp8(
+                x,
+                w1.transpose(1, 2),  # Transposed
+                w2.transpose(1, 2),  # Transposed
+                w1_scale.transpose(1, 2),
+                w2_scale.transpose(1, 2),
+                topk_weights,
+                topk_ids,
+                a1_strides,
+                c1_strides,
+                a2_strides,
+                c2_strides,
+                workspace,
+                a_ptrs,
+                b_ptrs,
+                out_ptrs,
+                a_scales_ptrs,
+                b_scales_ptrs,
+                expert_offsets,
+                problem_sizes1,
+                problem_sizes2,
+            )
+            # Run Triton version (requires original shape weights, use inplace=False)
+            y_triton = fused_experts(
+                x,
+                w1,  # Original shape
+                w2,  # Original shape
+                topk_weights,
+                topk_ids,
+                inplace=False,  # Important: Use False to get output tensor
+                activation="silu",
+                use_fp8_w8a8=True,
+                w1_scale=w1_scale,
+                w2_scale=w2_scale,
+                block_shape=block_shape,
+            )
+        # Ensure outputs are same dtype for comparison
+        y_cutlass = y_cutlass.to(dtype)
+        y_triton = y_triton.to(dtype)
+        abs_error = torch.abs(y_cutlass - y_triton)
+        rel_error = abs_error / torch.clamp(torch.abs(y_triton), min=1e-2)
+        max_abs_err = abs_error.max().item()
+        max_rel_err = rel_error.max().item()
+        print("y_cutlass:", y_cutlass[:, :10])
+        print("y_triton:", y_triton[:, :10])
+        print(f"Max absolute error: {max_abs_err:.6f}")
+        print(f"Max relative error: {max_rel_err:.6f}")
+        # Tolerance might need adjustment based on FP8 specifics and kernel differences
+        # FP8 comparisons often require higher tolerance than FP16/BF16
+        assert max_rel_err < 5e-1, f"Relative error too high! {max_rel_err}"
+        print("Correctness check passed.")
+def main(tp_size=8, batch_sizes=[1, 4, 8, 16, 32, 64, 128, 256, 512], check=False):
+    model_config = get_model_config(tp_size)
+    print("Model Config:", model_config)
+    for batch_size in batch_sizes:
+        run_test(tp_size, batch_size, model_config, check)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--tp-size", type=int, default=8, help="Tensor Parallel size")
+    parser.add_argument(
+        "--batch-sizes",
+        type=int,
+        nargs="+",
+        default=[1, 4, 8, 16, 32, 64, 128, 256, 512],  # Adjusted default
+        help="List of batch sizes to test",
+    )
+    parser.add_argument("--check", action="store_true", help="Enable check mode")
+    args = parser.parse_args()
+    print(f"Running benchmarks with TP size: {args.tp_size}")
+    print(f"Testing batch sizes: {args.batch_sizes}")
+    main(tp_size=args.tp_size, batch_sizes=args.batch_sizes, check=args.check)

sglang 0.4.6.post4__py3-none-any.whl → 0.4.7__py3-none-any.whl

sglang 0.4.6.post4py3-none-any.whl → 0.4.7py3-none-any.whl