PyPI - sglang - Versions diffs - 0.4.0.post1__py3-none-any.whl → 0.4.1__py3-none-any.whl - Mend

sglang 0.4.0.post1py3-none-any.whl → 0.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (74) hide show

sglang/bench_offline_throughput.py +6 -6
sglang/bench_one_batch.py +1 -0
sglang/bench_serving.py +9 -1
sglang/check_env.py +140 -48
sglang/lang/backend/runtime_endpoint.py +1 -0
sglang/lang/chat_template.py +32 -0
sglang/llama3_eval.py +316 -0
sglang/srt/aio_rwlock.py +100 -0
sglang/srt/configs/model_config.py +8 -1
sglang/srt/constrained/xgrammar_backend.py +4 -1
sglang/srt/layers/attention/flashinfer_backend.py +51 -5
sglang/srt/layers/attention/triton_backend.py +16 -25
sglang/srt/layers/attention/triton_ops/decode_attention.py +305 -350
sglang/srt/layers/linear.py +20 -2
sglang/srt/layers/logits_processor.py +133 -95
sglang/srt/layers/{ep_moe → moe/ep_moe}/layer.py +18 -39
sglang/srt/layers/moe/fused_moe_native.py +46 -0
sglang/srt/layers/{fused_moe_triton → moe/fused_moe_triton}/__init__.py +3 -7
sglang/srt/layers/{fused_moe_triton → moe/fused_moe_triton}/fused_moe.py +174 -119
sglang/srt/layers/{fused_moe_triton → moe/fused_moe_triton}/layer.py +17 -49
sglang/srt/layers/moe/topk.py +191 -0
sglang/srt/layers/quantization/__init__.py +5 -50
sglang/srt/layers/quantization/fp8.py +221 -36
sglang/srt/layers/quantization/fp8_kernel.py +278 -0
sglang/srt/layers/quantization/fp8_utils.py +90 -1
sglang/srt/layers/radix_attention.py +8 -1
sglang/srt/layers/sampler.py +27 -5
sglang/srt/layers/torchao_utils.py +31 -0
sglang/srt/managers/detokenizer_manager.py +37 -17
sglang/srt/managers/io_struct.py +39 -10
sglang/srt/managers/schedule_batch.py +54 -34
sglang/srt/managers/schedule_policy.py +64 -5
sglang/srt/managers/scheduler.py +171 -136
sglang/srt/managers/tokenizer_manager.py +184 -133
sglang/srt/mem_cache/base_prefix_cache.py +2 -2
sglang/srt/mem_cache/chunk_cache.py +2 -2
sglang/srt/mem_cache/memory_pool.py +15 -8
sglang/srt/mem_cache/radix_cache.py +12 -2
sglang/srt/model_executor/cuda_graph_runner.py +25 -11
sglang/srt/model_executor/model_runner.py +28 -14
sglang/srt/model_parallel.py +66 -5
sglang/srt/models/dbrx.py +1 -1
sglang/srt/models/deepseek.py +1 -1
sglang/srt/models/deepseek_v2.py +67 -18
sglang/srt/models/gemma2.py +34 -0
sglang/srt/models/gemma2_reward.py +0 -1
sglang/srt/models/granite.py +517 -0
sglang/srt/models/grok.py +73 -9
sglang/srt/models/llama.py +22 -0
sglang/srt/models/llama_classification.py +11 -23
sglang/srt/models/llama_reward.py +0 -2
sglang/srt/models/llava.py +37 -14
sglang/srt/models/mixtral.py +2 -2
sglang/srt/models/olmoe.py +1 -1
sglang/srt/models/qwen2.py +20 -0
sglang/srt/models/qwen2_moe.py +1 -1
sglang/srt/models/xverse_moe.py +1 -1
sglang/srt/openai_api/adapter.py +8 -0
sglang/srt/openai_api/protocol.py +9 -4
sglang/srt/server.py +2 -1
sglang/srt/server_args.py +19 -9
sglang/srt/utils.py +40 -54
sglang/test/test_block_fp8.py +341 -0
sglang/test/test_utils.py +3 -2
sglang/utils.py +10 -3
sglang/version.py +1 -1
{sglang-0.4.0.post1.dist-info → sglang-0.4.1.dist-info}/METADATA +12 -7
{sglang-0.4.0.post1.dist-info → sglang-0.4.1.dist-info}/RECORD +73 -67
sglang/srt/layers/fused_moe_patch.py +0 -133
/sglang/srt/layers/{ep_moe → moe/ep_moe}/__init__.py +0 -0
/sglang/srt/layers/{ep_moe → moe/ep_moe}/kernels.py +0 -0
{sglang-0.4.0.post1.dist-info → sglang-0.4.1.dist-info}/LICENSE +0 -0
{sglang-0.4.0.post1.dist-info → sglang-0.4.1.dist-info}/WHEEL +0 -0
{sglang-0.4.0.post1.dist-info → sglang-0.4.1.dist-info}/top_level.txt +0 -0

sglang/srt/server_args.py CHANGED Viewed

@@ -141,6 +141,7 @@ class ServerArgs:
     enable_nan_detection: bool = False
     enable_p2p_check: bool = False
     triton_attention_reduce_in_fp32: bool = False
+    triton_attention_num_kv_splits: int = 8
     num_continuous_decode_steps: int = 1
     delete_ckpt_after_loading: bool = False
@@ -220,12 +221,10 @@ class ServerArgs:
         if self.enable_dp_attention:
             self.dp_size = self.tp_size
             self.chunked_prefill_size = self.chunked_prefill_size // 2
-            self.cuda_graph_max_bs = min(self.cuda_graph_max_bs, 96)
             self.schedule_conservativeness = self.schedule_conservativeness * 0.3
             self.disable_overlap_schedule = True
             logger.warning(
                 f"DP attention is enabled. The chunked prefill size is adjusted to {self.chunked_prefill_size} to avoid MoE kernel issues. "
-                f"The CUDA graph max batch size is adjusted to {self.cuda_graph_max_bs}. "
                 f"The schedule conservativeness is adjusted to {self.schedule_conservativeness}. "
                 "Data parallel size is adjusted to be the same as tensor parallel size. "
                 "Overlap scheduler is disabled."
@@ -282,7 +281,15 @@ class ServerArgs:
             "--load-format",
             type=str,
             default=ServerArgs.load_format,
-            choices=["auto", "pt", "safetensors", "npcache", "dummy", "gguf"],
+            choices=[
+                "auto",
+                "pt",
+                "safetensors",
+                "npcache",
+                "dummy",
+                "gguf",
+                "bitsandbytes",
+            ],
             help="The format of the model weights to load. "
             '"auto" will try to load the weights in the safetensors format '
             "and fall back to the pytorch bin format if safetensors format "
@@ -293,7 +300,9 @@ class ServerArgs:
             "a numpy cache to speed up the loading. "
             '"dummy" will initialize the weights with random values, '
             "which is mainly for profiling."
-            '"gguf" will load the weights in the gguf format. ',
+            '"gguf" will load the weights in the gguf format. '
+            '"bitsandbytes" will load the weights using bitsandbytes '
+            "quantization.",
         )
         parser.add_argument(
             "--trust-remote-code",
@@ -689,11 +698,6 @@ class ServerArgs:
             action="store_true",
             help="Disable Multi-head Latent Attention (MLA) for DeepSeek-V2.",
         )
-        parser.add_argument(
-            "--disable-nan-detection",
-            action="store_true",
-            help="Disable the NaN detection for better performance.",
-        )
         parser.add_argument(
             "--disable-overlap-schedule",
             action="store_true",
@@ -753,6 +757,12 @@ class ServerArgs:
             help="Cast the intermidiate attention results to fp32 to avoid possible crashes related to fp16."
             "This only affects Triton attention kernels.",
         )
+        parser.add_argument(
+            "--triton-attention-num-kv-splits",
+            type=int,
+            default=ServerArgs.triton_attention_num_kv_splits,
+            help="The number of KV splits in flash decoding Triton kernel. Larger value is better in longer context scenarios. The default value is 8.",
+        )
         parser.add_argument(
             "--num-continuous-decode-steps",
             type=int,

sglang/srt/utils.py CHANGED Viewed

@@ -14,6 +14,7 @@
 """Common utilities."""
 import base64
+import dataclasses
 import ipaddress
 import itertools
 import json
@@ -92,7 +93,7 @@ def is_flashinfer_available():
     """
     if not get_bool_env_var("SGLANG_IS_FLASHINFER_AVAILABLE", default="true"):
         return False
-    return torch.cuda.is_available() and not is_hip()
+    return torch.cuda.is_available() and torch.version.cuda
 def is_ipv6(address):
@@ -169,7 +170,7 @@ def calculate_time(show=False, min_cost_ms=0.0):
     return wrapper
-def get_available_gpu_memory(device, gpu_id, distributed=False):
+def get_available_gpu_memory(device, gpu_id, distributed=False, empty_cache=True):
     """
     Get available memory for cuda:gpu_id device.
     When distributed is True, the available memory is the minimum available memory of all GPUs.
@@ -184,7 +185,8 @@ def get_available_gpu_memory(device, gpu_id, distributed=False):
                 "which may cause useless memory allocation for torch CUDA context.",
             )
-        torch.cuda.empty_cache()
+        if empty_cache:
+            torch.cuda.empty_cache()
         free_gpu_memory, _ = torch.cuda.mem_get_info(gpu_id)
     elif device == "xpu":
@@ -196,7 +198,9 @@ def get_available_gpu_memory(device, gpu_id, distributed=False):
                 f"WARNING: current device is not {gpu_id}, but {torch.xpu.current_device()}, ",
                 "which may cause useless memory allocation for torch XPU context.",
             )
-        torch.xpu.empty_cache()
+        if empty_cache:
+            torch.xpu.empty_cache()
         used_memory = torch.xpu.memory_allocated()
         total_gpu_memory = torch.xpu.get_device_properties(gpu_id).total_memory
         free_gpu_memory = total_gpu_memory - used_memory
@@ -1068,9 +1072,6 @@ def get_device_name(device_id: int = 0) -> str:
     if hasattr(torch, "cuda") and torch.cuda.is_available():
         return torch.cuda.get_device_name(device_id)
-    if hasattr(torch, "hip") and torch.hip.is_available():
-        return torch.hip.get_device_name(device_id)
     if hasattr(torch, "xpu") and torch.xpu.is_available():
         return torch.xpu.get_device_name(device_id)
@@ -1083,9 +1084,6 @@ def get_device_capability(device_id: int = 0) -> Tuple[int, int]:
     if hasattr(torch, "cuda") and torch.cuda.is_available():
         major, minor = torch.cuda.get_device_capability(device_id)
-    if hasattr(torch, "hip") and torch.hip.is_available():
-        major, minor = torch.cuda.get_device_capability(device_id)
     if hasattr(torch, "xpu") and torch.xpu.is_available():
         major, minor, *_ = torch.xpu.get_device_capability(device_id)["version"].split(
             "."
@@ -1241,49 +1239,37 @@ def cuda_device_count_stateless() -> int:
     return _cuda_device_count_stateless(os.environ.get("CUDA_VISIBLE_DEVICES", None))
-def should_use_tensor_core(
-    kv_cache_dtype: torch.dtype,
-    num_attention_heads: int,
-    num_kv_heads: int,
-) -> bool:
-    """
-    Determine whether to use tensor cores for attention computation.
-    Args:
-        kv_cache_dtype: Data type of the KV cache
-        num_attention_heads: Number of attention heads
-        num_kv_heads: Number of key/value heads
-    Returns:
-        bool: Whether to use tensor cores
-    """
-    # Try to use environment variable first
-    env_override = os.environ.get("SGLANG_FLASHINFER_USE_TENSOR_CORE")
-    if env_override is not None:
-        return env_override.lower() == "true"
-    # Try to use _grouped_size_compiled_for_decode_kernels if available
-    # This is for flashinfer <=0.1.6. Otherwise, there is an accuracy bug
-    try:
-        from flashinfer.decode import _grouped_size_compiled_for_decode_kernels
-        if not _grouped_size_compiled_for_decode_kernels(
-            num_attention_heads,
-            num_kv_heads,
-        ):
-            return True
+def dataclass_to_string_truncated(data, max_length=2048):
+    if isinstance(data, str):
+        if len(data) > max_length:
+            half_length = max_length // 2
+            return f'"{data[:half_length]} ... {data[-half_length:]}"'
         else:
-            return False
-    except (ImportError, AttributeError):
-        pass
-    # Calculate GQA group size
-    gqa_group_size = num_attention_heads // num_kv_heads
-    # Determine based on dtype and GQA group size
-    if kv_cache_dtype in (torch.float8_e4m3fn, torch.float8_e5m2):
-        return True
-    elif kv_cache_dtype in (torch.float16, torch.half, torch.bfloat16):
-        return gqa_group_size > 4
+            return f'"{data}"'
+    elif isinstance(data, (list, tuple)):
+        if len(data) > max_length:
+            half_length = max_length // 2
+            return str(data[:half_length]) + " ... " + str(data[-half_length:])
+        else:
+            return str(data)
+    elif isinstance(data, dict):
+        return (
+            "{"
+            + ", ".join(
+                f"{k}: {dataclass_to_string_truncated(v, max_length)}"
+                for k, v in data.items()
+            )
+            + "}"
+        )
+    elif dataclasses.is_dataclass(data):
+        fields = dataclasses.fields(data)
+        return (
+            f"{data.__class__.__name__}("
+            + ", ".join(
+                f"{f.name}={dataclass_to_string_truncated(getattr(data, f.name), max_length)}"
+                for f in fields
+            )
+            + ")"
+        )
     else:
-        return False
+        return str(data)

sglang/test/test_block_fp8.py ADDED Viewed

@@ -0,0 +1,341 @@
+import itertools
+import unittest
+import torch
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_moe
+from sglang.srt.layers.quantization.fp8_kernel import (
+    per_token_group_quant_fp8,
+    w8a8_block_fp8_matmul,
+)
+# For test
+def native_per_token_group_quant_fp8(
+    x, group_size, eps=1e-10, dtype=torch.float8_e4m3fn
+):
+    """Function to perform per-token-group quantization on an input tensor `x` using native torch.
+    It converts the tensor values into float8 values and returns the
+    quantized tensor along with the scaling factor used for quantization.
+    Note that only `torch.float8_e4m3fn` is supported for now.
+    """
+    assert (
+        x.shape[-1] % group_size == 0
+    ), "the last dimension of `x` cannot be divisible by `group_size`"
+    assert x.is_contiguous(), "`x` is not contiguous"
+    finfo = torch.finfo(dtype)
+    fp8_min = finfo.min
+    fp8_max = finfo.max
+    x_ = x.reshape(x.numel() // group_size, group_size)
+    amax = x_.abs().max(dim=-1, keepdim=True)[0].clamp(min=eps).to(torch.float32)
+    x_s = amax / fp8_max
+    x_q = (x_ / x_s).clamp(min=fp8_min, max=fp8_max).to(dtype)
+    x_q = x_q.reshape(x.shape)
+    x_s = x_s.reshape(x.shape[:-1] + (x.shape[-1] // group_size,))
+    return x_q, x_s
+class TestPerTokenGroupQuantFP8(unittest.TestCase):
+    DTYPES = [torch.half, torch.bfloat16, torch.float32]
+    NUM_TOKENS = [7, 83, 2048]
+    D = [512, 4096, 5120, 13824]
+    GROUP_SIZE = [64, 128, 256, 512]
+    SEEDS = [0]
+    @classmethod
+    def setUpClass(cls):
+        if not torch.cuda.is_available():
+            raise unittest.SkipTest("CUDA is not available")
+        torch.set_default_device("cuda")
+    def _per_token_group_quant_fp8(self, num_tokens, d, dtype, group_size, seed):
+        torch.manual_seed(seed)
+        x = torch.rand(num_tokens, d, dtype=dtype)
+        with torch.inference_mode():
+            ref_out, ref_scale = native_per_token_group_quant_fp8(x, group_size)
+            out, scale = per_token_group_quant_fp8(x, group_size)
+        self.assertTrue(
+            torch.allclose(out.to(torch.float32), ref_out.to(torch.float32), rtol=0.15)
+        )
+        self.assertTrue(torch.allclose(scale, ref_scale))
+    def test_per_token_group_quant_fp8(self):
+        for params in itertools.product(
+            self.NUM_TOKENS,
+            self.D,
+            self.DTYPES,
+            self.GROUP_SIZE,
+            self.SEEDS,
+        ):
+            with self.subTest(
+                num_tokens=params[0],
+                d=params[1],
+                dtype=params[2],
+                group_size=params[3],
+                seed=params[4],
+            ):
+                self._per_token_group_quant_fp8(*params)
+# For test
+def native_w8a8_block_fp8_matmul(A, B, As, Bs, block_size, output_dtype=torch.float16):
+    """This function performs matrix multiplication with block-wise quantization using native torch.
+    It takes two input tensors `A` and `B` with scales `As` and `Bs`.
+    The output is returned in the specified `output_dtype`.
+    """
+    A = A.to(torch.float32)
+    B = B.to(torch.float32)
+    assert A.shape[-1] == B.shape[-1]
+    assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2
+    assert len(block_size) == 2
+    block_n, block_k = block_size[0], block_size[1]
+    assert (A.shape[-1] + block_k - 1) // block_k == As.shape[-1]
+    assert A.shape[:-1] == As.shape[:-1]
+    M = A.numel() // A.shape[-1]
+    N, K = B.shape
+    origin_C_shape = A.shape[:-1] + (N,)
+    A = A.reshape(M, A.shape[-1])
+    As = As.reshape(M, As.shape[-1])
+    n_tiles = (N + block_n - 1) // block_n
+    k_tiles = (K + block_k - 1) // block_k
+    assert n_tiles == Bs.shape[0]
+    assert k_tiles == Bs.shape[1]
+    C_shape = (M, N)
+    C = torch.zeros(C_shape, dtype=torch.float32, device=A.device)
+    A_tiles = [A[:, i * block_k : min((i + 1) * block_k, K)] for i in range(k_tiles)]
+    B_tiles = [
+        [
+            B[
+                j * block_n : min((j + 1) * block_n, N),
+                i * block_k : min((i + 1) * block_k, K),
+            ]
+            for i in range(k_tiles)
+        ]
+        for j in range(n_tiles)
+    ]
+    C_tiles = [C[:, j * block_n : min((j + 1) * block_n, N)] for j in range(n_tiles)]
+    As_tiles = [As[:, i : i + 1] for i in range(k_tiles)]
+    for i in range(k_tiles):
+        for j in range(n_tiles):
+            a = A_tiles[i]
+            b = B_tiles[j][i]
+            c = C_tiles[j]
+            s = As_tiles[i] * Bs[j][i]
+            c[:, :] += torch.matmul(a, b.t()) * s
+    C = C.reshape(origin_C_shape).to(output_dtype)
+    return C
+class TestW8A8BlockFP8Matmul(unittest.TestCase):
+    OUT_DTYPES = [torch.float32, torch.half, torch.bfloat16]
+    M = [1, 7, 83, 512, 2048]
+    N = [128, 512, 1024, 4096, 7748, 13824]
+    K = [256, 4096, 5120, 3884, 13824]
+    # BLOCK_SIZE = [[64, 64], [64, 128], [128, 64], [128, 128]]
+    BLOCK_SIZE = [[128, 128]]
+    SEEDS = [0]
+    @classmethod
+    def setUpClass(cls):
+        if not torch.cuda.is_available():
+            raise unittest.SkipTest("CUDA is not available")
+        torch.set_default_device("cuda")
+    def _w8a8_block_fp8_matmul(self, M, N, K, block_size, out_dtype, seed):
+        torch.manual_seed(seed)
+        # NOTE(HandH1998): to avoid overflow when out_dtype = torch.half
+        factor_for_scale = 1e-2
+        fp8_info = torch.finfo(torch.float8_e4m3fn)
+        fp8_max, fp8_min = fp8_info.max, fp8_info.min
+        A_fp32 = (torch.rand(M, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
+        A_fp8 = A_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+        B_fp32 = (torch.rand(N, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
+        B_fp8 = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+        block_n, block_k = block_size[0], block_size[1]
+        n_tiles = (N + block_n - 1) // block_n
+        k_tiles = (K + block_k - 1) // block_k
+        As = torch.rand(M, k_tiles, dtype=torch.float32) * factor_for_scale
+        Bs = torch.rand(n_tiles, k_tiles, dtype=torch.float32) * factor_for_scale
+        with torch.inference_mode():
+            ref_out = native_w8a8_block_fp8_matmul(
+                A_fp8, B_fp8, As, Bs, block_size, out_dtype
+            )
+            out = w8a8_block_fp8_matmul(A_fp8, B_fp8, As, Bs, block_size, out_dtype)
+        self.assertTrue(
+            torch.mean(torch.abs(out.to(torch.float32) - ref_out.to(torch.float32)))
+            / torch.mean(torch.abs(ref_out.to(torch.float32)))
+            < 0.001
+        )
+    def test_w8a8_block_fp8_matmul(self):
+        for params in itertools.product(
+            self.M,
+            self.N,
+            self.K,
+            self.BLOCK_SIZE,
+            self.OUT_DTYPES,
+            self.SEEDS,
+        ):
+            with self.subTest(
+                M=params[0],
+                N=params[1],
+                K=params[2],
+                block_size=params[3],
+                out_dtype=params[4],
+                seed=params[5],
+            ):
+                self._w8a8_block_fp8_matmul(*params)
+# For test
+def torch_w8a8_block_fp8_moe(a, w1, w2, w1_s, w2_s, score, topk, block_shape):
+    """This function performs fused moe with block-wise quantization using native torch."""
+    B, D = a.shape
+    a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
+    out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device)
+    score = torch.softmax(score, dim=-1, dtype=torch.float32)
+    topk_weight, topk_ids = torch.topk(score, topk)
+    topk_weight = topk_weight.view(-1)
+    topk_ids = topk_ids.view(-1)
+    _, block_k = block_shape[0], block_shape[1]
+    a_q, a_s = native_per_token_group_quant_fp8(a, block_k)
+    # NOTE(HandH1998): Since "index_cuda" not implemented for 'Float8_e4m3fn', we need to cast `float8`` to `float32``.
+    a_q = a_q.to(torch.float32)
+    for i in range(w1.shape[0]):
+        mask = topk_ids == i
+        if mask.sum():
+            inter_out = native_w8a8_block_fp8_matmul(
+                a_q[mask], w1[i], a_s[mask], w1_s[i], block_shape, output_dtype=a.dtype
+            )
+            act_out = SiluAndMul().forward_native(inter_out)
+            act_out_q, act_out_s = native_per_token_group_quant_fp8(act_out, block_k)
+            act_out = act_out.to(torch.float32)
+            out[mask] = native_w8a8_block_fp8_matmul(
+                act_out_q, w2[i], act_out_s, w2_s[i], block_shape, output_dtype=a.dtype
+            )
+    return (
+        out.view(B, -1, w2.shape[1]) * topk_weight.view(B, -1, 1).to(out.dtype)
+    ).sum(dim=1)
+class TestW8A8BlockFP8FusedMoE(unittest.TestCase):
+    DTYPES = [torch.float32, torch.half, torch.bfloat16]
+    M = [1, 33, 64, 222, 1024 * 128]
+    N = [128, 1024, 2048]
+    K = [256, 4096, 5120]
+    E = [8, 24]
+    TOP_KS = [2, 6]
+    BLOCK_SIZE = [[64, 64], [64, 128], [128, 64], [128, 128]]
+    # BLOCK_SIZE = [[128, 128]]
+    SEEDS = [0]
+    @classmethod
+    def setUpClass(cls):
+        if not torch.cuda.is_available():
+            raise unittest.SkipTest("CUDA is not available")
+        torch.set_default_device("cuda")
+    def _w8a8_block_fp8_fused_moe(self, M, N, K, E, topk, block_size, dtype, seed):
+        torch.manual_seed(seed)
+        # NOTE(HandH1998): to avoid overflow when out_dtype = torch.half
+        factor_for_scale = 1e-2
+        fp8_info = torch.finfo(torch.float8_e4m3fn)
+        fp8_max, fp8_min = fp8_info.max, fp8_info.min
+        a = torch.randn((M, K), dtype=dtype) / 10
+        w1_fp32 = (torch.rand((E, 2 * N, K), dtype=torch.float32) - 0.5) * 2 * fp8_max
+        w1 = w1_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+        w2_fp32 = (torch.rand((E, K, N), dtype=torch.float32) - 0.5) * 2 * fp8_max
+        w2 = w2_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+        block_n, block_k = block_size[0], block_size[1]
+        n_tiles_w1 = (2 * N + block_n - 1) // block_n
+        n_tiles_w2 = (K + block_n - 1) // block_n
+        k_tiles_w1 = (K + block_k - 1) // block_k
+        k_tiles_w2 = (N + block_k - 1) // block_k
+        w1_s = (
+            torch.rand((E, n_tiles_w1, k_tiles_w1), dtype=torch.float32)
+            * factor_for_scale
+        )
+        w2_s = (
+            torch.rand((E, n_tiles_w2, k_tiles_w2), dtype=torch.float32)
+            * factor_for_scale
+        )
+        score = torch.randn((M, E), dtype=dtype)
+        with torch.inference_mode():
+            out = fused_moe(
+                a,
+                w1,
+                w2,
+                score,
+                topk,
+                renormalize=False,
+                use_fp8_w8a8=True,
+                w1_scale=w1_s,
+                w2_scale=w2_s,
+                block_shape=block_size,
+            )
+            ref_out = torch_w8a8_block_fp8_moe(
+                a, w1, w2, w1_s, w2_s, score, topk, block_size
+            )
+        self.assertTrue(
+            torch.mean(torch.abs(out.to(torch.float32) - ref_out.to(torch.float32)))
+            / torch.mean(torch.abs(ref_out.to(torch.float32)))
+            < 0.02
+        )
+    def test_w8a8_block_fp8_fused_moe(self):
+        for params in itertools.product(
+            self.M,
+            self.N,
+            self.K,
+            self.E,
+            self.TOP_KS,
+            self.BLOCK_SIZE,
+            self.DTYPES,
+            self.SEEDS,
+        ):
+            with self.subTest(
+                M=params[0],
+                N=params[1],
+                K=params[2],
+                E=params[3],
+                topk=params[4],
+                block_size=params[5],
+                dtype=params[6],
+                seed=params[7],
+            ):
+                self._w8a8_block_fp8_fused_moe(*params)
+if __name__ == "__main__":
+    unittest.main(verbosity=2)

sglang/test/test_utils.py CHANGED Viewed

@@ -568,6 +568,7 @@ def run_bench_serving(
         disable_tqdm=False,
         disable_stream=disable_stream,
         disable_ignore_eos=False,
+        return_logprob=False,
         lora_name=None,
         extra_request_body=None,
         profile=None,
@@ -719,13 +720,13 @@ def run_and_check_memory_leak(
     # Clean up everything
     kill_process_tree(process.pid)
-    kill_process_tree(process.pid)
     stdout.close()
     stderr.close()
     if os.path.exists(STDOUT_FILENAME):
         os.remove(STDOUT_FILENAME)
     if os.path.exists(STDERR_FILENAME):
         os.remove(STDERR_FILENAME)
+    kill_process_tree(process.pid)
     t.join()
     # Assert success
@@ -733,7 +734,7 @@ def run_and_check_memory_leak(
     has_leak = False
     has_abort = False
     for line in output_lines:
-        if "The server is fired" in line:
+        if "Uvicorn running" in line:
             has_new_server = True
         if "leak" in line:
             has_leak = True

sglang/utils.py CHANGED Viewed

@@ -1,4 +1,4 @@
-"""Common utilities."""
+"""Common utilities"""
 import base64
 import gc
@@ -79,7 +79,14 @@ class HttpResponse:
         return self.resp.status
-def http_request(url, json=None, stream=False, api_key=None, verify=None):
+def http_request(
+    url,
+    json=None,
+    stream=False,
+    api_key=None,
+    verify=None,
+    method: Optional[str] = None,
+):
     """A faster version of requests.post with low-level urllib API."""
     headers = {"Content-Type": "application/json; charset=utf-8"}
@@ -90,7 +97,7 @@ def http_request(url, json=None, stream=False, api_key=None, verify=None):
     if stream:
         return requests.post(url, json=json, stream=True, headers=headers)
     else:
-        req = urllib.request.Request(url, headers=headers)
+        req = urllib.request.Request(url, headers=headers, method=method)
         if json is None:
             data = None
         else:

sglang/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.4.~~0.post1~~"
1	+ __version__ = "0.4.1"

{sglang-0.4.0.post1.dist-info → sglang-0.4.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sglang
-Version: 0.4.0.post1
+Version: 0.4.1
 Summary: SGLang is yet another fast serving framework for large language models and vision language models.
 License:                                  Apache License
                                    Version 2.0, January 2004
@@ -215,6 +215,7 @@ Requires-Dist: requests
 Requires-Dist: tqdm
 Requires-Dist: numpy
 Requires-Dist: IPython
+Requires-Dist: setproctitle
 Provides-Extra: runtime-common
 Requires-Dist: aiohttp; extra == "runtime-common"
 Requires-Dist: decord; extra == "runtime-common"
@@ -232,16 +233,17 @@ Requires-Dist: psutil; extra == "runtime-common"
 Requires-Dist: pydantic; extra == "runtime-common"
 Requires-Dist: python-multipart; extra == "runtime-common"
 Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
-Requires-Dist: torchao; extra == "runtime-common"
+Requires-Dist: torchao>=0.7.0; extra == "runtime-common"
 Requires-Dist: uvicorn; extra == "runtime-common"
 Requires-Dist: uvloop; extra == "runtime-common"
-Requires-Dist: xgrammar>=0.1.4; extra == "runtime-common"
+Requires-Dist: xgrammar>=0.1.6; extra == "runtime-common"
 Provides-Extra: srt
 Requires-Dist: sglang[runtime_common]; extra == "srt"
 Requires-Dist: torch; extra == "srt"
 Requires-Dist: vllm<=0.6.4.post1,>=0.6.3.post1; extra == "srt"
 Requires-Dist: cuda-python; extra == "srt"
-Requires-Dist: flashinfer>=0.1.6; extra == "srt"
+Requires-Dist: flashinfer==0.1.6; extra == "srt"
+Requires-Dist: sgl-kernel>=0.0.2.post8; extra == "srt"
 Provides-Extra: srt-hip
 Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
 Requires-Dist: torch; extra == "srt-hip"
@@ -311,8 +313,11 @@ Requires-Dist: sglang[test]; extra == "dev-hpu"
 --------------------------------------------------------------------------------
-| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Documentation**](https://sgl-project.github.io/) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2tmmp6flg-89dOlJW2TjnBrTRk1I_~GA) |
-[**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing) | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
+| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/)
+| [**Documentation**](https://sgl-project.github.io/)
+| [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2tmmp6flg-89dOlJW2TjnBrTRk1I_~GA)
+| [**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing)
+| [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
 ## News
 - [2024/12] 🔥 SGLang v0.4: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
@@ -353,7 +358,7 @@ Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
 [Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
 ## Adoption and Sponsorship
-The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, xAI and 01.AI.
+The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, xAI, 01.AI and DataCrunch.
 ## Acknowledgment and Citation
 We learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).

sglang 0.4.0.post1__py3-none-any.whl → 0.4.1__py3-none-any.whl

sglang 0.4.0.post1py3-none-any.whl → 0.4.1py3-none-any.whl