PyPI - sglang - Versions diffs - 0.5.1__py3-none-any.whl → 0.5.1.post2__py3-none-any.whl - Mend

sglang 0.5.1py3-none-any.whl → 0.5.1.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

sglang/srt/disaggregation/decode.py +4 -0
sglang/srt/disaggregation/prefill.py +4 -0
sglang/srt/entrypoints/engine.py +1 -1
sglang/srt/entrypoints/tool.py +7 -7
sglang/srt/layers/attention/flashinfer_mla_backend.py +71 -89
sglang/srt/layers/attention/utils.py +15 -94
sglang/srt/layers/moe/cutlass_moe.py +0 -7
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +6 -2
sglang/srt/layers/quantization/modelopt_quant.py +2 -2
sglang/srt/lora/lora_manager.py +29 -12
sglang/srt/managers/scheduler_metrics_mixin.py +15 -0
sglang/srt/metrics/collector.py +5 -5
sglang/srt/model_executor/cuda_graph_runner.py +2 -2
sglang/srt/models/grok.py +0 -4
sglang/srt/offloader.py +115 -0
sglang/srt/server_args.py +0 -4
sglang/srt/utils.py +0 -7
sglang/test/test_cutlass_moe.py +33 -28
sglang/version.py +1 -1
{sglang-0.5.1.dist-info → sglang-0.5.1.post2.dist-info}/METADATA +4 -4
{sglang-0.5.1.dist-info → sglang-0.5.1.post2.dist-info}/RECORD +25 -24
{sglang-0.5.1.dist-info → sglang-0.5.1.post2.dist-info}/WHEEL +0 -0
{sglang-0.5.1.dist-info → sglang-0.5.1.post2.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.1.dist-info → sglang-0.5.1.post2.dist-info}/top_level.txt +0 -0

sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json ADDED Viewed

@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}

sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py CHANGED Viewed

@@ -1,5 +1,7 @@
 import logging
+import torch
 from sglang.srt.utils import get_bool_env_var, get_device_sm
 logger = logging.getLogger(__name__)
@@ -7,8 +9,10 @@ logger = logging.getLogger(__name__)
 def _compute_enable_deep_gemm():
     sm_version = get_device_sm()
-    # TODO fix blackwell fp8
-    if sm_version != 90:
+    if sm_version < 90:
+        return False
+    # TODO fix deepgemm cu129 fp8 issue
+    if torch.version.cuda == "12.9":
         return False
     try:

sglang/srt/layers/quantization/modelopt_quant.py CHANGED Viewed

@@ -876,7 +876,6 @@ class ModelOptNvFp4FusedMoEMethod(FusedMoEMethodBase):
             data=torch.empty(
                 layer.num_local_experts,
                 2 * intermediate_size_per_partition,
-                # 2 fp4 items are packed in the input dimension
                 hidden_size // self.quant_config.group_size,
                 dtype=weight_scale_dtype,
             ),
@@ -895,7 +894,6 @@ class ModelOptNvFp4FusedMoEMethod(FusedMoEMethodBase):
             data=torch.empty(
                 layer.num_local_experts,
                 hidden_size,
-                # 2 fp4 items are packed in the input dimension
                 intermediate_size_per_partition // self.quant_config.group_size,
                 dtype=weight_scale_dtype,
             ),
@@ -1212,11 +1210,13 @@ class ModelOptNvFp4FusedMoEMethod(FusedMoEMethodBase):
             # Process w13 weights
             w13_blockscale_swizzled = self.swizzle_blockscale(layer.w13_weight_scale)
+            del layer.w13_weight_scale
             layer.w13_blockscale_swizzled.data.copy_(w13_blockscale_swizzled)
             layer.w13_weight = Parameter(layer.w13_weight.data, requires_grad=False)
             # Process w2 weights
             w2_blockscale_swizzled = self.swizzle_blockscale(layer.w2_weight_scale)
+            del layer.w2_weight_scale
             layer.w2_blockscale_swizzled.data.copy_(w2_blockscale_swizzled)
             layer.w2_weight = Parameter(layer.w2_weight.data, requires_grad=False)

sglang/srt/lora/lora_manager.py CHANGED Viewed

@@ -420,20 +420,37 @@ class LoRAManager:
     ):
         """Infer LoRA target modules and max_lora_rank from loaded adapters if not provided."""
-        if target_modules is not None:
-            self.target_modules = set(target_modules)
-        else:
-            self.target_modules = set()
-            for config in self.configs.values():
-                if not isinstance(config.target_modules, list):
+        self.target_modules = (
+            get_normalized_target_modules(target_modules) if target_modules else set()
+        )
+        for lora_id, config in self.configs.items():
+            if not isinstance(config.target_modules, list):
+                raise ValueError(
+                    f"SGLang currently only supports inferring LoRA target modules when a list of "
+                    "suffixes is provided in `target_modules` field of PEFT config. Please explicitly "
+                    "specify `--lora-target-modules` during server startup. You can specify `all` to "
+                    "enable all support modules types. "
+                )
+            adapter_target_modules = get_normalized_target_modules(
+                config.target_modules
+            )
+            if target_modules is not None:
+                # When `--lora-target-modules` is provided, validate adapter target modules is a subset of the specified target modules.
+                if not adapter_target_modules.issubset(self.target_modules):
+                    unsupported_modules = adapter_target_modules - self.target_modules
+                    lora_name = self.lora_refs[lora_id].lora_name
                     raise ValueError(
-                        f"SGLang currently only supports inferring LoRA target modules when a list of "
-                        "suffixes is provided in `target_modules` field of PEFT config. Please explicitly "
-                        "specify `--lora-target-modules` during server startup. You can specify `all` to "
-                        "enable all support modules types. "
+                        f"LoRA adapter '{lora_name}' contains target modules {sorted(unsupported_modules)} "
+                        f"that are not included in the specified --lora-target-modules {sorted(self.target_modules)}. "
+                        f"Please update --lora-target-modules to include all required modules: "
+                        f"{sorted(self.target_modules | adapter_target_modules)}, or use 'all' to enable all supported modules."
                     )
-                self.target_modules.update(config.target_modules)
-        self.target_modules = get_normalized_target_modules(self.target_modules)
+            else:
+                # Otherwise, infer target_modules from adapter configs.
+                self.target_modules.update(adapter_target_modules)
         if max_lora_rank is not None:
             self.max_lora_rank = max_lora_rank

sglang/srt/managers/scheduler_metrics_mixin.py CHANGED Viewed

@@ -125,6 +125,14 @@ class SchedulerMetricsMixin:
                 total_queue_latency += req.queue_time_end - req.queue_time_start
             self.stats.avg_request_queue_latency = total_queue_latency / num_new_seq
+            if self.disaggregation_mode == DisaggregationMode.PREFILL:
+                self.stats.num_prefill_prealloc_queue_reqs = len(
+                    self.disagg_prefill_bootstrap_queue.queue
+                )
+                self.stats.num_prefill_inflight_queue_reqs = len(
+                    self.disagg_prefill_inflight_queue
+                )
             self.metrics_collector.log_stats(self.stats)
             self._emit_kv_metrics()
         self._publish_kv_events()
@@ -202,6 +210,13 @@ class SchedulerMetricsMixin:
             self.stats.spec_accept_length = spec_accept_length
             self.stats.total_retracted_reqs = self.total_retracted_reqs
             self.metrics_collector.log_stats(self.stats)
+            if self.disaggregation_mode == DisaggregationMode.DECODE:
+                self.stats.num_decode_prealloc_queue_reqs = len(
+                    self.disagg_decode_prealloc_queue.queue
+                )
+                self.stats.num_decode_transfer_queue_reqs = len(
+                    self.disagg_decode_transfer_queue.queue
+                )
             self._emit_kv_metrics()
         self._publish_kv_events()

sglang/srt/metrics/collector.py CHANGED Viewed

@@ -142,7 +142,7 @@ class SchedulerStats:
     spec_accept_length: float = 0.0
     avg_request_queue_latency: float = 0.0
     num_prefill_prealloc_queue_reqs: int = 0
-    num_prefill_infight_queue_reqs: int = 0
+    num_prefill_inflight_queue_reqs: int = 0
     num_decode_prealloc_queue_reqs: int = 0
     num_decode_transfer_queue_reqs: int = 0
     total_retracted_reqs: int = 0
@@ -235,9 +235,9 @@ class SchedulerMetricsCollector:
             multiprocess_mode="mostrecent",
         )
-        self.num_prefill_infight_queue_reqs = Gauge(
-            name="sglang:num_prefill_infight_queue_reqs",
-            documentation="The number of requests in the prefill infight queue.",
+        self.num_prefill_inflight_queue_reqs = Gauge(
+            name="sglang:num_prefill_inflight_queue_reqs",
+            documentation="The number of requests in the prefill inflight queue.",
             labelnames=labels.keys(),
             multiprocess_mode="mostrecent",
         )
@@ -294,7 +294,7 @@ class SchedulerMetricsCollector:
             self.num_prefill_prealloc_queue_reqs, stats.num_prefill_prealloc_queue_reqs
         )
         self._log_gauge(
-            self.num_prefill_infight_queue_reqs, stats.num_prefill_infight_queue_reqs
+            self.num_prefill_inflight_queue_reqs, stats.num_prefill_inflight_queue_reqs
         )
         self._log_gauge(
             self.num_decode_prealloc_queue_reqs, stats.num_decode_prealloc_queue_reqs

sglang/srt/model_executor/cuda_graph_runner.py CHANGED Viewed

@@ -54,7 +54,7 @@ from sglang.srt.utils import (
     empty_context,
     get_available_gpu_memory,
     get_device_memory_capacity,
-    rank0_log,
+    log_info_on_rank0,
     require_attn_tp_gather,
     require_gathered_buffer,
     require_mlp_sync,
@@ -267,7 +267,7 @@ class CudaGraphRunner:
         # Batch sizes to capture
         self.capture_bs, self.compile_bs = get_batch_sizes_to_capture(model_runner)
-        rank0_log(f"Capture cuda graph bs {self.capture_bs}")
+        log_info_on_rank0(logger, f"Capture cuda graph bs {self.capture_bs}")
         self.capture_forward_mode = ForwardMode.DECODE
         self.capture_hidden_mode = CaptureHiddenMode.NULL
         self.num_tokens_per_bs = 1

sglang/srt/models/grok.py CHANGED Viewed

@@ -842,10 +842,6 @@ class Grok1ForCausalLM(nn.Module):
         if self.is_weights_presharded:
             setattr(DefaultModelLoader, "_prepare_weights", _prepare_presharded_weights)
-        default_replicate_lm_head = False
-        self.replicate_lm_head = getattr(
-            config, "replicate_lm_head", default_replicate_lm_head
-        )
         self.replicate_embedding = getattr(config, "replicate_embedding", False)
         self.model = Grok1Model(

sglang/srt/offloader.py CHANGED Viewed

@@ -321,6 +321,7 @@ class _BaseParamOffloader(ABC):
     @staticmethod
     def create(mode: str, **kwargs) -> "_BaseParamOffloader":
         return {
+            "meta": _MetaParamOffloader,
             "cpu": _CpuParamOffloader,
             "shm_cpu": _ShmCpuParamOffloader,
             "sharded_gpu": _ShardedGpuParamOffloader,
@@ -341,6 +342,17 @@ class _BaseParamOffloader(ABC):
         raise NotImplementedError
+class _MetaParamOffloader(_BaseParamOffloader):
+    """Usually used for debugging."""
+    def __init__(self, module, param_name):
+        super().__init__(module, param_name)
+        _move_param_to_meta(module, param_name)
+    def create_device_tensor(self):
+        return torch.empty_like(self._param.data, device="cuda")
 class _CpuParamOffloader(_BaseParamOffloader):
     def __init__(self, module, param_name):
         super().__init__(module, param_name)
@@ -431,3 +443,106 @@ def _empty_strided_like(x: torch.Tensor, device, pin_memory=False):
         device=device,
         pin_memory=pin_memory,
     )
+# ----------------------------------------- ShardedGpu ------------------------------------------------------
+# TODO unify with ShmCpu mode
+class _ShardedGpuParamOffloader(_BaseParamOffloader):
+    def __init__(self, module, param_name):
+        super().__init__(module, param_name)
+        self._rank = get_naive_distributed().get_rank()
+        self._world_size = get_naive_distributed().get_world_size()
+        from sglang.srt.distributed import get_tensor_model_parallel_world_size
+        assert get_tensor_model_parallel_world_size() == 1, "not yet support tp_size!=1"
+        assert (
+            self._param.data.is_contiguous()
+        ), f"not yet support non-contiguous tensor {self._param.shape=} {self._param.stride()=}"
+        if self._rank == 0:
+            _move_param_to_cpu(self._param, pin_memory=True)
+        else:
+            _move_param_to_meta(self._module, self._param_name)
+        self.sharded_param_handles = None
+    def post_init(self):
+        # check again since it may be changed
+        assert (
+            self._param.data.is_contiguous()
+        ), f"not yet support non-contiguous tensor {self._param.shape=} {self._param.stride()=}"
+        scatter_src = self._param.data
+        logger.info(
+            f"[offloader] post_init {scatter_src.nbytes=} {scatter_src.dtype=} {scatter_src.shape=} {torch.cuda.memory_allocated()=}"
+        )
+        if self._rank == 0:
+            scatter_src = scatter_src.to("cuda")
+        scatter_list = _even_chunk(scatter_src, self._world_size)
+        sharded_param = torch.empty(
+            scatter_list[0].shape, dtype=scatter_list[0].dtype, device="cuda"
+        )
+        self.sharded_param_handles = _create_shared_buffer_tensors(
+            local_tensor=sharded_param
+        )
+        get_naive_distributed().scatter(
+            sharded_param, scatter_list if self._rank == 0 else None
+        )
+        _move_param_to_meta(self._module, self._param_name)
+    def create_device_tensor(self):
+        output = _empty_strided_like(self._param, device="cuda")
+        output_chunks = output.chunk(self._world_size)
+        for index in range(self._world_size):
+            src_rank = (self._rank + index) % self._world_size
+            src_buf = self.sharded_param_handles[src_rank]
+            output_chunks[src_rank].copy_(src_buf)
+        return output
+def _even_chunk(x: torch.Tensor, chunks: int):
+    assert x.shape[0] % chunks == 0, f"{x.shape=} {chunks=}"
+    return list(x.chunk(chunks))
+def _create_shared_buffer_tensors(local_tensor: torch.Tensor) -> List[torch.Tensor]:
+    self_rank = get_naive_distributed().get_rank()
+    world_size = get_naive_distributed().get_world_size()
+    object_list = get_naive_distributed().all_gather_object(
+        dict(
+            dup_serialized_local_tensor=[
+                (
+                    None
+                    if interesting_rank == self_rank
+                    else MultiprocessingSerializer.serialize(local_tensor)
+                )
+                for interesting_rank in range(world_size)
+            ]
+        )
+    )
+    output_tensors = []
+    for output_rank in range(world_size):
+        remote_serialized_tensor = object_list[output_rank][
+            "dup_serialized_local_tensor"
+        ][self_rank]
+        if output_rank == self_rank:
+            assert remote_serialized_tensor is None
+            output_tensors.append(local_tensor)
+        else:
+            output_tensors.append(
+                MultiprocessingSerializer.deserialize(remote_serialized_tensor)
+            )
+    return output_tensors

sglang/srt/server_args.py CHANGED Viewed

@@ -639,10 +639,6 @@ class ServerArgs:
                     logger.warning(
                         "DeepSeek MTP does not require setting speculative_draft_model_path."
                     )
-                if self.page_size != 1 and self.attention_backend == "flashinfer":
-                    raise ValueError(
-                        "Speculative decoding with page_size != 1 is not supported. Please set page_size to 1."
-                    )
             # Auto choose parameters
             if self.speculative_num_steps is None:

sglang/srt/utils.py CHANGED Viewed

@@ -2002,13 +2002,6 @@ def configure_ipv6(dist_init_addr):
     return port, host
-def rank0_log(msg: str):
-    from sglang.srt.distributed import get_tensor_model_parallel_rank
-    if get_tensor_model_parallel_rank() == 0:
-        logger.info(msg)
 def launch_dummy_health_check_server(host, port, enable_metrics):
     import asyncio

sglang/test/test_cutlass_moe.py CHANGED Viewed

@@ -8,6 +8,15 @@ from transformers import AutoConfig
 from sglang.srt.layers.moe.cutlass_moe import cutlass_fused_experts_fp8
 from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_experts
+from sglang.srt.layers.moe.moe_runner.base import MoeRunnerConfig
+# Copy from: https://github.com/deepseek-ai/DeepGEMM/blob/main/deep_gemm/utils.py
+def calc_diff(x, y):
+    x, y = x.double(), y.double()
+    denominator = (x * x + y * y).sum()
+    sim = 2 * (x * y).sum() / denominator
+    return 1 - sim
 def get_model_config(tp_size: int):
@@ -69,16 +78,11 @@ def run_test(tp_size, batch_size, model_config, check=False):
     # --- Input Data ---
     # Use bf16/fp16 for input activation based on model config
-    x = torch.randn((batch_size, H), device="cuda", dtype=dtype) * 0.0001
+    x = torch.randn((batch_size, H), device="cuda", dtype=dtype)
     # --- Weights (Generate in higher precision, then convert to FP8) ---
     # Generate weights suitable for FP8 conversion (e.g., scaled appropriately)
-    w1_hp = (
-        torch.randn((E, I, H), device="cuda", dtype=torch.float32) * 0.00001 + 0.00001
-    )
-    w2_hp = (
-        torch.randn((E, H, I // 2), device="cuda", dtype=torch.float32) * 0.00001
-        + 0.00001
-    )
+    w1_hp = torch.randn((E, I, H), device="cuda", dtype=torch.float32)
+    w2_hp = torch.randn((E, H, I // 2), device="cuda", dtype=torch.float32)
     w1 = to_fp8(w1_hp)
     w2 = to_fp8(w2_hp)
@@ -149,13 +153,13 @@ def run_test(tp_size, batch_size, model_config, check=False):
     )
     # Note: Triton expects non-transposed weights
+    moe_config = MoeRunnerConfig(inplace=False)
     triton_lambda = lambda: fused_experts(
         x,
         w1,
         w2,
         (topk_weights, topk_ids, "dummy"),
-        inplace=False,
-        activation="silu",  # Assuming SiLU activation common in MoEs
+        moe_config,
         use_fp8_w8a8=True,
         w1_scale=w1_scale,
         w2_scale=w2_scale,
@@ -221,32 +225,19 @@ def run_test(tp_size, batch_size, model_config, check=False):
                 w1,  # Original shape
                 w2,  # Original shape
                 (topk_weights, topk_ids, "dummy"),
-                inplace=False,  # Important: Use False to get output tensor
-                activation="silu",
+                moe_config,
                 use_fp8_w8a8=True,
                 w1_scale=w1_scale,
                 w2_scale=w2_scale,
                 block_shape=block_shape,
             )
-        # Ensure outputs are same dtype for comparison
-        y_cutlass = y_cutlass.to(dtype)
-        y_triton = y_triton.to(dtype)
-        abs_error = torch.abs(y_cutlass - y_triton)
-        rel_error = abs_error / torch.clamp(torch.abs(y_triton), min=1e-2)
-        max_abs_err = abs_error.max().item()
-        max_rel_err = rel_error.max().item()
-        print("y_cutlass:", y_cutlass[:, :10])
-        print("y_triton:", y_triton[:, :10])
-        print(f"Max absolute error: {max_abs_err:.6f}")
-        print(f"Max relative error: {max_rel_err:.6f}")
+        diff = calc_diff(y_cutlass, y_triton)
+        print(f"Diff: {diff:.6f}")
         # Tolerance might need adjustment based on FP8 specifics and kernel differences
         # FP8 comparisons often require higher tolerance than FP16/BF16
-        assert max_rel_err < 5e-1, f"Relative error too high! {max_rel_err}"
+        assert diff < 1e-4, f"Diff too high! {diff}"
         print("Correctness check passed.")
@@ -264,7 +255,21 @@ if __name__ == "__main__":
         "--batch-sizes",
         type=int,
         nargs="+",
-        default=[1, 4, 8, 16, 32, 64, 128, 256, 512, 1024],  # Adjusted default
+        default=[
+            1,
+            4,
+            8,
+            16,
+            32,
+            64,
+            128,
+            256,
+            512,
+            1024,
+            2048,
+            4096,
+            8192,
+        ],  # Adjusted default
         help="List of batch sizes to test",
     )
     parser.add_argument("--check", action="store_true", help="Enable check mode")

sglang/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.5.1"
1	+ __version__ = "0.5.1.post2"

{sglang-0.5.1.dist-info → sglang-0.5.1.post2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sglang
-Version: 0.5.1
+Version: 0.5.1.post2
 Summary: SGLang is yet another fast serving framework for large language models and vision language models.
 License:                                  Apache License
                                    Version 2.0, January 2004
@@ -262,7 +262,7 @@ Requires-Dist: torch==2.8.0; extra == "srt"
 Requires-Dist: torchaudio==2.8.0; extra == "srt"
 Requires-Dist: torchvision; extra == "srt"
 Requires-Dist: cuda-python; extra == "srt"
-Requires-Dist: flashinfer_python==0.2.11.post3; extra == "srt"
+Requires-Dist: flashinfer_python==0.2.14.post1; extra == "srt"
 Provides-Extra: blackwell
 Requires-Dist: sglang[runtime_common]; extra == "blackwell"
 Requires-Dist: sgl-kernel; extra == "blackwell"
@@ -270,7 +270,7 @@ Requires-Dist: torch==2.8.0; extra == "blackwell"
 Requires-Dist: torchaudio==2.8.0; extra == "blackwell"
 Requires-Dist: torchvision; extra == "blackwell"
 Requires-Dist: cuda-python; extra == "blackwell"
-Requires-Dist: flashinfer_python==0.2.11.post3; extra == "blackwell"
+Requires-Dist: flashinfer_python==0.2.14.post1; extra == "blackwell"
 Provides-Extra: srt-hip
 Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
 Requires-Dist: torch; extra == "srt-hip"
@@ -374,7 +374,7 @@ Dynamic: license-file
 | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
 ## News
-- [2025/08] 🔔 SGLang x AMD SF Meetup on 8/22: Hands-on GPU workshop, tech talks by AMD/xAI/SGLang, and networking. [Register here](https://lu.ma/gbfhjvuo).
+- [2025/08] 🔔 SGLang x AMD SF Meetup on 8/22: Hands-on GPU workshop, tech talks by AMD/xAI/SGLang, and networking ([Roadmap](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_sglang_roadmap.pdf), [Large-scale EP](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_sglang_ep.pdf)).
 - [2025/08] 🔥 SGLang provides day-0 support for OpenAI gpt-oss model ([instructions](https://github.com/sgl-project/sglang/issues/8833))
 - [2025/06] 🔥 SGLang, the high-performance serving infrastructure powering trillions of tokens daily, has been awarded the third batch of the Open Source AI Grant by a16z ([a16z blog](https://a16z.com/advancing-open-source-ai-through-benchmarks-and-bold-experimentation/)).
 - [2025/06] 🔥 Deploying DeepSeek on GB200 NVL72 with PD and Large Scale EP (Part I): 2.7x Higher Decoding Throughput ([blog](https://lmsys.org/blog/2025-06-16-gb200-part-1/)).

sglang 0.5.1__py3-none-any.whl → 0.5.1.post2__py3-none-any.whl

sglang 0.5.1py3-none-any.whl → 0.5.1.post2py3-none-any.whl