PyPI - sglang - Versions diffs - 0.4.2.post4__py3-none-any.whl → 0.4.3__py3-none-any.whl - Mend

sglang 0.4.2.post4py3-none-any.whl → 0.4.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json CHANGED Viewed

@@ -1,61 +1,61 @@
 {
     "1": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 4,
+        "GROUP_SIZE_M": 8,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "2": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 16,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "4": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "8": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 4,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "16": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 8,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "24": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 8,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "32": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 16,
         "num_warps": 4,
@@ -64,52 +64,52 @@
     },
     "48": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "64": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 4,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "96": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 4,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "128": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "256": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "512": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
         "num_warps": 4,
@@ -117,28 +117,28 @@
         "waves_per_eu": 0
     },
     "1024": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 4,
+        "GROUP_SIZE_M": 8,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "1536": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 4,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "2048": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 4,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
@@ -156,7 +156,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 4,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0

sglang/srt/layers/quantization/fp8_kernel.py CHANGED Viewed

@@ -27,6 +27,10 @@ from sglang.srt.utils import get_device_core_count, get_device_name, is_hip
 is_hip_ = is_hip()
 fp8_type_ = torch.float8_e4m3fnuz if is_hip_ else torch.float8_e4m3fn
+_is_cuda = torch.cuda.is_available() and torch.version.cuda
+if _is_cuda:
+    from sgl_kernel import sgl_per_token_group_quant_fp8
 logger = logging.getLogger(__name__)
@@ -72,11 +76,60 @@ def _per_token_group_quant_fp8(
     tl.store(y_s_ptr, y_s)
+@triton.jit
+def _per_token_group_quant_fp8_colmajor(
+    # Pointers to inputs and output
+    y_ptr,
+    y_q_ptr,
+    y_s_ptr,
+    group_size,
+    # Num columns of y
+    y_num_columns,
+    # Stride from one column to the next of y_s
+    y_s_col_stride,
+    # Avoid to divide zero
+    eps,
+    # Information for float8
+    fp8_min,
+    fp8_max,
+    # Meta-parameters
+    BLOCK: tl.constexpr,
+):
+    """A Triton-accelerated function to perform per-token-group
+    quantization on a tensor.
+    This function converts the tensor values into float8 values.
+    """
+    # Map the program id to the row of X and Y it should compute.
+    g_id = tl.program_id(0)
+    y_ptr += g_id * group_size
+    y_q_ptr += g_id * group_size
+    # Convert g_id the flattened block coordinate to 2D so we can index
+    # into the output y_scales matrix
+    blocks_per_row = y_num_columns // group_size
+    scale_col = g_id % blocks_per_row
+    scale_row = g_id // blocks_per_row
+    y_s_ptr += scale_col * y_s_col_stride + scale_row
+    cols = tl.arange(0, BLOCK)  # group_size <= BLOCK
+    mask = cols < group_size
+    y = tl.load(y_ptr + cols, mask=mask, other=0.0).to(tl.float32)
+    # Quant
+    _absmax = tl.maximum(tl.max(tl.abs(y)), eps)
+    y_s = _absmax / fp8_max
+    y_q = tl.clamp(y / y_s, fp8_min, fp8_max).to(y_q_ptr.dtype.element_ty)
+    tl.store(y_q_ptr + cols, y_q, mask=mask)
+    tl.store(y_s_ptr, y_s)
 def per_token_group_quant_fp8(
     x: torch.Tensor,
     group_size: int,
     eps: float = 1e-10,
     dtype: torch.dtype = fp8_type_,
+    column_major_scales: bool = False,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     """Function to perform per-token-group quantization on an input tensor `x`.
@@ -108,30 +161,83 @@ def per_token_group_quant_fp8(
     x_q = torch.empty_like(x, device=x.device, dtype=dtype)
     M = x.numel() // group_size
     N = group_size
-    x_s = torch.empty(
-        x.shape[:-1] + (x.shape[-1] // group_size,),
-        device=x.device,
-        dtype=torch.float32,
-    )
+    if column_major_scales:
+        x_s = torch.empty(
+            (x.shape[-1] // group_size,) + x.shape[:-1],
+            device=x.device,
+            dtype=torch.float32,
+        ).permute(-1, -2)
+    else:
+        x_s = torch.empty(
+            x.shape[:-1] + (x.shape[-1] // group_size,),
+            device=x.device,
+            dtype=torch.float32,
+        )
     BLOCK = triton.next_power_of_2(N)
     # heuristics for number of warps
     num_warps = min(max(BLOCK // 256, 1), 8)
     num_stages = 1
-    _per_token_group_quant_fp8[(M,)](
-        x,
-        x_q,
-        x_s,
-        group_size,
-        N,
-        eps,
-        fp8_min=fp8_min,
-        fp8_max=fp8_max,
-        BLOCK=BLOCK,
-        num_warps=num_warps,
-        num_stages=num_stages,
+    if column_major_scales:
+        _per_token_group_quant_fp8_colmajor[(M,)](
+            x,
+            x_q,
+            x_s,
+            group_size,
+            x.shape[1],
+            x_s.stride(1),
+            eps,
+            fp8_min=fp8_min,
+            fp8_max=fp8_max,
+            BLOCK=BLOCK,
+            num_warps=num_warps,
+            num_stages=num_stages,
+        )
+    else:
+        _per_token_group_quant_fp8[(M,)](
+            x,
+            x_q,
+            x_s,
+            group_size,
+            N,
+            eps,
+            fp8_min=fp8_min,
+            fp8_max=fp8_max,
+            BLOCK=BLOCK,
+            num_warps=num_warps,
+            num_stages=num_stages,
+        )
+    return x_q, x_s
+def sglang_per_token_group_quant_fp8(
+    x: torch.Tensor,
+    group_size: int,
+    eps: float = 1e-10,
+    dtype: torch.dtype = fp8_type_,
+):
+    assert (
+        x.shape[-1] % group_size == 0
+    ), "the last dimension of `x` cannot be divisible by `group_size`"
+    assert x.is_contiguous(), "`x` is not contiguous"
+    finfo = torch.finfo(dtype)
+    fp8_max = finfo.max
+    fp8_min = -fp8_max
+    x_q = torch.empty_like(x, device=x.device, dtype=dtype)
+    M = x.numel() // group_size
+    N = group_size
+    x_s = torch.empty(
+        x.shape[:-1] + (x.shape[-1] // group_size,),
+        device=x.device,
+        dtype=torch.float32,
     )
+    sgl_per_token_group_quant_fp8(x, x_q, x_s, group_size, eps, fp8_min, fp8_max)
     return x_q, x_s

sglang/srt/layers/quantization/fp8_utils.py CHANGED Viewed

@@ -10,6 +10,9 @@ from sglang.srt.layers.quantization.fp8_kernel import (
 from sglang.srt.utils import is_hip
 is_hip_ = is_hip()
+_is_cuda = torch.cuda.is_available() and torch.version.cuda
+if _is_cuda:
+    from sgl_kernel import fp8_blockwise_scaled_mm
 def normalize_e4m3fn_to_e4m3fnuz(
@@ -36,6 +39,19 @@ def normalize_e4m3fn_to_e4m3fnuz(
     return weight, weight_scale, input_scale
+def cutlass_block_fp8_supported() -> bool:
+    if _is_cuda:
+        major, minor = torch.cuda.get_device_capability()
+        sm_version = major * 10 + minor
+        cuda_version = tuple(map(int, torch.version.cuda.split(".")))
+        if cuda_version >= (12, 0) and sm_version >= 90:
+            return True
+    return False
+CUTLASS_BLOCK_FP8_SUPPORTED = cutlass_block_fp8_supported()
 def apply_w8a8_block_fp8_linear(
     input: torch.Tensor,
     weight: torch.Tensor,
@@ -48,11 +64,24 @@ def apply_w8a8_block_fp8_linear(
     # View input as 2D matrix for fp8 methods
     input_2d = input.view(-1, input.shape[-1])
     output_shape = [*input.shape[:-1], weight.shape[0]]
-    q_input, x_scale = per_token_group_quant_fp8(input_2d, block_size[1])
-    output = w8a8_block_fp8_matmul(
-        q_input, weight, x_scale, weight_scale, block_size, output_dtype=input.dtype
+    # TODO: add more robust shape check here
+    shape_supported_by_cutlass = (
+        weight.shape[0] % 128 == 0 and weight.shape[1] % 128 == 0
     )
+    if CUTLASS_BLOCK_FP8_SUPPORTED and shape_supported_by_cutlass:
+        q_input, x_scale = per_token_group_quant_fp8(
+            input_2d, block_size[1], column_major_scales=True
+        )
+        output = fp8_blockwise_scaled_mm(
+            q_input, weight.T, x_scale, weight_scale.T, out_dtype=input.dtype
+        )
+    else:
+        q_input, x_scale = per_token_group_quant_fp8(
+            input_2d, block_size[1], column_major_scales=False
+        )
+        output = w8a8_block_fp8_matmul(
+            q_input, weight, x_scale, weight_scale, block_size, output_dtype=input.dtype
+        )
     if bias is not None:
         output = output + bias

sglang/srt/managers/detokenizer_manager.py CHANGED Viewed

@@ -210,6 +210,7 @@ class DetokenizerManager:
                     input_top_logprobs_idx=recv_obj.input_top_logprobs_idx,
                     output_top_logprobs_val=recv_obj.output_top_logprobs_val,
                     output_top_logprobs_idx=recv_obj.output_top_logprobs_idx,
+                    output_hidden_states=recv_obj.output_hidden_states,
                 )
             )

sglang/srt/managers/io_struct.py CHANGED Viewed

@@ -371,6 +371,8 @@ class BatchTokenIDOut:
     output_top_logprobs_val: List[List]
     output_top_logprobs_idx: List[List]
+    output_hidden_states: List[List[float]]
 @dataclass
 class BatchStrOut:
@@ -397,6 +399,8 @@ class BatchStrOut:
     output_top_logprobs_val: List[List]
     output_top_logprobs_idx: List[List]
+    output_hidden_states: List[List[float]]
 @dataclass
 class BatchEmbeddingOut:

sglang/srt/managers/schedule_batch.py CHANGED Viewed

@@ -65,6 +65,7 @@ global_server_args_dict = {
     "enable_dp_attention": ServerArgs.enable_dp_attention,
     "enable_ep_moe": ServerArgs.enable_ep_moe,
     "device": ServerArgs.device,
+    "enable_flashinfer_mla": ServerArgs.enable_flashinfer_mla,
 }
 logger = logging.getLogger(__name__)
@@ -315,6 +316,7 @@ class Req:
             self.output_token_logprobs_val = self.output_token_logprobs_idx = (
                 self.output_top_logprobs_val
             ) = self.output_top_logprobs_idx = None
+        self.hidden_states = []
         # Logprobs (internal values)
         # The tokens is prefilled but need to be considered as decode tokens
@@ -604,6 +606,9 @@ class ScheduleBatch:
     # Enable custom logit processor
     enable_custom_logit_processor: bool = False
+    # Return hidden states
+    return_hidden_states: bool = False
     @classmethod
     def init_new(
         cls,
@@ -615,6 +620,7 @@ class ScheduleBatch:
         enable_overlap: bool,
         spec_algorithm: SpeculativeAlgorithm,
         enable_custom_logit_processor: bool,
+        return_hidden_states: bool = False,
     ):
         return cls(
             reqs=reqs,
@@ -629,6 +635,7 @@ class ScheduleBatch:
             device=req_to_token_pool.device,
             spec_algorithm=spec_algorithm,
             enable_custom_logit_processor=enable_custom_logit_processor,
+            return_hidden_states=return_hidden_states,
         )
     def batch_size(self):
@@ -1196,9 +1203,15 @@ class ScheduleBatch:
             spec_algorithm=self.spec_algorithm,
             spec_info=self.spec_info,
             capture_hidden_mode=(
-                getattr(self.spec_info, "capture_hidden_mode", CaptureHiddenMode.NULL)
-                if self.spec_info
-                else CaptureHiddenMode.NULL
+                CaptureHiddenMode.FULL
+                if self.return_hidden_states
+                else (
+                    getattr(
+                        self.spec_info, "capture_hidden_mode", CaptureHiddenMode.NULL
+                    )
+                    if self.spec_info
+                    else CaptureHiddenMode.NULL
+                )
             ),
         )

sglang/srt/managers/scheduler.py CHANGED Viewed

@@ -997,6 +997,7 @@ class Scheduler:
             self.enable_overlap,
             self.spec_algorithm,
             self.server_args.enable_custom_logit_processor,
+            self.server_args.return_hidden_states,
         )
         new_batch.prepare_for_extend()
@@ -1156,6 +1157,8 @@ class Scheduler:
                         logits_output.input_token_logprobs.tolist()
                     )
+            hidden_state_offset = 0
             # Check finish conditions
             logprob_pt = 0
             for i, (req, next_token_id) in enumerate(zip(batch.reqs, next_token_ids)):
@@ -1182,6 +1185,21 @@ class Scheduler:
                             i, req, logprob_pt, next_token_ids, logits_output
                         )
+                    if (
+                        self.server_args.return_hidden_states
+                        and logits_output.hidden_states is not None
+                    ):
+                        req.hidden_states.append(
+                            logits_output.hidden_states[
+                                hidden_state_offset : (
+                                    hidden_state_offset := hidden_state_offset
+                                    + len(req.origin_input_ids)
+                                )
+                            ]
+                            .cpu()
+                            .clone()
+                        )
                     if req.grammar is not None:
                         req.grammar.accept_token(next_token_id)
                         req.grammar.finished = req.finished()
@@ -1275,6 +1293,12 @@ class Scheduler:
                         logits_output.next_token_top_logprobs_idx[i]
                     )
+            if (
+                self.server_args.return_hidden_states
+                and logits_output.hidden_states is not None
+            ):
+                req.hidden_states.append(logits_output.hidden_states[i].cpu().clone())
             if req.grammar is not None:
                 req.grammar.accept_token(next_token_id)
                 req.grammar.finished = req.finished()
@@ -1398,6 +1422,7 @@ class Scheduler:
             completion_tokens = []
             cached_tokens = []
             spec_verify_ct = []
+            hidden_states = []
             if return_logprob:
                 input_token_logprobs_val = []
@@ -1464,6 +1489,8 @@ class Scheduler:
                         output_top_logprobs_val.append(req.output_top_logprobs_val)
                         output_top_logprobs_idx.append(req.output_top_logprobs_idx)
+                    hidden_states.append(req.hidden_states)
             # Send to detokenizer
             if rids:
                 self.send_to_detokenizer.send_pyobj(
@@ -1490,6 +1517,7 @@ class Scheduler:
                         input_top_logprobs_idx,
                         output_top_logprobs_val,
                         output_top_logprobs_idx,
+                        hidden_states,
                     )
                 )
         else:  # embedding or reward model
@@ -1553,6 +1581,7 @@ class Scheduler:
             self.enable_overlap,
             self.spec_algorithm,
             self.server_args.enable_custom_logit_processor,
+            self.server_args.return_hidden_states,
         )
         idle_batch.prepare_for_idle()
         return idle_batch

sglang/srt/managers/tokenizer_manager.py CHANGED Viewed

@@ -796,6 +796,12 @@ class TokenizerManager:
                     }
                 )
+            if (
+                hasattr(recv_obj, "output_hidden_states")
+                and len(recv_obj.output_hidden_states[i]) > 0
+            ):
+                meta_info["hidden_states"] = recv_obj.output_hidden_states[i]
             if isinstance(recv_obj, BatchStrOut):
                 out_dict = {
                     "text": recv_obj.output_strs[i],

sglang/srt/managers/tp_worker_overlap_thread.py CHANGED Viewed

@@ -156,6 +156,10 @@ class TpModelWorkerClient:
                     logits_output.input_token_logprobs = (
                         logits_output.input_token_logprobs.to("cpu", non_blocking=True)
                     )
+            if logits_output.hidden_states is not None:
+                logits_output.hidden_states = logits_output.hidden_states.to(
+                    "cpu", non_blocking=True
+                )
             next_token_ids = next_token_ids.to("cpu", non_blocking=True)
             copy_done.record()

sglang/srt/model_executor/cuda_graph_runner.py CHANGED Viewed

@@ -33,6 +33,9 @@ from sglang.srt.model_executor.forward_batch_info import (
     ForwardBatch,
     ForwardMode,
 )
+from sglang.srt.utils import is_hip
+is_hip_ = is_hip()
 if TYPE_CHECKING:
     from sglang.srt.model_executor.model_runner import ModelRunner
@@ -129,6 +132,8 @@ def get_batch_sizes_to_capture(model_runner: ModelRunner):
         if bs <= model_runner.req_to_token_pool.size
         and bs <= server_args.cuda_graph_max_bs
     ]
+    if is_hip_:
+        capture_bs += [i * 8 for i in range(21, 33)]
     compile_bs = (
         [bs for bs in capture_bs if bs <= server_args.torch_compile_max_bs]
         if server_args.enable_torch_compile
@@ -349,7 +354,13 @@ class CudaGraphRunner:
             spec_algorithm=self.model_runner.spec_algorithm,
             spec_info=spec_info,
             capture_hidden_mode=(
-                spec_info.capture_hidden_mode if spec_info else CaptureHiddenMode.NULL
+                CaptureHiddenMode.FULL
+                if self.model_runner.server_args.return_hidden_states
+                else (
+                    spec_info.capture_hidden_mode
+                    if spec_info
+                    else CaptureHiddenMode.NULL
+                )
             ),
         )

sglang 0.4.2.post4__py3-none-any.whl → 0.4.3__py3-none-any.whl

sglang 0.4.2.post4py3-none-any.whl → 0.4.3py3-none-any.whl