PyPI - sglang - Versions diffs - 0.4.7__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl - Mend

sglang 0.4.7py3-none-any.whl → 0.4.7.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (99) hide show

sglang/__init__.py +2 -0
sglang/api.py +7 -0
sglang/bench_serving.py +1 -1
sglang/lang/interpreter.py +40 -1
sglang/lang/ir.py +27 -0
sglang/math_utils.py +8 -0
sglang/srt/configs/model_config.py +6 -0
sglang/srt/conversation.py +6 -0
sglang/srt/disaggregation/base/__init__.py +1 -1
sglang/srt/disaggregation/base/conn.py +25 -11
sglang/srt/disaggregation/common/__init__.py +5 -1
sglang/srt/disaggregation/common/utils.py +42 -0
sglang/srt/disaggregation/decode.py +196 -51
sglang/srt/disaggregation/fake/__init__.py +1 -1
sglang/srt/disaggregation/fake/conn.py +15 -9
sglang/srt/disaggregation/mooncake/__init__.py +1 -1
sglang/srt/disaggregation/mooncake/conn.py +18 -13
sglang/srt/disaggregation/nixl/__init__.py +6 -1
sglang/srt/disaggregation/nixl/conn.py +17 -12
sglang/srt/disaggregation/prefill.py +128 -43
sglang/srt/disaggregation/utils.py +127 -123
sglang/srt/entrypoints/engine.py +15 -1
sglang/srt/entrypoints/http_server.py +13 -2
sglang/srt/eplb_simulator/__init__.py +1 -0
sglang/srt/eplb_simulator/reader.py +51 -0
sglang/srt/layers/activation.py +19 -0
sglang/srt/layers/attention/aiter_backend.py +15 -2
sglang/srt/layers/attention/cutlass_mla_backend.py +38 -15
sglang/srt/layers/attention/flashattention_backend.py +53 -64
sglang/srt/layers/attention/flashinfer_backend.py +1 -2
sglang/srt/layers/attention/flashinfer_mla_backend.py +22 -24
sglang/srt/layers/attention/flashmla_backend.py +2 -10
sglang/srt/layers/attention/triton_backend.py +119 -119
sglang/srt/layers/attention/triton_ops/decode_attention.py +2 -7
sglang/srt/layers/attention/vision.py +51 -24
sglang/srt/layers/communicator.py +23 -5
sglang/srt/layers/linear.py +0 -4
sglang/srt/layers/logits_processor.py +0 -12
sglang/srt/layers/moe/ep_moe/kernels.py +6 -5
sglang/srt/layers/moe/ep_moe/layer.py +42 -32
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +11 -37
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +1 -4
sglang/srt/layers/moe/topk.py +16 -8
sglang/srt/layers/pooler.py +56 -0
sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py +1 -0
sglang/srt/layers/quantization/{deep_gemm.py → deep_gemm_wrapper/compile_utils.py} +23 -80
sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +32 -0
sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +110 -0
sglang/srt/layers/quantization/fp8_kernel.py +44 -15
sglang/srt/layers/quantization/fp8_utils.py +87 -22
sglang/srt/layers/radix_attention.py +2 -3
sglang/srt/lora/lora_manager.py +79 -34
sglang/srt/lora/mem_pool.py +4 -5
sglang/srt/managers/cache_controller.py +2 -1
sglang/srt/managers/io_struct.py +28 -4
sglang/srt/managers/multimodal_processors/base_processor.py +2 -2
sglang/srt/managers/multimodal_processors/vila.py +85 -0
sglang/srt/managers/schedule_batch.py +39 -6
sglang/srt/managers/scheduler.py +73 -17
sglang/srt/managers/tokenizer_manager.py +29 -2
sglang/srt/mem_cache/chunk_cache.py +1 -0
sglang/srt/mem_cache/hiradix_cache.py +4 -2
sglang/srt/mem_cache/memory_pool.py +111 -407
sglang/srt/mem_cache/memory_pool_host.py +380 -0
sglang/srt/mem_cache/radix_cache.py +36 -12
sglang/srt/model_executor/cuda_graph_runner.py +122 -55
sglang/srt/model_executor/forward_batch_info.py +14 -5
sglang/srt/model_executor/model_runner.py +6 -6
sglang/srt/model_loader/loader.py +8 -1
sglang/srt/models/bert.py +113 -13
sglang/srt/models/deepseek_v2.py +113 -155
sglang/srt/models/internvl.py +46 -102
sglang/srt/models/roberta.py +117 -9
sglang/srt/models/vila.py +305 -0
sglang/srt/openai_api/adapter.py +162 -4
sglang/srt/openai_api/protocol.py +37 -1
sglang/srt/sampling/sampling_batch_info.py +24 -0
sglang/srt/sampling/sampling_params.py +2 -0
sglang/srt/server_args.py +318 -233
sglang/srt/speculative/build_eagle_tree.py +1 -1
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +4 -3
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +5 -2
sglang/srt/speculative/eagle_utils.py +389 -109
sglang/srt/speculative/eagle_worker.py +134 -43
sglang/srt/two_batch_overlap.py +4 -2
sglang/srt/utils.py +58 -0
sglang/test/attention/test_prefix_chunk_info.py +2 -0
sglang/test/runners.py +38 -3
sglang/test/test_block_fp8.py +1 -0
sglang/test/test_block_fp8_deep_gemm_blackwell.py +252 -0
sglang/test/test_block_fp8_ep.py +1 -0
sglang/test/test_utils.py +3 -1
sglang/utils.py +9 -0
sglang/version.py +1 -1
{sglang-0.4.7.dist-info → sglang-0.4.7.post1.dist-info}/METADATA +5 -5
{sglang-0.4.7.dist-info → sglang-0.4.7.post1.dist-info}/RECORD +99 -88
{sglang-0.4.7.dist-info → sglang-0.4.7.post1.dist-info}/WHEEL +0 -0
{sglang-0.4.7.dist-info → sglang-0.4.7.post1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.7.dist-info → sglang-0.4.7.post1.dist-info}/top_level.txt +0 -0

sglang/srt/layers/pooler.py CHANGED Viewed

@@ -3,10 +3,13 @@
 from dataclasses import dataclass
 from enum import IntEnum
+from typing import Optional
 import torch
 import torch.nn as nn
+from transformers import PretrainedConfig
+from sglang.srt.layers.activation import get_cross_encoder_activation_function
 from sglang.srt.model_executor.model_runner import ForwardBatch
@@ -54,3 +57,56 @@ class Pooler(nn.Module):
             pooled_data = nn.functional.normalize(pooled_data, p=2, dim=1)
         return EmbeddingPoolerOutput(embeddings=pooled_data)
+class CrossEncodingPooler(nn.Module):
+    """A layer that pools specific information from hidden states.
+    This layer does the following:
+    1. Extracts specific tokens or aggregates data based on pooling method.
+    2. Normalizes output if specified.
+    3. Returns structured results as `EmbeddingPoolerOutput`.
+    """
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        classifier: nn.Module,
+        pooler: Optional[nn.Module] = None,
+    ):
+        super().__init__()
+        self.classifier = classifier
+        self.pooler = pooler
+        self.default_activation_function = get_cross_encoder_activation_function(config)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> EmbeddingPoolerOutput:
+        """Pools sentence pair scores from the hidden_states."""
+        prompt_lens = forward_batch.extend_seq_lens
+        offset = 0
+        pooled_data_lst = []
+        for prompt_len in prompt_lens:
+            pooled_data_i = hidden_states[offset : offset + prompt_len]
+            if self.pooler is not None:
+                final_shape_tensor = self.pooler(pooled_data_i, forward_batch)
+            else:
+                final_shape_tensor = self.classifier(pooled_data_i)
+            pooled_data_lst.append(final_shape_tensor)
+            offset += prompt_len
+        pooled_output = torch.stack(pooled_data_lst)
+        if self.pooler is not None:
+            # apply classifier once on the full batch if possible
+            pooled_output = self.classifier(pooled_output)
+        scores = self.default_activation_function(pooled_output).squeeze(-1)
+        return EmbeddingPoolerOutput(embeddings=scores)

sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .entrypoint import *

sglang/srt/layers/quantization/{deep_gemm.py → deep_gemm_wrapper/compile_utils.py} RENAMED Viewed

@@ -5,34 +5,23 @@ from dataclasses import dataclass
 from enum import IntEnum, auto
 from typing import Callable, Dict, List, Optional, Tuple
-import torch
 from tqdm.contrib.concurrent import thread_map
+from sglang.srt.layers.quantization.deep_gemm_wrapper.configurer import (
+    DEEPGEMM_BLACKWELL,
+    ENABLE_JIT_DEEPGEMM,
+)
 from sglang.srt.server_args import ServerArgs
-from sglang.srt.utils import get_bool_env_var, get_device_sm, get_int_env_var, is_cuda
+from sglang.srt.utils import get_bool_env_var, get_int_env_var
 logger = logging.getLogger(__name__)
-_ENABLE_JIT_DEEPGEMM = False
-try:
-    import deep_gemm
+if ENABLE_JIT_DEEPGEMM and not DEEPGEMM_BLACKWELL:
     from deep_gemm import get_num_sms
     from deep_gemm.jit import build
-    from deep_gemm.jit.compiler import get_nvcc_compiler
     from deep_gemm.jit_kernels.gemm import get_best_configs
     from deep_gemm.jit_kernels.runtime import FP8GemmRuntime, GemmType
-    sm_version = get_device_sm()
-    if sm_version == 90:
-        if get_bool_env_var("SGL_ENABLE_JIT_DEEPGEMM", default="true"):
-            _ENABLE_JIT_DEEPGEMM = True
-except ImportError:
-    logger.warning("Failed to import deepgemm, disable _ENABLE_JIT_DEEPGEMM.")
-def get_enable_jit_deepgemm():
-    return _ENABLE_JIT_DEEPGEMM
 _BUILTIN_M_LIST = list(range(1, 1024 * 16 + 1))
 _ENABLE_JIT_DEEPGEMM_PRECOMPILE = get_bool_env_var(
@@ -52,8 +41,10 @@ os.environ["DG_JIT_CACHE_DIR"] = os.getenv(
 # NVRTC may have performance loss with some cases.
 # And NVCC JIT speed is also 9x faster in the ref commit
 _USE_NVRTC_DEFAULT = "0"
-if _ENABLE_JIT_DEEPGEMM:
+if ENABLE_JIT_DEEPGEMM:
     try:
+        from deep_gemm.jit.compiler import get_nvcc_compiler
         get_nvcc_compiler()
     except:
         logger.warning(
@@ -114,11 +105,12 @@ class DeepGemmKernelHelper:
 _INITIALIZATION_DICT: Dict[Tuple[DeepGemmKernelType, int, int, int], bool] = dict()
+# TODO improve naming
 def _compile_warning_1():
     if not _IN_PRECOMPILE_STAGE and _IS_FIRST_RANK_ON_NODE:
         logger.warning(
             "Entering DeepGEMM JIT Pre-Compile session. "
-            "And it may takes a long time(Typically 10-20 mins) "
+            "It may takes a long time (typically 10-20 mins) "
             "if you have not run `sglang.compile_deep_gemm`. "
             "It is recommended to run `sglang.compile_deep_gemm` with same args as `sglang.launch_server`"
             " for pre-compilation to reduce the overhead if you have not run it before. "
@@ -127,6 +119,7 @@ def _compile_warning_1():
         )
+# TODO improve naming
 def _compile_warning_2():
     logger.warning(
         "Entering DeepGEMM JIT Single Kernel Compile session. "
@@ -238,6 +231,7 @@ def _compile_gemm_nt_f8f8bf16_one(
     _ = build("gemm_fp8_fp8_bf16_nt", code, FP8GemmRuntime, kwargs)
+# TODO further refactor warmup-related
 _KERNEL_HELPER_DICT: Dict[DeepGemmKernelType, DeepGemmKernelHelper] = {
     DeepGemmKernelType.GROUPED_GEMM_NT_F8F8BF16_MASKED: DeepGemmKernelHelper(
         name="m_grouped_gemm_fp8_fp8_bf16_nt_masked",
@@ -270,7 +264,6 @@ def _maybe_compile_deep_gemm_one_type_all(
     num_groups: int,
     m_list: Optional[List[int]] = None,
 ) -> None:
     global _INITIALIZATION_DICT
     global _BUILTIN_M_LIST
@@ -304,56 +297,6 @@ def _maybe_compile_deep_gemm_one_type_all(
         thread_map(compile_func, collected_configs, max_workers=_COMPILE_WORKERS)
-def grouped_gemm_nt_f8f8bf16_masked(
-    lhs: Tuple[torch.Tensor, torch.Tensor],
-    rhs: Tuple[torch.Tensor, torch.Tensor],
-    out: torch.Tensor,
-    masked_m: torch.Tensor,
-    expected_m: int,
-):
-    num_groups, _, k = lhs[0].shape
-    _, n, _ = rhs[0].shape
-    kernel_type = DeepGemmKernelType.GROUPED_GEMM_NT_F8F8BF16_MASKED
-    _maybe_compile_deep_gemm_one_type_all(kernel_type, n, k, num_groups)
-    with _log_jit_build(expected_m, n, k, kernel_type):
-        deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_masked(
-            lhs, rhs, out, masked_m, expected_m
-        )
-def grouped_gemm_nt_f8f8bf16_contig(
-    lhs: Tuple[torch.Tensor, torch.Tensor],
-    rhs: Tuple[torch.Tensor, torch.Tensor],
-    out: torch.Tensor,
-    m_indices: torch.Tensor,
-):
-    m, k = lhs[0].shape
-    num_groups, n, _ = rhs[0].shape
-    kernel_type = DeepGemmKernelType.GROUPED_GEMM_NT_F8F8BF16_CONTIG
-    _maybe_compile_deep_gemm_one_type_all(kernel_type, n, k, num_groups)
-    with _log_jit_build(m, n, k, kernel_type):
-        deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(lhs, rhs, out, m_indices)
-def gemm_nt_f8f8bf16(
-    lhs: Tuple[torch.Tensor, torch.Tensor],
-    rhs: Tuple[torch.Tensor, torch.Tensor],
-    out: torch.Tensor,
-):
-    m, k = lhs[0].shape
-    n, _ = rhs[0].shape
-    kernel_type = DeepGemmKernelType.GEMM_NT_F8F8BF16
-    _maybe_compile_deep_gemm_one_type_all(kernel_type, n, k, 1)
-    with _log_jit_build(m, n, k, kernel_type):
-        deep_gemm.gemm_fp8_fp8_bf16_nt(lhs, rhs, out)
 @contextmanager
 def _log_jit_build(M: int, N: int, K: int, kernel_type: DeepGemmKernelType):
     if _IN_PRECOMPILE_STAGE:
@@ -368,7 +311,8 @@ def _log_jit_build(M: int, N: int, K: int, kernel_type: DeepGemmKernelType):
         ret = origin_func(self, *args, **kwargs)
         if ret is None:
             kernel_helper = _KERNEL_HELPER_DICT[kernel_type]
-            _compile_warning_2()
+            if not DEEPGEMM_BLACKWELL:
+                _compile_warning_2()
             logger.warning(
                 f"DeepGEMM JIT Compiling for <{kernel_helper.name}> M={M}, N={N}, K={K}. Please wait."
             )
@@ -380,13 +324,12 @@ def _log_jit_build(M: int, N: int, K: int, kernel_type: DeepGemmKernelType):
 @contextmanager
-def configure_deep_gemm_num_sms(num_sms):
-    if num_sms is None:
+def deep_gemm_execution_hook(
+    m: int, n: int, k: int, num_groups: int, kernel_type: DeepGemmKernelType
+):
+    # not supported yet
+    if not DEEPGEMM_BLACKWELL:
+        _maybe_compile_deep_gemm_one_type_all(kernel_type, n, k, num_groups)
+    with _log_jit_build(m, n, k, kernel_type):
         yield
-    else:
-        original_num_sms = deep_gemm.get_num_sms()
-        deep_gemm.set_num_sms(num_sms)
-        try:
-            yield
-        finally:
-            deep_gemm.set_num_sms(original_num_sms)

sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py ADDED Viewed

@@ -0,0 +1,32 @@
+import logging
+from sglang.srt.utils import get_bool_env_var, get_device_sm
+logger = logging.getLogger(__name__)
+def _compute_enable_deep_gemm():
+    sm_version = get_device_sm()
+    if sm_version < 90:
+        return False
+    try:
+        import deep_gemm
+    except ImportError:
+        logger.warning("Failed to import deep_gemm, disable ENABLE_JIT_DEEPGEMM.")
+        return False
+    return get_bool_env_var("SGL_ENABLE_JIT_DEEPGEMM", default="true")
+ENABLE_JIT_DEEPGEMM = _compute_enable_deep_gemm()
+try:
+    from deep_gemm import fp8_gemm_nt
+    # They have not given a name to this breaking change
+    DEEPGEMM_BLACKWELL = True
+except ImportError:
+    DEEPGEMM_BLACKWELL = False
+DEEPGEMM_SCALE_UE8M0 = DEEPGEMM_BLACKWELL

sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py ADDED Viewed

@@ -0,0 +1,110 @@
+import logging
+from contextlib import contextmanager
+from typing import Tuple
+import torch
+from sglang.srt.layers.quantization.deep_gemm_wrapper import compile_utils
+from sglang.srt.layers.quantization.deep_gemm_wrapper.configurer import (
+    DEEPGEMM_BLACKWELL,
+    DEEPGEMM_SCALE_UE8M0,
+    ENABLE_JIT_DEEPGEMM,
+)
+from sglang.srt.server_args import ServerArgs
+logger = logging.getLogger(__name__)
+if ENABLE_JIT_DEEPGEMM:
+    import deep_gemm
+    if DEEPGEMM_BLACKWELL:
+        from deep_gemm import fp8_gemm_nt as _gemm_nt_f8f8bf16_raw
+        from deep_gemm import (
+            fp8_m_grouped_gemm_nt_masked as _grouped_gemm_nt_f8f8bf16_masked_raw,
+        )
+        from deep_gemm import (
+            m_grouped_fp8_gemm_nt_contiguous as _grouped_gemm_nt_f8f8bf16_contig_raw,
+        )
+    else:
+        from deep_gemm import gemm_fp8_fp8_bf16_nt as _gemm_nt_f8f8bf16_raw
+        from deep_gemm import get_col_major_tma_aligned_tensor
+        from deep_gemm import (
+            m_grouped_gemm_fp8_fp8_bf16_nt_contiguous as _grouped_gemm_nt_f8f8bf16_contig_raw,
+        )
+        from deep_gemm import (
+            m_grouped_gemm_fp8_fp8_bf16_nt_masked as _grouped_gemm_nt_f8f8bf16_masked_raw,
+        )
+def grouped_gemm_nt_f8f8bf16_masked(
+    lhs: Tuple[torch.Tensor, torch.Tensor],
+    rhs: Tuple[torch.Tensor, torch.Tensor],
+    out: torch.Tensor,
+    masked_m: torch.Tensor,
+    expected_m: int,
+    recipe=None,
+):
+    num_groups, _, k = lhs[0].shape
+    _, n, _ = rhs[0].shape
+    kernel_type = compile_utils.DeepGemmKernelType.GROUPED_GEMM_NT_F8F8BF16_MASKED
+    with compile_utils.deep_gemm_execution_hook(
+        expected_m, n, k, num_groups, kernel_type
+    ):
+        _grouped_gemm_nt_f8f8bf16_masked_raw(
+            lhs,
+            rhs,
+            out,
+            masked_m,
+            expected_m,
+            **({"recipe": recipe} if DEEPGEMM_BLACKWELL else {})
+        )
+def grouped_gemm_nt_f8f8bf16_contig(
+    lhs: Tuple[torch.Tensor, torch.Tensor],
+    rhs: Tuple[torch.Tensor, torch.Tensor],
+    out: torch.Tensor,
+    m_indices: torch.Tensor,
+):
+    m, k = lhs[0].shape
+    num_groups, n, _ = rhs[0].shape
+    kernel_type = compile_utils.DeepGemmKernelType.GROUPED_GEMM_NT_F8F8BF16_CONTIG
+    with compile_utils.deep_gemm_execution_hook(m, n, k, num_groups, kernel_type):
+        _grouped_gemm_nt_f8f8bf16_contig_raw(lhs, rhs, out, m_indices)
+def gemm_nt_f8f8bf16(
+    lhs: Tuple[torch.Tensor, torch.Tensor],
+    rhs: Tuple[torch.Tensor, torch.Tensor],
+    out: torch.Tensor,
+):
+    m, k = lhs[0].shape
+    n, _ = rhs[0].shape
+    num_groups = 1
+    kernel_type = compile_utils.DeepGemmKernelType.GEMM_NT_F8F8BF16
+    with compile_utils.deep_gemm_execution_hook(m, n, k, num_groups, kernel_type):
+        _gemm_nt_f8f8bf16_raw(
+            lhs,
+            rhs,
+            out,
+        )
+def update_deep_gemm_config(gpu_id: int, server_args: ServerArgs):
+    compile_utils.update_deep_gemm_config(gpu_id, server_args)
+@contextmanager
+def configure_deep_gemm_num_sms(num_sms):
+    if num_sms is None:
+        yield
+    else:
+        original_num_sms = deep_gemm.get_num_sms()
+        deep_gemm.set_num_sms(num_sms)
+        try:
+            yield
+        finally:
+            deep_gemm.set_num_sms(original_num_sms)

sglang/srt/layers/quantization/fp8_kernel.py CHANGED Viewed

@@ -23,7 +23,8 @@ import torch
 import triton
 import triton.language as tl
-from sglang.srt.layers.quantization.deep_gemm import _ENABLE_JIT_DEEPGEMM
+from sglang.math_utils import align
+from sglang.srt.layers.quantization import deep_gemm_wrapper
 from sglang.srt.utils import (
     direct_register_custom_op,
     get_device_core_count,
@@ -44,10 +45,6 @@ if _is_cuda:
         sgl_per_token_quant_fp8,
     )
-    from sglang.srt.layers.quantization.deep_gemm import (
-        gemm_nt_f8f8bf16 as deep_gemm_gemm_nt_f8f8bf16,
-    )
 logger = logging.getLogger(__name__)
@@ -67,7 +64,6 @@ else:
     fp8_max = torch.finfo(fp8_dtype).max
 fp8_min = -fp8_max
 if supports_custom_op():
     def deep_gemm_fp8_fp8_bf16_nt(
@@ -77,7 +73,7 @@ if supports_custom_op():
         Bs: torch.Tensor,
         C: torch.Tensor,
     ) -> None:
-        deep_gemm_gemm_nt_f8f8bf16((A, As), (B, Bs), C)
+        deep_gemm_wrapper.gemm_nt_f8f8bf16((A, As), (B, Bs), C)
     def deep_gemm_fp8_fp8_bf16_nt_fake(
         A: torch.Tensor,
@@ -280,6 +276,7 @@ def sglang_per_token_group_quant_fp8(
     eps: float = 1e-10,
     column_major_scales: bool = False,
     scale_tma_aligned: bool = False,
+    scale_ue8m0: bool = False,
 ):
     assert (
         x.shape[-1] % group_size == 0
@@ -287,8 +284,21 @@ def sglang_per_token_group_quant_fp8(
     assert x.is_contiguous(), "`x` is not contiguous"
     x_q = torch.empty_like(x, device=x.device, dtype=fp8_dtype)
-    if column_major_scales:
+    if scale_ue8m0:
+        assert column_major_scales and scale_tma_aligned
+        x_q_mn, x_q_k = x.shape
+        x_s_mn, x_s_k = x_q_mn, x_q_k // 128
+        aligned_mn = align(x_s_mn, 4)
+        aligned_k = align(x_s_k, 4)
+        # TODO(FIXME): Fix cuda kernel and recover here to empty.
+        x_s = torch.zeros(
+            (aligned_k // 4, aligned_mn),
+            device=x.device,
+            dtype=torch.int,
+        ).transpose(0, 1)[:x_s_mn, :]
+    elif column_major_scales:
         if scale_tma_aligned:
+            # TODO extract "align" function
             # aligned to 4 * sizeof(float)
             aligned_size = (x.shape[-2] + 3) // 4 * 4
             x_s = torch.empty(
@@ -309,7 +319,9 @@ def sglang_per_token_group_quant_fp8(
             dtype=torch.float32,
         )
     if x.shape[0] > 0:
-        sgl_per_token_group_quant_fp8(x, x_q, x_s, group_size, eps, fp8_min, fp8_max)
+        sgl_per_token_group_quant_fp8(
+            x, x_q, x_s, group_size, eps, fp8_min, fp8_max, scale_ue8m0
+        )
     return x_q, x_s
@@ -754,7 +766,15 @@ def prepare_block_fp8_matmul_inputs(
     assert A.shape[-1] == B.shape[-1]
     assert A.shape[:-1] == As.shape[:-1]
     assert A.is_contiguous()
-    assert triton.cdiv(A.shape[-1], block_k) == As.shape[-1]
+    if As.dtype == torch.float:
+        assert triton.cdiv(A.shape[-1], block_k) == As.shape[-1]
+    elif As.dtype == torch.int:
+        assert (
+            triton.cdiv(triton.cdiv(A.shape[-1], block_k), 4) == As.shape[-1]
+        ), f"{A.shape=} {As.shape=} {block_size=}"
+    else:
+        raise NotImplementedError
     M = A.numel() // A.shape[-1]
@@ -762,8 +782,17 @@ def prepare_block_fp8_matmul_inputs(
     assert B.is_contiguous()
     assert Bs.ndim == 2
     N, K = B.shape
-    assert triton.cdiv(N, block_n) == Bs.shape[0]
-    assert triton.cdiv(K, block_k) == Bs.shape[1]
+    if Bs.dtype == torch.float:
+        assert triton.cdiv(N, block_n) == Bs.shape[0]
+        assert triton.cdiv(K, block_k) == Bs.shape[1]
+    elif Bs.dtype == torch.int:
+        assert N == Bs.shape[0], f"{B.shape=} {Bs.shape=} {block_size=}"
+        assert (
+            triton.cdiv(triton.cdiv(K, block_k), 4) == Bs.shape[1]
+        ), f"{B.shape=} {Bs.shape=} {block_size=}"
+    else:
+        raise NotImplementedError
     C_shape = A.shape[:-1] + (N,)
     C = A.new_empty(C_shape, dtype=output_dtype)
@@ -782,12 +811,12 @@ def w8a8_block_fp8_matmul_deepgemm(
     M, N, K, C = prepare_block_fp8_matmul_inputs(A, B, As, Bs, block_size, output_dtype)
     # Deepgemm only supports output tensor type as bfloat16
-    assert C.dtype == torch.bfloat16 and _ENABLE_JIT_DEEPGEMM
+    assert C.dtype == torch.bfloat16 and deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM
     if supports_custom_op():
         torch.ops.sglang.deep_gemm_fp8_fp8_bf16_nt(A, As, B, Bs, C)
     else:
-        deep_gemm_gemm_nt_f8f8bf16((A, As), (B, Bs), C)
+        deep_gemm_wrapper.gemm_nt_f8f8bf16((A, As), (B, Bs), C)
     return C
@@ -881,7 +910,7 @@ def w8a8_block_fp8_matmul(
     block_size: List[int],
     output_dtype: torch.dtype = torch.float16,
 ) -> torch.Tensor:
-    if output_dtype == torch.bfloat16 and _ENABLE_JIT_DEEPGEMM:
+    if output_dtype == torch.bfloat16 and deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM:
         return w8a8_block_fp8_matmul_deepgemm(
             A, B, As, Bs, block_size, output_dtype=output_dtype
         )

sglang/srt/layers/quantization/fp8_utils.py CHANGED Viewed

@@ -1,9 +1,10 @@
-import os
-from curses import flash
 from typing import Callable, List, Optional, Tuple
+import einops
 import torch
+from sglang.math_utils import align
+from sglang.srt.layers.quantization import deep_gemm_wrapper
 from sglang.srt.layers.quantization.fp8_kernel import sglang_per_token_group_quant_fp8
 from sglang.srt.layers.utils import is_sm100_supported
@@ -14,7 +15,6 @@ try:
 except ImportError:
     VLLM_AVAILABLE = False
-from sglang.srt.layers.quantization.deep_gemm import _ENABLE_JIT_DEEPGEMM
 from sglang.srt.layers.quantization.fp8_kernel import (
     fp8_dtype,
     fp8_max,
@@ -137,7 +137,7 @@ def dispatch_w8a8_block_fp8_linear() -> Callable:
         return cutlass_w8a8_block_fp8_linear_with_fallback
     elif _use_aiter:
         return aiter_w8a8_block_fp8_linear
-    elif _ENABLE_JIT_DEEPGEMM:
+    elif deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM:
         return deepgemm_w8a8_block_fp8_linear_with_fallback
     else:
         return triton_w8a8_block_fp8_linear
@@ -238,7 +238,14 @@ def deepgemm_w8a8_block_fp8_linear_with_fallback(
         block_size[1],
         column_major_scales=True,
         scale_tma_aligned=True,
+        scale_ue8m0=deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0,
     )
+    # NOTE(alcanderian): Useless when scale is packed to int32
+    # if get_bool_env_var("SGLANG_W8A8_DEEPGEMM_SANITY_CHECK_UE8M0"):
+    #     _check_ue8m0("x_scale", x_scale)
+    #     _check_ue8m0("weight_scale", ws)
     output = w8a8_block_fp8_matmul_deepgemm(
         q_input, weight, x_scale, weight_scale, block_size, output_dtype=output_dtype
     )
@@ -247,6 +254,11 @@ def deepgemm_w8a8_block_fp8_linear_with_fallback(
     return output.to(dtype=output_dtype).view(*output_shape)
+def _check_ue8m0(name, x):
+    x_ceil = ceil_to_ue8m0(x)
+    assert torch.all(x == x_ceil), f"{name=} {x=} {x_ceil=}"
 def aiter_w8a8_block_fp8_linear(
     input: torch.Tensor,
     weight: torch.Tensor,
@@ -369,27 +381,80 @@ def block_quant_dequant(
     The output is an unquantized tensor with dtype.
     """
     block_n, block_k = block_size[0], block_size[1]
-    n, k = x_q_block.shape
-    n_tiles = (n + block_n - 1) // block_n
-    k_tiles = (k + block_k - 1) // block_k
-    assert n_tiles == x_s.shape[0]
-    assert k_tiles == x_s.shape[1]
+    *_, n, k = x_q_block.shape
-    x_dq_block = torch.empty_like(x_q_block, dtype=dtype)
+    # ... n_scale k_scale -> ... (n_scale block_n) (k_scale block_k)
+    x_scale_repeat = x_s.repeat_interleave(block_n, dim=-2).repeat_interleave(
+        block_k, dim=-1
+    )
+    x_scale_repeat = x_scale_repeat[..., :n, :k]
+    return (x_q_block.to(torch.float32) * x_scale_repeat).to(dtype)
+def requant_weight_ue8m0_inplace(weight, weight_scale_inv, weight_block_size):
+    assert isinstance(weight, torch.nn.Parameter)
+    assert isinstance(weight_scale_inv, torch.nn.Parameter)
+    weight.data, weight_scale_inv.data = _requant_weight_ue8m0(
+        weight, weight_scale_inv, weight_block_size
+    )
+def _requant_weight_ue8m0(
+    weight: torch.Tensor,
+    weight_scale_inv: torch.Tensor,
+    weight_block_size: List[int],
+):
+    assert weight_block_size == [128, 128]
+    *_, n, k = weight.shape
+    weight_dequant = block_quant_dequant(
+        weight,
+        weight_scale_inv,
+        weight_block_size,
+        torch.bfloat16,
+    )
+    weight_dequant_flat = weight_dequant.view((-1, k))
+    out_w_flat, out_s_flat = per_block_cast_to_fp8(weight_dequant_flat)
+    out_w = out_w_flat.view(weight.shape)
+    out_s = out_s_flat.view(weight_scale_inv.shape)
+    # NOTE copy and modified from DeepGEMM
+    def _transform_scale(sf, mn: int):
+        import deep_gemm.utils.layout
+        sf = sf.index_select(-2, torch.arange(mn, device=sf.device) // 128)
+        sf = deep_gemm.utils.layout.get_col_major_tma_aligned_packed_tensor(sf)
+        return sf
+    out_s = _transform_scale(out_s, mn=out_w.shape[-2])
+    return out_w, out_s
+# COPIED FROM DeepGEMM
+def per_block_cast_to_fp8(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    assert x.dim() == 2
+    m, n = x.shape
+    x_padded = torch.zeros(
+        (align(m, 128), align(n, 128)), dtype=x.dtype, device=x.device
+    )
+    x_padded[:m, :n] = x
+    x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128)
+    x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
+    sf = ceil_to_ue8m0(x_amax / 448.0)
+    x_scaled = (x_view * (1.0 / sf)).to(torch.float8_e4m3fn)
+    return x_scaled.view_as(x_padded)[:m, :n].contiguous(), sf.view(
+        x_view.size(0), x_view.size(2)
+    )
-    for j in range(n_tiles):
-        for i in range(k_tiles):
-            x_q_block_tile = x_q_block[
-                j * block_n : min((j + 1) * block_n, n),
-                i * block_k : min((i + 1) * block_k, k),
-            ]
-            x_dq_block_tile = x_dq_block[
-                j * block_n : min((j + 1) * block_n, n),
-                i * block_k : min((i + 1) * block_k, k),
-            ]
-            x_dq_block_tile[:, :] = x_q_block_tile.to(torch.float32) * x_s[j][i]
-    return x_dq_block
+# COPIED FROM DeepGEMM
+def ceil_to_ue8m0(x: torch.Tensor):
+    return torch.pow(2.0, torch.ceil(torch.log2(x.abs())))
 def channel_quant_to_tensor_quant(

sglang 0.4.7__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl

sglang 0.4.7py3-none-any.whl → 0.4.7.post1py3-none-any.whl