PyPI - sglang - Versions diffs - 0.4.5.post2__py3-none-any.whl → 0.4.5.post3__py3-none-any.whl - Mend

sglang 0.4.5.post2py3-none-any.whl → 0.4.5.post3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

sglang/bench_serving.py +3 -2
sglang/compile_deep_gemm.py +136 -0
sglang/lang/backend/openai.py +5 -1
sglang/lang/backend/runtime_endpoint.py +5 -1
sglang/srt/configs/model_config.py +4 -1
sglang/srt/constrained/xgrammar_backend.py +1 -0
sglang/srt/disaggregation/decode.py +43 -0
sglang/srt/disaggregation/mini_lb.py +69 -8
sglang/srt/disaggregation/mooncake/conn.py +1 -1
sglang/srt/disaggregation/nixl/__init__.py +1 -0
sglang/srt/disaggregation/nixl/conn.py +622 -0
sglang/srt/disaggregation/prefill.py +100 -16
sglang/srt/disaggregation/utils.py +17 -0
sglang/srt/entrypoints/engine.py +4 -0
sglang/srt/entrypoints/http_server.py +3 -7
sglang/srt/function_call_parser.py +60 -0
sglang/srt/layers/activation.py +2 -2
sglang/srt/layers/attention/flashattention_backend.py +781 -150
sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +5 -5
sglang/srt/layers/attention/triton_ops/extend_attention.py +5 -5
sglang/srt/layers/attention/triton_ops/prefill_attention.py +7 -3
sglang/srt/layers/dp_attention.py +1 -1
sglang/srt/layers/layernorm.py +19 -4
sglang/srt/layers/moe/ep_moe/layer.py +2 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +153 -0
sglang/srt/layers/quantization/deep_gemm.py +378 -0
sglang/srt/layers/quantization/fp8_kernel.py +7 -38
sglang/srt/layers/quantization/fp8_utils.py +2 -2
sglang/srt/layers/quantization/gptq.py +13 -7
sglang/srt/layers/quantization/modelopt_quant.py +2 -2
sglang/srt/layers/quantization/w8a8_int8.py +3 -3
sglang/srt/layers/rotary_embedding.py +6 -6
sglang/srt/layers/sampler.py +2 -2
sglang/srt/managers/data_parallel_controller.py +7 -1
sglang/srt/managers/io_struct.py +14 -3
sglang/srt/managers/schedule_batch.py +13 -0
sglang/srt/managers/scheduler.py +16 -6
sglang/srt/managers/tokenizer_manager.py +115 -29
sglang/srt/managers/tp_worker.py +1 -0
sglang/srt/mem_cache/hiradix_cache.py +40 -32
sglang/srt/mem_cache/memory_pool.py +31 -13
sglang/srt/model_executor/cuda_graph_runner.py +13 -8
sglang/srt/model_executor/model_runner.py +19 -4
sglang/srt/models/deepseek_v2.py +9 -6
sglang/srt/models/minicpm3.py +2 -2
sglang/srt/models/minicpmo.py +17 -6
sglang/srt/openai_api/adapter.py +71 -4
sglang/srt/openai_api/protocol.py +6 -1
sglang/srt/server_args.py +52 -40
sglang/srt/speculative/build_eagle_tree.py +2 -2
sglang/srt/speculative/eagle_utils.py +2 -2
sglang/srt/speculative/eagle_worker.py +2 -7
sglang/srt/utils.py +46 -5
sglang/test/test_utils.py +3 -1
sglang/version.py +1 -1
{sglang-0.4.5.post2.dist-info → sglang-0.4.5.post3.dist-info}/METADATA +3 -3
{sglang-0.4.5.post2.dist-info → sglang-0.4.5.post3.dist-info}/RECORD +62 -57
{sglang-0.4.5.post2.dist-info → sglang-0.4.5.post3.dist-info}/WHEEL +0 -0
{sglang-0.4.5.post2.dist-info → sglang-0.4.5.post3.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.5.post2.dist-info → sglang-0.4.5.post3.dist-info}/top_level.txt +0 -0

sglang/srt/layers/quantization/deep_gemm.py ADDED Viewed

@@ -0,0 +1,378 @@
+import logging
+import os
+from contextlib import contextmanager
+from dataclasses import dataclass
+from enum import IntEnum, auto
+from typing import Callable, Dict, List, Optional, Tuple
+import torch
+from tqdm.contrib.concurrent import thread_map
+from sglang.srt.server_args import ServerArgs
+from sglang.srt.utils import get_bool_env_var, get_device_sm, get_int_env_var, is_cuda
+_ENABLE_JIT_DEEPGEMM = False
+if is_cuda():
+    import deep_gemm
+    from deep_gemm import get_num_sms
+    from deep_gemm.jit_kernels.gemm import get_best_configs
+    from deep_gemm.jit_kernels.gemm import includes as deep_gemm_includes
+    from deep_gemm.jit_kernels.gemm import template as deep_gemm_gemm_template
+    from deep_gemm.jit_kernels.m_grouped_gemm import (
+        template as deep_gemm_grouped_gemm_template,
+    )
+    from deep_gemm.jit_kernels.tuner import jit_tuner
+    sm_version = get_device_sm()
+    if sm_version == 90:
+        if get_bool_env_var("SGL_ENABLE_JIT_DEEPGEMM", default="false"):
+            _ENABLE_JIT_DEEPGEMM = True
+logger = logging.getLogger(__name__)
+_BUILTIN_M_LIST = list(range(1, 1024 * 16 + 1))
+_ENABLE_JIT_DEEPGEMM_PRECOMPILE = get_bool_env_var(
+    "SGL_JIT_DEEPGEMM_PRECOMPILE", "true"
+)
+_DO_COMPILE = get_bool_env_var("SGL_IS_FIRST_RANK_ON_NODE", "true")
+_COMPILE_WORKERS = get_int_env_var("SGL_JIT_DEEPGEMM_COMPILE_WORKERS", 4)
+_IN_PRE_COMPILE_STAGE = get_bool_env_var("SGL_IN_DEEP_GEMM_PRE_COMPILE_STAGE", "false")
+# Force redirect deep_gemm cache_dir
+os.environ["DG_CACHE_DIR"] = os.getenv(
+    "SGL_DG_CACHE_DIR", os.path.expanduser("~") + "/.cache/deep_gemm"
+)
+def update_deep_gemm_config(gpu_id: int, server_args: ServerArgs):
+    global _BUILTIN_M_LIST
+    global _DO_COMPILE
+    # Generate m_max
+    m_max = 1024 * 16
+    if server_args.chunked_prefill_size < 1:
+        m_max = 1024 * 64
+    elif server_args.chunked_prefill_size > 8192:
+        m_max = server_args.chunked_prefill_size * 2
+    m_max = min(1024 * 128, m_max)
+    _BUILTIN_M_LIST = list(range(1, m_max + 1))
+    # Check if is the first rank on node
+    _DO_COMPILE = ServerArgs.base_gpu_id == gpu_id
+class DeepGemmKernelType(IntEnum):
+    GROUPED_GEMM_NT_F8F8BF16_MASKED = auto()
+    GROUPED_GEMM_NT_F8F8BF16_CONTIG = auto()
+    GEMM_NT_F8F8BF16 = auto()
+@dataclass
+class DeepGemmKernelHelper:
+    name: str
+    compile_func: Callable[
+        [
+            int,
+            int,
+            int,
+            Tuple[int, int, int, int, Tuple[int, bool], Tuple[int, int, int]],
+        ],
+        None,
+    ]
+    configure_func: Callable[
+        [int, int, int, int, int],
+        Tuple[int, int, int, int, Tuple[int, bool], Tuple[int, int, int]],
+    ]
+_INITIALIZATION_DICT: Dict[Tuple[DeepGemmKernelType, int, int, int], bool] = dict()
+def _compile_warning_1():
+    if not _IN_PRE_COMPILE_STAGE:
+        logger.warning(
+            "Entering DeepGEMM JIT Pre-Complie session. "
+            "And it may takes a long time(Typically 10-20 mins) "
+            "if you have not run `sglang.compile_deep_gemm`. "
+            "Recommand to run `sglang.compile_deep_gemm` with same args as `sglang.launch_server`"
+            " for pre-compilation to reduce the overhead if you have not run it before. "
+            "For example: "
+            "`python3 -m sglang.compile_deep_gemm --model deepseek-ai/DeepSeek-V3 --tp 8 --trust-remote-code`"
+        )
+def _compile_warning_2():
+    logger.warning(
+        "Entering DeepGEMM JIT Single Kernel Complie session. "
+        "And it will makes inference throughput becomes flaky. "
+        "Please run `sglang.compile_deep_gemm` with same args as `sglang.launch_server`"
+        " for pre-compilation to solve this issue. "
+        "For example: "
+        "`python3 -m sglang.compile_deep_gemm --model deepseek-ai/DeepSeek-V3 --tp 8 --trust-remote-code`"
+    )
+def _compile_grouped_gemm_nt_f8f8bf16_masked_one(
+    n: int,
+    k: int,
+    num_groups: int,
+    config: Tuple[int, int, int, int, Tuple[int, bool], Tuple[int, int, int]],
+) -> None:
+    # Auto-tuning with compilation
+    global deep_gemm_includes, deep_gemm_grouped_gemm_template
+    _, block_m, block_n, num_stages, tma_multicast_config, smem_config = config
+    _ = jit_tuner.compile_and_tune(
+        name="m_grouped_gemm_fp8_fp8_bf16_nt",
+        keys={
+            "N": n,
+            "K": k,
+            "BLOCK_M": block_m,
+            "BLOCK_N": block_n,
+            "SWIZZLE_D_MODE": smem_config[1],
+            "BLOCK_N_PADDING": smem_config[2],
+            "NUM_GROUPS": num_groups,
+            "NUM_STAGES": num_stages,
+            "NUM_TMA_MULTICAST": tma_multicast_config[0],
+            "IS_TMA_MULTICAST_ON_A": tma_multicast_config[1],
+            "GEMM_TYPE": "GroupedMasked",
+        },
+        space=(),
+        includes=deep_gemm_includes,
+        arg_defs=(
+            ("lhs", torch.float8_e4m3fn),
+            ("lhs_scales", torch.float),
+            ("rhs", torch.float8_e4m3fn),
+            ("rhs_scales", torch.float),
+            ("out", torch.bfloat16),
+            ("grouped_layout", torch.int32),
+            ("m", int),
+            ("stream", torch.cuda.Stream),
+            ("num_sms", int),
+            ("smem_size", int),
+        ),
+        template=deep_gemm_grouped_gemm_template,
+        args=[],
+    )
+def _compile_grouped_gemm_nt_f8f8bf16_contig_one(
+    n: int,
+    k: int,
+    num_groups: int,
+    config: Tuple[int, int, int, int, Tuple[int, bool], Tuple[int, int, int]],
+) -> None:
+    global deep_gemm_includes, deep_gemm_grouped_gemm_template
+    _, block_m, block_n, num_stages, tma_multicast_config, smem_config = config
+    _ = jit_tuner.compile_and_tune(
+        name="m_grouped_gemm_fp8_fp8_bf16_nt",
+        keys={
+            "N": n,
+            "K": k,
+            "BLOCK_M": block_m,
+            "BLOCK_N": block_n,
+            "SWIZZLE_D_MODE": smem_config[1],
+            "BLOCK_N_PADDING": smem_config[2],
+            "NUM_GROUPS": num_groups,
+            "NUM_STAGES": num_stages,
+            "NUM_TMA_MULTICAST": tma_multicast_config[0],
+            "IS_TMA_MULTICAST_ON_A": tma_multicast_config[1],
+            "GEMM_TYPE": "GroupedContiguous",
+        },
+        space=(),
+        includes=deep_gemm_includes,
+        arg_defs=(
+            ("lhs", torch.float8_e4m3fn),
+            ("lhs_scales", torch.float),
+            ("rhs", torch.float8_e4m3fn),
+            ("rhs_scales", torch.float),
+            ("out", torch.bfloat16),
+            ("grouped_layout", torch.int32),
+            ("m", int),
+            ("num_groups", int),
+            ("stream", torch.cuda.Stream),
+            ("num_sms", int),
+            ("smem_size", int),
+        ),
+        template=deep_gemm_grouped_gemm_template,
+        args=[],
+    )
+def _compile_gemm_nt_f8f8bf16_one(
+    n: int,
+    k: int,
+    _: int,  # _ is a dummy parameter to align with other interfaces
+    config: Tuple[int, int, int, int, Tuple[int, bool], Tuple[int, int, int]],
+) -> None:
+    global deep_gemm_includes, deep_gemm_gemm_template
+    _, block_m, block_n, num_stages, tma_multicast_config, smem_config = config
+    _ = jit_tuner.compile_and_tune(
+        name="gemm_fp8_fp8_bf16_nt",
+        keys={
+            "N": n,
+            "K": k,
+            "BLOCK_M": block_m,
+            "BLOCK_N": block_n,
+            "SWIZZLE_D_MODE": smem_config[1],
+            "BLOCK_N_PADDING": smem_config[2],
+            "NUM_STAGES": num_stages,
+            "NUM_TMA_MULTICAST": tma_multicast_config[0],
+            "IS_TMA_MULTICAST_ON_A": tma_multicast_config[1],
+        },
+        space=(),
+        includes=deep_gemm_includes,
+        arg_defs=(
+            ("lhs", torch.float8_e4m3fn),
+            ("lhs_scales", torch.float),
+            ("rhs", torch.float8_e4m3fn),
+            ("rhs_scales", torch.float),
+            ("out", torch.bfloat16),
+            ("m", int),
+            ("stream", torch.cuda.Stream),
+            ("num_sms", int),
+            ("smem_size", int),
+        ),
+        template=deep_gemm_gemm_template,
+        args=[],
+    )
+_KERNEL_HELPER_DICT: Dict[DeepGemmKernelType, DeepGemmKernelHelper] = {
+    DeepGemmKernelType.GROUPED_GEMM_NT_F8F8BF16_MASKED: DeepGemmKernelHelper(
+        name="m_grouped_gemm_fp8_fp8_bf16_nt_masked",
+        compile_func=_compile_grouped_gemm_nt_f8f8bf16_masked_one,
+        configure_func=lambda m, n, k, num_groups, num_sms: get_best_configs(
+            m, n, k, num_groups, num_sms, is_grouped_masked=True
+        ),
+    ),
+    DeepGemmKernelType.GROUPED_GEMM_NT_F8F8BF16_CONTIG: DeepGemmKernelHelper(
+        name="m_grouped_gemm_fp8_fp8_bf16_nt_contiguous",
+        compile_func=_compile_grouped_gemm_nt_f8f8bf16_contig_one,
+        configure_func=lambda m, n, k, _, num_sms: get_best_configs(
+            m, n, k, 1, num_sms, is_grouped_contiguous=True
+        ),
+    ),
+    DeepGemmKernelType.GEMM_NT_F8F8BF16: DeepGemmKernelHelper(
+        name="gemm_fp8_fp8_bf16_nt",
+        compile_func=_compile_gemm_nt_f8f8bf16_one,
+        configure_func=lambda m, n, k, _, num_sms: get_best_configs(
+            m, n, k, 1, num_sms
+        ),
+    ),
+}
+def _maybe_compile_deep_gemm_one_type_all(
+    kernel_type: DeepGemmKernelType,
+    n: int,
+    k: int,
+    num_groups: int,
+    m_list: Optional[List[int]] = None,
+) -> None:
+    global _INITIALIZATION_DICT
+    global _BUILTIN_M_LIST
+    query_key = (kernel_type, n, k, num_groups)
+    if (
+        _ENABLE_JIT_DEEPGEMM_PRECOMPILE
+        and _DO_COMPILE
+        and _INITIALIZATION_DICT.get(query_key) is None
+    ):
+        _INITIALIZATION_DICT[query_key] = True
+        kernel_helper = _KERNEL_HELPER_DICT[kernel_type]
+        _compile_warning_1()
+        logger.info(
+            f"Try DeepGEMM JIT Compiling for "
+            f"<{kernel_helper.name}> N={n}, K={k}, num_groups={num_groups} with all Ms."
+            f"{' It only takes a litte time(Typically 1 sec) if you have run `sglang.compile_deep_gemm`. ' if not _IN_PRE_COMPILE_STAGE else ''}"
+        )
+        # NOTE(alcanderian): get_num_sms should be change when 2-batch-overlap is introduced
+        num_sms = get_num_sms()
+        collected_configs = set()
+        for m in m_list if m_list is not None else _BUILTIN_M_LIST:
+            # Put config into set to get unique configs and reduce cases to be compiled
+            collected_configs.add(
+                kernel_helper.configure_func(m, n, k, num_groups, num_sms)
+            )
+        compile_func = lambda config: kernel_helper.compile_func(
+            n, k, num_groups, config
+        )
+        thread_map(compile_func, collected_configs, max_workers=_COMPILE_WORKERS)
+def grouped_gemm_nt_f8f8bf16_masked(
+    lhs: Tuple[torch.Tensor, torch.Tensor],
+    rhs: Tuple[torch.Tensor, torch.Tensor],
+    out: torch.Tensor,
+    masked_m: torch.Tensor,
+    expected_m: int,
+):
+    num_groups, _, k = lhs[0].shape
+    _, n, _ = rhs[0].shape
+    kernel_type = DeepGemmKernelType.GROUPED_GEMM_NT_F8F8BF16_MASKED
+    _maybe_compile_deep_gemm_one_type_all(kernel_type, n, k, num_groups)
+    with _log_jit_build(expected_m, n, k, kernel_type):
+        deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_masked(
+            lhs, rhs, out, masked_m, expected_m
+        )
+def grouped_gemm_nt_f8f8bf16_contig(
+    lhs: Tuple[torch.Tensor, torch.Tensor],
+    rhs: Tuple[torch.Tensor, torch.Tensor],
+    out: torch.Tensor,
+    m_indices: torch.Tensor,
+):
+    m, k = lhs[0].shape
+    num_groups, n, _ = rhs[0].shape
+    kernel_type = DeepGemmKernelType.GROUPED_GEMM_NT_F8F8BF16_CONTIG
+    _maybe_compile_deep_gemm_one_type_all(kernel_type, n, k, num_groups)
+    with _log_jit_build(m, n, k, kernel_type):
+        deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(lhs, rhs, out, m_indices)
+def gemm_nt_f8f8bf16(
+    lhs: Tuple[torch.Tensor, torch.Tensor],
+    rhs: Tuple[torch.Tensor, torch.Tensor],
+    out: torch.Tensor,
+):
+    m, k = lhs[0].shape
+    n, _ = rhs[0].shape
+    kernel_type = DeepGemmKernelType.GEMM_NT_F8F8BF16
+    _maybe_compile_deep_gemm_one_type_all(kernel_type, n, k, 1)
+    with _log_jit_build(m, n, k, kernel_type):
+        deep_gemm.gemm_fp8_fp8_bf16_nt(lhs, rhs, out)
+@contextmanager
+def _log_jit_build(M: int, N: int, K: int, kernel_type: DeepGemmKernelType):
+    if _IN_PRE_COMPILE_STAGE:
+        yield
+        return
+    from deep_gemm.jit.runtime import RuntimeCache
+    origin_func = RuntimeCache.__getitem__
+    def __patched_func(self, *args, **kwargs):
+        ret = origin_func(self, *args, **kwargs)
+        if ret is None:
+            kernel_helper = _KERNEL_HELPER_DICT[kernel_type]
+            _compile_warning_2()
+            logger.warning(
+                f"DeepGEMM JIT Compiling for <{kernel_helper.name}> M={M}, N={N}, K={K}. Please wait."
+            )
+        return ret
+    RuntimeCache.__getitem__ = __patched_func
+    yield
+    RuntimeCache.__getitem__ = origin_func

sglang/srt/layers/quantization/fp8_kernel.py CHANGED Viewed

@@ -16,19 +16,17 @@ import functools
 import json
 import logging
 import os
-from contextlib import contextmanager
 from typing import Any, Dict, List, Optional, Tuple
 import torch
 import triton
 import triton.language as tl
+from sglang.srt.layers.quantization.deep_gemm import _ENABLE_JIT_DEEPGEMM
 from sglang.srt.utils import (
     direct_register_custom_op,
-    get_bool_env_var,
     get_device_core_count,
     get_device_name,
-    get_device_sm,
     is_cuda,
     is_hip,
     supports_custom_op,
@@ -43,22 +41,16 @@ else:
     fp8_max = torch.finfo(_fp8_type).max
 fp8_min = -fp8_max
-_enable_jit_deepgemm = False
-_enable_jit_deepgemm_bmm = False
 if _is_cuda:
-    import deep_gemm
     from sgl_kernel import (
         sgl_per_tensor_quant_fp8,
         sgl_per_token_group_quant_fp8,
         sgl_per_token_quant_fp8,
     )
-    sm_version = get_device_sm()
-    if sm_version == 90:
-        if get_bool_env_var("SGL_ENABLE_JIT_DEEPGEMM", default="false"):
-            _enable_jit_deepgemm = True
-        if get_bool_env_var("SGL_ENABLE_JIT_DEEPGEMM_BMM", default="false"):
-            _enable_jit_deepgemm_bmm = True
+    from sglang.srt.layers.quantization.deep_gemm import (
+        gemm_nt_f8f8bf16 as deep_gemm_gemm_nt_f8f8bf16,
+    )
 logger = logging.getLogger(__name__)
@@ -71,10 +63,7 @@ if supports_custom_op():
         Bs: torch.Tensor,
         C: torch.Tensor,
     ) -> None:
-        M, K = A.shape
-        N, _ = B.shape
-        with _log_jit_build(M, N, K):
-            deep_gemm.gemm_fp8_fp8_bf16_nt((A, As), (B, Bs), C)
+        deep_gemm_gemm_nt_f8f8bf16((A, As), (B, Bs), C)
     def deep_gemm_fp8_fp8_bf16_nt_fake(
         A: torch.Tensor,
@@ -715,25 +704,6 @@ def get_w8a8_block_fp8_configs(
     return None
-@contextmanager
-def _log_jit_build(M: int, N: int, K: int):
-    from deep_gemm.jit.runtime import RuntimeCache
-    origin_func = RuntimeCache.__getitem__
-    def __patched_func(self, *args, **kwargs):
-        ret = origin_func(self, *args, **kwargs)
-        if ret is None:
-            logger.warning(
-                f"DeepGEMM JIT code generation <gemm_fp8_fp8_bf16_nt>: M={M}, N={N}, K={K}. Please wait."
-            )
-        return ret
-    RuntimeCache.__getitem__ = __patched_func
-    yield
-    RuntimeCache.__getitem__ = origin_func
 def w8a8_block_fp8_matmul(
     A: torch.Tensor,
     B: torch.Tensor,
@@ -804,12 +774,11 @@ def w8a8_block_fp8_matmul(
     )
     # deepgemm only support bf16
-    if C.dtype == torch.bfloat16 and _enable_jit_deepgemm:
+    if C.dtype == torch.bfloat16 and _ENABLE_JIT_DEEPGEMM:
         if supports_custom_op():
             torch.ops.sglang.deep_gemm_fp8_fp8_bf16_nt(A, As, B, Bs, C)
         else:
-            with _log_jit_build(M, N, K):
-                deep_gemm.gemm_fp8_fp8_bf16_nt((A, As), (B, Bs), C)
+            deep_gemm_gemm_nt_f8f8bf16((A, As), (B, Bs), C)
     else:
         kernel = (
             _w8a8_block_fp8_matmul_unrolledx4

sglang/srt/layers/quantization/fp8_utils.py CHANGED Viewed

@@ -12,8 +12,8 @@ try:
 except ImportError:
     VLLM_AVAILABLE = False
+from sglang.srt.layers.quantization.deep_gemm import _ENABLE_JIT_DEEPGEMM
 from sglang.srt.layers.quantization.fp8_kernel import (
-    _enable_jit_deepgemm,
     per_token_group_quant_fp8,
     scaled_fp8_quant,
     sglang_per_token_quant_fp8,
@@ -143,7 +143,7 @@ def apply_w8a8_block_fp8_linear(
         )
         gemm_a8w8_blockscale(q_input, weight, x_scale, weight_scale, output)
     else:
-        if _enable_jit_deepgemm:
+        if _ENABLE_JIT_DEEPGEMM:
             q_input, x_scale = sglang_per_token_group_quant_fp8(
                 input_2d,
                 block_size[1],

sglang/srt/layers/quantization/gptq.py CHANGED Viewed

@@ -37,6 +37,14 @@ except ImportError:
 logger = logging.getLogger(__name__)
+def check_marlin_format(hf_quant_cfg: Dict[str, Any]) -> bool:
+    # compat: gptqmodel and autogptq (eol) main use checkpoint_format: str
+    # compat: autogptq <=0.7.1 is_marlin_format: bool
+    return hf_quant_cfg.get("checkpoint_format") == "marlin" or hf_quant_cfg.get(
+        "is_marlin_format", False
+    )
 class GPTQConfig(QuantizationConfig):
     """Config class for GPTQ.
@@ -262,13 +270,15 @@ class GPTQMarlinConfig(QuantizationConfig):
     @classmethod
     def override_quantization_method(cls, hf_quant_cfg, user_quant) -> Optional[str]:
+        is_marlin_format = check_marlin_format(hf_quant_cfg)
         can_convert = cls.is_gptq_marlin_compatible(hf_quant_cfg)
         is_valid_user_quant = (
             user_quant is None or user_quant == "marlin" or user_quant == "gptq_marlin"
         )
-        if can_convert and is_valid_user_quant:
+        if not is_marlin_format and can_convert and is_valid_user_quant:
             msg = (
                 "The model is convertible to {} during runtime."
                 " Using {} kernel.".format(cls.get_name(), cls.get_name())
@@ -276,7 +286,7 @@ class GPTQMarlinConfig(QuantizationConfig):
             logger.info(msg)
             return cls.get_name()
-        if can_convert and user_quant == "gptq":
+        if not is_marlin_format and can_convert and user_quant == "gptq":
             logger.info(
                 "Detected that the model can run with gptq_marlin"
                 ", however you specified quantization=gptq explicitly,"
@@ -401,11 +411,7 @@ class MarlinConfig(QuantizationConfig):
     @classmethod
     def override_quantization_method(cls, hf_quant_cfg, user_quant) -> Optional[str]:
-        # compat: autogptq >=0.8.0 use checkpoint_format: str
-        # compat: autogptq <=0.7.1 is_marlin_format: bool
-        is_marlin_format = hf_quant_cfg.get(
-            "checkpoint_format"
-        ) == "marlin" or hf_quant_cfg.get("is_marlin_format", False)
+        is_marlin_format = check_marlin_format(hf_quant_cfg)
         is_valid_user_quant = (
             user_quant is None or user_quant == "gptq" or user_quant == "marlin"

sglang/srt/layers/quantization/modelopt_quant.py CHANGED Viewed

@@ -22,9 +22,9 @@ from sglang.srt.layers.quantization.utils import (
     requantize_with_max_scale,
 )
 from sglang.srt.layers.radix_attention import RadixAttention
-from sglang.srt.utils import is_cuda_available
+from sglang.srt.utils import is_cuda
-if is_cuda_available():
+if is_cuda():
     from sgl_kernel import cutlass_scaled_fp4_mm, scaled_fp4_quant
 # Initialize logger for the module

sglang/srt/layers/quantization/w8a8_int8.py CHANGED Viewed

@@ -11,10 +11,10 @@ from sglang.srt.layers.quantization.base_config import (
     QuantizeMethodBase,
 )
 from sglang.srt.layers.quantization.int8_kernel import per_token_quant_int8
-from sglang.srt.utils import is_cuda_available, set_weight_attrs
+from sglang.srt.utils import is_cuda, set_weight_attrs
-is_cuda = is_cuda_available()
-if is_cuda:
+_is_cuda = is_cuda()
+if _is_cuda:
     from sgl_kernel import int8_scaled_mm

sglang/srt/layers/rotary_embedding.py CHANGED Viewed

@@ -8,11 +8,11 @@ import torch
 import torch.nn as nn
 from sglang.srt.custom_op import CustomOp
-from sglang.srt.utils import is_cuda_available
+from sglang.srt.utils import is_cuda
-_is_cuda_available = is_cuda_available()
+_is_cuda = is_cuda()
-if _is_cuda_available:
+if _is_cuda:
     from sgl_kernel import apply_rope_with_cos_sin_cache_inplace
 else:
     from vllm._custom_ops import rotary_embedding as vllm_rotary_embedding
@@ -82,7 +82,7 @@ class RotaryEmbedding(CustomOp):
         cache = self._compute_cos_sin_cache()
         # NOTE(ByronHsu): cache needs to be in FP32 for numerical stability
-        if not _is_cuda_available:
+        if not _is_cuda:
             cache = cache.to(dtype)
         self.cos_sin_cache: torch.Tensor
         self.register_buffer("cos_sin_cache", cache, persistent=False)
@@ -149,7 +149,7 @@ class RotaryEmbedding(CustomOp):
         key: torch.Tensor,
         offsets: Optional[torch.Tensor] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
-        if _is_cuda_available and (self.head_size in [64, 128, 256, 512]):
+        if _is_cuda and (self.head_size in [64, 128, 256, 512]):
             apply_rope_with_cos_sin_cache_inplace(
                 positions=positions,
                 query=query,
@@ -652,7 +652,7 @@ class DeepseekScalingRotaryEmbedding(RotaryEmbedding):
     def forward(self, *args, **kwargs):
         if torch.compiler.is_compiling():
             return self.forward_native(*args, **kwargs)
-        if _is_cuda_available:
+        if _is_cuda:
             return self.forward_cuda(*args, **kwargs)
         else:
             return self.forward_native(*args, **kwargs)

sglang/srt/layers/sampler.py CHANGED Viewed

@@ -10,9 +10,9 @@ from sglang.srt.layers.dp_attention import get_attention_tp_group
 from sglang.srt.layers.logits_processor import LogitsProcessorOutput
 from sglang.srt.managers.schedule_batch import global_server_args_dict
 from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
-from sglang.srt.utils import crash_on_warnings, get_bool_env_var, is_cuda_available
+from sglang.srt.utils import crash_on_warnings, get_bool_env_var, is_cuda
-if is_cuda_available():
+if is_cuda():
     from sgl_kernel import (
         min_p_sampling_from_probs,
         top_k_renorm_prob,

sglang/srt/managers/data_parallel_controller.py CHANGED Viewed

@@ -30,6 +30,7 @@ from sglang.srt.managers.io_struct import (
 )
 from sglang.srt.managers.scheduler import run_scheduler_process
 from sglang.srt.server_args import PortArgs, ServerArgs
+from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
 from sglang.srt.utils import bind_port, configure_logger, get_zmq_socket
 from sglang.utils import get_exception_traceback
@@ -174,6 +175,10 @@ class DataParallelController:
         if not server_args.enable_dp_attention:
             logger.info(f"Launch DP{dp_rank} starting at GPU #{base_gpu_id}.")
+        memory_saver_adapter = TorchMemorySaverAdapter.create(
+            enable=server_args.enable_memory_saver
+        )
         # Launch tensor parallel scheduler processes
         scheduler_pipe_readers = []
         tp_size_per_node = server_args.tp_size // server_args.nnodes
@@ -208,7 +213,8 @@ class DataParallelController:
                 target=run_scheduler_process,
                 args=(server_args, rank_port_args, gpu_id, tp_rank, dp_rank, writer),
             )
-            proc.start()
+            with memory_saver_adapter.configure_subprocess():
+                proc.start()
             self.scheduler_procs.append(proc)
             scheduler_pipe_readers.append(reader)

sglang 0.4.5.post2__py3-none-any.whl → 0.4.5.post3__py3-none-any.whl

sglang 0.4.5.post2py3-none-any.whl → 0.4.5.post3py3-none-any.whl