PyPI - sglang - Versions diffs - 0.4.8.post1__py3-none-any.whl → 0.4.9.post1__py3-none-any.whl - Mend

sglang 0.4.8.post1py3-none-any.whl → 0.4.9.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (158) hide show

sglang/bench_one_batch_server.py +17 -2
sglang/bench_serving.py +170 -24
sglang/srt/configs/internvl.py +4 -2
sglang/srt/configs/janus_pro.py +1 -1
sglang/srt/configs/model_config.py +60 -1
sglang/srt/configs/update_config.py +119 -0
sglang/srt/conversation.py +69 -1
sglang/srt/disaggregation/decode.py +21 -5
sglang/srt/disaggregation/mooncake/conn.py +35 -4
sglang/srt/disaggregation/nixl/conn.py +6 -6
sglang/srt/disaggregation/prefill.py +2 -2
sglang/srt/disaggregation/utils.py +1 -1
sglang/srt/distributed/parallel_state.py +44 -17
sglang/srt/entrypoints/EngineBase.py +8 -0
sglang/srt/entrypoints/engine.py +40 -6
sglang/srt/entrypoints/http_server.py +111 -24
sglang/srt/entrypoints/http_server_engine.py +1 -1
sglang/srt/entrypoints/openai/protocol.py +4 -2
sglang/srt/eplb/__init__.py +0 -0
sglang/srt/{managers → eplb}/eplb_algorithms/__init__.py +1 -1
sglang/srt/{managers → eplb}/eplb_manager.py +2 -4
sglang/srt/{eplb_simulator → eplb/eplb_simulator}/reader.py +1 -1
sglang/srt/{managers → eplb}/expert_distribution.py +1 -5
sglang/srt/{managers → eplb}/expert_location.py +1 -1
sglang/srt/{managers → eplb}/expert_location_dispatch.py +1 -1
sglang/srt/{model_executor → eplb}/expert_location_updater.py +17 -1
sglang/srt/hf_transformers_utils.py +2 -1
sglang/srt/layers/activation.py +2 -2
sglang/srt/layers/amx_utils.py +86 -0
sglang/srt/layers/attention/ascend_backend.py +219 -0
sglang/srt/layers/attention/flashattention_backend.py +32 -9
sglang/srt/layers/attention/tbo_backend.py +37 -9
sglang/srt/layers/communicator.py +20 -2
sglang/srt/layers/dp_attention.py +9 -3
sglang/srt/layers/elementwise.py +76 -12
sglang/srt/layers/flashinfer_comm_fusion.py +202 -0
sglang/srt/layers/layernorm.py +26 -0
sglang/srt/layers/linear.py +84 -14
sglang/srt/layers/logits_processor.py +4 -4
sglang/srt/layers/moe/cutlass_w4a8_moe.py +215 -0
sglang/srt/layers/moe/ep_moe/kernels.py +81 -8
sglang/srt/layers/moe/ep_moe/layer.py +176 -15
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +23 -17
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +3 -2
sglang/srt/layers/moe/fused_moe_triton/layer.py +211 -74
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +176 -0
sglang/srt/layers/moe/router.py +60 -22
sglang/srt/layers/moe/topk.py +10 -28
sglang/srt/layers/parameter.py +67 -7
sglang/srt/layers/quantization/__init__.py +2 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +1 -1
sglang/srt/layers/quantization/fp8.py +72 -7
sglang/srt/layers/quantization/fp8_kernel.py +1 -1
sglang/srt/layers/quantization/fp8_utils.py +1 -2
sglang/srt/layers/quantization/gptq.py +5 -1
sglang/srt/layers/quantization/modelopt_quant.py +244 -1
sglang/srt/layers/quantization/moe_wna16.py +1 -1
sglang/srt/layers/quantization/quant_utils.py +166 -0
sglang/srt/layers/quantization/w4afp8.py +264 -0
sglang/srt/layers/quantization/w8a8_int8.py +52 -1
sglang/srt/layers/rotary_embedding.py +2 -2
sglang/srt/layers/vocab_parallel_embedding.py +20 -10
sglang/srt/lora/lora.py +4 -5
sglang/srt/lora/lora_manager.py +73 -20
sglang/srt/lora/triton_ops/gate_up_lora_b.py +30 -19
sglang/srt/lora/triton_ops/qkv_lora_b.py +30 -19
sglang/srt/lora/triton_ops/sgemm_lora_a.py +27 -11
sglang/srt/lora/triton_ops/sgemm_lora_b.py +27 -15
sglang/srt/managers/cache_controller.py +41 -195
sglang/srt/managers/configure_logging.py +1 -1
sglang/srt/managers/io_struct.py +58 -14
sglang/srt/managers/mm_utils.py +77 -61
sglang/srt/managers/multimodal_processor.py +2 -6
sglang/srt/managers/multimodal_processors/qwen_audio.py +94 -0
sglang/srt/managers/schedule_batch.py +78 -85
sglang/srt/managers/scheduler.py +130 -64
sglang/srt/managers/scheduler_output_processor_mixin.py +8 -2
sglang/srt/managers/session_controller.py +12 -3
sglang/srt/managers/tokenizer_manager.py +314 -103
sglang/srt/managers/tp_worker.py +13 -1
sglang/srt/managers/tp_worker_overlap_thread.py +8 -0
sglang/srt/mem_cache/allocator.py +290 -0
sglang/srt/mem_cache/chunk_cache.py +34 -2
sglang/srt/mem_cache/hiradix_cache.py +2 -0
sglang/srt/mem_cache/memory_pool.py +402 -66
sglang/srt/mem_cache/memory_pool_host.py +6 -109
sglang/srt/mem_cache/multimodal_cache.py +3 -0
sglang/srt/mem_cache/radix_cache.py +8 -4
sglang/srt/model_executor/cuda_graph_runner.py +2 -1
sglang/srt/model_executor/forward_batch_info.py +17 -4
sglang/srt/model_executor/model_runner.py +297 -56
sglang/srt/model_loader/loader.py +41 -0
sglang/srt/model_loader/weight_utils.py +72 -4
sglang/srt/models/deepseek_nextn.py +1 -3
sglang/srt/models/deepseek_v2.py +195 -45
sglang/srt/models/deepseek_vl2.py +3 -5
sglang/srt/models/gemma3_causal.py +1 -2
sglang/srt/models/gemma3n_causal.py +4 -3
sglang/srt/models/gemma3n_mm.py +4 -20
sglang/srt/models/hunyuan.py +1 -1
sglang/srt/models/kimi_vl.py +1 -2
sglang/srt/models/llama.py +10 -4
sglang/srt/models/llama4.py +32 -45
sglang/srt/models/llama_eagle3.py +61 -11
sglang/srt/models/llava.py +5 -5
sglang/srt/models/minicpmo.py +2 -2
sglang/srt/models/mistral.py +1 -1
sglang/srt/models/mllama4.py +402 -89
sglang/srt/models/phi4mm.py +1 -3
sglang/srt/models/pixtral.py +3 -7
sglang/srt/models/qwen2.py +31 -3
sglang/srt/models/qwen2_5_vl.py +1 -3
sglang/srt/models/qwen2_audio.py +200 -0
sglang/srt/models/qwen2_moe.py +32 -6
sglang/srt/models/qwen2_vl.py +1 -4
sglang/srt/models/qwen3.py +94 -25
sglang/srt/models/qwen3_moe.py +68 -21
sglang/srt/models/vila.py +3 -8
sglang/srt/{mm_utils.py → multimodal/mm_utils.py} +2 -2
sglang/srt/{managers/multimodal_processors → multimodal/processors}/base_processor.py +140 -158
sglang/srt/{managers/multimodal_processors → multimodal/processors}/clip.py +2 -13
sglang/srt/{managers/multimodal_processors → multimodal/processors}/deepseek_vl_v2.py +4 -11
sglang/srt/{managers/multimodal_processors → multimodal/processors}/gemma3.py +3 -10
sglang/srt/{managers/multimodal_processors → multimodal/processors}/gemma3n.py +5 -20
sglang/srt/{managers/multimodal_processors → multimodal/processors}/internvl.py +3 -10
sglang/srt/{managers/multimodal_processors → multimodal/processors}/janus_pro.py +3 -9
sglang/srt/{managers/multimodal_processors → multimodal/processors}/kimi_vl.py +6 -13
sglang/srt/{managers/multimodal_processors → multimodal/processors}/llava.py +2 -10
sglang/srt/{managers/multimodal_processors → multimodal/processors}/minicpm.py +5 -12
sglang/srt/{managers/multimodal_processors → multimodal/processors}/mlama.py +2 -14
sglang/srt/{managers/multimodal_processors → multimodal/processors}/mllama4.py +65 -66
sglang/srt/{managers/multimodal_processors → multimodal/processors}/phi4mm.py +4 -14
sglang/srt/{managers/multimodal_processors → multimodal/processors}/pixtral.py +3 -9
sglang/srt/{managers/multimodal_processors → multimodal/processors}/qwen_vl.py +8 -14
sglang/srt/{managers/multimodal_processors → multimodal/processors}/vila.py +13 -31
sglang/srt/operations_strategy.py +6 -2
sglang/srt/reasoning_parser.py +26 -0
sglang/srt/sampling/sampling_batch_info.py +39 -1
sglang/srt/server_args.py +84 -22
sglang/srt/speculative/build_eagle_tree.py +57 -18
sglang/srt/speculative/eagle_worker.py +6 -4
sglang/srt/two_batch_overlap.py +203 -27
sglang/srt/utils.py +343 -163
sglang/srt/warmup.py +12 -3
sglang/test/runners.py +10 -1
sglang/test/test_cutlass_w4a8_moe.py +281 -0
sglang/test/test_utils.py +15 -3
sglang/utils.py +5 -5
sglang/version.py +1 -1
{sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/METADATA +12 -8
{sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/RECORD +157 -146
sglang/math_utils.py +0 -8
/sglang/srt/{managers → eplb}/eplb_algorithms/deepseek.py +0 -0
/sglang/srt/{managers → eplb}/eplb_algorithms/deepseek_vec.py +0 -0
/sglang/srt/{eplb_simulator → eplb/eplb_simulator}/__init__.py +0 -0
{sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/WHEEL +0 -0
{sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/top_level.txt +0 -0

sglang/srt/layers/moe/router.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Tuple
+from typing import Optional, Tuple
 import torch
 import triton
@@ -16,6 +16,8 @@ def fused_moe_router_kernel(
     moe_router_weight_ptr,  # input (num_experts, hidden_dim)
     topk_weights_ptr,  # output (bs, topk)
     topk_ids_ptr,  # output (bs, topk)
+    correction_bias_ptr,
+    is_correction_bias: tl.constexpr,
     num_experts: tl.constexpr,
     topk: tl.constexpr,
     moe_softcapping: tl.constexpr,
@@ -49,6 +51,11 @@ def fused_moe_router_kernel(
     bottom = exped + 1
     logits_softcapped = top / bottom * moe_softcapping
+    # Add bias after softcapping
+    if is_correction_bias:
+        bias = tl.load(correction_bias_ptr + tl.arange(0, num_experts))
+        logits_softcapped = logits_softcapped + bias
     # topk
     # assert 1 <= topk <= num_experts
@@ -109,6 +116,7 @@ def fused_moe_router_impl(
     router_weight: torch.Tensor,
     topk: int,
     moe_softcapping: float,
+    correction_bias: Optional[torch.Tensor] = None,
 ):
     assert len(x.shape) == 2 and x.shape[1] == router_weight.shape[1]
     bs, hidden_dim = x.shape
@@ -117,23 +125,23 @@ def fused_moe_router_impl(
     # router_logits = torch.empty((bs, num_experts), dtype=torch.float32, device=x.device)
     topk_weights = torch.empty((bs, topk), dtype=torch.float32, device=x.device)
     topk_ids = torch.empty((bs, topk), dtype=torch.int32, device=x.device)
+    is_correction_bias = correction_bias is not None
-    grid = lambda meta: (bs,)
-    min_num_warps = 16 if _is_hip else 32
+    max_warps = 16 if _is_hip else 32
     config = {
         "BLOCK_SIZE": triton.next_power_of_2(hidden_dim),
         "num_warps": max(
-            min(triton.next_power_of_2(triton.cdiv(hidden_dim, 256)), min_num_warps), 4
+            min(triton.next_power_of_2(triton.cdiv(hidden_dim, 256)), max_warps), 4
         ),
     }
-    fused_moe_router_kernel[grid](
+    fused_moe_router_kernel[(bs,)](
         x,
         router_weight,
         topk_weights,
         topk_ids,
+        correction_bias,
+        is_correction_bias=is_correction_bias,
         num_experts=num_experts,
         topk=topk,
         moe_softcapping=moe_softcapping,
@@ -153,7 +161,7 @@ def fused_moe_router_large_bs_kernel(
     topk_ids_ptr,  # output (bs, topk)
     bs,
     num_experts: tl.constexpr,
-    topk: tl.constexpr,  # only support topk == 1
+    topk: tl.constexpr,  # only support topk <= 2
     moe_softcapping: tl.constexpr,
     moe_renormalize: tl.constexpr,  # not supported
     K: tl.constexpr,
@@ -204,25 +212,53 @@ def fused_moe_router_large_bs_kernel(
     logits_softcapped = (exped - 1) / (exped + 1) * moe_softcapping
     # 5. top1
-    cond = tl.arange(0, BLOCK_SIZE_N)[None, :] < num_experts
-    top1 = tl.argmax(tl.where(cond, logits_softcapped, float("-inf")), axis=1)
+    arange_block_size_n = tl.arange(0, BLOCK_SIZE_N)[None, :]
+    cond_top1 = arange_block_size_n < num_experts
+    top1 = tl.argmax(tl.where(cond_top1, logits_softcapped, float("-inf")), axis=1)
     top1_v = tl.max(
-        tl.where(cond, logits_softcapped, float("-inf")), axis=1, keep_dims=True
+        tl.where(cond_top1, logits_softcapped, float("-inf")), axis=1, keep_dims=True
     )
-    invsumexp = 1.0 / tl.sum(
-        tl.where(cond, tl.exp(logits_softcapped - top1_v), 0.0), axis=1
+    top1_invsumexp = 1.0 / tl.sum(
+        tl.where(cond_top1, tl.exp(logits_softcapped - top1_v), 0.0), axis=1
     )
-    # 6. store to output
-    offs_topk = pid * topk * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    topk_mask = offs_topk < bs
-    tl.store(topk_ids_ptr + offs_topk, top1, mask=topk_mask)
+    # 6. store top1 to output
+    offs_top1 = pid * topk * BLOCK_SIZE_M + topk * tl.arange(0, BLOCK_SIZE_M)
+    top1_mask = offs_top1 < bs * topk
+    tl.store(topk_ids_ptr + offs_top1, top1, mask=top1_mask)
     tl.store(
-        topk_weights_ptr + offs_topk,
-        invsumexp,
-        mask=topk_mask,
+        topk_weights_ptr + offs_top1,
+        top1_invsumexp,
+        mask=top1_mask,
     )
+    # 7. handle topk == 2
+    if topk == 2:
+        cond_top2 = (arange_block_size_n < num_experts) and (
+            arange_block_size_n != top1[:, None]
+        )
+        top2 = tl.argmax(
+            tl.where(cond_top2, logits_softcapped, float("-inf")),
+            axis=1,
+            keep_dims=True,
+        )
+        top2_v = tl.sum(
+            logits_softcapped * (arange_block_size_n == top2), axis=1, keep_dims=True
+        )
+        top2_invsumexp = tl.exp(top2_v - top1_v) * top1_invsumexp[:, None]
+        # store top2
+        offs_top2 = (
+            pid * topk * BLOCK_SIZE_M + topk * tl.arange(0, BLOCK_SIZE_M)[:, None] + 1
+        )
+        top2_mask = offs_top2 < bs * topk
+        tl.store(topk_ids_ptr + offs_top2, top2, mask=top2_mask)
+        tl.store(
+            topk_weights_ptr + offs_top2,
+            top2_invsumexp,
+            mask=top2_mask,
+        )
 def fused_moe_router_large_bs_impl(
     x: torch.Tensor,
@@ -239,7 +275,7 @@ def fused_moe_router_large_bs_impl(
     assert num_experts <= BLOCK_SIZE_N
     assert hidden_dim % BLOCK_SIZE_K == 0
-    assert topk == 1
+    assert topk <= 2
     topk_weights = torch.empty((bs, topk), dtype=torch.float32, device=x.device)
     topk_ids = torch.empty((bs, topk), dtype=torch.int32, device=x.device)
@@ -273,6 +309,7 @@ def fused_moe_router_shim(
     gating_output,
     topk,
     renormalize,
+    correction_bias: Optional[torch.Tensor] = None,
 ):
     assert not renormalize
     assert (
@@ -286,7 +323,7 @@ def fused_moe_router_shim(
     BLOCK_SIZE_K = 256
     if (
         bs >= 512
-        and topk == 1
+        and topk <= 2
         and num_experts <= BLOCK_SIZE_N
         and hidden_dim % BLOCK_SIZE_K == 0
     ):
@@ -305,6 +342,7 @@ def fused_moe_router_shim(
             router_weight=gating_output,
             topk=topk,
             moe_softcapping=moe_softcapping,
+            correction_bias=correction_bias,
         )

sglang/srt/layers/moe/topk.py CHANGED Viewed

@@ -18,12 +18,12 @@ from typing import Callable, Optional
 import torch
 import torch.nn.functional as F
-from sglang.srt.managers import expert_location_dispatch
-from sglang.srt.managers.expert_distribution import (
+from sglang.srt.eplb import expert_location_dispatch
+from sglang.srt.eplb.expert_distribution import (
     ExpertDistributionRecorder,
     get_global_expert_distribution_recorder,
 )
-from sglang.srt.managers.expert_location_dispatch import (
+from sglang.srt.eplb.expert_location_dispatch import (
     ExpertLocationDispatchInfo,
     topk_ids_logical_to_physical,
 )
@@ -35,6 +35,7 @@ from sglang.srt.utils import (
     is_cpu,
     is_cuda,
     is_hip,
+    is_npu,
 )
 _is_cuda = is_cuda()
@@ -42,6 +43,7 @@ _is_hip = is_hip()
 _use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
 _is_cpu_amx_available = cpu_has_amx_support()
 _is_cpu = is_cpu()
+_is_npu = is_npu()
 if _is_cuda:
     from sgl_kernel import moe_fused_gate
@@ -106,37 +108,14 @@ def fused_topk(
         M, topk, dtype=torch.float32, device=hidden_states.device
     )
     topk_ids = torch.empty(M, topk, dtype=torch.int32, device=hidden_states.device)
-    token_expert_indicies = torch.empty(
-        M, topk, dtype=torch.int32, device=hidden_states.device
-    )
     topk_softmax(
         topk_weights,
         topk_ids,
-        token_expert_indicies,
-        gating_output.float(),
-    )
-    del token_expert_indicies
-    return _fused_topk_postprocess(
-        topk_weights=topk_weights,
-        topk_ids=topk_ids,
-        renormalize=renormalize,
-        expert_location_dispatch_info=expert_location_dispatch_info,
-        num_token_non_padded=num_token_non_padded,
+        gating_output,
+        renormalize,
     )
-@torch.compile(dynamic=True, backend=get_compiler_backend())
-def _fused_topk_postprocess(
-    topk_weights,
-    topk_ids,
-    renormalize,
-    expert_location_dispatch_info,
-    num_token_non_padded,
-):
-    if renormalize:
-        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
     topk_ids = topk_ids_logical_to_physical(topk_ids, expert_location_dispatch_info)
     _mask_topk_ids_padded_region(topk_ids, num_token_non_padded)
     return topk_weights, topk_ids
@@ -159,6 +138,9 @@ def grouped_topk_gpu(
     assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
     scores = torch.softmax(gating_output, dim=-1)
+    # NPU compiler limitation
+    if _is_npu and scores.dtype == torch.bfloat16:
+        scores = scores.to(torch.float16)
     num_token = scores.shape[0]
     num_experts = scores.shape[1]
     group_scores = (

sglang/srt/layers/parameter.py CHANGED Viewed

@@ -7,6 +7,8 @@ from typing import Callable, Optional, Union
 import torch
 from torch.nn import Parameter
+from sglang.srt.utils import is_cpu
 __all__ = [
     "BasevLLMParameter",
     "PackedvLLMParameter",
@@ -21,6 +23,8 @@ __all__ = [
 logger = logging.getLogger(__name__)
+_is_cpu = is_cpu()
 class BasevLLMParameter(Parameter):
     """
@@ -93,9 +97,28 @@ class _ColumnvLLMParameter(BasevLLMParameter):
     ):
         if not use_presharded_weights:
             shard_size = self.data.shape[self.output_dim]
-            loaded_weight = loaded_weight.narrow(
-                self.output_dim, tp_rank * shard_size, shard_size
+            from sglang.srt.model_loader.weight_utils import (
+                narrow_padded_param_and_loaded_weight,
             )
+            if _is_cpu:
+                param_data, loaded_weight = narrow_padded_param_and_loaded_weight(
+                    self.data,
+                    loaded_weight,
+                    0,  # param_data_start
+                    tp_rank * shard_size,
+                    self.output_dim,
+                    shard_size,
+                )
+                assert param_data.shape == loaded_weight.shape
+                param_data.copy_(loaded_weight)
+                return
+            else:
+                loaded_weight = loaded_weight.narrow(
+                    self.output_dim, tp_rank * shard_size, shard_size
+                )
         assert self.data.shape == loaded_weight.shape
         self.data.copy_(loaded_weight)
@@ -116,10 +139,27 @@ class _ColumnvLLMParameter(BasevLLMParameter):
         param_data = self.data
         param_data = param_data.narrow(self.output_dim, shard_offset, shard_size)
-        if not use_presharded_weights:
-            loaded_weight = loaded_weight.narrow(
-                self.output_dim, tp_rank * shard_size, shard_size
+        from sglang.srt.model_loader.weight_utils import (
+            narrow_padded_param_and_loaded_weight,
+        )
+        if _is_cpu:
+            param_data, loaded_weight = narrow_padded_param_and_loaded_weight(
+                param_data,
+                loaded_weight,
+                0,  # param_data_start
+                tp_rank * shard_size,
+                self.output_dim,
+                shard_size,
+                not use_presharded_weights,
             )
+        else:
+            if not use_presharded_weights:
+                loaded_weight = loaded_weight.narrow(
+                    self.output_dim, tp_rank * shard_size, shard_size
+                )
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
@@ -182,10 +222,30 @@ class RowvLLMParameter(BasevLLMParameter):
     ):
         if not use_presharded_weights:
             shard_size = self.data.shape[self.input_dim]
-            loaded_weight = loaded_weight.narrow(
-                self.input_dim, tp_rank * shard_size, shard_size
+            from sglang.srt.model_loader.weight_utils import (
+                narrow_padded_param_and_loaded_weight,
             )
+            if _is_cpu:
+                param_data, loaded_weight = narrow_padded_param_and_loaded_weight(
+                    self.data,
+                    loaded_weight,
+                    0,  # param_data_start
+                    tp_rank * shard_size,
+                    self.input_dim,
+                    shard_size,
+                )
+                assert param_data.shape == loaded_weight.shape
+                param_data.copy_(loaded_weight)
+                return
+            else:
+                loaded_weight = loaded_weight.narrow(
+                    self.input_dim, tp_rank * shard_size, shard_size
+                )
         if len(loaded_weight.shape) == 0:
             loaded_weight = loaded_weight.reshape(1)

sglang/srt/layers/quantization/__init__.py CHANGED Viewed

@@ -68,6 +68,7 @@ from sglang.srt.layers.quantization.modelopt_quant import (
 )
 from sglang.srt.layers.quantization.moe_wna16 import MoeWNA16Config
 from sglang.srt.layers.quantization.qoq import QoQConfig
+from sglang.srt.layers.quantization.w4afp8 import W4AFp8Config
 from sglang.srt.layers.quantization.w8a8_fp8 import W8A8Fp8Config
 from sglang.srt.layers.quantization.w8a8_int8 import W8A8Int8Config
@@ -82,6 +83,7 @@ BASE_QUANTIZATION_METHODS: Dict[str, Type[QuantizationConfig]] = {
     "moe_wna16": MoeWNA16Config,
     "compressed-tensors": CompressedTensorsConfig,
     "qoq": QoQConfig,
+    "w4afp8": W4AFp8Config,
 }
 # VLLM-dependent quantization methods

sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py CHANGED Viewed

@@ -76,7 +76,7 @@ class CompressedTensorsW8A16Fp8(CompressedTensorsScheme):
             layer.input_scale = torch.nn.Parameter(
                 layer.input_scale.data, requires_grad=False
             )
-        prepare_fp8_layer_for_marlin(layer, strategy="channel")
+        prepare_fp8_layer_for_marlin(layer, size_k_first=True)
     def create_weights(
         self,

sglang/srt/layers/quantization/fp8.py CHANGED Viewed

@@ -1,7 +1,7 @@
 # Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/model_executor/layers/quantization/fp8.py
 import logging
-from typing import Any, Callable, Dict, List, Optional
+from typing import Any, Callable, Dict, List, Optional, Union
 import torch
 import torch.nn.functional as F
@@ -27,6 +27,7 @@ except ImportError:
 from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.layers.amx_utils import _amx_process_weight_after_loading
 from sglang.srt.layers.linear import (
     LinearBase,
     LinearMethodBase,
@@ -73,6 +74,7 @@ from sglang.srt.utils import (
     log_info_on_rank0,
     print_warning_once,
     set_weight_attrs,
+    use_intel_amx_backend,
 )
 _is_hip = is_hip()
@@ -86,7 +88,7 @@ _is_fp8_fnuz = is_fp8_fnuz()
 _use_hip_int4 = get_bool_env_var("SGLANG_INT4_WEIGHT")
 _use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
-if _is_hip:
+if _is_hip and (_use_aiter or _use_hip_int4):
     from aiter import ActivationType, QuantType
     from aiter.fused_moe import fused_moe
     from aiter.fused_moe_bf16_asm import asm_moe, ck_moe_2stages
@@ -198,7 +200,7 @@ class Fp8LinearMethod(LinearMethodBase):
         quant_config: The quantization config.
     """
-    def __init__(self, quant_config: Fp8Config):
+    def __init__(self, quant_config: Union["Fp8Config", "W4AFp8Config"]):
         self.quant_config = quant_config
         self.cutlass_fp8_supported = cutlass_fp8_supported()
@@ -284,7 +286,10 @@ class Fp8LinearMethod(LinearMethodBase):
         if self.quant_config.is_checkpoint_fp8_serialized:
             # WEIGHT SCALE
             if self.block_quant:
-                assert self.quant_config.activation_scheme == "dynamic"
+                if hasattr(self.quant_config, "activation_scheme"):
+                    assert self.quant_config.activation_scheme == "dynamic"
+                elif hasattr(self.quant_config, "linear_activation_scheme"):
+                    assert self.quant_config.linear_activation_scheme == "dynamic"
                 scale = BlockQuantScaleParameter(
                     data=torch.empty(
                         (output_size_per_partition + block_n - 1) // block_n,
@@ -306,7 +311,13 @@ class Fp8LinearMethod(LinearMethodBase):
                 layer.register_parameter("weight_scale", scale)
             # INPUT ACTIVATION SCALE
-            if self.quant_config.activation_scheme == "static":
+            if (
+                hasattr(self.quant_config, "activation_scheme")
+                and self.quant_config.activation_scheme == "static"
+            ) or (
+                hasattr(self.quant_config, "linear_activation_scheme")
+                and self.quant_config.linear_activation_scheme == "static"
+            ):
                 scale = PerTensorScaleParameter(
                     data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
                     weight_loader=weight_loader,
@@ -330,6 +341,12 @@ class Fp8LinearMethod(LinearMethodBase):
                 )
                 layer.input_scale = None
+            elif _is_cpu:
+                assert (
+                    _is_cpu_amx_available
+                ), "Fp8LinearMethod on CPU requires that CPU has AMX support"
+                _amx_process_weight_after_loading(layer, ["weight"])
+                return
             else:
                 weight, weight_scale = layer.weight.data, layer.weight_scale_inv.data
             layer.weight = torch.nn.Parameter(weight, requires_grad=False)
@@ -363,7 +380,13 @@ class Fp8LinearMethod(LinearMethodBase):
             layer.weight_scale = torch.nn.Parameter(
                 layer.weight_scale.data, requires_grad=False
             )
-            if self.quant_config.activation_scheme == "static":
+            if (
+                hasattr(self.quant_config, "activation_scheme")
+                and self.quant_config.activation_scheme == "static"
+            ) or (
+                hasattr(self.quant_config, "linear_activation_scheme")
+                and self.quant_config.linear_activation_scheme == "static"
+            ):
                 layer.input_scale = torch.nn.Parameter(
                     layer.input_scale.data, requires_grad=False
                 )
@@ -397,7 +420,13 @@ class Fp8LinearMethod(LinearMethodBase):
             # Update layer with new values.
             layer.weight = Parameter(weight.t(), requires_grad=False)
             layer.weight_scale = Parameter(weight_scale, requires_grad=False)
-            if self.quant_config.activation_scheme == "static":
+            if (
+                hasattr(self.quant_config, "activation_scheme")
+                and self.quant_config.activation_scheme == "static"
+            ) or (
+                hasattr(self.quant_config, "linear_activation_scheme")
+                and self.quant_config.linear_activation_scheme == "static"
+            ):
                 layer.input_scale = Parameter(
                     layer.input_scale.max(), requires_grad=False
                 )
@@ -426,6 +455,17 @@ class Fp8LinearMethod(LinearMethodBase):
             )
         if self.block_quant:
+            if use_intel_amx_backend(layer):
+                return torch.ops.sgl_kernel.fp8_scaled_mm_cpu(
+                    x,
+                    layer.weight,
+                    layer.weight_scale_inv,
+                    self.quant_config.weight_block_size,
+                    bias,
+                    x.dtype,
+                    True,  # is_vnni
+                )
             return self.w8a8_block_fp8_linear(
                 input=x,
                 weight=layer.weight,
@@ -746,6 +786,13 @@ class Fp8MoEMethod:
                 layer.w2_weight.data = shuffle_weight(
                     layer.w2_weight.contiguous(), (16, 16)
                 )
+            if _is_cpu:
+                assert (
+                    _is_cpu_amx_available
+                ), "Fp8MoEMethod on CPU requires that CPU has AMX support"
+                _amx_process_weight_after_loading(layer, ["w13_weight", "w2_weight"])
             return
         # If checkpoint is fp16 or bfloat16, quantize in place.
@@ -971,6 +1018,24 @@ class Fp8MoEMethod:
             routed_scaling_factor=routed_scaling_factor,
         )
+        if use_intel_amx_backend(layer):
+            return torch.ops.sgl_kernel.fused_experts_cpu(
+                x,
+                layer.w13_weight,
+                layer.w2_weight,
+                topk_weights,
+                topk_ids,
+                False,  # inplace See [Note] inplace should be False in fused_experts.
+                False,  # use_int8_w8a8
+                True,  # use_fp8_w8a16
+                layer.w13_weight_scale_inv,  # w1_scale
+                layer.w2_weight_scale_inv,  # w2_scale
+                self.quant_config.weight_block_size,  # block_size
+                None,  # a1_scale
+                None,  # a2_scale
+                True,  # is_vnni
+            )
         if _is_hip:
             ret = self.maybe_apply_hip_fused_experts(
                 layer,

sglang/srt/layers/quantization/fp8_kernel.py CHANGED Viewed

@@ -23,9 +23,9 @@ import torch
 import triton
 import triton.language as tl
-from sglang.math_utils import align
 from sglang.srt.layers.quantization import deep_gemm_wrapper
 from sglang.srt.utils import (
+    align,
     direct_register_custom_op,
     get_device_core_count,
     get_device_name,

sglang/srt/layers/quantization/fp8_utils.py CHANGED Viewed

@@ -1,9 +1,7 @@
 from typing import Callable, List, Optional, Tuple
-import einops
 import torch
-from sglang.math_utils import align
 from sglang.srt.layers.quantization import deep_gemm_wrapper
 from sglang.srt.layers.quantization.fp8_kernel import sglang_per_token_group_quant_fp8
 from sglang.srt.layers.utils import is_sm100_supported
@@ -27,6 +25,7 @@ from sglang.srt.layers.quantization.fp8_kernel import (
     w8a8_block_fp8_matmul_triton,
 )
 from sglang.srt.utils import (
+    align,
     get_bool_env_var,
     get_cuda_version,
     get_device_capability,

sglang/srt/layers/quantization/gptq.py CHANGED Viewed

@@ -344,6 +344,10 @@ class GPTQMarlinConfig(QuantizationConfig):
         if (num_bits, sym) not in cls.TYPE_MAP:
             return False
+        assert (
+            VLLM_AVAILABLE
+        ), "vllm is not installed, to use gptq_marlin, please install vllm"
         return check_marlin_supported(
             quant_type=cls.TYPE_MAP[(num_bits, sym)], group_size=group_size
         )
@@ -726,6 +730,6 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
             g_idx2=layer.w2_g_idx,
             sort_indices1=layer.w13_g_idx_sort_indices,
             sort_indices2=layer.w2_g_idx_sort_indices,
-            num_bits=self.quant_config.quant_type.size_bits,
+            quant_type_id=self.quant_config.quant_type.id,
             is_k_full=self.is_k_full,
         ).to(orig_dtype)

sglang 0.4.8.post1__py3-none-any.whl → 0.4.9.post1__py3-none-any.whl

sglang 0.4.8.post1py3-none-any.whl → 0.4.9.post1py3-none-any.whl