PyPI - sglang - Versions diffs - 0.4.8__py3-none-any.whl → 0.4.9__py3-none-any.whl - Mend

sglang 0.4.8py3-none-any.whl → 0.4.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (150) hide show

sglang/bench_one_batch_server.py +17 -2
sglang/bench_serving.py +168 -22
sglang/srt/configs/internvl.py +4 -2
sglang/srt/configs/janus_pro.py +1 -1
sglang/srt/configs/model_config.py +49 -0
sglang/srt/configs/update_config.py +119 -0
sglang/srt/conversation.py +35 -0
sglang/srt/custom_op.py +7 -1
sglang/srt/disaggregation/base/conn.py +2 -0
sglang/srt/disaggregation/decode.py +22 -6
sglang/srt/disaggregation/mooncake/conn.py +289 -48
sglang/srt/disaggregation/mooncake/transfer_engine.py +31 -1
sglang/srt/disaggregation/nixl/conn.py +100 -52
sglang/srt/disaggregation/prefill.py +5 -4
sglang/srt/disaggregation/utils.py +13 -12
sglang/srt/distributed/parallel_state.py +44 -17
sglang/srt/entrypoints/EngineBase.py +8 -0
sglang/srt/entrypoints/engine.py +45 -9
sglang/srt/entrypoints/http_server.py +111 -24
sglang/srt/entrypoints/openai/protocol.py +51 -6
sglang/srt/entrypoints/openai/serving_chat.py +52 -76
sglang/srt/entrypoints/openai/serving_completions.py +1 -0
sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
sglang/srt/eplb/__init__.py +0 -0
sglang/srt/{managers → eplb}/eplb_algorithms/__init__.py +1 -1
sglang/srt/{managers → eplb}/eplb_manager.py +2 -4
sglang/srt/{eplb_simulator → eplb/eplb_simulator}/reader.py +1 -1
sglang/srt/{managers → eplb}/expert_distribution.py +18 -1
sglang/srt/{managers → eplb}/expert_location.py +1 -1
sglang/srt/{managers → eplb}/expert_location_dispatch.py +1 -1
sglang/srt/{model_executor → eplb}/expert_location_updater.py +17 -1
sglang/srt/hf_transformers_utils.py +2 -1
sglang/srt/layers/activation.py +7 -0
sglang/srt/layers/amx_utils.py +86 -0
sglang/srt/layers/attention/ascend_backend.py +219 -0
sglang/srt/layers/attention/flashattention_backend.py +56 -23
sglang/srt/layers/attention/tbo_backend.py +37 -9
sglang/srt/layers/communicator.py +18 -2
sglang/srt/layers/dp_attention.py +9 -3
sglang/srt/layers/elementwise.py +76 -12
sglang/srt/layers/flashinfer_comm_fusion.py +202 -0
sglang/srt/layers/layernorm.py +41 -0
sglang/srt/layers/linear.py +99 -12
sglang/srt/layers/logits_processor.py +15 -6
sglang/srt/layers/moe/ep_moe/kernels.py +23 -8
sglang/srt/layers/moe/ep_moe/layer.py +115 -25
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +42 -19
sglang/srt/layers/moe/fused_moe_native.py +7 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +8 -4
sglang/srt/layers/moe/fused_moe_triton/layer.py +129 -10
sglang/srt/layers/moe/router.py +60 -22
sglang/srt/layers/moe/topk.py +36 -28
sglang/srt/layers/parameter.py +67 -7
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +1 -1
sglang/srt/layers/quantization/fp8.py +44 -0
sglang/srt/layers/quantization/fp8_kernel.py +1 -1
sglang/srt/layers/quantization/fp8_utils.py +6 -6
sglang/srt/layers/quantization/gptq.py +5 -1
sglang/srt/layers/quantization/moe_wna16.py +1 -1
sglang/srt/layers/quantization/quant_utils.py +166 -0
sglang/srt/layers/quantization/w8a8_int8.py +52 -1
sglang/srt/layers/rotary_embedding.py +105 -13
sglang/srt/layers/vocab_parallel_embedding.py +19 -2
sglang/srt/lora/lora.py +4 -5
sglang/srt/lora/lora_manager.py +73 -20
sglang/srt/managers/configure_logging.py +1 -1
sglang/srt/managers/io_struct.py +60 -15
sglang/srt/managers/mm_utils.py +73 -59
sglang/srt/managers/multimodal_processor.py +2 -6
sglang/srt/managers/multimodal_processors/qwen_audio.py +94 -0
sglang/srt/managers/schedule_batch.py +80 -79
sglang/srt/managers/scheduler.py +153 -63
sglang/srt/managers/scheduler_output_processor_mixin.py +8 -2
sglang/srt/managers/session_controller.py +12 -3
sglang/srt/managers/tokenizer_manager.py +314 -103
sglang/srt/managers/tp_worker.py +13 -1
sglang/srt/managers/tp_worker_overlap_thread.py +8 -0
sglang/srt/mem_cache/allocator.py +290 -0
sglang/srt/mem_cache/chunk_cache.py +34 -2
sglang/srt/mem_cache/memory_pool.py +289 -3
sglang/srt/mem_cache/multimodal_cache.py +3 -0
sglang/srt/model_executor/cuda_graph_runner.py +3 -2
sglang/srt/model_executor/forward_batch_info.py +17 -4
sglang/srt/model_executor/model_runner.py +302 -58
sglang/srt/model_loader/loader.py +86 -10
sglang/srt/model_loader/weight_utils.py +160 -3
sglang/srt/models/deepseek_nextn.py +5 -4
sglang/srt/models/deepseek_v2.py +305 -26
sglang/srt/models/deepseek_vl2.py +3 -5
sglang/srt/models/gemma3_causal.py +1 -2
sglang/srt/models/gemma3n_audio.py +949 -0
sglang/srt/models/gemma3n_causal.py +1010 -0
sglang/srt/models/gemma3n_mm.py +495 -0
sglang/srt/models/hunyuan.py +771 -0
sglang/srt/models/kimi_vl.py +1 -2
sglang/srt/models/llama.py +10 -4
sglang/srt/models/llama4.py +32 -45
sglang/srt/models/llama_eagle3.py +61 -11
sglang/srt/models/llava.py +5 -5
sglang/srt/models/minicpmo.py +2 -2
sglang/srt/models/mistral.py +1 -1
sglang/srt/models/mllama4.py +43 -11
sglang/srt/models/phi4mm.py +1 -3
sglang/srt/models/pixtral.py +3 -7
sglang/srt/models/qwen2.py +31 -3
sglang/srt/models/qwen2_5_vl.py +1 -3
sglang/srt/models/qwen2_audio.py +200 -0
sglang/srt/models/qwen2_moe.py +32 -6
sglang/srt/models/qwen2_vl.py +1 -4
sglang/srt/models/qwen3.py +94 -25
sglang/srt/models/qwen3_moe.py +68 -21
sglang/srt/models/vila.py +3 -8
sglang/srt/{managers/multimodal_processors → multimodal/processors}/base_processor.py +150 -133
sglang/srt/{managers/multimodal_processors → multimodal/processors}/clip.py +2 -13
sglang/srt/{managers/multimodal_processors → multimodal/processors}/deepseek_vl_v2.py +4 -11
sglang/srt/{managers/multimodal_processors → multimodal/processors}/gemma3.py +3 -10
sglang/srt/multimodal/processors/gemma3n.py +82 -0
sglang/srt/{managers/multimodal_processors → multimodal/processors}/internvl.py +3 -10
sglang/srt/{managers/multimodal_processors → multimodal/processors}/janus_pro.py +3 -9
sglang/srt/{managers/multimodal_processors → multimodal/processors}/kimi_vl.py +6 -13
sglang/srt/{managers/multimodal_processors → multimodal/processors}/llava.py +2 -10
sglang/srt/{managers/multimodal_processors → multimodal/processors}/minicpm.py +5 -12
sglang/srt/{managers/multimodal_processors → multimodal/processors}/mlama.py +2 -14
sglang/srt/{managers/multimodal_processors → multimodal/processors}/mllama4.py +3 -6
sglang/srt/{managers/multimodal_processors → multimodal/processors}/phi4mm.py +4 -14
sglang/srt/{managers/multimodal_processors → multimodal/processors}/pixtral.py +3 -9
sglang/srt/{managers/multimodal_processors → multimodal/processors}/qwen_vl.py +8 -14
sglang/srt/{managers/multimodal_processors → multimodal/processors}/vila.py +13 -31
sglang/srt/operations_strategy.py +6 -2
sglang/srt/reasoning_parser.py +26 -0
sglang/srt/sampling/sampling_batch_info.py +39 -1
sglang/srt/server_args.py +85 -24
sglang/srt/speculative/build_eagle_tree.py +57 -18
sglang/srt/speculative/eagle_worker.py +6 -4
sglang/srt/two_batch_overlap.py +204 -28
sglang/srt/utils.py +369 -138
sglang/srt/warmup.py +12 -3
sglang/test/runners.py +10 -1
sglang/test/test_utils.py +15 -3
sglang/version.py +1 -1
{sglang-0.4.8.dist-info → sglang-0.4.9.dist-info}/METADATA +9 -6
{sglang-0.4.8.dist-info → sglang-0.4.9.dist-info}/RECORD +149 -137
sglang/math_utils.py +0 -8
/sglang/srt/{managers → eplb}/eplb_algorithms/deepseek.py +0 -0
/sglang/srt/{managers → eplb}/eplb_algorithms/deepseek_vec.py +0 -0
/sglang/srt/{eplb_simulator → eplb/eplb_simulator}/__init__.py +0 -0
/sglang/srt/{mm_utils.py → multimodal/mm_utils.py} +0 -0
{sglang-0.4.8.dist-info → sglang-0.4.9.dist-info}/WHEEL +0 -0
{sglang-0.4.8.dist-info → sglang-0.4.9.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.8.dist-info → sglang-0.4.9.dist-info}/top_level.txt +0 -0

sglang/srt/layers/moe/router.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Tuple
+from typing import Optional, Tuple
 import torch
 import triton
@@ -16,6 +16,8 @@ def fused_moe_router_kernel(
     moe_router_weight_ptr,  # input (num_experts, hidden_dim)
     topk_weights_ptr,  # output (bs, topk)
     topk_ids_ptr,  # output (bs, topk)
+    correction_bias_ptr,
+    is_correction_bias: tl.constexpr,
     num_experts: tl.constexpr,
     topk: tl.constexpr,
     moe_softcapping: tl.constexpr,
@@ -49,6 +51,11 @@ def fused_moe_router_kernel(
     bottom = exped + 1
     logits_softcapped = top / bottom * moe_softcapping
+    # Add bias after softcapping
+    if is_correction_bias:
+        bias = tl.load(correction_bias_ptr + tl.arange(0, num_experts))
+        logits_softcapped = logits_softcapped + bias
     # topk
     # assert 1 <= topk <= num_experts
@@ -109,6 +116,7 @@ def fused_moe_router_impl(
     router_weight: torch.Tensor,
     topk: int,
     moe_softcapping: float,
+    correction_bias: Optional[torch.Tensor] = None,
 ):
     assert len(x.shape) == 2 and x.shape[1] == router_weight.shape[1]
     bs, hidden_dim = x.shape
@@ -117,23 +125,23 @@ def fused_moe_router_impl(
     # router_logits = torch.empty((bs, num_experts), dtype=torch.float32, device=x.device)
     topk_weights = torch.empty((bs, topk), dtype=torch.float32, device=x.device)
     topk_ids = torch.empty((bs, topk), dtype=torch.int32, device=x.device)
+    is_correction_bias = correction_bias is not None
-    grid = lambda meta: (bs,)
-    min_num_warps = 16 if _is_hip else 32
+    max_warps = 16 if _is_hip else 32
     config = {
         "BLOCK_SIZE": triton.next_power_of_2(hidden_dim),
         "num_warps": max(
-            min(triton.next_power_of_2(triton.cdiv(hidden_dim, 256)), min_num_warps), 4
+            min(triton.next_power_of_2(triton.cdiv(hidden_dim, 256)), max_warps), 4
         ),
     }
-    fused_moe_router_kernel[grid](
+    fused_moe_router_kernel[(bs,)](
         x,
         router_weight,
         topk_weights,
         topk_ids,
+        correction_bias,
+        is_correction_bias=is_correction_bias,
         num_experts=num_experts,
         topk=topk,
         moe_softcapping=moe_softcapping,
@@ -153,7 +161,7 @@ def fused_moe_router_large_bs_kernel(
     topk_ids_ptr,  # output (bs, topk)
     bs,
     num_experts: tl.constexpr,
-    topk: tl.constexpr,  # only support topk == 1
+    topk: tl.constexpr,  # only support topk <= 2
     moe_softcapping: tl.constexpr,
     moe_renormalize: tl.constexpr,  # not supported
     K: tl.constexpr,
@@ -204,25 +212,53 @@ def fused_moe_router_large_bs_kernel(
     logits_softcapped = (exped - 1) / (exped + 1) * moe_softcapping
     # 5. top1
-    cond = tl.arange(0, BLOCK_SIZE_N)[None, :] < num_experts
-    top1 = tl.argmax(tl.where(cond, logits_softcapped, float("-inf")), axis=1)
+    arange_block_size_n = tl.arange(0, BLOCK_SIZE_N)[None, :]
+    cond_top1 = arange_block_size_n < num_experts
+    top1 = tl.argmax(tl.where(cond_top1, logits_softcapped, float("-inf")), axis=1)
     top1_v = tl.max(
-        tl.where(cond, logits_softcapped, float("-inf")), axis=1, keep_dims=True
+        tl.where(cond_top1, logits_softcapped, float("-inf")), axis=1, keep_dims=True
     )
-    invsumexp = 1.0 / tl.sum(
-        tl.where(cond, tl.exp(logits_softcapped - top1_v), 0.0), axis=1
+    top1_invsumexp = 1.0 / tl.sum(
+        tl.where(cond_top1, tl.exp(logits_softcapped - top1_v), 0.0), axis=1
     )
-    # 6. store to output
-    offs_topk = pid * topk * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    topk_mask = offs_topk < bs
-    tl.store(topk_ids_ptr + offs_topk, top1, mask=topk_mask)
+    # 6. store top1 to output
+    offs_top1 = pid * topk * BLOCK_SIZE_M + topk * tl.arange(0, BLOCK_SIZE_M)
+    top1_mask = offs_top1 < bs * topk
+    tl.store(topk_ids_ptr + offs_top1, top1, mask=top1_mask)
     tl.store(
-        topk_weights_ptr + offs_topk,
-        invsumexp,
-        mask=topk_mask,
+        topk_weights_ptr + offs_top1,
+        top1_invsumexp,
+        mask=top1_mask,
     )
+    # 7. handle topk == 2
+    if topk == 2:
+        cond_top2 = (arange_block_size_n < num_experts) and (
+            arange_block_size_n != top1[:, None]
+        )
+        top2 = tl.argmax(
+            tl.where(cond_top2, logits_softcapped, float("-inf")),
+            axis=1,
+            keep_dims=True,
+        )
+        top2_v = tl.sum(
+            logits_softcapped * (arange_block_size_n == top2), axis=1, keep_dims=True
+        )
+        top2_invsumexp = tl.exp(top2_v - top1_v) * top1_invsumexp[:, None]
+        # store top2
+        offs_top2 = (
+            pid * topk * BLOCK_SIZE_M + topk * tl.arange(0, BLOCK_SIZE_M)[:, None] + 1
+        )
+        top2_mask = offs_top2 < bs * topk
+        tl.store(topk_ids_ptr + offs_top2, top2, mask=top2_mask)
+        tl.store(
+            topk_weights_ptr + offs_top2,
+            top2_invsumexp,
+            mask=top2_mask,
+        )
 def fused_moe_router_large_bs_impl(
     x: torch.Tensor,
@@ -239,7 +275,7 @@ def fused_moe_router_large_bs_impl(
     assert num_experts <= BLOCK_SIZE_N
     assert hidden_dim % BLOCK_SIZE_K == 0
-    assert topk == 1
+    assert topk <= 2
     topk_weights = torch.empty((bs, topk), dtype=torch.float32, device=x.device)
     topk_ids = torch.empty((bs, topk), dtype=torch.int32, device=x.device)
@@ -273,6 +309,7 @@ def fused_moe_router_shim(
     gating_output,
     topk,
     renormalize,
+    correction_bias: Optional[torch.Tensor] = None,
 ):
     assert not renormalize
     assert (
@@ -286,7 +323,7 @@ def fused_moe_router_shim(
     BLOCK_SIZE_K = 256
     if (
         bs >= 512
-        and topk == 1
+        and topk <= 2
         and num_experts <= BLOCK_SIZE_N
         and hidden_dim % BLOCK_SIZE_K == 0
     ):
@@ -305,6 +342,7 @@ def fused_moe_router_shim(
             router_weight=gating_output,
             topk=topk,
             moe_softcapping=moe_softcapping,
+            correction_bias=correction_bias,
         )

sglang/srt/layers/moe/topk.py CHANGED Viewed

@@ -18,34 +18,43 @@ from typing import Callable, Optional
 import torch
 import torch.nn.functional as F
-from sglang.srt.managers import expert_location_dispatch
-from sglang.srt.managers.expert_distribution import (
+from sglang.srt.eplb import expert_location_dispatch
+from sglang.srt.eplb.expert_distribution import (
     ExpertDistributionRecorder,
     get_global_expert_distribution_recorder,
 )
-from sglang.srt.managers.expert_location_dispatch import (
+from sglang.srt.eplb.expert_location_dispatch import (
     ExpertLocationDispatchInfo,
     topk_ids_logical_to_physical,
 )
 from sglang.srt.managers.schedule_batch import global_server_args_dict
 from sglang.srt.utils import (
     cpu_has_amx_support,
+    get_bool_env_var,
     get_compiler_backend,
     is_cpu,
     is_cuda,
     is_hip,
+    is_npu,
 )
 _is_cuda = is_cuda()
 _is_hip = is_hip()
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
 _is_cpu_amx_available = cpu_has_amx_support()
 _is_cpu = is_cpu()
+_is_npu = is_npu()
 if _is_cuda:
     from sgl_kernel import moe_fused_gate
 if _is_cuda or _is_hip:
     from sgl_kernel import topk_softmax
+if _use_aiter:
+    try:
+        from aiter import biased_grouped_topk as aiter_biased_grouped_topk
+    except ImportError:
+        raise ImportError("aiter is required when SGLANG_USE_AITER is set to True")
 def fused_topk_torch_native(
@@ -99,37 +108,14 @@ def fused_topk(
         M, topk, dtype=torch.float32, device=hidden_states.device
     )
     topk_ids = torch.empty(M, topk, dtype=torch.int32, device=hidden_states.device)
-    token_expert_indicies = torch.empty(
-        M, topk, dtype=torch.int32, device=hidden_states.device
-    )
     topk_softmax(
         topk_weights,
         topk_ids,
-        token_expert_indicies,
-        gating_output.float(),
-    )
-    del token_expert_indicies
-    return _fused_topk_postprocess(
-        topk_weights=topk_weights,
-        topk_ids=topk_ids,
-        renormalize=renormalize,
-        expert_location_dispatch_info=expert_location_dispatch_info,
-        num_token_non_padded=num_token_non_padded,
+        gating_output,
+        renormalize,
     )
-@torch.compile(dynamic=True, backend=get_compiler_backend())
-def _fused_topk_postprocess(
-    topk_weights,
-    topk_ids,
-    renormalize,
-    expert_location_dispatch_info,
-    num_token_non_padded,
-):
-    if renormalize:
-        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
     topk_ids = topk_ids_logical_to_physical(topk_ids, expert_location_dispatch_info)
     _mask_topk_ids_padded_region(topk_ids, num_token_non_padded)
     return topk_weights, topk_ids
@@ -152,6 +138,9 @@ def grouped_topk_gpu(
     assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
     scores = torch.softmax(gating_output, dim=-1)
+    # NPU compiler limitation
+    if _is_npu and scores.dtype == torch.bfloat16:
+        scores = scores.to(torch.float16)
     num_token = scores.shape[0]
     num_experts = scores.shape[1]
     group_scores = (
@@ -347,6 +336,25 @@ def biased_grouped_topk_gpu(
                 topk_ids, expert_location_dispatch_info, num_token_non_padded
             )
         return topk_weights, topk_ids
+    elif _use_aiter:
+        token = gating_output.shape[0]
+        device = gating_output.device
+        assert (
+            hidden_states.shape[0] == gating_output.shape[0]
+        ), f"Number of tokens mismatch: hidden_states.shape[0] = {hidden_states.shape[0]}, gating_output.shape[0] = {gating_output.shape[0]}"
+        topk_weights = torch.empty((token, topk), dtype=torch.float32, device=device)
+        topk_ids = torch.empty((token, topk), dtype=torch.int32, device=device)
+        aiter_biased_grouped_topk(
+            gating_output,
+            correction_bias,
+            topk_weights,
+            topk_ids,
+            num_expert_group,
+            topk_group,
+            renormalize,
+            routed_scaling_factor,
+        )
+        return topk_weights, topk_ids
     else:
         biased_grouped_topk_fn = (
             torch.compile(

sglang/srt/layers/parameter.py CHANGED Viewed

@@ -7,6 +7,8 @@ from typing import Callable, Optional, Union
 import torch
 from torch.nn import Parameter
+from sglang.srt.utils import is_cpu
 __all__ = [
     "BasevLLMParameter",
     "PackedvLLMParameter",
@@ -21,6 +23,8 @@ __all__ = [
 logger = logging.getLogger(__name__)
+_is_cpu = is_cpu()
 class BasevLLMParameter(Parameter):
     """
@@ -93,9 +97,28 @@ class _ColumnvLLMParameter(BasevLLMParameter):
     ):
         if not use_presharded_weights:
             shard_size = self.data.shape[self.output_dim]
-            loaded_weight = loaded_weight.narrow(
-                self.output_dim, tp_rank * shard_size, shard_size
+            from sglang.srt.model_loader.weight_utils import (
+                narrow_padded_param_and_loaded_weight,
             )
+            if _is_cpu:
+                param_data, loaded_weight = narrow_padded_param_and_loaded_weight(
+                    self.data,
+                    loaded_weight,
+                    0,  # param_data_start
+                    tp_rank * shard_size,
+                    self.output_dim,
+                    shard_size,
+                )
+                assert param_data.shape == loaded_weight.shape
+                param_data.copy_(loaded_weight)
+                return
+            else:
+                loaded_weight = loaded_weight.narrow(
+                    self.output_dim, tp_rank * shard_size, shard_size
+                )
         assert self.data.shape == loaded_weight.shape
         self.data.copy_(loaded_weight)
@@ -116,10 +139,27 @@ class _ColumnvLLMParameter(BasevLLMParameter):
         param_data = self.data
         param_data = param_data.narrow(self.output_dim, shard_offset, shard_size)
-        if not use_presharded_weights:
-            loaded_weight = loaded_weight.narrow(
-                self.output_dim, tp_rank * shard_size, shard_size
+        from sglang.srt.model_loader.weight_utils import (
+            narrow_padded_param_and_loaded_weight,
+        )
+        if _is_cpu:
+            param_data, loaded_weight = narrow_padded_param_and_loaded_weight(
+                param_data,
+                loaded_weight,
+                0,  # param_data_start
+                tp_rank * shard_size,
+                self.output_dim,
+                shard_size,
+                not use_presharded_weights,
             )
+        else:
+            if not use_presharded_weights:
+                loaded_weight = loaded_weight.narrow(
+                    self.output_dim, tp_rank * shard_size, shard_size
+                )
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
@@ -182,10 +222,30 @@ class RowvLLMParameter(BasevLLMParameter):
     ):
         if not use_presharded_weights:
             shard_size = self.data.shape[self.input_dim]
-            loaded_weight = loaded_weight.narrow(
-                self.input_dim, tp_rank * shard_size, shard_size
+            from sglang.srt.model_loader.weight_utils import (
+                narrow_padded_param_and_loaded_weight,
             )
+            if _is_cpu:
+                param_data, loaded_weight = narrow_padded_param_and_loaded_weight(
+                    self.data,
+                    loaded_weight,
+                    0,  # param_data_start
+                    tp_rank * shard_size,
+                    self.input_dim,
+                    shard_size,
+                )
+                assert param_data.shape == loaded_weight.shape
+                param_data.copy_(loaded_weight)
+                return
+            else:
+                loaded_weight = loaded_weight.narrow(
+                    self.input_dim, tp_rank * shard_size, shard_size
+                )
         if len(loaded_weight.shape) == 0:
             loaded_weight = loaded_weight.reshape(1)

sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py CHANGED Viewed

@@ -76,7 +76,7 @@ class CompressedTensorsW8A16Fp8(CompressedTensorsScheme):
             layer.input_scale = torch.nn.Parameter(
                 layer.input_scale.data, requires_grad=False
             )
-        prepare_fp8_layer_for_marlin(layer, strategy="channel")
+        prepare_fp8_layer_for_marlin(layer, size_k_first=True)
     def create_weights(
         self,

sglang/srt/layers/quantization/fp8.py CHANGED Viewed

@@ -27,6 +27,7 @@ except ImportError:
 from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.layers.amx_utils import _amx_process_weight_after_loading
 from sglang.srt.layers.linear import (
     LinearBase,
     LinearMethodBase,
@@ -73,6 +74,7 @@ from sglang.srt.utils import (
     log_info_on_rank0,
     print_warning_once,
     set_weight_attrs,
+    use_intel_amx_backend,
 )
 _is_hip = is_hip()
@@ -330,6 +332,12 @@ class Fp8LinearMethod(LinearMethodBase):
                 )
                 layer.input_scale = None
+            elif _is_cpu:
+                assert (
+                    _is_cpu_amx_available
+                ), "Fp8LinearMethod on CPU requires that CPU has AMX support"
+                _amx_process_weight_after_loading(layer, ["weight"])
+                return
             else:
                 weight, weight_scale = layer.weight.data, layer.weight_scale_inv.data
             layer.weight = torch.nn.Parameter(weight, requires_grad=False)
@@ -426,6 +434,17 @@ class Fp8LinearMethod(LinearMethodBase):
             )
         if self.block_quant:
+            if use_intel_amx_backend(layer):
+                return torch.ops.sgl_kernel.fp8_scaled_mm_cpu(
+                    x,
+                    layer.weight,
+                    layer.weight_scale_inv,
+                    self.quant_config.weight_block_size,
+                    bias,
+                    x.dtype,
+                    True,  # is_vnni
+                )
             return self.w8a8_block_fp8_linear(
                 input=x,
                 weight=layer.weight,
@@ -746,6 +765,13 @@ class Fp8MoEMethod:
                 layer.w2_weight.data = shuffle_weight(
                     layer.w2_weight.contiguous(), (16, 16)
                 )
+            if _is_cpu:
+                assert (
+                    _is_cpu_amx_available
+                ), "Fp8MoEMethod on CPU requires that CPU has AMX support"
+                _amx_process_weight_after_loading(layer, ["w13_weight", "w2_weight"])
             return
         # If checkpoint is fp16 or bfloat16, quantize in place.
@@ -971,6 +997,24 @@ class Fp8MoEMethod:
             routed_scaling_factor=routed_scaling_factor,
         )
+        if use_intel_amx_backend(layer):
+            return torch.ops.sgl_kernel.fused_experts_cpu(
+                x,
+                layer.w13_weight,
+                layer.w2_weight,
+                topk_weights,
+                topk_ids,
+                False,  # inplace See [Note] inplace should be False in fused_experts.
+                False,  # use_int8_w8a8
+                True,  # use_fp8_w8a16
+                layer.w13_weight_scale_inv,  # w1_scale
+                layer.w2_weight_scale_inv,  # w2_scale
+                self.quant_config.weight_block_size,  # block_size
+                None,  # a1_scale
+                None,  # a2_scale
+                True,  # is_vnni
+            )
         if _is_hip:
             ret = self.maybe_apply_hip_fused_experts(
                 layer,

sglang/srt/layers/quantization/fp8_kernel.py CHANGED Viewed

@@ -23,9 +23,9 @@ import torch
 import triton
 import triton.language as tl
-from sglang.math_utils import align
 from sglang.srt.layers.quantization import deep_gemm_wrapper
 from sglang.srt.utils import (
+    align,
     direct_register_custom_op,
     get_device_core_count,
     get_device_name,

sglang/srt/layers/quantization/fp8_utils.py CHANGED Viewed

@@ -1,9 +1,7 @@
 from typing import Callable, List, Optional, Tuple
-import einops
 import torch
-from sglang.math_utils import align
 from sglang.srt.layers.quantization import deep_gemm_wrapper
 from sglang.srt.layers.quantization.fp8_kernel import sglang_per_token_group_quant_fp8
 from sglang.srt.layers.utils import is_sm100_supported
@@ -27,6 +25,7 @@ from sglang.srt.layers.quantization.fp8_kernel import (
     w8a8_block_fp8_matmul_triton,
 )
 from sglang.srt.utils import (
+    align,
     get_bool_env_var,
     get_cuda_version,
     get_device_capability,
@@ -42,7 +41,10 @@ _is_fp8_fnuz = is_fp8_fnuz()
 _use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
 if _use_aiter:
-    from aiter import gemm_a8w8_blockscale_CK
+    import aiter
+    from aiter import gemm_a8w8_blockscale_CK, get_hip_quant
+    aiter_per1x128_quant = get_hip_quant(aiter.QuantType.per_1x128)
 if _is_cuda:
     from sgl_kernel import fp8_blockwise_scaled_mm, fp8_scaled_mm
@@ -271,9 +273,7 @@ def aiter_w8a8_block_fp8_linear(
     input_2d = input.view(-1, input.shape[-1])
     output_shape = [*input.shape[:-1], weight.shape[0]]
-    q_input, x_scale = per_token_group_quant_fp8(
-        input_2d, block_size[1], column_major_scales=False
-    )
+    q_input, x_scale = aiter_per1x128_quant(input_2d, quant_dtype=aiter.dtypes.fp8)
     output = gemm_a8w8_blockscale_CK(
         q_input, weight, x_scale, weight_scale, dtype=input.dtype
     )

sglang/srt/layers/quantization/gptq.py CHANGED Viewed

@@ -344,6 +344,10 @@ class GPTQMarlinConfig(QuantizationConfig):
         if (num_bits, sym) not in cls.TYPE_MAP:
             return False
+        assert (
+            VLLM_AVAILABLE
+        ), "vllm is not installed, to use gptq_marlin, please install vllm"
         return check_marlin_supported(
             quant_type=cls.TYPE_MAP[(num_bits, sym)], group_size=group_size
         )
@@ -726,6 +730,6 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
             g_idx2=layer.w2_g_idx,
             sort_indices1=layer.w13_g_idx_sort_indices,
             sort_indices2=layer.w2_g_idx_sort_indices,
-            num_bits=self.quant_config.quant_type.size_bits,
+            quant_type_id=self.quant_config.quant_type.id,
             is_k_full=self.is_k_full,
         ).to(orig_dtype)

sglang/srt/layers/quantization/moe_wna16.py CHANGED Viewed

@@ -131,7 +131,7 @@ class MoeWNA16Config(QuantizationConfig):
         capability_tuple = get_device_capability()
         device_capability = (
             -1
-            if capability_tuple is None
+            if all(capability is None for capability in capability_tuple)
             else capability_tuple[0] * 10 + capability_tuple[1]
         )
         # Avoid circular import

sglang 0.4.8__py3-none-any.whl → 0.4.9__py3-none-any.whl

sglang 0.4.8py3-none-any.whl → 0.4.9py3-none-any.whl