PyPI - sglang - Versions diffs - 0.4.4.post3__py3-none-any.whl → 0.4.5__py3-none-any.whl - Mend

sglang 0.4.4.post3py3-none-any.whl → 0.4.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (99) hide show

sglang/bench_serving.py +49 -7
sglang/lang/chat_template.py +24 -0
sglang/srt/_custom_ops.py +59 -92
sglang/srt/configs/model_config.py +5 -0
sglang/srt/constrained/base_grammar_backend.py +5 -1
sglang/srt/conversation.py +29 -4
sglang/srt/custom_op.py +5 -0
sglang/srt/distributed/device_communicators/custom_all_reduce.py +27 -79
sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +2 -2
sglang/srt/entrypoints/engine.py +0 -5
sglang/srt/layers/attention/flashattention_backend.py +678 -83
sglang/srt/layers/attention/flashinfer_backend.py +5 -7
sglang/srt/layers/attention/flashinfer_mla_backend.py +1 -3
sglang/srt/layers/attention/flashmla_backend.py +1 -1
sglang/srt/layers/moe/ep_moe/kernels.py +142 -0
sglang/srt/layers/moe/ep_moe/layer.py +79 -80
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +382 -199
sglang/srt/layers/moe/fused_moe_native.py +5 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1024,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +416 -50
sglang/srt/layers/moe/fused_moe_triton/layer.py +7 -0
sglang/srt/layers/moe/topk.py +49 -3
sglang/srt/layers/quantization/__init__.py +5 -1
sglang/srt/layers/quantization/blockwise_int8.py +2 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +2 -1
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +34 -10
sglang/srt/layers/quantization/fp8.py +3 -1
sglang/srt/layers/quantization/fp8_utils.py +1 -4
sglang/srt/layers/quantization/moe_wna16.py +503 -0
sglang/srt/layers/quantization/utils.py +1 -1
sglang/srt/layers/quantization/w8a8_int8.py +2 -0
sglang/srt/layers/radix_attention.py +2 -0
sglang/srt/layers/rotary_embedding.py +63 -12
sglang/srt/managers/cache_controller.py +34 -11
sglang/srt/managers/mm_utils.py +202 -156
sglang/srt/managers/multimodal_processor.py +0 -2
sglang/srt/managers/multimodal_processors/base_processor.py +45 -77
sglang/srt/managers/multimodal_processors/clip.py +7 -26
sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +17 -58
sglang/srt/managers/multimodal_processors/gemma3.py +12 -27
sglang/srt/managers/multimodal_processors/janus_pro.py +21 -47
sglang/srt/managers/multimodal_processors/llava.py +34 -14
sglang/srt/managers/multimodal_processors/minicpm.py +35 -38
sglang/srt/managers/multimodal_processors/mlama.py +10 -23
sglang/srt/managers/multimodal_processors/mllama4.py +161 -0
sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -45
sglang/srt/managers/schedule_batch.py +185 -128
sglang/srt/managers/scheduler.py +4 -4
sglang/srt/managers/tokenizer_manager.py +1 -1
sglang/srt/managers/utils.py +1 -6
sglang/srt/mem_cache/hiradix_cache.py +62 -52
sglang/srt/mem_cache/memory_pool.py +72 -6
sglang/srt/mem_cache/paged_allocator.py +39 -0
sglang/srt/metrics/collector.py +23 -53
sglang/srt/model_executor/cuda_graph_runner.py +8 -6
sglang/srt/model_executor/forward_batch_info.py +10 -10
sglang/srt/model_executor/model_runner.py +60 -57
sglang/srt/model_loader/loader.py +8 -0
sglang/srt/models/clip.py +12 -7
sglang/srt/models/deepseek_janus_pro.py +10 -15
sglang/srt/models/deepseek_v2.py +212 -121
sglang/srt/models/deepseek_vl2.py +105 -104
sglang/srt/models/gemma3_mm.py +14 -80
sglang/srt/models/llama.py +16 -5
sglang/srt/models/llama4.py +420 -0
sglang/srt/models/llava.py +31 -19
sglang/srt/models/llavavid.py +16 -7
sglang/srt/models/minicpmo.py +63 -147
sglang/srt/models/minicpmv.py +17 -27
sglang/srt/models/mllama.py +29 -14
sglang/srt/models/mllama4.py +154 -0
sglang/srt/models/qwen2.py +9 -6
sglang/srt/models/qwen2_5_vl.py +21 -31
sglang/srt/models/qwen2_vl.py +20 -21
sglang/srt/openai_api/adapter.py +18 -6
sglang/srt/platforms/interface.py +371 -0
sglang/srt/server_args.py +99 -14
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -5
sglang/srt/speculative/eagle_utils.py +140 -28
sglang/srt/speculative/eagle_worker.py +93 -24
sglang/srt/utils.py +104 -51
sglang/test/test_custom_ops.py +55 -0
sglang/test/test_utils.py +13 -26
sglang/utils.py +2 -2
sglang/version.py +1 -1
{sglang-0.4.4.post3.dist-info → sglang-0.4.5.dist-info}/METADATA +4 -3
{sglang-0.4.4.post3.dist-info → sglang-0.4.5.dist-info}/RECORD +99 -84
{sglang-0.4.4.post3.dist-info → sglang-0.4.5.dist-info}/WHEEL +0 -0
{sglang-0.4.4.post3.dist-info → sglang-0.4.5.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.4.post3.dist-info → sglang-0.4.5.dist-info}/top_level.txt +0 -0

sglang/srt/layers/quantization/moe_wna16.py ADDED Viewed

@@ -0,0 +1,503 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/moe_wna16.py
+import logging
+from typing import Any, Callable, Dict, List, Optional
+import torch
+from sglang.srt.distributed import get_tensor_model_parallel_rank
+from sglang.srt.distributed.parallel_state import get_tp_group
+from sglang.srt.layers.linear import LinearBase, UnquantizedLinearMethod
+from sglang.srt.layers.quantization.awq import AWQConfig
+from sglang.srt.layers.quantization.base_config import (
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
+from sglang.srt.layers.quantization.gptq import GPTQConfig, GPTQMarlinConfig
+from sglang.srt.utils import get_device_capability, set_weight_attrs
+logger = logging.getLogger(__name__)
+class MoeWNA16Config(QuantizationConfig):
+    """Config class for MOE WNA16 (W8A16/W4A16) quantization."""
+    def __init__(
+        self,
+        linear_quant_method: str,
+        weight_bits: int,
+        group_size: int,
+        has_zp: bool,
+        lm_head_quantized: bool,
+        modules_to_not_convert: Optional[List[str]],
+        full_config: Dict[str, Any],
+    ) -> None:
+        super().__init__()
+        self.weight_bits = weight_bits
+        self.group_size = group_size
+        self.has_zp = has_zp
+        self.bit8_pack_factor = 8 // self.weight_bits
+        self.lm_head_quantized = lm_head_quantized
+        self.linear_quant_method = linear_quant_method
+        self.full_config = full_config
+        self.use_marlin = False
+        # Avoid circular import
+        if self.linear_quant_method == "gptq":
+            self.use_marlin = GPTQMarlinConfig.is_gptq_marlin_compatible(full_config)
+        elif self.linear_quant_method == "awq":
+            capability_tuple = get_device_capability()
+            device_capability = (
+                -1
+                if capability_tuple is None
+                else capability_tuple[0] * 10 + capability_tuple[1]
+            )
+            awq_min_capability = AWQConfig.get_min_capability()
+            if device_capability < awq_min_capability:
+                raise ValueError(
+                    "The quantization method moe_wna16 + awq is not supported "
+                    "for the current GPU. "
+                    f"Minimum capability: {awq_min_capability}. "
+                    f"Current capability: {device_capability}."
+                )
+        else:
+            raise ValueError("moe_wna16 only support gptq and awq.")
+        if modules_to_not_convert is None:
+            self.modules_to_not_convert = []
+        else:
+            self.modules_to_not_convert = modules_to_not_convert
+    @classmethod
+    def get_name(cls) -> str:
+        return "moe_wna16"
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.bfloat16, torch.half]
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 70
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return ["quantize_config.json"]
+    def get_scaled_act_names(self) -> List[str]:
+        raise NotImplementedError
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> "MoeWNA16Config":
+        quant_method = cls.get_from_keys(config, ["quant_method"])
+        weight_bits = cls.get_from_keys(config, ["bits"])
+        group_size = cls.get_from_keys(config, ["group_size"])
+        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"], default=False)
+        if quant_method == "gptq":
+            has_zp = not cls.get_from_keys(config, ["sym"])
+            modules_to_not_convert = []
+        elif quant_method == "awq":
+            has_zp = cls.get_from_keys(config, ["zero_point"])
+            modules_to_not_convert = cls.get_from_keys_or(
+                config, ["modules_to_not_convert"], None
+            )
+        else:
+            raise ValueError("moe_wna16 only support gptq and awq.")
+        return cls(
+            quant_method,
+            weight_bits,
+            group_size,
+            has_zp,
+            lm_head_quantized,
+            modules_to_not_convert,
+            config,
+        )
+    @classmethod
+    def override_quantization_method(cls, hf_quant_cfg, user_quant) -> Optional[str]:
+        can_convert = cls.is_moe_wna16_compatible(hf_quant_cfg)
+        if can_convert and user_quant == "moe_wna16":
+            return cls.get_name()
+        return None
+    @classmethod
+    def is_moe_wna16_compatible(cls, quant_config: Dict[str, Any]):
+        # Extract data from quant config.
+        quant_method = quant_config.get("quant_method", "").lower()
+        num_bits = quant_config.get("bits")
+        desc_act = quant_config.get("desc_act")
+        capability_tuple = get_device_capability()
+        device_capability = (
+            -1
+            if capability_tuple is None
+            else capability_tuple[0] * 10 + capability_tuple[1]
+        )
+        # Avoid circular import
+        awq_min_capability = AWQConfig.get_min_capability()
+        gptq_compatible = quant_method == "gptq" and not desc_act and num_bits in [4, 8]
+        awq_compatible = (
+            quant_method == "awq"
+            and num_bits == 4
+            and device_capability >= awq_min_capability
+        )
+        return gptq_compatible or awq_compatible
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional["QuantizeMethodBase"]:
+        # avoid circular import
+        from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
+        if is_layer_skipped_quant(prefix, self.modules_to_not_convert):
+            return UnquantizedLinearMethod()
+        elif isinstance(layer, LinearBase):
+            if self.linear_quant_method == "gptq":
+                if self.use_marlin:
+                    return GPTQMarlinConfig.from_config(
+                        self.full_config
+                    ).get_quant_method(layer, prefix)
+                else:
+                    return GPTQConfig.from_config(self.full_config).get_quant_method(
+                        layer, prefix
+                    )
+            elif self.linear_quant_method == "awq":
+                return AWQConfig.from_config(self.full_config).get_quant_method(
+                    layer, prefix
+                )
+            else:
+                raise ValueError("moe_wna16 only support gptq and awq.")
+        elif isinstance(layer, FusedMoE):
+            return MoeWNA16Method(self)
+        return None
+def is_layer_skipped_quant(prefix: str, modules_to_not_convert: List[str]):
+    return any(module_name in prefix for module_name in modules_to_not_convert)
+class MoeWNA16Method:
+    """Linear method for MOE WNA16 (W8A16/W4A16) quantization.
+    Args:
+        quant_config: The MOE WNA16 (W8A16/W4A16) quantization config.
+    """
+    def __new__(cls, *args, **kwargs):
+        # avoid circular import
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoEMethodBase
+        if not hasattr(cls, "_initialized"):
+            original_init = cls.__init__
+            new_cls = type(
+                cls.__name__,
+                (FusedMoEMethodBase,),
+                {
+                    "__init__": original_init,
+                    **{k: v for k, v in cls.__dict__.items() if k != "__dict__"},
+                },
+            )
+            obj = super(new_cls, new_cls).__new__(new_cls)
+            obj.__init__(*args, **kwargs)
+            return obj
+        return super().__new__(cls)
+    def __init__(self, quant_config: MoeWNA16Config):
+        self.quant_config = quant_config
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
+        layer.quant_config = self.quant_config
+        bit8_pack_factor = self.quant_config.bit8_pack_factor
+        group_size = self.quant_config.group_size
+        group_size_div_factor = 1
+        # make intermediate_size and hidden_size diviable by group_size
+        # we reduce the group size to ensure that
+        # and we would repeat the loaded_weight later
+        while intermediate_size_per_partition % group_size or hidden_size % group_size:
+            group_size = group_size // 2
+            group_size_div_factor *= 2
+            assert group_size >= 32
+        layer.group_size = group_size
+        layer.group_size_div_factor = group_size_div_factor
+        strategy = FusedMoeWeightScaleSupported.GROUP.value
+        extra_weight_attrs.update({"quant_method": strategy, "is_transposed": False})
+        assert "weight_loader" in extra_weight_attrs
+        weight_loader = extra_weight_attrs["weight_loader"]
+        wrapped_weight_loader = MoeWNA16Method.get_weight_loader(layer, weight_loader)
+        extra_weight_attrs["weight_loader"] = wrapped_weight_loader
+        # Fused gate_up_proj (column parallel)
+        w13_qweight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                hidden_size // bit8_pack_factor,
+                dtype=torch.uint8,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_qweight", w13_qweight)
+        set_weight_attrs(w13_qweight, extra_weight_attrs)
+        # down_proj (row parallel)
+        w2_qweight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                intermediate_size_per_partition // bit8_pack_factor,
+                dtype=torch.uint8,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_qweight", w2_qweight)
+        set_weight_attrs(w2_qweight, extra_weight_attrs)
+        w13_scales = torch.nn.Parameter(
+            torch.zeros(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                hidden_size // group_size,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_scales", w13_scales)
+        set_weight_attrs(w13_scales, extra_weight_attrs)
+        w2_scales = torch.nn.Parameter(
+            torch.zeros(
+                num_experts,
+                hidden_size,
+                intermediate_size_per_partition // group_size,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_scales", w2_scales)
+        set_weight_attrs(w2_scales, extra_weight_attrs)
+        if self.quant_config.has_zp:
+            w13_qzeros = torch.nn.Parameter(
+                torch.zeros(
+                    num_experts,
+                    2 * intermediate_size_per_partition // bit8_pack_factor,
+                    hidden_size // group_size,
+                    dtype=torch.uint8,
+                ),
+                requires_grad=False,
+            )
+            layer.register_parameter("w13_qzeros", w13_qzeros)
+            set_weight_attrs(w13_qzeros, extra_weight_attrs)
+            w2_qzeros = torch.nn.Parameter(
+                torch.zeros(
+                    num_experts,
+                    hidden_size // bit8_pack_factor,
+                    intermediate_size_per_partition // group_size,
+                    dtype=torch.uint8,
+                ),
+                requires_grad=False,
+            )
+            layer.register_parameter("w2_qzeros", w2_qzeros)
+            set_weight_attrs(w2_qzeros, extra_weight_attrs)
+        if self.quant_config.linear_quant_method == "gptq":
+            # some param are unused, but we need to init them in order to
+            # load weights
+            invalid_param_keys = ["w13_g_idx", "w2_g_idx"]
+            if not self.quant_config.has_zp:
+                invalid_param_keys += ["w13_qzeros", "w2_qzeros"]
+            for key in invalid_param_keys:
+                param = torch.nn.Parameter(
+                    torch.empty((0,), dtype=torch.int32), requires_grad=False
+                )
+                layer.register_parameter(key, param)
+                set_weight_attrs(param, extra_weight_attrs)
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        custom_routing_function: Optional[Callable] = None,
+        correction_bias: Optional[torch.Tensor] = None,
+        activation: str = "silu",
+        apply_router_weight_on_input: bool = False,
+        inplace: bool = True,
+        no_combine: bool = False,
+    ) -> torch.Tensor:
+        # avoid circular import
+        from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_experts
+        from sglang.srt.layers.moe.topk import select_experts
+        assert activation == "silu", "Only SiLU activation is supported."
+        topk_weights, topk_ids = select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            top_k=top_k,
+            use_grouped_topk=use_grouped_topk,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=custom_routing_function,
+            correction_bias=correction_bias,
+        )
+        weight_bits = self.quant_config.weight_bits
+        has_zp = self.quant_config.has_zp
+        return fused_experts(
+            x,
+            layer.w13_qweight,
+            layer.w2_qweight,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            inplace=inplace,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+            use_int4_w4a16=weight_bits == 4,
+            use_int8_w8a16=weight_bits == 8,
+            w1_scale=layer.w13_scales,
+            w2_scale=layer.w2_scales,
+            w1_zp=layer.w13_qzeros if has_zp else None,
+            w2_zp=layer.w2_qzeros if has_zp else None,
+            block_shape=[0, layer.group_size],
+            no_combine=no_combine,
+        )
+    @staticmethod
+    def get_weight_loader(layer, weight_loader):
+        def convert_awq_tensor(tensor, tensor_type):
+            # convert awq qweight/qzeros to a standard format (assume int4)
+            # qweight: (k, n // pack_factor_bit32) -> (n, k // pack_factor_bit8)
+            # qzeros: (k // group_size, n // pack_factor_bit32) ->
+            #         (n // pack_factor_bit8, k // group_size)
+            # pack_factor_bit32 = 32 // weight_bits
+            # pack_factor_bit8 = 8 // weight_bits
+            # 0. suppose origin shape (a, b), dtype int32
+            # 1. convert to uint8, shape (a, b) -> (a, 4 * b)
+            size0 = tensor.size(0)
+            tensor = tensor.view(torch.uint8)
+            # 2. unpack to uint4 (only when weight_bits == 4)
+            #    shape (a, 4 * b) -> (a, 4 * b, 2)
+            shifter = torch.tensor([0, 4], dtype=torch.uint8, device=tensor.device)
+            tensor = (tensor[:, :, None] >> shifter) & 0xF
+            # 3. change order, see
+            # https://github.com/casper-hansen/AutoAWQ/blob/v0.2.8/awq/utils/quant_utils.py
+            # shape -> (a, 4 * b * pack_factor_bit8)
+            reverse_awq_pack_order = [0, 4, 1, 5, 2, 6, 3, 7]
+            tensor = tensor.view(-1, 8)[:, reverse_awq_pack_order]
+            tensor = tensor.view(size0, -1)
+            # 4. transpose, shape -> (4 * b * pack_factor_bit8, a)
+            tensor = tensor.T.contiguous()
+            # 5. repack (only when weight_bits == 4)
+            # qweight shape -> (4 * b * pack_factor_bit8, a // pack_factor_bit8)
+            # qzeros shape -> (4 * b, a)
+            if tensor_type == "qweight":
+                tensor = tensor[:, 1::2] * 16 + tensor[:, ::2]
+            elif tensor_type == "qzeros":
+                tensor = tensor[1::2, :] * 16 + tensor[::2, :]
+            return tensor
+        def convert_gptq_int4_qzeros(tensor):
+            tensor = tensor.view(torch.uint8)
+            shifter = torch.tensor([0, 4], dtype=torch.uint8, device=tensor.device)
+            tensor = (tensor[:, :, None] >> shifter) & 0xF
+            tensor = tensor + 1
+            tensor = tensor[:, :, 0] + tensor[:, :, 1] * 16
+            return tensor
+        def moe_wna16_weight_loader(
+            param: torch.nn.Parameter,
+            loaded_weight: torch.Tensor,
+            weight_name: str,
+            shard_id: str,
+            expert_id: int,
+        ):
+            if "g_idx" in weight_name:
+                return
+            if not layer.quant_config.has_zp and "qzeros" in weight_name:
+                return
+            device = get_tp_group().device
+            tp_rank = get_tensor_model_parallel_rank()
+            loaded_weight = loaded_weight.to(device)
+            shard_size = layer.intermediate_size_per_partition
+            # convert gptq and awq weight to a standard format
+            if layer.quant_config.linear_quant_method == "awq":
+                assert layer.quant_config.weight_bits == 4
+                if "weight" in weight_name:
+                    loaded_weight = convert_awq_tensor(loaded_weight, "qweight")
+                elif "zeros" in weight_name:
+                    loaded_weight = convert_awq_tensor(loaded_weight, "qzeros")
+                else:
+                    loaded_weight = loaded_weight.T
+            elif layer.quant_config.linear_quant_method == "gptq":
+                assert layer.quant_config.weight_bits in [4, 8]
+                if "weight" in weight_name:
+                    loaded_weight = loaded_weight.T.contiguous().view(torch.uint8)
+                elif "zeros" in weight_name:
+                    # add 1 to gptq qzeros to align with awq
+                    loaded_weight = loaded_weight.view(torch.uint8)
+                    if layer.quant_config.weight_bits == 4:
+                        loaded_weight = convert_gptq_int4_qzeros(loaded_weight).T
+                    else:
+                        loaded_weight = loaded_weight.T + 1
+                else:
+                    loaded_weight = loaded_weight.T
+            # repeat the qzeros/scales to fit new group size
+            if (
+                layer.group_size_div_factor > 1
+                and "qzeros" in weight_name
+                or "scales" in weight_name
+            ):
+                loaded_weight = loaded_weight.repeat_interleave(
+                    layer.group_size_div_factor, 1
+                )
+            if "w13_qzeros" in weight_name:
+                tensor = loaded_weight.view(layer.tp_size, -1, loaded_weight.size(1))[
+                    tp_rank
+                ]
+                if shard_id == "w1":
+                    param.data[expert_id, : shard_size // 2] = tensor
+                else:
+                    param.data[expert_id, shard_size // 2 :] = tensor
+            elif "w2_qzeros" in weight_name:
+                param.data[expert_id] = loaded_weight.view(
+                    loaded_weight.size(0), layer.tp_size, -1
+                )[:, tp_rank]
+            else:
+                weight_loader(param, loaded_weight, weight_name, shard_id, expert_id)
+        return moe_wna16_weight_loader

sglang/srt/layers/quantization/utils.py CHANGED Viewed

@@ -1,7 +1,7 @@
 # Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/utils/quant_utils.py
 from types import MappingProxyType
-from typing import List, Mapping, Tuple, Union
+from typing import List, Mapping, Optional, Tuple, Union
 import torch

sglang/srt/layers/quantization/w8a8_int8.py CHANGED Viewed

@@ -230,6 +230,7 @@ class W8A8Int8MoEMethod:
         custom_routing_function: Optional[Callable] = None,
         correction_bias: Optional[torch.Tensor] = None,
         activation: str = "silu",
+        apply_router_weight_on_input: bool = False,
         inplace: bool = True,
         no_combine: bool = False,
     ) -> torch.Tensor:
@@ -257,6 +258,7 @@ class W8A8Int8MoEMethod:
             topk_ids=topk_ids,
             inplace=inplace,
             activation=activation,
+            apply_router_weight_on_input=apply_router_weight_on_input,
             use_int8_w8a8=True,
             w1_scale=(layer.w13_weight_scale),
             w2_scale=(layer.w2_weight_scale),

sglang/srt/layers/radix_attention.py CHANGED Viewed

@@ -35,6 +35,7 @@ class RadixAttention(nn.Module):
         sliding_window_size: int = -1,
         is_cross_attention: bool = False,
         prefix: str = "",
+        use_irope: bool = False,
     ):
         super().__init__()
         self.tp_q_head_num = num_heads
@@ -50,6 +51,7 @@ class RadixAttention(nn.Module):
         self.is_cross_attention = is_cross_attention
         self.k_scale = None
         self.v_scale = None
+        self.use_irope = use_irope
     def forward(
         self,

sglang/srt/layers/rotary_embedding.py CHANGED Viewed

@@ -651,18 +651,6 @@ class DeepseekScalingRotaryEmbedding(RotaryEmbedding):
         query: torch.Tensor,
         key: torch.Tensor,
         offsets: Optional[torch.Tensor] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        if _is_cuda_available:
-            return self.forward_cuda(positions, query, key, offsets)
-        else:
-            return self.forward_native(positions, query, key, offsets)
-    def forward_native(
-        self,
-        positions: torch.Tensor,
-        query: torch.Tensor,
-        key: torch.Tensor,
-        offsets: Optional[torch.Tensor] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         """PyTorch-native implementation equivalent to forward()."""
         query_rot = query[..., : self.rotary_dim]
@@ -745,6 +733,69 @@ class Llama3RotaryEmbedding(RotaryEmbedding):
         return new_freqs
+class Llama4VisionRotaryEmbedding(RotaryEmbedding):
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: int,
+        is_neox_style: bool,
+        dtype: torch.dtype,
+    ):
+        super().__init__(
+            head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
+        )
+    def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor:
+        inv_freqs = super()._compute_inv_freq(base)
+        inv_freqs = inv_freqs[: (self.rotary_dim // 2)]
+        return inv_freqs
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        inv_freq = self._compute_inv_freq(self.base)
+        # self.max_position_embeddings here is number of image patches
+        # i.e. (image_size // patch_size) ** 2
+        num_patches = self.max_position_embeddings
+        img_idx = torch.arange(num_patches, dtype=torch.int32).reshape(num_patches, 1)
+        img_idx = torch.cat([img_idx, img_idx[:1]], dim=0)
+        img_idx[-1, -1] = -2  # set to ID_CLS_TOKEN
+        num_patches_single_dim = int(math.sqrt(num_patches))
+        frequencies_x = img_idx % num_patches_single_dim
+        frequencies_y = img_idx // num_patches_single_dim
+        freqs_x = (
+            (frequencies_x + 1)[..., None] * inv_freq[None, None, :]
+        ).repeat_interleave(2, dim=-1)
+        freqs_y = (
+            (frequencies_y + 1)[..., None] * inv_freq[None, None, :]
+        ).repeat_interleave(2, dim=-1)
+        freqs = torch.cat([freqs_x, freqs_y], dim=-1).float().contiguous()[..., ::2]
+        freqs = freqs.masked_fill(img_idx.reshape(-1, 1, 1) < 0, 0)
+        cache = torch.view_as_complex(
+            torch.stack([torch.cos(freqs), torch.sin(freqs)], dim=-1)
+        )
+        return cache
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        self.cos_sin_cache: torch.Tensor = self.cos_sin_cache.to(query.device)
+        query_ = torch.view_as_complex(query.float().reshape(*query.shape[:-1], -1, 2))
+        key_ = torch.view_as_complex(key.float().reshape(*key.shape[:-1], -1, 2))
+        broadcast_shape = [
+            d if i == 1 or i == (query_.ndim - 1) else 1
+            for i, d in enumerate(query_.shape)
+        ]
+        freqs_ci = self.cos_sin_cache.view(*broadcast_shape)
+        query_out = torch.view_as_real(query_ * freqs_ci).flatten(3)
+        key_out = torch.view_as_real(key_ * freqs_ci).flatten(3)
+        return query_out.type_as(query), key_out.type_as(key)
 class MRotaryEmbedding(RotaryEmbedding):
     """Rotary Embedding with Multimodal Sections."""

sglang 0.4.4.post3__py3-none-any.whl → 0.4.5__py3-none-any.whl

sglang 0.4.4.post3py3-none-any.whl → 0.4.5py3-none-any.whl