PyPI - sglang - Versions diffs - 0.4.8.post1__py3-none-any.whl → 0.4.9.post1__py3-none-any.whl - Mend

sglang 0.4.8.post1py3-none-any.whl → 0.4.9.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (158) hide show

sglang/bench_one_batch_server.py +17 -2
sglang/bench_serving.py +170 -24
sglang/srt/configs/internvl.py +4 -2
sglang/srt/configs/janus_pro.py +1 -1
sglang/srt/configs/model_config.py +60 -1
sglang/srt/configs/update_config.py +119 -0
sglang/srt/conversation.py +69 -1
sglang/srt/disaggregation/decode.py +21 -5
sglang/srt/disaggregation/mooncake/conn.py +35 -4
sglang/srt/disaggregation/nixl/conn.py +6 -6
sglang/srt/disaggregation/prefill.py +2 -2
sglang/srt/disaggregation/utils.py +1 -1
sglang/srt/distributed/parallel_state.py +44 -17
sglang/srt/entrypoints/EngineBase.py +8 -0
sglang/srt/entrypoints/engine.py +40 -6
sglang/srt/entrypoints/http_server.py +111 -24
sglang/srt/entrypoints/http_server_engine.py +1 -1
sglang/srt/entrypoints/openai/protocol.py +4 -2
sglang/srt/eplb/__init__.py +0 -0
sglang/srt/{managers → eplb}/eplb_algorithms/__init__.py +1 -1
sglang/srt/{managers → eplb}/eplb_manager.py +2 -4
sglang/srt/{eplb_simulator → eplb/eplb_simulator}/reader.py +1 -1
sglang/srt/{managers → eplb}/expert_distribution.py +1 -5
sglang/srt/{managers → eplb}/expert_location.py +1 -1
sglang/srt/{managers → eplb}/expert_location_dispatch.py +1 -1
sglang/srt/{model_executor → eplb}/expert_location_updater.py +17 -1
sglang/srt/hf_transformers_utils.py +2 -1
sglang/srt/layers/activation.py +2 -2
sglang/srt/layers/amx_utils.py +86 -0
sglang/srt/layers/attention/ascend_backend.py +219 -0
sglang/srt/layers/attention/flashattention_backend.py +32 -9
sglang/srt/layers/attention/tbo_backend.py +37 -9
sglang/srt/layers/communicator.py +20 -2
sglang/srt/layers/dp_attention.py +9 -3
sglang/srt/layers/elementwise.py +76 -12
sglang/srt/layers/flashinfer_comm_fusion.py +202 -0
sglang/srt/layers/layernorm.py +26 -0
sglang/srt/layers/linear.py +84 -14
sglang/srt/layers/logits_processor.py +4 -4
sglang/srt/layers/moe/cutlass_w4a8_moe.py +215 -0
sglang/srt/layers/moe/ep_moe/kernels.py +81 -8
sglang/srt/layers/moe/ep_moe/layer.py +176 -15
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +23 -17
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +3 -2
sglang/srt/layers/moe/fused_moe_triton/layer.py +211 -74
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +176 -0
sglang/srt/layers/moe/router.py +60 -22
sglang/srt/layers/moe/topk.py +10 -28
sglang/srt/layers/parameter.py +67 -7
sglang/srt/layers/quantization/__init__.py +2 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +1 -1
sglang/srt/layers/quantization/fp8.py +72 -7
sglang/srt/layers/quantization/fp8_kernel.py +1 -1
sglang/srt/layers/quantization/fp8_utils.py +1 -2
sglang/srt/layers/quantization/gptq.py +5 -1
sglang/srt/layers/quantization/modelopt_quant.py +244 -1
sglang/srt/layers/quantization/moe_wna16.py +1 -1
sglang/srt/layers/quantization/quant_utils.py +166 -0
sglang/srt/layers/quantization/w4afp8.py +264 -0
sglang/srt/layers/quantization/w8a8_int8.py +52 -1
sglang/srt/layers/rotary_embedding.py +2 -2
sglang/srt/layers/vocab_parallel_embedding.py +20 -10
sglang/srt/lora/lora.py +4 -5
sglang/srt/lora/lora_manager.py +73 -20
sglang/srt/lora/triton_ops/gate_up_lora_b.py +30 -19
sglang/srt/lora/triton_ops/qkv_lora_b.py +30 -19
sglang/srt/lora/triton_ops/sgemm_lora_a.py +27 -11
sglang/srt/lora/triton_ops/sgemm_lora_b.py +27 -15
sglang/srt/managers/cache_controller.py +41 -195
sglang/srt/managers/configure_logging.py +1 -1
sglang/srt/managers/io_struct.py +58 -14
sglang/srt/managers/mm_utils.py +77 -61
sglang/srt/managers/multimodal_processor.py +2 -6
sglang/srt/managers/multimodal_processors/qwen_audio.py +94 -0
sglang/srt/managers/schedule_batch.py +78 -85
sglang/srt/managers/scheduler.py +130 -64
sglang/srt/managers/scheduler_output_processor_mixin.py +8 -2
sglang/srt/managers/session_controller.py +12 -3
sglang/srt/managers/tokenizer_manager.py +314 -103
sglang/srt/managers/tp_worker.py +13 -1
sglang/srt/managers/tp_worker_overlap_thread.py +8 -0
sglang/srt/mem_cache/allocator.py +290 -0
sglang/srt/mem_cache/chunk_cache.py +34 -2
sglang/srt/mem_cache/hiradix_cache.py +2 -0
sglang/srt/mem_cache/memory_pool.py +402 -66
sglang/srt/mem_cache/memory_pool_host.py +6 -109
sglang/srt/mem_cache/multimodal_cache.py +3 -0
sglang/srt/mem_cache/radix_cache.py +8 -4
sglang/srt/model_executor/cuda_graph_runner.py +2 -1
sglang/srt/model_executor/forward_batch_info.py +17 -4
sglang/srt/model_executor/model_runner.py +297 -56
sglang/srt/model_loader/loader.py +41 -0
sglang/srt/model_loader/weight_utils.py +72 -4
sglang/srt/models/deepseek_nextn.py +1 -3
sglang/srt/models/deepseek_v2.py +195 -45
sglang/srt/models/deepseek_vl2.py +3 -5
sglang/srt/models/gemma3_causal.py +1 -2
sglang/srt/models/gemma3n_causal.py +4 -3
sglang/srt/models/gemma3n_mm.py +4 -20
sglang/srt/models/hunyuan.py +1 -1
sglang/srt/models/kimi_vl.py +1 -2
sglang/srt/models/llama.py +10 -4
sglang/srt/models/llama4.py +32 -45
sglang/srt/models/llama_eagle3.py +61 -11
sglang/srt/models/llava.py +5 -5
sglang/srt/models/minicpmo.py +2 -2
sglang/srt/models/mistral.py +1 -1
sglang/srt/models/mllama4.py +402 -89
sglang/srt/models/phi4mm.py +1 -3
sglang/srt/models/pixtral.py +3 -7
sglang/srt/models/qwen2.py +31 -3
sglang/srt/models/qwen2_5_vl.py +1 -3
sglang/srt/models/qwen2_audio.py +200 -0
sglang/srt/models/qwen2_moe.py +32 -6
sglang/srt/models/qwen2_vl.py +1 -4
sglang/srt/models/qwen3.py +94 -25
sglang/srt/models/qwen3_moe.py +68 -21
sglang/srt/models/vila.py +3 -8
sglang/srt/{mm_utils.py → multimodal/mm_utils.py} +2 -2
sglang/srt/{managers/multimodal_processors → multimodal/processors}/base_processor.py +140 -158
sglang/srt/{managers/multimodal_processors → multimodal/processors}/clip.py +2 -13
sglang/srt/{managers/multimodal_processors → multimodal/processors}/deepseek_vl_v2.py +4 -11
sglang/srt/{managers/multimodal_processors → multimodal/processors}/gemma3.py +3 -10
sglang/srt/{managers/multimodal_processors → multimodal/processors}/gemma3n.py +5 -20
sglang/srt/{managers/multimodal_processors → multimodal/processors}/internvl.py +3 -10
sglang/srt/{managers/multimodal_processors → multimodal/processors}/janus_pro.py +3 -9
sglang/srt/{managers/multimodal_processors → multimodal/processors}/kimi_vl.py +6 -13
sglang/srt/{managers/multimodal_processors → multimodal/processors}/llava.py +2 -10
sglang/srt/{managers/multimodal_processors → multimodal/processors}/minicpm.py +5 -12
sglang/srt/{managers/multimodal_processors → multimodal/processors}/mlama.py +2 -14
sglang/srt/{managers/multimodal_processors → multimodal/processors}/mllama4.py +65 -66
sglang/srt/{managers/multimodal_processors → multimodal/processors}/phi4mm.py +4 -14
sglang/srt/{managers/multimodal_processors → multimodal/processors}/pixtral.py +3 -9
sglang/srt/{managers/multimodal_processors → multimodal/processors}/qwen_vl.py +8 -14
sglang/srt/{managers/multimodal_processors → multimodal/processors}/vila.py +13 -31
sglang/srt/operations_strategy.py +6 -2
sglang/srt/reasoning_parser.py +26 -0
sglang/srt/sampling/sampling_batch_info.py +39 -1
sglang/srt/server_args.py +84 -22
sglang/srt/speculative/build_eagle_tree.py +57 -18
sglang/srt/speculative/eagle_worker.py +6 -4
sglang/srt/two_batch_overlap.py +203 -27
sglang/srt/utils.py +343 -163
sglang/srt/warmup.py +12 -3
sglang/test/runners.py +10 -1
sglang/test/test_cutlass_w4a8_moe.py +281 -0
sglang/test/test_utils.py +15 -3
sglang/utils.py +5 -5
sglang/version.py +1 -1
{sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/METADATA +12 -8
{sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/RECORD +157 -146
sglang/math_utils.py +0 -8
/sglang/srt/{managers → eplb}/eplb_algorithms/deepseek.py +0 -0
/sglang/srt/{managers → eplb}/eplb_algorithms/deepseek_vec.py +0 -0
/sglang/srt/{eplb_simulator → eplb/eplb_simulator}/__init__.py +0 -0
{sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/WHEEL +0 -0
{sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/top_level.txt +0 -0

sglang/srt/layers/quantization/w4afp8.py ADDED Viewed

@@ -0,0 +1,264 @@
+import logging
+from typing import Any, Dict, List, Optional
+import torch
+from torch.nn import Module
+from torch.nn.parameter import Parameter
+from sglang.srt.layers.linear import LinearBase, UnquantizedLinearMethod
+from sglang.srt.layers.quantization.base_config import (
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
+from sglang.srt.layers.quantization.fp8 import Fp8LinearMethod
+from sglang.srt.layers.quantization.utils import is_layer_skipped
+from sglang.srt.utils import set_weight_attrs
+ACTIVATION_SCHEMES = ["static", "dynamic"]
+logger = logging.getLogger(__name__)
+class W4AFp8Config(QuantizationConfig):
+    """Config class for MIXED_PRECISION W4AFp8."""
+    def __init__(
+        self,
+        is_checkpoint_fp8_serialized: bool = True,
+        is_checkpoint_w4afp8_serialized: bool = True,
+        linear_activation_scheme: str = "dynamic",
+        moe_activation_scheme: str = "static",
+        ignored_layers: Optional[List[str]] = None,
+        weight_block_size: Optional[List[int]] = None,
+        group_size: int = 128,
+    ) -> None:
+        super().__init__()
+        self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
+        self.is_checkpoint_w4afp8_serialized = is_checkpoint_w4afp8_serialized
+        if is_checkpoint_w4afp8_serialized:
+            logger.warning("Detected w4afp8 checkpoint. Please note that")
+        if moe_activation_scheme not in ACTIVATION_SCHEMES:
+            raise ValueError(f"Unsupported activation scheme {moe_activation_scheme}")
+        self.linear_activation_scheme = linear_activation_scheme
+        self.moe_activation_scheme = moe_activation_scheme
+        self.ignored_layers = ignored_layers or []
+        self.weight_block_size = [128, 128]
+        self.group_size = group_size
+    @classmethod
+    def get_name(cls) -> str:
+        return "w4afp8"
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.bfloat16, torch.float8_e4m3fn]
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 90
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return []
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> "W4AFp8Config":
+        quant_method = cls.get_from_keys(config, ["quant_method"])
+        is_checkpoint_fp8_serialized = "fp8" in quant_method
+        is_checkpoint_w4afp8_serialized = "w4afp8" in quant_method
+        linear_activation_scheme = "dynamic"
+        moe_activation_scheme = "static"
+        weight_block_size = [128, 128]
+        return cls(
+            is_checkpoint_fp8_serialized=is_checkpoint_fp8_serialized,
+            is_checkpoint_w4afp8_serialized=is_checkpoint_w4afp8_serialized,
+            linear_activation_scheme=linear_activation_scheme,
+            moe_activation_scheme=moe_activation_scheme,
+            weight_block_size=weight_block_size,
+        )
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional["QuantizeMethodBase"]:
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+        if isinstance(layer, LinearBase):
+            if is_layer_skipped(prefix, self.ignored_layers):
+                return UnquantizedLinearMethod()
+            return Fp8LinearMethod(self)
+        elif isinstance(layer, FusedMoE):
+            return W4AFp8MoEMethod(self)
+        return None
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+class W4AFp8MoEMethod:
+    def __init__(self, quant_config: W4AFp8Config):
+        self.quant_config = quant_config
+    def create_weights(
+        self,
+        layer: Module,
+        num_experts_per_partition: int,
+        hidden_size: int,
+        intermediate_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        assert "weight_loader" in extra_weight_attrs
+        # Fused gate_up_proj (column parallel)
+        w13_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts_per_partition,
+                intermediate_size * 2,
+                hidden_size // 2,
+                dtype=torch.int8,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight", w13_weight)
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+        # down_proj (row parallel)
+        w2_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts_per_partition,
+                hidden_size,
+                intermediate_size // 2,
+                dtype=torch.int8,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_weight", w2_weight)
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+        w13_weight_scale = torch.nn.Parameter(
+            torch.zeros(
+                num_experts_per_partition,
+                2 * intermediate_size,
+                hidden_size // self.quant_config.group_size,
+                dtype=torch.float32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight_scale_inv", w13_weight_scale)
+        set_weight_attrs(w13_weight_scale, extra_weight_attrs)
+        w2_weight_scale = torch.nn.Parameter(
+            torch.zeros(
+                num_experts_per_partition,
+                hidden_size,
+                intermediate_size // self.quant_config.group_size,
+                dtype=torch.float32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_weight_scale_inv", w2_weight_scale)
+        set_weight_attrs(w2_weight_scale, extra_weight_attrs)
+        # Input scales
+        w13_input_scale = torch.nn.Parameter(
+            torch.ones((num_experts_per_partition, 2), dtype=torch.bfloat16),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_input_scale", w13_input_scale)
+        set_weight_attrs(w13_input_scale, extra_weight_attrs)
+        w2_input_scale = torch.nn.Parameter(
+            torch.ones(num_experts_per_partition, dtype=torch.bfloat16),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_input_scale", w2_input_scale)
+        set_weight_attrs(w2_input_scale, extra_weight_attrs)
+        # Pre-populate the strides
+        device = layer.w13_weight.device
+        self.a_strides1 = torch.full(
+            (num_experts_per_partition, 3),
+            hidden_size,
+            device=device,
+            dtype=torch.int64,
+        )
+        self.c_strides1 = torch.full(
+            (num_experts_per_partition, 3),
+            2 * intermediate_size,
+            device=device,
+            dtype=torch.int64,
+        )
+        self.a_strides2 = torch.full(
+            (num_experts_per_partition, 3),
+            intermediate_size,
+            device=device,
+            dtype=torch.int64,
+        )
+        self.c_strides2 = torch.full(
+            (num_experts_per_partition, 3),
+            hidden_size,
+            device=device,
+            dtype=torch.int64,
+        )
+        self.b_strides1 = self.a_strides1
+        self.s_strides13 = self.c_strides1
+        self.b_strides2 = self.a_strides2
+        self.s_strides2 = self.c_strides2
+        self.expert_offsets = torch.empty(
+            (num_experts_per_partition + 1), dtype=torch.int32, device=device
+        )
+        self.problem_sizes1 = torch.empty(
+            (num_experts_per_partition, 3), dtype=torch.int32, device=device
+        )
+        self.problem_sizes2 = torch.empty(
+            (num_experts_per_partition, 3), dtype=torch.int32, device=device
+        )
+        return
+    def _interleave_scales(self, scales: torch.Tensor) -> torch.Tensor:
+        """Interleave scales in groups of 4 similar to TRT-LLM implementation."""
+        s_shape = scales.shape
+        # Reshape to separate groups of 4
+        scales_interleaved = scales.reshape(
+            s_shape[0], s_shape[1], (s_shape[2] // 4), 4
+        )
+        # Permute dimensions to interleave
+        scales_interleaved = scales_interleaved.permute(0, 2, 1, 3)
+        # Reshape back to original dimensions but with interleaved values
+        scales_interleaved = scales_interleaved.reshape(
+            s_shape[0], s_shape[2] // 4, s_shape[1] * 4
+        )
+        return scales_interleaved.contiguous()
+    def process_weights_after_loading(self, layer: Module) -> None:
+        dtype = torch.bfloat16
+        device = layer.w2_weight.device
+        # Interleave w13_weight_scale (gate_up_proj)
+        w13_weight_scale = layer.w13_weight_scale_inv.to(dtype)
+        w13_weight_scale = self._interleave_scales(w13_weight_scale)
+        layer.w13_weight_scale_inv = Parameter(w13_weight_scale, requires_grad=False)
+        # Interleave w2_weight_scale (down_proj)
+        w2_weight_scale = layer.w2_weight_scale_inv.to(dtype)
+        w2_weight_scale = self._interleave_scales(w2_weight_scale)
+        layer.w2_weight_scale_inv = Parameter(w2_weight_scale, requires_grad=False)
+        # Process input scales
+        w13_input_scale_max = layer.w13_input_scale.max().to(dtype).item()
+        new_w13_input_scale = torch.tensor(
+            [w13_input_scale_max],
+            dtype=dtype,
+            device=device,
+        )
+        layer.w13_input_scale = Parameter(new_w13_input_scale, requires_grad=False)
+        w2_input_scale_max = layer.w2_input_scale.max().to(dtype).item()
+        new_w2_input_scale = torch.tensor(
+            [w2_input_scale_max], dtype=dtype, device=device
+        )
+        layer.w2_input_scale = Parameter(new_w2_input_scale, requires_grad=False)

sglang/srt/layers/quantization/w8a8_int8.py CHANGED Viewed

@@ -4,6 +4,7 @@ import torch
 from torch.nn.parameter import Parameter
 from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.layers.amx_utils import _amx_process_weight_after_loading
 from sglang.srt.layers.linear import LinearMethodBase
 from sglang.srt.layers.parameter import ChannelQuantScaleParameter, ModelWeightParameter
 from sglang.srt.layers.quantization.base_config import (
@@ -11,9 +12,17 @@ from sglang.srt.layers.quantization.base_config import (
     QuantizeMethodBase,
 )
 from sglang.srt.layers.quantization.int8_kernel import per_token_quant_int8
-from sglang.srt.utils import is_cuda, set_weight_attrs
+from sglang.srt.utils import (
+    cpu_has_amx_support,
+    is_cpu,
+    is_cuda,
+    set_weight_attrs,
+    use_intel_amx_backend,
+)
 _is_cuda = is_cuda()
+_is_cpu_amx_available = cpu_has_amx_support()
+_is_cpu = is_cpu()
 if _is_cuda:
     from sgl_kernel import int8_scaled_mm
@@ -72,6 +81,13 @@ class W8A8Int8LinearMethod(LinearMethodBase):
         self.quantization_config = quantization_config
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        if _is_cpu:
+            assert (
+                _is_cpu_amx_available
+            ), "W8A8Int8LinearMethod on CPU requires that CPU has AMX support"
+            _amx_process_weight_after_loading(layer, ["weight"])
+            return
         layer.weight = Parameter(layer.weight.t(), requires_grad=False)
         layer.weight_scale = Parameter(layer.weight_scale.data, requires_grad=False)
@@ -112,6 +128,16 @@ class W8A8Int8LinearMethod(LinearMethodBase):
         x: torch.Tensor,
         bias: Optional[torch.Tensor] = None,
     ):
+        if use_intel_amx_backend(layer):
+            return torch.ops.sgl_kernel.int8_scaled_mm_with_quant(
+                x,
+                layer.weight,
+                layer.weight_scale,
+                bias,
+                x.dtype,
+                True,  # is_vnni
+            )
         x_q, x_scale = per_token_quant_int8(x)
         return int8_scaled_mm(
@@ -206,6 +232,13 @@ class W8A8Int8MoEMethod:
         layer.register_parameter("w2_input_scale", w2_input_scale)
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        if _is_cpu:
+            assert (
+                _is_cpu_amx_available
+            ), "W8A8Int8MoEMethod on CPU requires that CPU has AMX support"
+            _amx_process_weight_after_loading(layer, ["w13_weight", "w2_weight"])
+            return
         layer.w13_weight = Parameter(layer.w13_weight, requires_grad=False)
         layer.w2_weight = Parameter(layer.w2_weight, requires_grad=False)
         layer.w13_weight_scale = Parameter(
@@ -252,6 +285,24 @@ class W8A8Int8MoEMethod:
             routed_scaling_factor=routed_scaling_factor,
         )
+        if use_intel_amx_backend(layer):
+            return torch.ops.sgl_kernel.fused_experts_cpu(
+                x,
+                layer.w13_weight,
+                layer.w2_weight,
+                topk_weights,
+                topk_ids,
+                False,  # inplace See [Note] inplace should be False in fused_experts.
+                True,  # use_int8_w8a8
+                False,  # use_fp8_w8a16
+                layer.w13_weight_scale,  # w1_scale
+                layer.w2_weight_scale,  # w2_scale
+                None,  # block_size
+                layer.w13_input_scale,  # a1_scale
+                layer.w2_input_scale,  # a2_scale
+                True,  # is_vnni
+            )
         return fused_experts(
             x,
             layer.w13_weight,

sglang/srt/layers/rotary_embedding.py CHANGED Viewed

@@ -660,7 +660,7 @@ class DeepseekScalingRotaryEmbedding(RotaryEmbedding):
         beta_slow: int = 1,
         mscale: float = 1,
         mscale_all_dim: float = 0,
-        device: Optional[str] = "cuda",
+        device: Optional[str] = "cuda" if not _is_npu else "npu",
     ) -> None:
         self.scaling_factor = scaling_factor
         self.extrapolation_factor = extrapolation_factor
@@ -679,7 +679,7 @@ class DeepseekScalingRotaryEmbedding(RotaryEmbedding):
         )
         # Re-dispatch
-        if _is_hip:
+        if _is_hip or _is_npu:
             self._forward_method = self.forward_native
     def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor:

sglang/srt/layers/vocab_parallel_embedding.py CHANGED Viewed

@@ -1,5 +1,6 @@
 # Adapted from https://github.com/vllm-project/vllm/blob/v0.6.3.post1/vllm/model_executor/layers/vocab_parallel_embedding.py
+import logging
 from dataclasses import dataclass
 from typing import List, Optional, Sequence, Tuple
@@ -13,6 +14,7 @@ from sglang.srt.distributed import (
     get_tensor_model_parallel_world_size,
     tensor_model_parallel_all_reduce,
 )
+from sglang.srt.layers.amx_utils import PackWeightMethod
 from sglang.srt.layers.dp_attention import get_attention_tp_rank, get_attention_tp_size
 from sglang.srt.layers.parameter import BasevLLMParameter
 from sglang.srt.layers.quantization.base_config import (
@@ -20,18 +22,15 @@ from sglang.srt.layers.quantization.base_config import (
     QuantizeMethodBase,
     method_has_implemented_embedding,
 )
-from sglang.srt.utils import (
-    PackWeightMethod,
-    cpu_has_amx_support,
-    is_cpu,
-    set_weight_attrs,
-)
+from sglang.srt.utils import cpu_has_amx_support, is_cpu, set_weight_attrs
 DEFAULT_VOCAB_PADDING_SIZE = 64
 _is_cpu_amx_available = cpu_has_amx_support()
 _is_cpu = is_cpu()
+logger = logging.getLogger(__name__)
 class UnquantizedEmbeddingMethod(QuantizeMethodBase):
     """Unquantized method for embeddings."""
@@ -250,8 +249,16 @@ class VocabParallelEmbedding(torch.nn.Module):
             self.tp_size = 1
         self.num_embeddings = num_embeddings
-        self.padding_size = padding_size
         self.org_vocab_size = org_num_embeddings or num_embeddings
+        # Support the case where the vocab size is not divisible by the TP size.
+        if (
+            _is_cpu
+            and pad_vocab_size(self.org_vocab_size, padding_size) % self.tp_size != 0
+        ):
+            padding_size *= self.tp_size
+        self.padding_size = padding_size
         num_added_embeddings = num_embeddings - self.org_vocab_size
         self.use_presharded_weights = use_presharded_weights
         if use_presharded_weights:
@@ -558,9 +565,12 @@ class ParallelLMHead(VocabParallelEmbedding):
         )
         self.quant_config = quant_config
-        # We only support pack LMHead if it's not quantized. For LMHead with quant_config, the weight_name will be "qweight"
-        if self.quant_config is None and _is_cpu and _is_cpu_amx_available:
-            self.quant_method = PackWeightMethod(weight_names=["weight"])
+        # We only support pack LMHead if it's not quantized.
+        if _is_cpu and _is_cpu_amx_available:
+            if hasattr(self, "weight") and self.weight.dtype == torch.bfloat16:
+                self.quant_method = PackWeightMethod(weight_names=["weight"])
+        else:
+            logger.warning("The weight of LmHead is not packed")
         if bias:
             self.bias = Parameter(

sglang/srt/lora/lora.py CHANGED Viewed

@@ -65,7 +65,7 @@ class LoRAAdapter(nn.Module):
         self.layers: List[LoRALayer] = nn.ModuleList(
             [
                 LoRALayer(config, base_hf_config)
-                for i in range(base_hf_config.num_hidden_layers)
+                for _ in range(base_hf_config.num_hidden_layers)
             ]
         )
@@ -88,10 +88,9 @@ class LoRAAdapter(nn.Module):
             else:
                 self.weights[name] = loaded_weight.cpu()
-        # stack kv_proj and gate_up_proj
-        for i in range(self.base_hf_config.num_hidden_layers):
-            layer = self.layers[i]
-            weight_names = [name for name, _ in layer.weights.items()]
+        # normalize kv_proj and gate_up_proj
+        for layer in self.layers:
+            weight_names = list(layer.weights.keys())
             self.normalize_qkv_proj(weight_names, layer.weights)
             self.normalize_gate_up_proj(weight_names, layer.weights)

sglang/srt/lora/lora_manager.py CHANGED Viewed

@@ -35,6 +35,7 @@ from sglang.srt.lora.utils import (
     get_normalized_lora_weight_names,
     get_weight_name,
 )
+from sglang.srt.managers.io_struct import LoRAUpdateResult
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 from sglang.srt.utils import replace_submodule
@@ -98,44 +99,96 @@ class LoRAManager:
                 ],
             )
-    def load_lora_adapters(self, lora_paths: Dict[str, str]):
+    def create_lora_update_result(
+        self, success: bool, error_message: str = ""
+    ) -> LoRAUpdateResult:
+        return LoRAUpdateResult(
+            success=success,
+            error_message=error_message,
+            loaded_adapters={
+                name: config.path for name, config in self.configs.items()
+            },
+        )
+    def load_lora_adapters(self, lora_paths: Dict[str, str]) -> LoRAUpdateResult:
         """
         Load LoRA adapters from the specified paths.
-        TODO (lifuhuang): This method should be exposed to the server/engine API to support dynamic LoRA loading.
         Args:
             lora_paths (Dict[str, str]): A dictionary mapping LoRA adapter names to their file paths.
             If a LoRA adapter is already loaded, it will be skipped with a warning.
         """
+        results = []
         for lora_name, lora_path in lora_paths.items():
-            if lora_name in self.loras:
-                logger.warning(
-                    f"LoRA adapter {lora_name} is already loaded."
-                    "If you want to reload it, please unload it first."
-                )
-                continue
+            result = self.load_lora_adapter(lora_name, lora_path, update_state=False)
+            results.append(result)
+        self.update_state_from_configs()
+        return self.create_lora_update_result(
+            success=all(result.success for result in results),
+            error_message="\n".join(
+                result.error_message for result in results if not result.success
+            ),
+        )
+    def load_lora_adapter(
+        self, lora_name: str, lora_path: str, update_state: bool = True
+    ) -> LoRAUpdateResult:
+        """
+        Load a single LoRA adapter from the specified path.
+        Args:
+            lora_name (str): The name of the LoRA adapter.
+            lora_path (str): The file path to the LoRA adapter.
+            update_state (bool): Whether to refresh the internal state after loading the adapter. This is useful for batch loading.
+        """
+        success = True
+        error_message = ""
+        if lora_name in self.loras:
+            success = False
+            error_message = f"LoRA adapter {lora_name} is skipped as it is already loaded. If you want to reload it, please unload it first."
+        try:
             self.configs[lora_name] = LoRAConfig(lora_path)
+        except Exception as e:
+            success = False
+            error_message = (
+                f"Failed to load LoRA adapter {lora_name} from {lora_path}: {str(e)}"
+            )
-        self.update_state_from_configs()
+        if update_state:
+            self.update_state_from_configs()
+        return self.create_lora_update_result(
+            success=success,
+            error_message=error_message,
+        )
-    def unload_lora_adapters(self, lora_names: Set[str]):
+    def unload_lora_adapter(self, lora_name: str) -> LoRAUpdateResult:
         """
         Unload LoRA adapters by their names. This will remove the adapters from the memory pool and
         delete the corresponding LoRA modules.
-        Args:
-            lora_names (Set[str]): A set of LoRA adapter names to unload.
         """
-        for lora_name in lora_names:
-            if lora_name in self.loras:
-                del self.configs[lora_name]
-            else:
-                logger.warning(f"LoRA adapter {lora_name} is not loaded.")
+        success = True
+        error_message = ""
+        if lora_name in self.loras:
+            del self.configs[lora_name]
+        else:
+            error_message = f"LoRA adapter {lora_name} is not loaded."
+            success = False
         self.update_state_from_configs()
+        return self.create_lora_update_result(
+            success=success,
+            error_message=error_message,
+        )
     def prepare_lora_batch(self, forward_batch: ForwardBatch):
         # load active loras into lora memory pool
         cur_uids = set(forward_batch.lora_paths)
@@ -372,8 +425,8 @@ class LoRAManager:
                 lora_adapter.initialize_weights()
                 self.loras[name] = lora_adapter
-        # Clean up unused LoRA adapters
-        for name in self.loras:
+        # Clean up unused LoRA adapters, copying the list to avoid modifying the dict during iteration.
+        for name in list(self.loras):
             if name not in self.configs:
                 logger.info(f"Unloading LoRA adapter {name}")
                 del self.loras[name]

sglang 0.4.8.post1__py3-none-any.whl → 0.4.9.post1__py3-none-any.whl

sglang 0.4.8.post1py3-none-any.whl → 0.4.9.post1py3-none-any.whl