PyPI - sglang - Versions diffs - 0.4.9.post2__py3-none-any.whl → 0.4.9.post3__py3-none-any.whl - Mend

sglang 0.4.9.post2py3-none-any.whl → 0.4.9.post3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (168) hide show

sglang/bench_one_batch.py +2 -1
sglang/eval/loogle_eval.py +7 -0
sglang/srt/configs/deepseekvl2.py +11 -2
sglang/srt/configs/internvl.py +3 -0
sglang/srt/configs/janus_pro.py +3 -0
sglang/srt/configs/model_config.py +9 -7
sglang/srt/configs/update_config.py +3 -1
sglang/srt/conversation.py +1 -0
sglang/srt/custom_op.py +5 -2
sglang/srt/disaggregation/decode.py +9 -1
sglang/srt/disaggregation/mooncake/conn.py +44 -56
sglang/srt/distributed/parallel_state.py +33 -0
sglang/srt/entrypoints/engine.py +30 -26
sglang/srt/entrypoints/openai/serving_chat.py +21 -2
sglang/srt/eplb/expert_location_dispatch.py +1 -1
sglang/srt/function_call/function_call_parser.py +2 -0
sglang/srt/function_call/qwen3_detector.py +150 -0
sglang/srt/hf_transformers_utils.py +0 -1
sglang/srt/layers/activation.py +13 -0
sglang/srt/layers/attention/flashattention_backend.py +3 -3
sglang/srt/layers/attention/flashinfer_backend.py +40 -1
sglang/srt/layers/linear.py +13 -102
sglang/srt/layers/moe/ep_moe/kernels.py +4 -2
sglang/srt/layers/moe/ep_moe/layer.py +23 -402
sglang/srt/layers/moe/fused_moe_native.py +7 -47
sglang/srt/layers/moe/fused_moe_triton/__init__.py +4 -4
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=385,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=385,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +35 -45
sglang/srt/layers/moe/fused_moe_triton/layer.py +14 -396
sglang/srt/layers/moe/topk.py +187 -12
sglang/srt/layers/quantization/__init__.py +20 -134
sglang/srt/layers/quantization/awq.py +578 -11
sglang/srt/layers/quantization/awq_triton.py +339 -0
sglang/srt/layers/quantization/base_config.py +85 -10
sglang/srt/layers/quantization/blockwise_int8.py +17 -55
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +13 -11
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +24 -73
sglang/srt/layers/quantization/fp8.py +273 -62
sglang/srt/layers/quantization/fp8_kernel.py +210 -46
sglang/srt/layers/quantization/fp8_utils.py +2 -2
sglang/srt/layers/quantization/gptq.py +501 -143
sglang/srt/layers/quantization/marlin_utils.py +790 -0
sglang/srt/layers/quantization/modelopt_quant.py +26 -108
sglang/srt/layers/quantization/moe_wna16.py +45 -49
sglang/srt/layers/quantization/petit.py +252 -0
sglang/srt/layers/quantization/petit_utils.py +104 -0
sglang/srt/layers/quantization/qoq.py +7 -6
sglang/srt/layers/quantization/scalar_type.py +352 -0
sglang/srt/layers/quantization/unquant.py +422 -0
sglang/srt/layers/quantization/utils.py +343 -3
sglang/srt/layers/quantization/w4afp8.py +8 -4
sglang/srt/layers/quantization/w8a8_fp8.py +17 -51
sglang/srt/layers/quantization/w8a8_int8.py +51 -115
sglang/srt/layers/vocab_parallel_embedding.py +1 -41
sglang/srt/lora/lora.py +0 -4
sglang/srt/lora/lora_manager.py +87 -53
sglang/srt/lora/mem_pool.py +81 -33
sglang/srt/lora/utils.py +12 -5
sglang/srt/managers/cache_controller.py +241 -0
sglang/srt/managers/io_struct.py +41 -29
sglang/srt/managers/mm_utils.py +7 -8
sglang/srt/managers/schedule_batch.py +150 -110
sglang/srt/managers/schedule_policy.py +68 -27
sglang/srt/managers/scheduler.py +243 -61
sglang/srt/managers/scheduler_output_processor_mixin.py +22 -4
sglang/srt/managers/tokenizer_manager.py +11 -3
sglang/srt/managers/tp_worker.py +14 -0
sglang/srt/managers/tp_worker_overlap_thread.py +11 -0
sglang/srt/mem_cache/allocator.py +7 -16
sglang/srt/mem_cache/base_prefix_cache.py +14 -2
sglang/srt/mem_cache/chunk_cache.py +5 -2
sglang/srt/mem_cache/hicache_storage.py +152 -0
sglang/srt/mem_cache/hiradix_cache.py +179 -4
sglang/srt/mem_cache/memory_pool.py +16 -1
sglang/srt/mem_cache/memory_pool_host.py +41 -2
sglang/srt/mem_cache/radix_cache.py +26 -0
sglang/srt/mem_cache/swa_radix_cache.py +1025 -0
sglang/srt/metrics/collector.py +9 -0
sglang/srt/model_executor/cuda_graph_runner.py +5 -6
sglang/srt/model_executor/forward_batch_info.py +14 -1
sglang/srt/model_executor/model_runner.py +109 -22
sglang/srt/model_loader/loader.py +7 -1
sglang/srt/model_loader/utils.py +4 -4
sglang/srt/models/clip.py +1 -1
sglang/srt/models/deepseek.py +9 -6
sglang/srt/models/deepseek_janus_pro.py +1 -1
sglang/srt/models/deepseek_v2.py +191 -171
sglang/srt/models/deepseek_vl2.py +5 -5
sglang/srt/models/gemma.py +48 -0
sglang/srt/models/gemma2.py +52 -0
sglang/srt/models/gemma3_causal.py +63 -0
sglang/srt/models/gemma3_mm.py +1 -1
sglang/srt/models/gemma3n_mm.py +2 -4
sglang/srt/models/granitemoe.py +385 -0
sglang/srt/models/grok.py +9 -3
sglang/srt/models/hunyuan.py +63 -16
sglang/srt/models/internvl.py +1 -1
sglang/srt/models/kimi_vl.py +1 -1
sglang/srt/models/llama.py +41 -0
sglang/srt/models/llama4.py +11 -11
sglang/srt/models/llava.py +2 -2
sglang/srt/models/llavavid.py +1 -1
sglang/srt/models/minicpm.py +0 -2
sglang/srt/models/minicpmo.py +3 -7
sglang/srt/models/minicpmv.py +1 -1
sglang/srt/models/mistral.py +1 -1
sglang/srt/models/mixtral.py +9 -2
sglang/srt/models/mllama.py +3 -5
sglang/srt/models/mllama4.py +3 -3
sglang/srt/models/olmoe.py +8 -5
sglang/srt/models/persimmon.py +330 -0
sglang/srt/models/phi.py +321 -0
sglang/srt/models/phi4mm.py +44 -4
sglang/srt/models/phi4mm_audio.py +1260 -0
sglang/srt/models/phi4mm_utils.py +1917 -0
sglang/srt/models/phimoe.py +9 -3
sglang/srt/models/qwen.py +37 -0
sglang/srt/models/qwen2.py +41 -0
sglang/srt/models/qwen2_5_vl.py +4 -4
sglang/srt/models/qwen2_audio.py +1 -1
sglang/srt/models/qwen2_moe.py +53 -5
sglang/srt/models/qwen2_vl.py +4 -4
sglang/srt/models/qwen3.py +65 -1
sglang/srt/models/qwen3_moe.py +56 -18
sglang/srt/models/vila.py +1 -1
sglang/srt/multimodal/processors/base_processor.py +91 -97
sglang/srt/multimodal/processors/clip.py +21 -19
sglang/srt/multimodal/processors/deepseek_vl_v2.py +8 -26
sglang/srt/multimodal/processors/gemma3.py +13 -17
sglang/srt/multimodal/processors/gemma3n.py +19 -23
sglang/srt/multimodal/processors/internvl.py +9 -10
sglang/srt/multimodal/processors/janus_pro.py +12 -27
sglang/srt/multimodal/processors/kimi_vl.py +12 -14
sglang/srt/multimodal/processors/llava.py +4 -2
sglang/srt/multimodal/processors/minicpm.py +35 -44
sglang/srt/multimodal/processors/mlama.py +21 -18
sglang/srt/multimodal/processors/mllama4.py +4 -5
sglang/srt/multimodal/processors/phi4mm.py +63 -39
sglang/srt/multimodal/processors/pixtral.py +14 -35
sglang/srt/multimodal/processors/qwen_audio.py +65 -0
sglang/srt/multimodal/processors/qwen_vl.py +16 -21
sglang/srt/multimodal/processors/vila.py +14 -14
sglang/srt/sampling/sampling_params.py +8 -1
sglang/srt/server_args.py +393 -230
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +9 -1
sglang/srt/two_batch_overlap.py +1 -0
sglang/srt/utils.py +27 -1
sglang/test/runners.py +14 -3
sglang/test/test_block_fp8.py +8 -3
sglang/test/test_block_fp8_ep.py +1 -1
sglang/test/test_custom_ops.py +12 -7
sglang/test/test_cutlass_w4a8_moe.py +1 -3
sglang/test/test_fp4_moe.py +1 -3
sglang/test/test_marlin_moe.py +286 -0
sglang/test/test_marlin_utils.py +171 -0
sglang/test/test_utils.py +35 -0
sglang/version.py +1 -1
{sglang-0.4.9.post2.dist-info → sglang-0.4.9.post3.dist-info}/METADATA +8 -8
{sglang-0.4.9.post2.dist-info → sglang-0.4.9.post3.dist-info}/RECORD +166 -146
sglang/srt/layers/quantization/quant_utils.py +0 -166
sglang/srt/managers/multimodal_processors/qwen_audio.py +0 -94
{sglang-0.4.9.post2.dist-info → sglang-0.4.9.post3.dist-info}/WHEEL +0 -0
{sglang-0.4.9.post2.dist-info → sglang-0.4.9.post3.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.9.post2.dist-info → sglang-0.4.9.post3.dist-info}/top_level.txt +0 -0

sglang/srt/layers/quantization/w8a8_int8.py CHANGED Viewed

@@ -1,7 +1,9 @@
+from __future__ import annotations
 import importlib
 import sys
 from types import MappingProxyType
-from typing import Any, Callable, Dict, List, Mapping, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Tuple, Union, cast
 import torch
 from torch.nn.parameter import Parameter
@@ -11,21 +13,20 @@ from sglang.srt.distributed import (
     get_tensor_model_parallel_world_size,
 )
 from sglang.srt.layers.amx_utils import _amx_process_weight_after_loading
-from sglang.srt.layers.linear import (
-    LinearMethodBase,
-    RowParallelLinear,
-    UnquantizedLinearMethod,
-)
 from sglang.srt.layers.parameter import (
     ChannelQuantScaleParameter,
     ModelWeightParameter,
     PerTensorScaleParameter,
 )
 from sglang.srt.layers.quantization.base_config import (
+    FusedMoEMethodBase,
+    LinearMethodBase,
     QuantizationConfig,
     QuantizeMethodBase,
 )
+from sglang.srt.layers.quantization.compressed_tensors.utils import should_ignore_layer
 from sglang.srt.layers.quantization.int8_kernel import per_token_quant_int8
+from sglang.srt.layers.quantization.unquant import UnquantizedLinearMethod
 from sglang.srt.utils import (
     apply_module_patch,
     cpu_has_amx_support,
@@ -36,6 +37,9 @@ from sglang.srt.utils import (
     use_intel_amx_backend,
 )
+if TYPE_CHECKING:
+    from sglang.srt.layers.moe.topk import TopKOutput
 _is_cuda = is_cuda()
 _is_cpu_amx_available = cpu_has_amx_support()
 _is_cpu = is_cpu()
@@ -178,17 +182,18 @@ class W8A8Int8Config(QuantizationConfig):
     - Activation: dynamic, per-token, symmetric
     """
-    def __init__(self, quant_config: Dict[str, Any]):
+    def __init__(self, quant_config: Dict[str, Any] = {}):
         super().__init__()
         self.quant_description = quant_config
         self.is_dynamic = quant_config.get("is_dynamic", False)
-        if _is_npu:
-            if (
-                "packed_modules_mapping" in quant_config
-                and quant_config["packed_modules_mapping"] is not None
-            ):
-                self.packed_modules_mapping = quant_config["packed_modules_mapping"]
+        ignore = cast(List[str], quant_config.get("ignore", []))
+        self.ignore = ignore if ignore is not None else []
+        packed_modules_mapping = quant_config.get("packed_modules_mapping", {})
+        self.packed_modules_mapping = (
+            packed_modules_mapping if packed_modules_mapping is not None else {}
+        )
+        if _is_npu:
             # Ascend w8a8_int8 quantization with bias, use wrappers to isolate the effects between models
             for name in self.quant_description.keys():
                 if "norm.bias" in name:
@@ -229,14 +234,14 @@ class W8A8Int8Config(QuantizationConfig):
         return []
     @classmethod
-    def from_config(cls, config: Dict[str, Any]) -> "W8A8Int8Config":
+    def from_config(cls, config: Dict[str, Any]) -> W8A8Int8Config:
         return cls(config)
     def get_quant_method(
         self,
         layer: torch.nn.Module,
         prefix: str,
-    ) -> Optional["QuantizeMethodBase"]:
+    ) -> Optional[QuantizeMethodBase]:
         from sglang.srt.layers.linear import LinearBase
         from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
@@ -262,12 +267,16 @@ class W8A8Int8Config(QuantizationConfig):
             elif isinstance(layer, FusedMoE):
                 return NPU_W8A8MoEMethod(self)
             return None
-        else:
-            if isinstance(layer, LinearBase):
-                return W8A8Int8LinearMethod(self)
-            elif isinstance(layer, FusedMoE):
-                return W8A8Int8MoEMethod(self)
-            return None
+        if should_ignore_layer(
+            prefix, ignore=self.ignore, fused_mapping=self.packed_modules_mapping
+        ):
+            return UnquantizedLinearMethod()
+        if isinstance(layer, LinearBase):
+            return W8A8Int8LinearMethod(self)
+        elif isinstance(layer, FusedMoE):
+            return W8A8Int8MoEMethod(self)
+        return None
     def is_layer_skipped(
         self, prefix: str, fused_mapping: Mapping[str, List[str]] = MappingProxyType({})
@@ -374,7 +383,7 @@ class W8A8Int8LinearMethod(LinearMethodBase):
         )
-class W8A8Int8MoEMethod:
+class W8A8Int8MoEMethod(FusedMoEMethodBase):
     """MoE method for INT8.
     Supports loading INT8 checkpoints with static weight scale and
     dynamic/static activation scale.
@@ -385,25 +394,7 @@ class W8A8Int8MoEMethod:
         quant_config: The quantization config.
     """
-    def __new__(cls, *args, **kwargs):
-        from sglang.srt.layers.moe.fused_moe_triton import FusedMoEMethodBase
-        if not hasattr(cls, "_initialized"):
-            original_init = cls.__init__
-            new_cls = type(
-                cls.__name__,
-                (FusedMoEMethodBase,),
-                {
-                    "__init__": original_init,
-                    **{k: v for k, v in cls.__dict__.items() if k != "__dict__"},
-                },
-            )
-            obj = super(new_cls, new_cls).__new__(new_cls)
-            obj.__init__(*args, **kwargs)
-            return obj
-        return super().__new__(cls)
-    def __init__(self, quant_config):
+    def __init__(self, quant_config: W8A8Int8Config):
         self.quant_config = quant_config
     def create_weights(
@@ -481,15 +472,8 @@ class W8A8Int8MoEMethod:
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
-        num_fused_shared_experts: int = 0,
-        custom_routing_function: Optional[Callable] = None,
-        correction_bias: Optional[torch.Tensor] = None,
+        topk_output: TopKOutput,
+        *,
         activation: str = "silu",
         apply_router_weight_on_input: bool = False,
         inplace: bool = True,
@@ -497,24 +481,14 @@ class W8A8Int8MoEMethod:
         routed_scaling_factor: Optional[float] = None,
     ) -> torch.Tensor:
         from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_experts
-        from sglang.srt.layers.moe.topk import select_experts
-        # Expert selection
-        topk_weights, topk_ids = select_experts(
-            hidden_states=x,
-            router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            num_fused_shared_experts=num_fused_shared_experts,
-            custom_routing_function=custom_routing_function,
-            correction_bias=correction_bias,
-            routed_scaling_factor=routed_scaling_factor,
-        )
         if use_intel_amx_backend(layer):
+            from sglang.srt.layers.moe.topk import apply_topk_weights_cpu
+            topk_weights, topk_ids, _ = topk_output
+            x, topk_weights = apply_topk_weights_cpu(
+                apply_router_weight_on_input, topk_weights, x
+            )
             return torch.ops.sgl_kernel.fused_experts_cpu(
                 x,
                 layer.w13_weight,
@@ -536,8 +510,7 @@ class W8A8Int8MoEMethod:
             x,
             layer.w13_weight,
             layer.w2_weight,
-            topk_weights=topk_weights,
-            topk_ids=topk_ids,
+            topk_output=topk_output,
             inplace=inplace,
             activation=activation,
             apply_router_weight_on_input=apply_router_weight_on_input,
@@ -761,6 +734,8 @@ class NPU_W8A8LinearMethod(LinearMethodBase):
         x: torch.Tensor,
         bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
+        from sglang.srt.layers.linear import RowParallelLinear
         if isinstance(layer, RowParallelLinear):
             tp_rank = get_tensor_model_parallel_rank()
             return self.quant_method.apply(layer, x, bias, tp_rank)
@@ -885,13 +860,15 @@ class NPU_W8A8DynamicLinearMethod(LinearMethodBase):
         x: torch.Tensor,
         bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
+        from sglang.srt.layers.linear import RowParallelLinear
         if isinstance(layer, RowParallelLinear):
             tp_rank = get_tensor_model_parallel_rank()
             return self.quant_method.apply(layer, x, bias, tp_rank)
         return self.quant_method.apply(layer, x, bias)
-class NPU_W8A8MoEMethod:
+class NPU_W8A8MoEMethod(FusedMoEMethodBase):
     """MoE method for NPU quantization.
     This class search for specific quantization
@@ -910,7 +887,7 @@ class NPU_W8A8MoEMethod:
         layer: torch.nn.Module,
         num_experts: int,
         hidden_size: int,
-        intermediate_size: List[int],
+        intermediate_size: int,
         params_dtype: torch.dtype,
         **extra_weight_attrs,
     ) -> None:
@@ -987,52 +964,11 @@ class NPU_W8A8MoEMethod:
         self,
         layer,
         x,
-        router_logits,
-        top_k,
-        renormalize,
-        use_grouped_topk,
-        topk_group,
-        num_expert_group,
-        num_fused_shared_experts,
-        custom_routing_function,
-        correction_bias,
-        activation,
-        apply_router_weight_on_input,
-        routed_scaling_factor,
+        topk_output: TopKOutput,
         **kwargs,
     ) -> torch.Tensor:
-        from sglang.srt.layers.moe.topk import select_experts
-        global_num_experts = router_logits.shape[-1]
-        # NOTE: now npu_moe_gating_top_k can only support `group_count=256` pattern
-        if global_num_experts == 256:
-            topk_weights, topk_ids, _ = torch_npu.npu_moe_gating_top_k(
-                router_logits,
-                k=top_k,
-                bias=correction_bias,
-                k_group=topk_group,
-                group_count=num_expert_group,
-                group_select_mode=1,
-                renorm=0,
-                norm_type=1,
-                routed_scaling_factor=1,
-                eps=float(1e-20),
-            )
-        else:
-            topk_weights, topk_ids = select_experts(
-                hidden_states=x,
-                router_logits=router_logits,
-                use_grouped_topk=use_grouped_topk,
-                top_k=top_k,
-                renormalize=renormalize,
-                topk_group=topk_group,
-                num_expert_group=num_expert_group,
-                num_fused_shared_experts=num_fused_shared_experts,
-                custom_routing_function=custom_routing_function,
-                correction_bias=correction_bias,
-                torch_native=True,
-                routed_scaling_factor=routed_scaling_factor,
-            )
+        topk_weights, topk_ids, _ = topk_output
         topk_ids = topk_ids.to(torch.int32)
         topk_weights = topk_weights.to(x.dtype)
         return npu_fused_experts(
@@ -1043,5 +979,5 @@ class NPU_W8A8MoEMethod:
             w2_scale=layer.w2_weight_scale,
             topk_weights=topk_weights,
             topk_ids=topk_ids,
-            top_k=top_k,
+            top_k=topk_ids.shape[1],
         )

sglang/srt/layers/vocab_parallel_embedding.py CHANGED Viewed

@@ -5,7 +5,6 @@ from dataclasses import dataclass
 from typing import List, Optional, Sequence, Tuple
 import torch
-import torch.nn.functional as F
 from torch.nn.parameter import Parameter, UninitializedParameter
 from sglang.srt.distributed import (
@@ -22,6 +21,7 @@ from sglang.srt.layers.quantization.base_config import (
     QuantizeMethodBase,
     method_has_implemented_embedding,
 )
+from sglang.srt.layers.quantization.unquant import UnquantizedEmbeddingMethod
 from sglang.srt.utils import cpu_has_amx_support, is_cpu, set_weight_attrs
 DEFAULT_VOCAB_PADDING_SIZE = 64
@@ -32,44 +32,6 @@ _is_cpu = is_cpu()
 logger = logging.getLogger(__name__)
-class UnquantizedEmbeddingMethod(QuantizeMethodBase):
-    """Unquantized method for embeddings."""
-    def create_weights(
-        self,
-        layer: torch.nn.Module,
-        input_size_per_partition: int,
-        output_partition_sizes: List[int],
-        input_size: int,
-        output_size: int,
-        params_dtype: torch.dtype,
-        **extra_weight_attrs,
-    ):
-        """Create weights for embedding layer."""
-        weight = Parameter(
-            torch.empty(
-                sum(output_partition_sizes),
-                input_size_per_partition,
-                dtype=params_dtype,
-            ),
-            requires_grad=False,
-        )
-        set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
-        layer.register_parameter("weight", weight)
-        set_weight_attrs(weight, extra_weight_attrs)
-    def apply(
-        self,
-        layer: torch.nn.Module,
-        x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        return F.linear(x, layer.weight, bias)
-    def embedding(self, layer: torch.nn.Module, input_: torch.Tensor) -> torch.Tensor:
-        return F.embedding(input_, layer.weight)
 def pad_vocab_size(vocab_size: int, pad_to: int = DEFAULT_VOCAB_PADDING_SIZE) -> int:
     """Pad the vocab size to the given value."""
     return ((vocab_size + pad_to - 1) // pad_to) * pad_to
@@ -569,8 +531,6 @@ class ParallelLMHead(VocabParallelEmbedding):
         if _is_cpu and _is_cpu_amx_available:
             if hasattr(self, "weight") and self.weight.dtype == torch.bfloat16:
                 self.quant_method = PackWeightMethod(weight_names=["weight"])
-        else:
-            logger.warning("The weight of LmHead is not packed")
         if bias:
             self.bias = Parameter(

sglang/srt/lora/lora.py CHANGED Viewed

@@ -186,10 +186,6 @@ class LoRAAdapter(nn.Module):
                 up_name = weight_name.replace("gate_proj", "up_proj")
                 gate_up_name = weight_name.replace("gate_proj", "gate_up_proj")
                 if up_name not in weights:
-                    logger.warning(
-                        f"Gate projection {weight_name} does not have a corresponding up projection {up_name}. "
-                        f"Initializing up projection to zero."
-                    )
                     weights[up_name] = torch.zeros_like(weights[weight_name])
                     # FIXME: Add gate-only support for flashinfer in future implementations
                     assert self.lora_backend.name == "triton", (

sglang/srt/lora/lora_manager.py CHANGED Viewed

@@ -16,7 +16,7 @@
 # and "Punica: Multi-Tenant LoRA Serving"
 import logging
-from typing import Dict, Set, Tuple
+from typing import Dict, Iterable, Optional, Set, Tuple
 import torch
@@ -53,6 +53,8 @@ class LoRAManager:
         lora_backend: str = "triton",
         tp_size: int = 1,
         tp_rank: int = 0,
+        max_lora_rank: Optional[int] = None,
+        target_modules: Optional[Iterable[str]] = None,
     ):
         self.base_model: torch.nn.Module = base_model
         self.base_hf_config: AutoConfig = base_hf_config
@@ -62,6 +64,10 @@ class LoRAManager:
         self.device: torch.device = next(self.base_model.parameters()).device
         self.tp_size: int = tp_size
         self.tp_rank: int = tp_rank
+        self.max_lora_rank: Optional[int] = max_lora_rank
+        self.target_modules: Optional[Set[str]] = (
+            set(target_modules) if target_modules else None
+        )
         # LoRA backend for running sgemm kernels
         logger.info(f"Using {lora_backend} as backend of LoRA kernels.")
@@ -153,7 +159,9 @@ class LoRAManager:
             error_message = f"LoRA adapter {lora_name} is skipped as it is already loaded. If you want to reload it, please unload it first."
         try:
-            self.configs[lora_name] = LoRAConfig(lora_path)
+            new_adapter = LoRAConfig(lora_path)
+            self.validate_new_adapter(lora_name, new_adapter)
+            self.configs[lora_name] = new_adapter
         except Exception as e:
             success = False
             error_message = (
@@ -168,6 +176,21 @@ class LoRAManager:
             error_message=error_message,
         )
+    def validate_new_adapter(self, lora_name: str, lora_config: LoRAConfig):
+        """
+        Validate if an adapter can be loaded into the current LoRA memory pool and generate error if it is incompatible.
+        """
+        incompatible = self.memory_pool and not self.memory_pool.can_support(
+            lora_config
+        )
+        if incompatible:
+            raise ValueError(
+                f"LoRA adapter {lora_name} with rank {lora_config.r} is incompatible with the current LoRA memory pool configuration. "
+                "Please ensure that the LoRA adapter's rank is within the configured `--max_lora_rank` and that the target modules are "
+                "included in `--enable_lora_modules`."
+            )
     def unload_lora_adapter(self, lora_name: str) -> LoRAUpdateResult:
         """
         Unload LoRA adapters by their names. This will remove the adapters from the memory pool and
@@ -214,7 +237,7 @@ class LoRAManager:
                 weight_indices[i] = self.memory_pool.get_buffer_id(lora_path)
                 if lora_path is not None:
                     lora = self.loras[lora_path]
-                    lora_ranks[weight_indices[i]] = lora.config.hf_config["r"]
+                    lora_ranks[weight_indices[i]] = lora.config.r
                     scalings[weight_indices[i]] = lora.scaling
             # Use pinned memory to avoid synchronizations during host-to-device transfer
@@ -319,7 +342,7 @@ class LoRAManager:
                     )
                 else:
                     weight_name = get_weight_name(
-                        module_name, self.lora_weight_names, LoRAType.LORA_A
+                        module_name, self.memory_pool.lora_weight_names, LoRAType.LORA_A
                     )
                     module.set_lora_info(
                         self.memory_pool.get_tensor(
@@ -351,58 +374,67 @@ class LoRAManager:
             i: {} for i in range(self.base_hf_config.num_hidden_layers)
         }
-        # Initialize memory pool
-        self.memory_pool = LoRAMemoryPool(
-            self.base_hf_config,
-            self.max_loras_per_batch,
-            self.dtype,
-            self.tp_size,
-            self.tp_rank,
-        )
+        # The LoRA memory pool that manages the GPU buffers for active LoRA weights.
+        # It is initialized lazily when the first LoRA adapter is loaded.
+        self.memory_pool: Optional[LoRAMemoryPool] = None
     def update_state_from_configs(self):
         """
         Update the internal state of the LoRAManager based on the current `self.configs`. This method
         should be called whenever `self.configs` is modified (e.g., when new LoRA adapters are loaded).
-        This includes:
-        - Initializing LoRA adapters if they are not already loaded.
-        - Collect all LoRA weight names based on the current loaded adapters.
-        - Lazily monkey-patching the base model to use LoRA layers where applicable.
-        - Preparing the GPU buffer pool for active LoRA weights.
         """
-        # Target module names in huggingface lora configs.
-        # e.g., {"k_proj", "q_proj", "v_proj", "o_proj"}
-        hf_target_module_names: Set[str] = set()
-        for config in self.configs.values():
-            hf_target_module_names.update(config.target_modules)
-        max_lora_dim: int = max([x.hf_config["r"] for x in self.configs.values()])
         # Loads / unloads LoRA adapters based on the latest configs.
         self.update_lora_adapters()
+        # Apply the latest LoRA configurations to the internal state for inferencing.
+        self.apply_lora_configs()
+    def apply_lora_configs(self):
+        """
+        Apply the LoRA configurations to the base model and internal states of the LoRAManager for inferencing.
+        Notes:
+        - Currently, this method is effectively only invoked during the initialization phase of the LoRAManager as
+          we do not yet support dynamically updating adapter shape configs, which has a dependency on (1) FlashInfer
+          LoRA backend deprecation and (2) CUDA graph recapture support. We are targeting completing these work in
+          early CY25H2.
+        """
+        if self.memory_pool is None:
+            # Infer max_lora_rank and target_modules if not explicitly specified in server args.
+            if self.target_modules is None:
+                self.target_modules = set()
+                for config in self.configs.values():
+                    self.target_modules.update(config.target_modules)
+            if self.max_lora_rank is None:
+                self.max_lora_rank = max(
+                    [x.hf_config["r"] for x in self.configs.values()],
+                    default=0,
+                )
+            self.update_lora_weight_names()
+            self.update_lora_modules()
+            self.update_memory_buffers()
+        else:
+            # No-op if the memory pool can support the current LoRA configurations.
+            # TODO (lifuhuang): support reinitializing the memory pool when the maximum LoRA rank or target
+            # module is changed once FlashInfer backend is deprecated.
+            assert self.memory_pool.can_support(self.configs.values()), (
+                "LoRA memory pool cannot support the current LoRA configuration. "
+                "This should never happen as we should have validated adapter compatibility. "
+                "Please create a Github issue to report.",
+            )
-        # Lazily update states for new LoRA weight name (e.g., qkv_proj) as needed.
-        #
-        # Please note that the following update operations are "monotonic" by design, meaning that we update
-        # multiple places to support the new weight names when the first adapter targeting such weight names
-        # is loaded. However, we never "rollback" the support (e.g., convert LoRA layer back to base layer)
-        # even if the associated adapters are unloaded later for both simplicity and practicality reasons: the
-        # list of LoRA weight names is expected to be extremely finite and stable.
-        self.update_lora_weight_names(hf_target_module_names)
-        self.update_lora_modules(hf_target_module_names)
-        self.update_memory_buffers(max_lora_dim)
-    def update_lora_weight_names(self, hf_target_names: Set[str]):
+    def update_lora_weight_names(self):
         """
         Add new LoRA weight names if needed based on the current `self.configs`.
         """
         # Target lora weight names for lora_a and lora_b modules respectively.
-        for module in hf_target_names:
-            lora_A, lora_B = get_normalized_lora_weight_names(module)
-            self.lora_weight_names[0].update(lora_A)
-            self.lora_weight_names[1].update(lora_B)
+        lora_A, lora_B = get_normalized_lora_weight_names(self.target_modules)
+        self.lora_weight_names[0].update(lora_A)
+        self.lora_weight_names[1].update(lora_B)
     def update_lora_adapters(self):
         """
@@ -434,21 +466,23 @@ class LoRAManager:
         # Additional checks for flashinfer backend
         # FIXME remove the restrictions after supporting multi-rank for flashinfer backend
         if self.lora_backend == "flashinfer":
-            lora_dims = set(x.hf_config["r"] for x in self.configs.values())
+            lora_dims = set(x.r for x in self.configs.values())
             scalings = set(x.scaling for x in self.loras.values())
             assert (
                 len(lora_dims) == 1 and len(scalings) == 1
             ), "Flashinfer backend currently only supports single LoRA rank and scaling across all adapters. "
-    def update_memory_buffers(self, max_lora_dim: int):
-        """
-        Update the LoRA memory pool buffers based on the current LoRA configurations and update
-        LoRA modules to use the new buffers. This method should be called after the LoRA configurations
-        are set or updated.
-        """
-        self.memory_pool.init_buffers(
-            self.lora_weight_names, self.base_model, max_lora_dim
+    def update_memory_buffers(self):
+        """(Re)initialize the LoRA memory pool based on the current configurations."""
+        self.memory_pool = LoRAMemoryPool(
+            base_hf_config=self.base_hf_config,
+            max_loras_per_batch=self.max_loras_per_batch,
+            dtype=self.dtype,
+            tp_size=self.tp_size,
+            tp_rank=self.tp_rank,
+            max_lora_rank=self.max_lora_rank,
+            lora_weight_names=self.lora_weight_names,
+            base_model=self.base_model,
         )
     def set_lora_module(self, module_name, module):
@@ -456,11 +490,11 @@ class LoRAManager:
         replace_submodule(self.base_model, module_name, lora_module)
         return lora_module
-    def update_lora_modules(self, hf_target_names: Set[str]):
+    def update_lora_modules(self):
         # Target module names of customized layers defined in python/sglang/srt/layers
         # e.g., {"qkv_proj", "o_proj"}
         customized_target_names = get_customized_names_from_hf_names(
-            hf_target_names, self.base_model
+            self.target_modules, self.base_model
         )
         for module_name, module in self.base_model.named_modules():

sglang 0.4.9.post2__py3-none-any.whl → 0.4.9.post3__py3-none-any.whl

sglang 0.4.9.post2py3-none-any.whl → 0.4.9.post3py3-none-any.whl