PyPI - sglang - Versions diffs - 0.4.4.post2__py3-none-any.whl → 0.4.4.post3__py3-none-any.whl - Mend

sglang 0.4.4.post2py3-none-any.whl → 0.4.4.post3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

sglang/bench_serving.py +23 -3
sglang/srt/configs/deepseekvl2.py +10 -1
sglang/srt/configs/model_config.py +5 -16
sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -1
sglang/srt/distributed/parallel_state.py +32 -5
sglang/srt/entrypoints/http_server.py +7 -1
sglang/srt/entrypoints/verl_engine.py +2 -0
sglang/srt/function_call_parser.py +0 -1
sglang/srt/layers/attention/flashattention_backend.py +218 -79
sglang/srt/layers/dp_attention.py +12 -1
sglang/srt/layers/moe/topk.py +30 -3
sglang/srt/layers/quantization/__init__.py +134 -165
sglang/srt/layers/quantization/awq.py +200 -0
sglang/srt/layers/quantization/fp8_kernel.py +2 -1
sglang/srt/layers/quantization/gptq.py +30 -40
sglang/srt/layers/quantization/w8a8_fp8.py +1 -1
sglang/srt/layers/rotary_embedding.py +12 -0
sglang/srt/lora/backend/base_backend.py +4 -4
sglang/srt/lora/backend/flashinfer_backend.py +12 -9
sglang/srt/lora/backend/triton_backend.py +5 -8
sglang/srt/lora/layers.py +19 -33
sglang/srt/lora/lora_manager.py +20 -7
sglang/srt/lora/mem_pool.py +12 -6
sglang/srt/lora/triton_ops/gate_up_lora_b.py +10 -4
sglang/srt/lora/triton_ops/qkv_lora_b.py +8 -3
sglang/srt/lora/triton_ops/sgemm_lora_a.py +16 -5
sglang/srt/lora/triton_ops/sgemm_lora_b.py +11 -6
sglang/srt/lora/utils.py +6 -0
sglang/srt/managers/io_struct.py +4 -2
sglang/srt/managers/multimodal_processors/clip.py +63 -0
sglang/srt/managers/schedule_batch.py +1 -0
sglang/srt/managers/scheduler.py +25 -19
sglang/srt/managers/tokenizer_manager.py +0 -1
sglang/srt/managers/tp_worker.py +3 -0
sglang/srt/model_executor/cuda_graph_runner.py +9 -8
sglang/srt/model_executor/model_runner.py +9 -6
sglang/srt/model_loader/loader.py +11 -1
sglang/srt/model_loader/weight_utils.py +6 -3
sglang/srt/models/clip.py +563 -0
sglang/srt/models/deepseek_janus_pro.py +2 -2
sglang/srt/models/deepseek_v2.py +151 -26
sglang/srt/models/gemma3_causal.py +12 -2
sglang/srt/models/gemma3_mm.py +6 -0
sglang/srt/openai_api/adapter.py +88 -87
sglang/srt/openai_api/protocol.py +10 -5
sglang/srt/patch_torch.py +71 -0
sglang/srt/server_args.py +21 -11
sglang/srt/speculative/eagle_worker.py +1 -1
sglang/srt/utils.py +33 -0
sglang/test/runners.py +27 -2
sglang/test/test_utils.py +1 -1
sglang/version.py +1 -1
{sglang-0.4.4.post2.dist-info → sglang-0.4.4.post3.dist-info}/METADATA +8 -4
{sglang-0.4.4.post2.dist-info → sglang-0.4.4.post3.dist-info}/RECORD +57 -53
{sglang-0.4.4.post2.dist-info → sglang-0.4.4.post3.dist-info}/WHEEL +0 -0
{sglang-0.4.4.post2.dist-info → sglang-0.4.4.post3.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.4.post2.dist-info → sglang-0.4.4.post3.dist-info}/top_level.txt +0 -0

sglang/srt/layers/quantization/fp8_kernel.py CHANGED Viewed

@@ -24,6 +24,7 @@ import triton.language as tl
 from sglang.srt.utils import (
     direct_register_custom_op,
+    get_bool_env_var,
     get_device_core_count,
     get_device_name,
     get_device_sm,
@@ -43,7 +44,7 @@ if _is_cuda:
     from sgl_kernel import sgl_per_token_group_quant_fp8, sgl_per_token_quant_fp8
     sm_version = get_device_sm()
-    if sm_version >= 90 and int(os.getenv("SGL_ENABLE_JIT_DEEPGEMM", "1")):
+    if sm_version >= 90 and get_bool_env_var("SGL_ENABLE_JIT_DEEPGEMM", default="true"):
         _enable_jit_deepgemm = True

sglang/srt/layers/quantization/gptq.py CHANGED Viewed

@@ -11,12 +11,29 @@ from sglang.srt.utils import is_cuda
 _is_cuda = is_cuda()
 try:
-    import vllm
+    from vllm.model_executor.layers.quantization.base_config import QuantizeMethodBase
+    from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
+    from vllm.model_executor.layers.quantization.gptq_marlin import (
+        GPTQMarlinLinearMethod,
+        GPTQMarlinMoEMethod,
+    )
+    from vllm.model_executor.layers.quantization.marlin import MarlinLinearMethod
+    from vllm.model_executor.layers.quantization.utils.marlin_utils import (
+        check_marlin_supported,
+    )
+    from vllm.scalar_type import scalar_types
     VLLM_AVAILABLE = True
 except ImportError:
     VLLM_AVAILABLE = False
+    GPTQLinearMethod = MarlinLinearMethod = QuantizeMethodBase = Any
+    class scalar_types:
+        uint4b8 = "uint4b8"
+        uint8b128 = "uint8b128"
 logger = logging.getLogger(__name__)
@@ -117,12 +134,8 @@ class GPTQConfig(QuantizationConfig):
     def get_quant_method(
         self, layer: torch.nn.Module, prefix: str
-    ) -> Optional["GPTQLinearMethod"]:
-        if not VLLM_AVAILABLE:
-            raise ImportError("vllm is not installed")
-        from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
+    ) -> Optional[GPTQLinearMethod]:
+        # Delay the import to avoid circular dependency
         from sglang.srt.layers.quantization import get_linear_quant_method
         return get_linear_quant_method(self, layer, prefix, GPTQLinearMethod)
@@ -131,16 +144,11 @@ class GPTQConfig(QuantizationConfig):
 class GPTQMarlinConfig(QuantizationConfig):
     """Config class for GPTQ Marlin"""
-    if VLLM_AVAILABLE:
-        from vllm.scalar_type import scalar_types
-        # (num_bits, is_sym) -> quant_type
-        TYPE_MAP = {
-            (4, True): scalar_types.uint4b8,
-            (8, True): scalar_types.uint8b128,
-        }
-    else:
-        raise ImportError("vllm is not installed")
+    # (num_bits, is_sym) -> quant_type
+    TYPE_MAP = {
+        (4, True): scalar_types.uint4b8,
+        (8, True): scalar_types.uint8b128,
+    }
     def __init__(
         self,
@@ -197,6 +205,7 @@ class GPTQMarlinConfig(QuantizationConfig):
                 "Unsupported quantization config: " f"bits={weight_bits}, sym={is_sym}"
             )
+        # (num_bits, is_sym) -> quant_type
         self.quant_type = self.TYPE_MAP[(weight_bits, is_sym)]
     def __repr__(self) -> str:
@@ -278,15 +287,8 @@ class GPTQMarlinConfig(QuantizationConfig):
     def get_quant_method(
         self, layer: torch.nn.Module, prefix: str
-    ) -> Optional["QuantizeMethodBase"]:
-        if not VLLM_AVAILABLE:
-            raise ImportError("vllm is not installed")
-        from vllm.model_executor.layers.quantization.gptq_marlin import (
-            GPTQMarlinLinearMethod,
-            GPTQMarlinMoEMethod,
-        )
+    ) -> Optional[QuantizeMethodBase]:
+        # Delay the import to avoid circular dependency
         from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
         from sglang.srt.layers.quantization import get_linear_quant_method
@@ -304,19 +306,12 @@ class GPTQMarlinConfig(QuantizationConfig):
     @classmethod
     def is_gptq_marlin_compatible(cls, quant_config: Dict[str, Any]):
-        if not VLLM_AVAILABLE:
-            return False
         quant_method = quant_config.get("quant_method", "").lower()
         num_bits = quant_config.get("bits")
         group_size = quant_config.get("group_size")
         sym = quant_config.get("sym")
         desc_act = quant_config.get("desc_act")
-        from vllm.model_executor.layers.quantization.utils.marlin_utils import (
-            check_marlin_supported,
-        )
         if not _is_cuda:
             return False
@@ -427,13 +422,8 @@ class MarlinConfig(QuantizationConfig):
     def get_quant_method(
         self, layer: torch.nn.Module, prefix: str
-    ) -> Optional["MarlinLinearMethod"]:
-        if not VLLM_AVAILABLE:
-            raise ImportError("vllm is not installed")
-        from vllm.model_executor.layers.quantization.marlin import MarlinLinearMethod
-        # Delay import to avoid circular dependency
+    ) -> Optional[MarlinLinearMethod]:
+        # Delay the import to avoid circular dependency
         from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead
         if isinstance(layer, LinearBase) or (

sglang/srt/layers/quantization/w8a8_fp8.py CHANGED Viewed

@@ -37,7 +37,7 @@ class W8A8Fp8Config(QuantizationConfig):
     Note:
     - For models without offline quantization, weights will be quantized during model loading
     - If CUTLASS is supported: Per-channel weight quantization is used
-    - If CUTLASS is not supported: Falls back to per-token weight quantization
+    - If CUTLASS is not supported: Falls back to per-tensor weight quantization
     """
     def __init__(self, is_checkpoint_fp8_serialized: bool = False):

sglang/srt/layers/rotary_embedding.py CHANGED Viewed

@@ -651,6 +651,18 @@ class DeepseekScalingRotaryEmbedding(RotaryEmbedding):
         query: torch.Tensor,
         key: torch.Tensor,
         offsets: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if _is_cuda_available:
+            return self.forward_cuda(positions, query, key, offsets)
+        else:
+            return self.forward_native(positions, query, key, offsets)
+    def forward_native(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         """PyTorch-native implementation equivalent to forward()."""
         query_rot = query[..., : self.rotary_dim]

sglang/srt/lora/backend/base_backend.py CHANGED Viewed

@@ -5,7 +5,7 @@ import torch
 from sglang.srt.lora.utils import LoRABatchInfo
-def get_fuse_output_scaling_add_from_name(name: str) -> bool:
+def get_fuse_output_add_from_name(name: str) -> bool:
     mapping = {
         "triton": True,
         "flashinfer": False,
@@ -28,14 +28,14 @@ class BaseLoRABackend:
     Args:
         name: name of backend
         batch_info: information of current batch for use
-        fuse_output_scaling_add: if set to True, the output buffer for storing result will be passed in when doing lora_b forward,
-                                 and the operation of scaling and adding will be fused into kernel
+        fuse_output_add: if set to True, the output buffer for storing result will be passed in when doing lora_b forward,
+                                 and the operation of adding will be fused into kernel
     """
     def __init__(self, name: str, batch_info: LoRABatchInfo = None):
         self.name = name
         self.batch_info = batch_info
-        self.fuse_output_scaling_add = get_fuse_output_scaling_add_from_name(name)
+        self.fuse_output_add = get_fuse_output_add_from_name(name)
         self.fuse_stacked_lora_b = get_fuse_stacked_lora_b_from_name(name)
     def run_lora_a_sgemm(

sglang/srt/lora/backend/flashinfer_backend.py CHANGED Viewed

@@ -37,13 +37,16 @@ class FlashInferLoRABackend(BaseLoRABackend):
         self, x: torch.Tensor, weights: torch.Tensor, *args, **kwargs
     ) -> torch.Tensor:
-        return self.segment_gemm.run(
-            x=x,
-            weights=weights,
-            batch_size=self.batch_info.bs,
-            weight_column_major=True,
-            seg_indptr=self.batch_info.seg_indptr,
-            weight_indices=self.batch_info.weight_indices,
+        return (
+            self.segment_gemm.run(
+                x=x,
+                weights=weights,
+                batch_size=self.batch_info.bs,
+                weight_column_major=True,
+                seg_indptr=self.batch_info.seg_indptr,
+                weight_indices=self.batch_info.weight_indices,
+            )
+            * self.batch_info.scalings[0]
         )
     def run_qkv_lora(
@@ -90,7 +93,7 @@ class FlashInferLoRABackend(BaseLoRABackend):
             weights=kv_lora_b[1],
         )
-        return lora_output
+        return lora_output * self.batch_info.scalings[0]
     def run_gate_up_lora(
         self,
@@ -125,4 +128,4 @@ class FlashInferLoRABackend(BaseLoRABackend):
             weights=gate_up_lora_b[1],
         )
-        return lora_output
+        return lora_output * self.batch_info.scalings[0]

sglang/srt/lora/backend/triton_backend.py CHANGED Viewed

@@ -25,11 +25,10 @@ class TritonLoRABackend(BaseLoRABackend):
         x: torch.Tensor,
         weights: torch.Tensor,
         base_output: torch.Tensor = None,
-        scaling: float = 1.0,
         *args,
         **kwargs
     ) -> torch.Tensor:
-        return sgemm_lora_b_fwd(x, weights, self.batch_info, base_output, scaling)
+        return sgemm_lora_b_fwd(x, weights, self.batch_info, base_output)
     def run_qkv_lora(
         self,
@@ -39,7 +38,6 @@ class TritonLoRABackend(BaseLoRABackend):
         output_offset: torch.Tensor,
         max_qkv_out_dim: int,
         base_output: torch.Tensor = None,
-        scaling: float = 1.0,
         *args,
         **kwargs
     ) -> torch.Tensor:
@@ -49,7 +47,7 @@ class TritonLoRABackend(BaseLoRABackend):
         # qkv_lora_b: (num_lora, output_dim_q + 2 * output_dim_kv, r)
         assert isinstance(qkv_lora_b, torch.Tensor)
-        lora_a_output = sgemm_lora_a_fwd(x, qkv_lora_a, self.batch_info)
+        lora_a_output = sgemm_lora_a_fwd(x, qkv_lora_a, self.batch_info, stack_num=3)
         lora_output = qkv_lora_b_fwd(
             lora_a_output,
             qkv_lora_b,
@@ -57,7 +55,6 @@ class TritonLoRABackend(BaseLoRABackend):
             output_offset,
             max_qkv_out_dim,
             base_output,
-            scaling,
         )
         return lora_output
@@ -67,7 +64,6 @@ class TritonLoRABackend(BaseLoRABackend):
         gate_up_lora_a: torch.Tensor,
         gate_up_lora_b: torch.Tensor,
         base_output: torch.Tensor = None,
-        scaling: float = 1.0,
         *args,
         **kwargs
     ) -> torch.Tensor:
@@ -79,13 +75,14 @@ class TritonLoRABackend(BaseLoRABackend):
         output_dim = gate_up_lora_b.shape[-2] // 2
         # lora_a_output: (s, 2 * r)
-        lora_a_output = sgemm_lora_a_fwd(x, gate_up_lora_a, self.batch_info)
+        lora_a_output = sgemm_lora_a_fwd(
+            x, gate_up_lora_a, self.batch_info, stack_num=2
+        )
         lora_output = gate_up_lora_b_fwd(
             lora_a_output,
             gate_up_lora_b,
             self.batch_info,
             output_dim,
             base_output,
-            scaling,
         )
         return lora_output

sglang/srt/lora/layers.py CHANGED Viewed

@@ -23,14 +23,10 @@ class BaseLayerWithLoRA(nn.Module):
     def __init__(
         self,
         base_layer: nn.Module,
-        lora_rank: int,
-        scaling: float,
         lora_backend: BaseLoRABackend,
     ):
         super().__init__()
         self.base_layer: nn.Module = base_layer
-        self.lora_rank: int = lora_rank
-        self.scaling: float = scaling
         self.set_lora: bool = False
         self.lora_backend: BaseLoRABackend = lora_backend
@@ -59,11 +55,9 @@ class VocabParallelEmbeddingWithLoRA(BaseLayerWithLoRA):
     def __init__(
         self,
         base_layer: VocabParallelEmbedding,
-        lora_rank: int,
-        scaling: float,
         lora_backend: BaseLoRABackend,
     ) -> None:
-        super().__init__(base_layer, lora_rank, scaling, lora_backend)
+        super().__init__(base_layer, lora_backend)
         self.weight = base_layer.weight
@@ -71,11 +65,9 @@ class ColumnParallelLinearWithLoRA(BaseLayerWithLoRA):
     def __init__(
         self,
         base_layer: ColumnParallelLinear,
-        lora_rank: int,
-        scaling: float,
         lora_backend: BaseLoRABackend,
     ) -> None:
-        super().__init__(base_layer, lora_rank, scaling, lora_backend)
+        super().__init__(base_layer, lora_backend)
     def set_lora_info(
         self,
@@ -87,7 +79,7 @@ class ColumnParallelLinearWithLoRA(BaseLayerWithLoRA):
         self.B_buffer = B_buffer
     def apply_lora(self, base_output: torch.Tensor, x: torch.Tensor) -> torch.Tensor:
-        backend_kwargs = {"base_output": base_output, "scaling": self.scaling}
+        backend_kwargs = {"base_output": base_output}
         lora_a_output = self.lora_backend.run_lora_a_sgemm(x, self.A_buffer)
         lora_output = self.lora_backend.run_lora_b_sgemm(
             lora_a_output,
@@ -96,8 +88,8 @@ class ColumnParallelLinearWithLoRA(BaseLayerWithLoRA):
         )
         return (
             lora_output
-            if self.lora_backend.fuse_output_scaling_add
-            else base_output + lora_output * self.scaling
+            if self.lora_backend.fuse_output_add
+            else base_output + lora_output
         )
     def forward(self, input_: torch.Tensor):
@@ -132,11 +124,9 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
     def __init__(
         self,
         base_layer: MergedColumnParallelLinear,
-        lora_rank: int,
-        scaling: float,
         lora_backend: BaseLoRABackend,
     ) -> None:
-        super().__init__(base_layer, lora_rank, scaling, lora_backend)
+        super().__init__(base_layer, lora_backend)
     def set_lora_info(
         self,
@@ -155,7 +145,7 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
             self.B_buffer_gate_up = (B_buffer[0], B_buffer[1])
     def apply_lora(self, base_output: torch.Tensor, x: torch.Tensor) -> torch.Tensor:
-        backend_kwargs = {"base_output": base_output, "scaling": self.scaling}
+        backend_kwargs = {"base_output": base_output}
         lora_output = self.lora_backend.run_gate_up_lora(
             x,
@@ -165,8 +155,8 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
         )
         return (
             lora_output
-            if self.lora_backend.fuse_output_scaling_add
-            else base_output + lora_output * self.scaling
+            if self.lora_backend.fuse_output_add
+            else base_output + lora_output
         )
     def slice_lora_a_weights(self, A: torch.Tensor, tp_rank: int):
@@ -184,11 +174,9 @@ class QKVParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
     def init__(
         self,
         base_layer: QKVParallelLinear,
-        lora_rank: int,
-        scaling: float,
         lora_backend: BaseLoRABackend,
     ) -> None:
-        super().__init__(base_layer, lora_rank, scaling, lora_backend)
+        super().__init__(base_layer, lora_backend)
     def set_lora_info(
         self,
@@ -230,7 +218,7 @@ class QKVParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
             )
     def apply_lora(self, base_output: torch.Tensor, x: torch.Tensor) -> torch.Tensor:
-        backend_kwargs = {"base_output": base_output, "scaling": self.scaling}
+        backend_kwargs = {"base_output": base_output}
         if self.lora_backend.fuse_stacked_lora_b:
             backend_kwargs["output_offset"] = self.output_offset
             backend_kwargs["max_qkv_out_dim"] = self.max_qkv_out_dim
@@ -243,8 +231,8 @@ class QKVParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
         )
         return (
             lora_output
-            if self.lora_backend.fuse_output_scaling_add
-            else base_output + lora_output * self.scaling
+            if self.lora_backend.fuse_output_add
+            else base_output + lora_output
         )
     def slice_lora_a_weights(self, A: torch.Tensor, tp_rank: int):
@@ -273,11 +261,9 @@ class RowParallelLinearWithLoRA(BaseLayerWithLoRA):
     def __init__(
         self,
         base_layer: RowParallelLinear,
-        lora_rank: int,
-        scaling: float,
         lora_backend: BaseLoRABackend,
     ) -> None:
-        super().__init__(base_layer, lora_rank, scaling, lora_backend)
+        super().__init__(base_layer, lora_backend)
     def set_lora_info(self, A_buffer: torch.Tensor, B_buffer: torch.Tensor):
         self.set_lora = True
@@ -285,7 +271,7 @@ class RowParallelLinearWithLoRA(BaseLayerWithLoRA):
         self.B_buffer = B_buffer
     def apply_lora(self, base_output: torch.Tensor, x: torch.Tensor) -> torch.Tensor:
-        backend_kwargs = {"base_output": base_output, "scaling": self.scaling}
+        backend_kwargs = {"base_output": base_output}
         lora_a_output = self.lora_backend.run_lora_a_sgemm(x, self.A_buffer)
         lora_output = self.lora_backend.run_lora_b_sgemm(
             lora_a_output,
@@ -294,8 +280,8 @@ class RowParallelLinearWithLoRA(BaseLayerWithLoRA):
         )
         return (
             lora_output
-            if self.lora_backend.fuse_output_scaling_add
-            else base_output + lora_output * self.scaling
+            if self.lora_backend.fuse_output_add
+            else base_output + lora_output
         )
     def forward(self, input_: torch.Tensor):
@@ -344,7 +330,7 @@ class RowParallelLinearWithLoRA(BaseLayerWithLoRA):
 def get_lora_layer(
-    layer: nn.Module, lora_rank: int, scaling: int, lora_backend: BaseLoRABackend
+    layer: nn.Module, lora_backend: BaseLoRABackend
 ) -> BaseLayerWithLoRA:
     supported_layer_types = {
         # the order matters
@@ -356,6 +342,6 @@ def get_lora_layer(
     }
     for src_layer_type, lora_layer_type in supported_layer_types.items():
         if isinstance(layer, src_layer_type):  # pylint: disable=unidiomatic-typecheck
-            ret = lora_layer_type(layer, lora_rank, scaling, lora_backend)
+            ret = lora_layer_type(layer, lora_backend)
             return ret
     raise Exception(f"No corresponding LoRA layer supported for {type(layer)}.")

sglang/srt/lora/lora_manager.py CHANGED Viewed

@@ -103,11 +103,14 @@ class LoRAManager:
             self.loras[name] = lora_adapter
         # misc lora configs
-        # FIXME remove the restrictions after implementing unified paging
         self.max_lora_dim: int = max([x.hf_config["r"] for x in self.configs.values()])
-        self.scaling: float = list(self.loras.values())[0].scaling
-        assert all(x.hf_config["r"] == self.max_lora_dim for x in self.configs.values())
-        assert all(x.scaling == self.scaling for x in self.loras.values())
+        if self.lora_backend == "flashinfer":
+            # FIXME remove the restrictions after supporting multi-rank for flashinfer backend
+            max_lora_dim = max([x.hf_config["r"] for x in self.configs.values()])
+            scaling = list(self.loras.values())[0].scaling
+            assert all(x.hf_config["r"] == max_lora_dim for x in self.configs.values())
+            assert all(x.scaling == scaling for x in self.loras.values())
         # Convert original model layers to layers with LoRA
         self.convert_to_lora_layers()
@@ -148,8 +151,18 @@ class LoRAManager:
         seg_indptr[1:] = torch.cumsum(seg_lens, dim=0)
         max_len = int(torch.max(seg_lens))
         weight_indices = torch.empty((bs,), dtype=torch.int64, device=self.device)
+        lora_ranks = torch.empty(
+            (self.max_loras_per_batch,), dtype=torch.int64, device="cuda"
+        )
+        scalings = torch.empty(
+            (self.max_loras_per_batch,), dtype=torch.float, device="cuda"
+        )
         for i, lora_path in enumerate(forward_batch.lora_paths):
             weight_indices[i] = self.memory_pool.get_buffer_id(lora_path)
+            lora = self.loras[lora_path]
+            lora_ranks[weight_indices[i]] = lora.config.hf_config["r"]
+            scalings[weight_indices[i]] = lora.scaling
         batch_info = LoRABatchInfo(
             bs=bs,
@@ -157,6 +170,8 @@ class LoRAManager:
             seg_indptr=seg_indptr,
             max_len=max_len,
             weight_indices=weight_indices,
+            lora_ranks=lora_ranks,
+            scalings=scalings,
         )
         self.lora_backend.set_batch_info(batch_info)
@@ -189,9 +204,7 @@ class LoRAManager:
                     )
     def set_lora_module(self, module_name, module):
-        lora_module = get_lora_layer(
-            module, self.max_lora_dim, self.scaling, self.lora_backend
-        )
+        lora_module = get_lora_layer(module, self.lora_backend)
         replace_submodule(self.base_model, module_name, lora_module)
         return lora_module

sglang/srt/lora/mem_pool.py CHANGED Viewed

@@ -163,10 +163,11 @@ class LoRAMemoryPool:
         if uid is None:
             for i in range(self.num_layer):
                 for k in self.A_buffer.keys():
-                    self.A_buffer[k][i][buffer_id] *= 0
+                    self.A_buffer[k][i][buffer_id] = 0
             return
         assert lora_adapter is not None
+        lora_rank = lora_adapter.config.hf_config["r"]
         for layer_id in range(self.num_layer):
             layer_weights = lora_adapter.layers[layer_id].weights
             temp_A_buffer: Dict[str, torch.Tensor] = {}
@@ -208,17 +209,22 @@ class LoRAMemoryPool:
                         )
             for name, weights in temp_A_buffer.items():
-                self.A_buffer[name][layer_id][buffer_id].copy_(weights)
+                c = get_stacked_multiply(name)
+                self.A_buffer[name][layer_id][buffer_id][: lora_rank * c, :].copy_(
+                    weights
+                )
             for name, weights in temp_B_buffer.items():
                 c = get_stacked_multiply(name)
                 if c > 1:
                     for stacked_id in range(c):
-                        self.B_buffer[name][layer_id][stacked_id][buffer_id].copy_(
-                            weights[stacked_id]
-                        )
+                        self.B_buffer[name][layer_id][stacked_id][buffer_id][
+                            :, :lora_rank
+                        ].copy_(weights[stacked_id])
                 else:
-                    self.B_buffer[name][layer_id][0][buffer_id].copy_(weights)
+                    self.B_buffer[name][layer_id][0][buffer_id][:, :lora_rank].copy_(
+                        weights
+                    )
     def get_tensor(
         self, weight_name: str, layer_id: int, lora_type: LoRAType

sglang/srt/lora/triton_ops/gate_up_lora_b.py CHANGED Viewed

@@ -22,17 +22,18 @@ def _gate_up_lora_b_kernel(
     w_stride_2,
     output_stride_0,
     output_stride_1,
-    # Information on sequence lengths and weight id
+    # Information on sequence lengths,ranks and weight id
     seg_lens,
     seg_indptr,
     weight_indices,
+    lora_ranks,
     # Meta parameters
     BLOCK_S: tl.constexpr,
     BLOCK_N: tl.constexpr,
     BLOCK_K: tl.constexpr,
     # For fused output scaling and adding
     fuse_scaling_add,
-    scaling,
+    scalings,
 ):
     # This kernel packs 2 sgemms (gate/up) into a single kernel.
@@ -51,6 +52,11 @@ def _gate_up_lora_b_kernel(
     w_index = tl.load(weight_indices + batch_id)
     seg_start = tl.load(seg_indptr + batch_id)
     n_start = gate_up_id * output_dim  # offset on output dim
+    rank = tl.load(lora_ranks + w_index)
+    scaling = tl.load(scalings + w_index)
+    # Adjust K (rank) according to the specific LoRA adapter
+    K = tl.minimum(K, rank)
     # The tile in output matrix will have (pid_s, pid_n) as id
     num_pid_n = tl.cdiv(output_dim, BLOCK_N)
@@ -109,7 +115,6 @@ def gate_up_lora_b_fwd(
     batch_info: LoRABatchInfo,
     output_dim: int,
     base_output: torch.Tensor = None,
-    scaling: float = 1.0,
 ) -> torch.Tensor:
     # x: (s, 2 * r)
@@ -160,11 +165,12 @@ def gate_up_lora_b_fwd(
         batch_info.seg_lens,
         batch_info.seg_indptr,
         batch_info.weight_indices,
+        batch_info.lora_ranks,
         BLOCK_S,
         BLOCK_OUT,
         BLOCK_R,
         fuse_scaling_add,
-        scaling,
+        batch_info.scalings,
     )
     return output

sglang/srt/lora/triton_ops/qkv_lora_b.py CHANGED Viewed

@@ -26,6 +26,7 @@ def _qkv_lora_b_kernel(
     seg_lens,
     seg_indptr,
     weight_indices,
+    lora_ranks,
     # Offsets of q/k/v slice on output dimension
     n_offs,
     # Meta parameters
@@ -34,7 +35,7 @@ def _qkv_lora_b_kernel(
     BLOCK_K: tl.constexpr,
     # For fused output scaling and adding
     fuse_scaling_add,
-    scaling,
+    scalings,
 ):
     # This kernel packs 3 sgemms (q/k/v) into a single kernel.
@@ -54,6 +55,10 @@ def _qkv_lora_b_kernel(
     seg_start = tl.load(seg_indptr + batch_id)
     n_start = tl.load(n_offs + qkv_id)
     n_size = tl.load(n_offs + qkv_id + 1) - n_start
+    rank = tl.load(lora_ranks + w_index)
+    scaling = tl.load(scalings + w_index)
+    # Adjust K (rank) according to the specific LoRA adapter
+    K = tl.minimum(K, rank)
     # The tile in output matrix will have (pid_s, pid_n) as id
     num_pid_n = tl.cdiv(max_qkv_out_dim, BLOCK_N)
@@ -112,7 +117,6 @@ def qkv_lora_b_fwd(
     output_offset: torch.Tensor,
     max_qkv_out_dim: int,
     base_output: torch.Tensor = None,
-    scaling: float = 1.0,
 ) -> torch.Tensor:
     # x: (s, 3 * r)
@@ -171,12 +175,13 @@ def qkv_lora_b_fwd(
         batch_info.seg_lens,
         batch_info.seg_indptr,
         batch_info.weight_indices,
+        batch_info.lora_ranks,
         output_offset,
         BLOCK_S,
         BLOCK_OUT,
         BLOCK_R,
         fuse_scaling_add,
-        scaling,
+        batch_info.scalings,
     )
     return output

sglang 0.4.4.post2__py3-none-any.whl → 0.4.4.post3__py3-none-any.whl

sglang 0.4.4.post2py3-none-any.whl → 0.4.4.post3py3-none-any.whl