PyPI - sglang - Versions diffs - 0.4.6.post5__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl - Mend

sglang 0.4.6.post5py3-none-any.whl → 0.4.7.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (359) hide show

sglang/srt/layers/sampler.py CHANGED Viewed

@@ -5,7 +5,7 @@ import torch
 import torch.distributed as dist
 from torch import nn
-from sglang.srt.distributed import get_tensor_model_parallel_group
+from sglang.srt.distributed import get_tp_group
 from sglang.srt.layers.dp_attention import get_attention_tp_group
 from sglang.srt.layers.logits_processor import LogitsProcessorOutput
 from sglang.srt.managers.schedule_batch import global_server_args_dict
@@ -30,7 +30,7 @@ class Sampler(nn.Module):
     def __init__(self):
         super().__init__()
         self.use_nan_detection = global_server_args_dict["enable_nan_detection"]
-        self.tp_sync_group = get_tensor_model_parallel_group().device_group
+        self.tp_sync_group = get_tp_group().device_group
         if global_server_args_dict["enable_dp_attention"]:
             self.tp_sync_group = get_attention_tp_group().device_group
@@ -59,7 +59,7 @@ class Sampler(nn.Module):
         # Apply the custom logit processors if registered in the sampling info.
         if sampling_info.has_custom_logit_processor:
-            self._apply_custom_logit_processor(logits, sampling_info)
+            apply_custom_logit_processor(logits, sampling_info)
         if self.use_nan_detection and torch.any(torch.isnan(logits)):
             logger.warning("Detected errors during sampling! NaN in the logits.")
@@ -81,54 +81,39 @@ class Sampler(nn.Module):
             probs = logits
             del logits
-            if global_server_args_dict["sampling_backend"] == "flashinfer":
-                if return_logprob:
-                    # NOTE: the top_p_renorm_prob from flashinfer has numerical problems,
-                    # https://github.com/flashinfer-ai/flashinfer/issues/708
-                    # so we use the torch implementation.
-                    # clamp to avoid -inf
-                    logprobs = torch.log(
-                        top_p_normalize_probs_torch(probs, sampling_info.top_ps)
-                    ).clamp(min=torch.finfo(probs.dtype).min)
-                max_top_k_round, batch_size = 32, probs.shape[0]
-                if sampling_info.need_min_p_sampling:
-                    probs = top_k_renorm_prob(probs, sampling_info.top_ks)
-                    probs = top_p_renorm_prob(probs, sampling_info.top_ps)
-                    batch_next_token_ids = min_p_sampling_from_probs(
-                        probs, sampling_info.min_ps
-                    )
-                else:
-                    # Check Nan will throw exception, only check when crash_on_warnings is True
-                    check_nan = self.use_nan_detection and crash_on_warnings()
-                    batch_next_token_ids = top_k_top_p_sampling_from_probs(
+            if True:  # Keep this redundant check to simplify some internal code sync
+                if global_server_args_dict["sampling_backend"] == "flashinfer":
+                    if sampling_info.need_min_p_sampling:
+                        probs = top_k_renorm_prob(probs, sampling_info.top_ks)
+                        probs = top_p_renorm_prob(probs, sampling_info.top_ps)
+                        batch_next_token_ids = min_p_sampling_from_probs(
+                            probs, sampling_info.min_ps
+                        )
+                    else:
+                        batch_next_token_ids = top_k_top_p_sampling_from_probs(
+                            probs,
+                            sampling_info.top_ks,
+                            sampling_info.top_ps,
+                            filter_apply_order="joint",
+                            check_nan=self.use_nan_detection,
+                        )
+                elif global_server_args_dict["sampling_backend"] == "pytorch":
+                    # A slower fallback implementation with torch native operations.
+                    batch_next_token_ids = top_k_top_p_min_p_sampling_from_probs_torch(
                         probs,
                         sampling_info.top_ks,
                         sampling_info.top_ps,
-                        filter_apply_order="joint",
-                        check_nan=check_nan,
+                        sampling_info.min_ps,
+                        sampling_info.need_min_p_sampling,
+                    )
+                else:
+                    raise ValueError(
+                        f"Invalid sampling backend: {global_server_args_dict['sampling_backend']}"
                     )
-            elif global_server_args_dict["sampling_backend"] == "pytorch":
-                # A slower fallback implementation with torch native operations.
-                batch_next_token_ids = top_k_top_p_min_p_sampling_from_probs_torch(
-                    probs,
-                    sampling_info.top_ks,
-                    sampling_info.top_ps,
-                    sampling_info.min_ps,
-                    sampling_info.need_min_p_sampling,
-                )
-                if return_logprob:
-                    # clamp to avoid -inf
-                    logprobs = torch.log(
-                        top_p_normalize_probs_torch(probs, sampling_info.top_ps)
-                    ).clamp(min=torch.finfo(probs.dtype).min)
-            else:
-                raise ValueError(
-                    f"Invalid sampling backend: {global_server_args_dict['sampling_backend']}"
-                )
+            if return_logprob:
+                # clamp to avoid -inf
+                logprobs = torch.log(probs).clamp(min=torch.finfo(probs.dtype).min)
         # Attach logprobs to logits_output (in-place modification)
         if return_logprob:
@@ -165,39 +150,6 @@ class Sampler(nn.Module):
         return batch_next_token_ids
-    def _apply_custom_logit_processor(
-        self, logits: torch.Tensor, sampling_batch_info: SamplingBatchInfo
-    ):
-        """Apply custom logit processors to the logits.
-        This function will modify the logits in-place."""
-        assert logits.shape[0] == len(sampling_batch_info), (
-            f"The batch size of logits ({logits.shape[0]}) does not match the batch size of "
-            f"sampling_batch_info ({len(sampling_batch_info)})"
-        )
-        for _, (
-            processor,
-            batch_mask,
-        ) in sampling_batch_info.custom_logit_processor.items():
-            # Get the batch indices that need to be processed
-            batch_indices = batch_mask.nonzero(as_tuple=True)[0]
-            assert batch_mask.shape[0] == len(sampling_batch_info), (
-                f"The number of batch mask ({batch_mask.shape[0]}) does not match the number of "
-                f"sampling_batch_info ({len(sampling_batch_info)})"
-            )
-            # Apply the processor to the logits
-            logits[batch_mask] = processor(
-                logits[batch_mask],
-                [sampling_batch_info.custom_params[i] for i in batch_indices],
-            )
-            logger.debug(
-                f"Custom logit processor {processor.__class__.__name__} is applied."
-            )
 def top_k_top_p_min_p_sampling_from_probs_torch(
     probs: torch.Tensor,
@@ -226,6 +178,14 @@ def top_k_top_p_min_p_sampling_from_probs_torch(
     return batch_next_token_ids
+def sampling_from_probs_torch(probs: torch.Tensor):
+    """A sampling implementation with native pytorch operations, without
+    top-k, top-p, or min-p filtering."""
+    sampled_index = torch.multinomial(probs, num_samples=1)
+    batch_next_token_ids = sampled_index.view(-1).to(torch.int32)
+    return batch_next_token_ids
 def top_p_normalize_probs_torch(
     probs: torch.Tensor,
     top_ps: torch.Tensor,
@@ -264,3 +224,44 @@ def get_token_ids_logprobs(logprobs: torch.Tensor, token_ids_logprobs: List[List
             output_token_ids_logprobs_idx.append([])
     return output_token_ids_logprobs_val, output_token_ids_logprobs_idx
+def apply_custom_logit_processor(
+    logits: torch.Tensor,
+    sampling_batch_info: SamplingBatchInfo,
+    num_tokens_in_batch: int = 1,
+):
+    """Apply custom logit processors to the logits.
+    This function will modify the logits in-place.
+    num_tokens_in_batch is needed to support spec decoding, where each batch can contain multiple
+    tokens. By default, we assume each batch contains only 1 token.
+    """
+    assert logits.shape[0] == len(sampling_batch_info) * num_tokens_in_batch, (
+        f"The batch size of logits ({logits.shape[0]}) does not match the batch size of "
+        f"sampling_batch_info ({len(sampling_batch_info)}) x num_tokens_in_batch "
+        f"({num_tokens_in_batch})"
+    )
+    for _, (
+        processor,
+        batch_mask,
+    ) in sampling_batch_info.custom_logit_processor.items():
+        # Get the batch indices that need to be processed
+        batch_indices = batch_mask.nonzero(as_tuple=True)[0]
+        assert batch_mask.shape[0] == len(sampling_batch_info), (
+            f"The number of batch mask ({batch_mask.shape[0]}) does not match the number of "
+            f"sampling_batch_info ({len(sampling_batch_info)})"
+        )
+        batch_mask = torch.repeat_interleave(batch_mask, num_tokens_in_batch)
+        # Apply the processor to the logits
+        logits[batch_mask] = processor(
+            logits[batch_mask],
+            [sampling_batch_info.custom_params[i] for i in batch_indices],
+        )
+        logger.debug(
+            f"Custom logit processor {processor.__class__.__name__} is applied."
+        )

sglang/srt/layers/utils.py CHANGED Viewed

@@ -33,3 +33,9 @@ class PPMissingLayer(torch.nn.Identity):
         """
         input = args[0] if args else next(iter(kwargs.values()))
         return (input,) if self.return_tuple else input
+def is_sm100_supported(device=None) -> bool:
+    return (torch.cuda.get_device_capability(device)[0] == 10) and (
+        torch.version.cuda >= "12.8"
+    )

sglang/srt/lora/layers.py CHANGED Viewed

@@ -137,7 +137,7 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
         self.A_buffer_gate_up = A_buffer
         if self.lora_backend.fuse_stacked_lora_b:
             # B_buffer_gate_up: (num_lora, 2 * output_dim, r)
-            if not hasattr(self, "B_buffer_gate_up") or self.B_buffer_gate_up is None:
+            if getattr(self, "B_buffer_gate_up", None) is None:
                 self.B_buffer_gate_up = torch.empty(
                     (
                         B_buffer[0].shape[0],
@@ -202,7 +202,7 @@ class QKVParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
             output_dim_q, output_dim_kv = B_buffer_q.shape[-2], B_buffer_kv.shape[-2]
             # B_buffer_qkv: (num_lora, output_dim_q + 2 * output_dim_kv, r)
-            if not hasattr(self, "B_buffer_qkv") or self.B_buffer_qkv is None:
+            if getattr(self, "B_buffer_qkv", None) is None:
                 self.B_buffer_qkv = torch.empty(
                     (
                         B_buffer_q[0].shape[0],
@@ -221,20 +221,17 @@ class QKVParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
             )
             # Offsets of q/k/v in output dimension
-            if not hasattr(self, "output_offset") or self.output_offset is None:
-                self.output_offset = torch.empty(
-                    4, dtype=torch.int32, device=B_buffer_q.device
+            if getattr(self, "output_offset", None) is None:
+                self.output_offset = torch.tensor(
+                    [
+                        0,
+                        output_dim_q,
+                        output_dim_q + output_dim_kv,
+                        output_dim_q + 2 * output_dim_kv,
+                    ],
+                    dtype=torch.int32,
+                    device=B_buffer_q.device,
                 )
-            self.output_offset[:4] = torch.tensor(
-                [
-                    0,
-                    output_dim_q,
-                    output_dim_q + output_dim_kv,
-                    output_dim_q + 2 * output_dim_kv,
-                ],
-                dtype=torch.int32,
-                device=B_buffer_q.device,
-            )
             # For computing number of launched blocks
             self.max_qkv_out_dim = max(output_dim_q, output_dim_kv)
         else:

sglang/srt/lora/lora.py CHANGED Viewed

@@ -92,11 +92,12 @@ class LoRAAdapter(nn.Module):
         for i in range(self.base_hf_config.num_hidden_layers):
             layer = self.layers[i]
             weight_names = [name for name, _ in layer.weights.items()]
-            self.stack_qkv_proj(weight_names, layer.weights)
-            self.stack_gate_up_proj(weight_names, layer.weights)
-    def stack_qkv_proj(self, weight_names: List[str], weights: Dict[str, torch.Tensor]):
+            self.normalize_qkv_proj(weight_names, layer.weights)
+            self.normalize_gate_up_proj(weight_names, layer.weights)
+    def normalize_qkv_proj(
+        self, weight_names: List[str], weights: Dict[str, torch.Tensor]
+    ):
         # Collect target q/k/v modules. This process is necessary since there might be no lora attached to k_proj
         target_module = set()
         for weight_name in weight_names:
@@ -106,6 +107,8 @@ class LoRAAdapter(nn.Module):
                 target_module.add("q_proj")
             if "v_proj" in weight_name:
                 target_module.add("v_proj")
+            if "qkv_proj" in weight_name:
+                target_module.add("qkv_proj")
         if len(target_module) == 0:
             return
@@ -148,8 +151,35 @@ class LoRAAdapter(nn.Module):
                     if "k_proj" in target_module:
                         weights.pop(k_name)
                     weights.pop(v_name)
+            elif "qkv_proj" in weight_name:
+                # If qkv_proj is already stacked, we normalize it following the SGL convention.
+                qkv_name = weight_name
+                q_name = weight_name.replace("qkv_proj", "q_proj")
+                k_name = weight_name.replace("qkv_proj", "k_proj")
+                v_name = weight_name.replace("qkv_proj", "v_proj")
+                kv_name = weight_name.replace("qkv_proj", "kv_proj")
+                if "lora_A" in weight_name:
+                    weights[qkv_name] = weights[qkv_name].repeat(3, 1)
+                else:
+                    head_size = (
+                        self.base_hf_config.hidden_size
+                        // self.base_hf_config.num_attention_heads
+                    )
+                    weights[q_name], k_proj_weight, v_proj_weight = torch.split(
+                        weights[qkv_name],
+                        [
+                            head_size * self.base_hf_config.num_attention_heads,
+                            head_size * self.base_hf_config.num_key_value_heads,
+                            head_size * self.base_hf_config.num_key_value_heads,
+                        ],
+                        dim=0,
+                    )
+                    weights[kv_name] = torch.stack(
+                        [k_proj_weight, v_proj_weight],
+                        dim=0,
+                    )
-    def stack_gate_up_proj(
+    def normalize_gate_up_proj(
         self, weight_names: List[str], weights: Dict[str, torch.Tensor]
     ):
         for weight_name in weight_names:
@@ -179,3 +209,17 @@ class LoRAAdapter(nn.Module):
                 weights.pop(weight_name)
                 if up_name in weights:
                     weights.pop(up_name)
+            elif "gate_up_proj" in weight_name:
+                # If gate_up_proj is already stacked, we normalize it following the SGL convention
+                gate_up_name = weight_name
+                if "lora_A" in weight_name:
+                    weights[gate_up_name] = weights[gate_up_name].repeat(2, 1)
+                else:
+                    output_dim = weights[gate_up_name].shape[0] // 2
+                    weights[gate_up_name] = torch.stack(
+                        [
+                            weights[gate_up_name][:output_dim, :],
+                            weights[gate_up_name][output_dim:, :],
+                        ],
+                        dim=0,
+                    )

sglang/srt/lora/lora_manager.py CHANGED Viewed

@@ -32,7 +32,7 @@ from sglang.srt.lora.utils import (
     LoRAType,
     get_customized_names_from_hf_names,
     get_layer_id,
-    get_stacked_name,
+    get_normalized_lora_weight_names,
     get_weight_name,
 )
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
@@ -81,7 +81,7 @@ class LoRAManager:
                 seg_indptr=torch.zeros(
                     self.max_bs_in_cuda_graph + 1, dtype=torch.int32
                 ),
-                max_len=0,
+                max_len=1,
                 weight_indices=torch.zeros(
                     self.max_bs_in_cuda_graph, dtype=torch.int32
                 ),
@@ -89,6 +89,17 @@ class LoRAManager:
                 scalings=torch.zeros(self.max_loras_per_batch, dtype=torch.float),
             )
+            # Initialize seg_lens and seg_indptr for CUDA graph as they remain constant
+            # across batches.
+            self.cuda_graph_batch_info.seg_lens[: self.max_bs_in_cuda_graph].fill_(1)
+            torch.cumsum(
+                self.cuda_graph_batch_info.seg_lens[: self.max_bs_in_cuda_graph],
+                dim=0,
+                out=self.cuda_graph_batch_info.seg_indptr[
+                    1 : self.max_bs_in_cuda_graph + 1
+                ],
+            )
     def init_loras(self):
         # Config of each LoRA adapter
         self.configs: Dict[str, LoRAConfig] = {}
@@ -101,10 +112,13 @@ class LoRAManager:
             self.hf_target_names.update(self.configs[name].target_modules)
         # Target lora weight names for lora_a and lora_b modules respectively.
-        # e.g., {("qkv_proj", "q_proj"), ("qkv_proj", "kv_proj")}
-        self.lora_weight_names: Set[Tuple[str]] = set(
-            [get_stacked_name(module) for module in self.hf_target_names]
-        )
+        weights_A: List[str] = []
+        weights_B: List[str] = []
+        for module in self.hf_target_names:
+            lora_A, lora_B = get_normalized_lora_weight_names(module)
+            weights_A += lora_A
+            weights_B += lora_B
+        self.lora_weight_names: Tuple[Set[str]] = set(weights_A), set(weights_B)
         # load all weights to cpu
         self.loras: Dict[str, LoRAAdapter] = {}
@@ -156,6 +170,45 @@ class LoRAManager:
         # set up batch info shared by all lora modules
         bs = forward_batch.batch_size
+        def transfer_adapter_info(
+            weight_indices_out: torch.Tensor,
+            lora_ranks_out: torch.Tensor,
+            scalings_out: torch.Tensor,
+        ):
+            """
+            Transfer adapter metadata (weight indices, LoRA rank, scalings) from host
+            to device (CUDA) asynchronously.
+            """
+            weight_indices = [0] * len(forward_batch.lora_paths)
+            lora_ranks = [0] * self.max_loras_per_batch
+            scalings = [0] * self.max_loras_per_batch
+            for i, lora_path in enumerate(forward_batch.lora_paths):
+                weight_indices[i] = self.memory_pool.get_buffer_id(lora_path)
+                if lora_path is not None:
+                    lora = self.loras[lora_path]
+                    lora_ranks[weight_indices[i]] = lora.config.hf_config["r"]
+                    scalings[weight_indices[i]] = lora.scaling
+            # Use pinned memory to avoid synchronizations during host-to-device transfer
+            weight_indices_tensor = torch.tensor(
+                weight_indices, dtype=torch.int32, pin_memory=True, device="cpu"
+            )
+            lora_ranks_tensor = torch.tensor(
+                lora_ranks, dtype=torch.int32, pin_memory=True, device="cpu"
+            )
+            scalings_tensor = torch.tensor(
+                scalings, dtype=torch.float, pin_memory=True, device="cpu"
+            )
+            # Copy to device tensors asynchronously
+            weight_indices_out[:bs].copy_(weight_indices_tensor, non_blocking=True)
+            lora_ranks_out[: self.max_loras_per_batch].copy_(
+                lora_ranks_tensor, non_blocking=True
+            )
+            scalings_out[: self.max_loras_per_batch].copy_(
+                scalings_tensor, non_blocking=True
+            )
         if (
             hasattr(self, "max_bs_in_cuda_graph")
             and bs <= self.max_bs_in_cuda_graph
@@ -163,51 +216,46 @@ class LoRAManager:
         ):
             # Do in-place updates when CUDA graph is enabled and the batch forward mode
             # could use CUDA graph.
-            self.cuda_graph_batch_info.bs = bs
-            self.cuda_graph_batch_info.seg_lens[:bs].fill_(1)
-            torch.cumsum(
-                self.cuda_graph_batch_info.seg_lens[:bs],
-                dim=0,
-                out=self.cuda_graph_batch_info.seg_indptr[1 : bs + 1],
+            transfer_adapter_info(
+                self.cuda_graph_batch_info.weight_indices,
+                self.cuda_graph_batch_info.lora_ranks,
+                self.cuda_graph_batch_info.scalings,
             )
-            self.cuda_graph_batch_info.max_len = 1
-            for i, lora_path in enumerate(forward_batch.lora_paths):
-                self.cuda_graph_batch_info.weight_indices[i] = (
-                    self.memory_pool.get_buffer_id(lora_path)
-                )
-                if lora_path is not None:
-                    lora = self.loras[lora_path]
-                    self.cuda_graph_batch_info.lora_ranks[
-                        self.cuda_graph_batch_info.weight_indices[i]
-                    ] = lora.config.hf_config["r"]
-                    self.cuda_graph_batch_info.scalings[
-                        self.cuda_graph_batch_info.weight_indices[i]
-                    ] = lora.scaling
+            self.cuda_graph_batch_info.bs = bs
+            self.cuda_graph_batch_info.max_len = 1
             batch_info = self.cuda_graph_batch_info
         else:
+            weight_indices = torch.empty((bs,), dtype=torch.int32, device=self.device)
+            lora_ranks = torch.zeros(
+                (self.max_loras_per_batch,), dtype=torch.int64, device=self.device
+            )
+            scalings = torch.zeros(
+                (self.max_loras_per_batch,), dtype=torch.float, device=self.device
+            )
+            transfer_adapter_info(
+                weight_indices,
+                lora_ranks,
+                scalings,
+            )
             seg_lens = (
                 forward_batch.extend_seq_lens
                 if forward_batch.forward_mode.is_extend()
                 else torch.ones(bs, device=self.device)
             )
+            max_len = (
+                # Calculate max_len from the CPU copy to avoid D2H transfer.
+                max(forward_batch.extend_seq_lens_cpu)
+                if forward_batch.forward_mode.is_extend()
+                else 1
+            )
             seg_indptr = torch.zeros((bs + 1,), dtype=torch.int32, device=self.device)
             seg_indptr[1:] = torch.cumsum(seg_lens, dim=0)
-            max_len = int(torch.max(seg_lens))
-            weight_indices = torch.empty((bs,), dtype=torch.int64, device=self.device)
-            lora_ranks = torch.zeros(
-                (self.max_loras_per_batch,), dtype=torch.int64, device="cuda"
-            )
-            scalings = torch.zeros(
-                (self.max_loras_per_batch,), dtype=torch.float, device="cuda"
-            )
-            for i, lora_path in enumerate(forward_batch.lora_paths):
-                weight_indices[i] = self.memory_pool.get_buffer_id(lora_path)
-                if lora_path is not None:
-                    lora = self.loras[lora_path]
-                    lora_ranks[weight_indices[i]] = lora.config.hf_config["r"]
-                    scalings[weight_indices[i]] = lora.scaling
             batch_info = LoRABatchInfo(
                 bs=bs,
                 seg_lens=seg_lens,
@@ -263,7 +311,18 @@ class LoRAManager:
         self.lora_modules: Dict[int, List[Tuple[str, BaseLayerWithLoRA]]] = {
             i: [] for i in range(self.base_hf_config.num_hidden_layers)
         }
         for module_name, module in self.base_model.named_modules():
+            # TODO (lifuhuang): in the future, we should consider generalizing the
+            # should_apply_lora function to support mapping by full module name instead
+            # of just the last part (e.g., "qkv_proj") to support scenarios with multiple
+            # attention stacks (e.g., multimodal models).
+            # See: https://github.com/sgl-project/sglang/issues/6608
+            if getattr(
+                self.base_model, "should_apply_lora", None
+            ) and not self.base_model.should_apply_lora(module_name):
+                continue
             # The module should be converted if it is included in target_names
             if module_name.split(".")[-1] in customized_target_names:
                 layer_id = get_layer_id(module_name)

sglang/srt/lora/mem_pool.py CHANGED Viewed

@@ -91,18 +91,16 @@ class LoRAMemoryPool:
     def init_buffers(
         self,
-        lora_weight_names: Set[Tuple[str]],
+        lora_weight_names: Tuple[Set[str]],
         base_model: torch.nn.Module,
     ):
         # lora_weight_names is a set of name pairs indicating each pair of lora modules to load
         #   e.g., {("qkv_proj", "q_proj"), ("qkv_proj", "kv_proj"), ("o_proj", "o_proj")}
-        self.lora_weight_names: Set[Tuple[str]] = lora_weight_names
+        self.lora_weight_names: Tuple[Set[str]] = lora_weight_names
         device = next(base_model.parameters()).device
-        lora_module_A_names = set([name[0] for name in lora_weight_names])
-        lora_module_B_names = set([name[1] for name in lora_weight_names])
         # Init A tensor, column_major=False
-        for module_A in lora_module_A_names:
+        for module_A in lora_weight_names[0]:
             lora_A_shape = self.get_lora_A_shape(module_A, base_model)
             self.A_buffer[module_A] = [
                 torch.empty(
@@ -110,10 +108,10 @@ class LoRAMemoryPool:
                     dtype=self.dtype,
                     device=device,
                 )
-                for i in range(self.num_layer)
+                for _ in range(self.num_layer)
             ]
         # Init B tensor, column_major=True
-        for module_B in lora_module_B_names:
+        for module_B in lora_weight_names[1]:
             lora_B_shape = self.get_lora_B_shape(module_B, base_model)
             self.B_buffer[module_B] = [
                 torch.empty(
@@ -134,12 +132,13 @@ class LoRAMemoryPool:
             for buffer_id in range(self.max_loras_per_batch):
                 # Prioritize empty slots
                 if self.buffer_id_to_uid[buffer_id] == "":
-                    return buffer_id, ""
+                    return buffer_id
             for buffer_id in range(self.max_loras_per_batch):
                 # Evict unneeded lora
                 if self.buffer_id_to_uid[buffer_id] not in cur_uids:
-                    return buffer_id, self.buffer_id_to_uid[buffer_id]
+                    self.uid_to_buffer_id.pop(self.buffer_id_to_uid[buffer_id])
+                    return buffer_id
             raise ValueError(
                 "No available buffer slots found. Please ensure the number of active loras is less than max_loras_per_batch."
@@ -147,9 +146,7 @@ class LoRAMemoryPool:
         for uid in cur_uids:
             if uid not in self.uid_to_buffer_id:
-                buffer_id, evicted_lora_uid = get_available_buffer_slot()
-                if evicted_lora_uid != "":
-                    self.uid_to_buffer_id.pop(evicted_lora_uid)
+                buffer_id = get_available_buffer_slot()
                 self.load_lora_weight_to_buffer(
                     uid, buffer_id, lora_adapters.get(uid, None)
                 )
@@ -159,6 +156,10 @@ class LoRAMemoryPool:
     def load_lora_weight_to_buffer(
         self, uid: str, buffer_id: int, lora_adapter: LoRAAdapter = None
     ):
+        def check_lora_weight_shape(buffer_view: torch.Tensor, weight: torch.Tensor):
+            assert (
+                buffer_view.shape == weight.shape
+            ), f"LoRA buffer shape {buffer_view.shape} does not match weight shape {weight.shape}."
         if uid is None:
             for i in range(self.num_layer):
@@ -210,21 +211,27 @@ class LoRAMemoryPool:
             for name, weights in temp_A_buffer.items():
                 c = get_stacked_multiply(name)
-                self.A_buffer[name][layer_id][buffer_id][: lora_rank * c, :].copy_(
-                    weights
-                )
+                buffer_view = self.A_buffer[name][layer_id][buffer_id][
+                    : lora_rank * c, :
+                ]
+                check_lora_weight_shape(buffer_view, weights)
+                buffer_view.copy_(weights)
             for name, weights in temp_B_buffer.items():
                 c = get_stacked_multiply(name)
                 if c > 1:
                     for stacked_id in range(c):
-                        self.B_buffer[name][layer_id][stacked_id][buffer_id][
-                            :, :lora_rank
-                        ].copy_(weights[stacked_id])
+                        buffer_view = self.B_buffer[name][layer_id][stacked_id][
+                            buffer_id
+                        ][:, :lora_rank]
+                        check_lora_weight_shape(buffer_view, weights[stacked_id])
+                        buffer_view.copy_(weights[stacked_id])
                 else:
-                    self.B_buffer[name][layer_id][0][buffer_id][:, :lora_rank].copy_(
-                        weights
-                    )
+                    buffer_view = self.B_buffer[name][layer_id][0][buffer_id][
+                        :, :lora_rank
+                    ]
+                    check_lora_weight_shape(buffer_view, weights)
+                    buffer_view.copy_(weights)
     def get_tensor(
         self, weight_name: str, layer_id: int, lora_type: LoRAType

sglang 0.4.6.post5__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl

sglang 0.4.6.post5py3-none-any.whl → 0.4.7.post1py3-none-any.whl