PyPI - vllm-cpu - Versions diffs - 0.8.5.post2__cp310-cp310-manylinux_2_17_x86_64.whl - Mend

vllm-cpu 0.8.5.post2__cp310-cp310-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of vllm-cpu might be problematic. Click here for more details.

Files changed (1103) hide show

vllm/lora/ops/triton_ops/lora_kernel_metadata.py ADDED Viewed

@@ -0,0 +1,147 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+LoRA kernels metadata preparation utilities.
+"""
+from dataclasses import dataclass
+from typing import Tuple, Union
+import torch
+@dataclass
+class LoRAKernelMeta:
+    token_lora_mapping: torch.Tensor
+    token_indices_sorted_by_lora_ids: torch.Tensor
+    active_lora_ids: torch.Tensor
+    num_tokens_per_lora: torch.Tensor
+    lora_token_start_loc: torch.Tensor
+    # The V1 architecture uses the traced torch.compile graphs to execute
+    # a forward pass. Things to note about this process,
+    # 1. The tracing infers all python scalar datatype objects into a constant
+    # value.
+    # 2. The tracing cannot handle dynamic control flow. (dynamic control flow
+    # is an experimental feature in pytorch)
+    # 3. The internals of torch.ops functions are not traced.
+    # We disguise the "no_lora" flag as a cpu tensor and leverage point number 3
+    # to early exit from inside the lora_expand / lora_shrink torch operation.
+    no_lora_flag_cpu: torch.Tensor
+    @staticmethod
+    def make(max_loras: int, max_num_tokens: int,
+             device: Union[torch.device, str]) -> "LoRAKernelMeta":
+        token_lora_mapping = torch.empty(max_num_tokens,
+                                         dtype=torch.int32,
+                                         device=device)
+        token_indices_sorted_by_lora_ids = torch.empty(max_num_tokens,
+                                                       dtype=torch.int32,
+                                                       device=device)
+        # +1 because "no-lora" is also a possibility
+        # example: let max_loras be 3, active_lora_ids of [-1, 0, 2, 1]
+        # is a possibility.
+        active_lora_ids = torch.empty(max_loras + 1,
+                                      dtype=torch.int32,
+                                      device=device)
+        # using running example, [3, 10, 5, 2] is a possibility.
+        num_tokens_per_lora = torch.zeros(max_loras + 1,
+                                          dtype=torch.int32,
+                                          device=device)
+        # +2 for this because, the first index is always 0.
+        # using running example, lora_token_start_loc
+        # is [0, 3, 13, 18, 20].
+        lora_token_start_loc = torch.zeros(max_loras + 2,
+                                           dtype=torch.int32,
+                                           device=device)
+        no_lora_flag_cpu = torch.tensor([False],
+                                        dtype=torch.bool,
+                                        device='cpu')
+        return LoRAKernelMeta(
+            token_lora_mapping=token_lora_mapping,
+            token_indices_sorted_by_lora_ids=token_indices_sorted_by_lora_ids,
+            active_lora_ids=active_lora_ids,
+            num_tokens_per_lora=num_tokens_per_lora,
+            lora_token_start_loc=lora_token_start_loc,
+            no_lora_flag_cpu=no_lora_flag_cpu)
+    def _reset(self):
+        self.active_lora_ids.fill_(-1)
+        self.num_tokens_per_lora.fill_(0)
+        self.lora_token_start_loc.fill_(0)
+        self.no_lora_flag_cpu.fill_(False)
+    def prepare_tensors(self, token_lora_mapping: torch.Tensor) -> None:
+        """
+        Prepare kernel metadata tensors for the current forward pass.
+        Args:
+            token_lora_tensor (torch.Tensor): Tensor containing lora indices
+            for each input token.
+        """
+        self._reset()
+        # Check and record no-lora case.
+        no_lora = torch.all(token_lora_mapping == -1)
+        self.no_lora_flag_cpu[0] = no_lora
+        if no_lora:
+            # Early exit. LoRA kernels will not be run.
+            return
+        num_tokens = token_lora_mapping.size(0)
+        # copy token lora mapping
+        self.token_lora_mapping[:num_tokens].copy_(token_lora_mapping,
+                                                   non_blocking=True)
+        # token_indices_sorted_by_lora_ids
+        _, token_indices_sorted_by_lora_ids = torch.sort(token_lora_mapping,
+                                                         stable=True)
+        # start gpu transfer
+        self.token_indices_sorted_by_lora_ids[:num_tokens].copy_(
+            token_indices_sorted_by_lora_ids, non_blocking=True)
+        # active_lora_ids, num_tokens_per_lora
+        lora_ids, num_tokens_per_lora = torch.unique(token_lora_mapping,
+                                                     sorted=True,
+                                                     return_counts=True)
+        self.active_lora_ids[:lora_ids.size(0)].copy_(lora_ids,
+                                                      non_blocking=True)
+        self.num_tokens_per_lora[:num_tokens_per_lora.size(0)].copy_(
+            num_tokens_per_lora, non_blocking=True)
+        # lora_token_start_loc
+        lora_token_start_loc = torch.cumsum(num_tokens_per_lora, dim=0)
+        self.lora_token_start_loc[1:1 + lora_token_start_loc.size(0)].copy_(
+            lora_token_start_loc, non_blocking=True)
+    def meta_args(
+        self, token_nums: int
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor,
+               torch.Tensor, torch.Tensor]:
+        """
+        This function returns the kernel metadata required for the current
+        forward pass execution of the kernel. The function returns all the
+        metadata required by the kernel, in order, as a tuple, so it can be
+        unpacked directly during the lora_shrink/lora_expand function call.
+        Args:
+            token_nums (int): Number of input tokens in the current forward
+            pass.
+        """
+        return (
+            self.token_lora_mapping[:token_nums],
+            self.token_indices_sorted_by_lora_ids[:token_nums],
+            self.num_tokens_per_lora,
+            self.lora_token_start_loc,
+            self.active_lora_ids,
+            self.no_lora_flag_cpu,
+        )

vllm/lora/ops/triton_ops/lora_shrink.py ADDED Viewed

@@ -0,0 +1,247 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+Based on:
+Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023).
+Punica: Multi-Tenant LoRA Serving.
+https://arxiv.org/abs/2310.18547
+"""
+from typing import List
+import torch
+import triton
+import triton.language as tl
+from vllm.lora.ops.triton_ops.kernel_utils import do_shrink_kernel
+from vllm.lora.ops.triton_ops.utils import _get_lora_a_ptr
+from vllm.utils import direct_register_custom_op
+@triton.jit
+def _lora_shrink_kernel(input_ptr, lora_ptr, out_ptr, M, N, K,
+                        token_indices_sorted_by_lora_ids, num_tokens_per_lora,
+                        lora_token_start_loc, lora_ids, scaling,
+                        input_d0_stride, input_d1_stride, lora_d0_stride,
+                        lora_d1_stride, lora_d2_stride, output_d0_stride,
+                        output_d1_stride, output_d2_stride,
+                        BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr,
+                        BLOCK_K: tl.constexpr, EVEN_K: tl.constexpr,
+                        SPLIT_K: tl.constexpr, SLICE_NUM: tl.constexpr):
+    cta_n_num = tl.cdiv(N, BLOCK_N)
+    cta_m_num = tl.cdiv(M, BLOCK_M)
+    pid_sk_m_n = tl.program_id(axis=0)
+    pid_sk = pid_sk_m_n % SPLIT_K
+    pid_m = (pid_sk_m_n // SPLIT_K) % cta_m_num
+    pid_n = pid_sk_m_n // (SPLIT_K * cta_m_num) % cta_n_num
+    slice_id = tl.program_id(axis=1)
+    lora_idx = tl.program_id(axis=2)
+    lora_id = tl.load(lora_ids + lora_idx)
+    if lora_id == -1:
+        # Early exit for the no-lora case.
+        return
+    lora_m_size = tl.load(num_tokens_per_lora + lora_idx)
+    cta_m_offset = pid_m * BLOCK_M
+    if cta_m_offset >= lora_m_size:
+        # Early exit CTA.
+        return
+    # num rows this CTA should process.
+    cta_m_len = min(BLOCK_M, lora_m_size - cta_m_offset)
+    # Identify all rows that this CTA should process.
+    lora_m_indices_start = tl.load(lora_token_start_loc + lora_idx)
+    cta_lora_seq_indices = (token_indices_sorted_by_lora_ids +
+                            lora_m_indices_start + cta_m_offset)
+    # Load all relevant row indices.
+    offset_m = tl.arange(0, BLOCK_M) % cta_m_len
+    ram = tl.load(cta_lora_seq_indices + offset_m)
+    do_shrink_kernel(
+        pid_n,
+        pid_sk,
+        slice_id,
+        lora_id,
+        input_ptr,
+        lora_ptr,
+        out_ptr,
+        N,
+        K,
+        cta_m_len,
+        ram,  # array identifying the rows of Input ptr to operate on
+        # input strides
+        input_d0_stride,
+        input_d1_stride,
+        # lora strides
+        lora_d0_stride,
+        lora_d1_stride,
+        lora_d2_stride,
+        # output strides
+        output_d0_stride,
+        output_d1_stride,
+        output_d2_stride,
+        scaling,
+        BLOCK_M,
+        BLOCK_N,
+        BLOCK_K,
+        EVEN_K,
+        SPLIT_K,
+        SLICE_NUM)
+@torch.inference_mode()
+def _lora_shrink(
+    inputs: torch.Tensor,  #  shape [num_tokens, hidden_size]
+    lora_a_weights: List[
+        torch.Tensor],  # shape [num_loras, lora_rank, hidden_size]
+    output_tensor: torch.Tensor,  # shape [num_slices, num_tokens, lora_rank]
+    token_lora_mapping: torch.Tensor,  # shape [num_tokens]
+    token_indices_sorted_by_lora_ids: torch.Tensor,  # shape [num_tokens]
+    num_tokens_per_lora: torch.Tensor,  # shape [max-loras + 1]
+    lora_token_start_loc: torch.Tensor,  # shape [max-loras + 2]
+    lora_ids: torch.Tensor,  # shape [max-loras + 1]
+    no_lora_flag_cpu: torch.Tensor,  # shape [1]
+    scaling: float,
+) -> None:
+    """
+    Args:
+        inputs (torch.Tensor): Input tensor
+        lora_a_weights (List[torch.Tensor]): LoRA weights
+        output_tensor (torch.Tensor): output tensor
+        token_lora_mapping (torch.Tensor): A tensor mapping each input token
+            to the lora-id related to that token. A value of -1 indicates that
+            LoRA doesn't apply to that token.
+        token_indices_sorted_by_lora_ids (torch.Tensor): Row/Token indices from
+            the A matrix grouped by LoRA IDs.
+        num_tokens_per_lora (torch.Tensor): num_tokens_per_lora[i] is the number
+            of tokens that are to be processed by LoRA ID lora_ids[i]
+        lora_token_start_loc (torch.Tensor): A cumulative sum of
+            num_tokens_per_lora. lora_token_start_loc[0] is always 0 so that
+            lora_token_start_loc[i], along with num_tokens_per_lora[i]
+            identifies the region in token_indices_sorted_by_lora_ids that
+            LoRA lora_ids[i] should process.
+        lora_ids (torch.Tensor): LoRA ids to process.
+        no_lora_flag_cpu (torch.Tensor): A CPU tensor of size 1, that indicates
+            if there are any requests that require LoRA.
+        scaling (float): Scaling factor.
+    """
+    assert no_lora_flag_cpu.numel() == 1
+    if no_lora_flag_cpu.item():
+        # None of the inputs require LoRA.
+        return
+    assert inputs.dtype == lora_a_weights[0].dtype
+    assert inputs.dtype in [torch.float16, torch.bfloat16]
+    for weight in lora_a_weights:
+        assert weight.dtype in [torch.float16, torch.bfloat16]
+    assert inputs.size(1) == lora_a_weights[0].size(-1)
+    assert inputs.is_contiguous()
+    assert output_tensor.is_contiguous()
+    # metadata sanity check
+    M = inputs.size(0)
+    assert token_lora_mapping.size(0) == M
+    assert token_lora_mapping.size(0) == token_indices_sorted_by_lora_ids.size(
+        0)
+    assert lora_ids.size(0) == num_tokens_per_lora.size(0)
+    assert lora_token_start_loc.size(0) == lora_ids.size(0) + 1
+    (lora_ptr_tensor, lora_strides_d0, lora_strides_d1,
+     lora_strides_d2) = _get_lora_a_ptr(lora_a_weights, inputs.device)
+    N, K = lora_a_weights[0].shape[-2:]  # K=hidden_size,N=rank
+    NUM_SLICES = len(lora_a_weights)
+    MAX_LORAS = lora_ids.size(0)
+    # Triton kernel configs
+    BLOCK_M = 32
+    BLOCK_N = 16
+    BLOCK_K = 256 if M < 128 else 32
+    SPLIT_K = 64 if M < 128 else 8
+    NUM_WARPS = 4
+    NUM_CTAS = 1
+    NUM_STAGES = 2
+    MAX_NREG = None
+    EVEN_K = K % (BLOCK_K * SPLIT_K) == 0  # type: ignore
+    # TODO (varun): This grid formulation maximizes parallelization at the
+    # cost of wasteful thread block launch when only few of the input tokens
+    # require LoRA. This might not be the best in all cases.
+    grid = (
+        SPLIT_K * triton.cdiv(M, BLOCK_M) * triton.cdiv(N, BLOCK_N),
+        NUM_SLICES,
+        # Each LoRA receives its own set of thread blocks for output
+        # computation. If some LoRA doesn't have any tokens to process, its
+        # thread blocks exit early.
+        MAX_LORAS,
+    )
+    _lora_shrink_kernel[grid](
+        inputs,
+        lora_ptr_tensor,
+        output_tensor,
+        M,
+        N,
+        K,
+        token_indices_sorted_by_lora_ids,
+        num_tokens_per_lora,
+        lora_token_start_loc,
+        lora_ids,
+        scaling,
+        inputs.stride(0),
+        inputs.stride(1),
+        lora_strides_d0,
+        lora_strides_d1,
+        lora_strides_d2,
+        output_tensor.stride(0),
+        output_tensor.stride(1),
+        output_tensor.stride(2),
+        BLOCK_M,
+        BLOCK_N,
+        BLOCK_K,
+        EVEN_K,
+        SPLIT_K,
+        NUM_SLICES,
+        num_warps=NUM_WARPS,
+        num_ctas=NUM_CTAS,
+        num_stages=NUM_STAGES,
+        maxnreg=MAX_NREG,
+    )
+    return
+def _lora_shrink_fake(
+    inputs: torch.Tensor,
+    lora_a_weights: List[torch.Tensor],
+    output_tensor: torch.Tensor,
+    token_lora_mapping: torch.Tensor,
+    token_indices_sorted_by_lora_ids: torch.Tensor,
+    num_tokens_per_lora: torch.Tensor,
+    lora_token_start_loc: torch.Tensor,
+    lora_ids: torch.Tensor,
+    no_lora_flag_cpu: torch.Tensor,
+    scaling: float,
+) -> None:
+    return
+try:
+    direct_register_custom_op(
+        op_name="lora_shrink",
+        op_func=_lora_shrink,
+        mutates_args=["output_tensor"],
+        fake_impl=_lora_shrink_fake,
+    )
+    lora_shrink = torch.ops.vllm.lora_shrink
+except AttributeError:
+    lora_shrink = _lora_shrink

vllm/lora/ops/triton_ops/utils.py ADDED Viewed

@@ -0,0 +1,121 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Dict, List, Tuple
+import torch
+_LORA_A_PTR_DICT: Dict[Tuple[int, ...], Tuple[torch.tensor, ...]] = {}
+_LORA_B_PTR_DICT: Dict[Tuple[int, ...], Tuple[torch.tensor, ...]] = {}
+def _get_lora_a_ptr(lora_a_weights: List[torch.Tensor], device: torch.device):
+    """
+    `_LORA_A_PTR_DICT` collects the required information during `profile_run`,
+    After this, it remains constant and subsequent usage is through LUT.
+    Refer to:
+    https://github.com/triton-lang/triton/blob/release/3.1.x/python/tutorials/08-grouped-gemm.py
+    """
+    key = tuple(lora_weight.data_ptr() for lora_weight in lora_a_weights)
+    if values := _LORA_A_PTR_DICT.get(key):
+        return values
+    lora_strides_d0 = []
+    lora_strides_d1 = []
+    lora_strides_d2 = []
+    tensor_ptrs = []
+    for lora_a_weight in lora_a_weights:
+        if lora_a_weight.ndim == 4:  # shape:(lora_num,1,size,rank)
+            assert lora_a_weight.size(1) == 1
+            lora_a_weight = lora_a_weight.squeeze(dim=1)
+        else:
+            assert lora_a_weight.ndim == 3  # shape:(lora_num,size,rank)
+        assert lora_a_weight.is_contiguous()
+        tensor_ptrs.append(lora_a_weight.data_ptr())
+        lora_strides_d0.append(lora_a_weight.stride(0))
+        lora_strides_d1.append(lora_a_weight.stride(1))
+        lora_strides_d2.append(lora_a_weight.stride(2))
+    if len(lora_a_weights) > 1:
+        lora_ptr_tensor = torch.tensor(tensor_ptrs, device=device)
+    else:
+        lora_ptr_tensor = lora_a_weights[0]
+    if (len(set(lora_strides_d0)) > 1 or len(set(lora_strides_d1)) > 1
+            or len(set(lora_strides_d2)) > 1):
+        raise ValueError("All LoRA weights must have the same stride.")
+    _LORA_A_PTR_DICT[key] = (
+        lora_ptr_tensor,
+        lora_strides_d0[0],
+        lora_strides_d1[0],
+        lora_strides_d2[0],
+    )
+    return _LORA_A_PTR_DICT.get(key)
+def _get_lora_b_ptr(lora_weights: List[torch.Tensor], offset_start: int,
+                    device: torch.device):
+    """
+     `_LORA_B_PTR_DICT` collects the required information during `profile_run`,
+    After this, it remains constant and subsequent usage is through LUT.
+    Refer to:
+    https://github.com/triton-lang/triton/blob/release/3.1.x/python/tutorials/08-grouped-gemm.py
+    """
+    key = tuple(lora_weight.data_ptr() for lora_weight in lora_weights)
+    if values := _LORA_B_PTR_DICT.get(key):
+        return values
+    slice_offset_lst = []
+    tensor_ptrs = []
+    lora_strides_d0 = []
+    lora_strides_d1 = []
+    lora_strides_d2 = []
+    hidden_sizes = []
+    slice_offset = offset_start
+    for lora_b_weight in lora_weights:
+        if lora_b_weight.ndim == 4:  # shape:(lora_num,1,size,rank)
+            assert lora_b_weight.size(1) == 1
+            lora_b_weight = lora_b_weight.squeeze(dim=1)
+        else:
+            assert lora_b_weight.ndim == 3  # shape:(lora_num,size,rank)
+        assert lora_b_weight.is_contiguous()
+        tensor_ptrs.append(lora_b_weight.data_ptr())
+        lora_strides_d0.append(lora_b_weight.stride(0))
+        lora_strides_d1.append(lora_b_weight.stride(1))
+        lora_strides_d2.append(lora_b_weight.stride(2))
+        slice_offset_lst.append(slice_offset)
+        slice_offset += lora_b_weight.size(1)
+        hidden_sizes.append(lora_b_weight.size(1))
+    if len(lora_weights) > 1:
+        # note these are device tensors
+        lora_ptr_tensor = torch.tensor(tensor_ptrs, device=device)
+        slice_start_tensor = torch.tensor(slice_offset_lst, device=device)
+    else:
+        slice_start_tensor = slice_offset_lst[0]
+        lora_ptr_tensor = lora_b_weight[0]
+    # If each lora has the same stride, there's no need to use a
+    # tensor for storage.
+    if (len(set(lora_strides_d0)) == 1 and len(set(lora_strides_d1)) == 1 and
+            len(set(lora_strides_d2)) == 1) and len(set(hidden_sizes)) == 1:
+        lora_strides_d0_tensor = lora_strides_d0[0]
+        lora_strides_d1_tensor = lora_strides_d1[0]
+        lora_strides_d2_tensor = lora_strides_d2[0]
+        hidden_sizes_tensor = hidden_sizes[0]
+        same_stride = True
+    else:
+        lora_strides_d0_tensor = torch.tensor(lora_strides_d0, device=device)
+        lora_strides_d1_tensor = torch.tensor(lora_strides_d1, device=device)
+        lora_strides_d2_tensor = torch.tensor(lora_strides_d2, device=device)
+        hidden_sizes_tensor = torch.tensor(hidden_sizes, device=device)
+        same_stride = False
+    # MAX_N is the maximum hidden size among all the lora_b weights
+    MAX_N = max(hidden_sizes)
+    _LORA_B_PTR_DICT[key] = (slice_start_tensor, lora_ptr_tensor,
+                             lora_strides_d0_tensor, lora_strides_d1_tensor,
+                             lora_strides_d2_tensor, hidden_sizes_tensor,
+                             same_stride, MAX_N)
+    return _LORA_B_PTR_DICT.get(key)

vllm/lora/peft_helper.py ADDED Viewed

@@ -0,0 +1,115 @@
+# SPDX-License-Identifier: Apache-2.0
+# Adapted from: https://github.com/huggingface/peft/blob/main/src/peft/tuners/lora/config.py
+import json
+import math
+import os
+from dataclasses import MISSING, dataclass, field, fields
+from typing import List, Literal, Optional, Union
+from vllm.config import LoRAConfig
+from vllm.logger import init_logger
+logger = init_logger(__name__)
+@dataclass
+class PEFTHelper:
+    """
+    A helper class for PEFT configurations, specifically designed for LoRA.
+    This class handles configuration validation, compatibility checks for
+    various LoRA implementations.
+    """
+    # Required fields
+    r: int
+    lora_alpha: int
+    target_modules: Union[list[str], str]
+    bias: Literal["none", "all", "lora_only"] = field(default="none")
+    modules_to_save: Optional[list[str]] = field(default=None)
+    # True to use Rank-Stabilized LoRA (rsLoRA, see: https://arxiv.org/abs/2312.03732)
+    use_rslora: bool = field(default=False)
+    # True to use Weight-Decomposed Low-Rank Adaptation (DoRA, see: https://arxiv.org/abs/2402.09353)
+    use_dora: bool = field(default=False)
+    # long context lora field
+    context_length: int = field(default=0)
+    # Extra vllm field, start with 'vllm_' to avoid conflict
+    vllm_lora_scaling_factor: float = field(default=1.0)
+    vllm_max_position_embeddings: Optional[int] = field(default=False)
+    vllm_long_context_scaling_factor: Optional[float] = field(default=None)
+    def _validate_features(self) -> List[str]:
+        """
+        Check if there are any unsupported LoRA features.
+        """
+        error_msg = []
+        if self.modules_to_save:
+            error_msg.append("vLLM only supports modules_to_save being None.")
+        if self.use_dora:
+            error_msg.append("vLLM does not yet support DoRA.")
+        return error_msg
+    def __post_init__(self):
+        if self.use_rslora:
+            logger.info_once("Loading LoRA weights trained with rsLoRA.")
+            self.vllm_lora_scaling_factor = self.lora_alpha / math.sqrt(self.r)
+        else:
+            self.vllm_lora_scaling_factor = self.lora_alpha / self.r
+        if self.context_length:
+            if self.vllm_max_position_embeddings is None:
+                self.vllm_max_position_embeddings = self.context_length
+            self.vllm_long_context_scaling_factor = float(
+                math.ceil(self.context_length /
+                          self.vllm_max_position_embeddings))
+    @classmethod
+    def from_dict(cls, config_dict: dict) -> "PEFTHelper":
+        # Get all field information from the class
+        class_fields = {f.name: f for f in fields(cls)}
+        # Check for required fields
+        required_fields = {
+            name
+            for name, f in class_fields.items()
+            if f.default is MISSING and f.default_factory is MISSING
+        }
+        # Identify any missing required fields
+        missing_fields = required_fields - set(config_dict.keys())
+        if missing_fields:
+            raise ValueError(
+                f"Missing required configuration fields: {missing_fields}")
+        # Filter out fields that aren't defined in the class
+        filtered_dict = {
+            k: v
+            for k, v in config_dict.items() if k in class_fields
+        }
+        return cls(**filtered_dict)
+    @classmethod
+    def from_local_dir(cls, lora_path: str,
+                       max_position_embeddings: Optional[int]) -> "PEFTHelper":
+        lora_config_path = os.path.join(lora_path, "adapter_config.json")
+        with open(lora_config_path) as f:
+            config = json.load(f)
+        config["vllm_max_position_embeddings"] = max_position_embeddings
+        return cls.from_dict(config)
+    def validate_legal(self, lora_config: LoRAConfig) -> None:
+        """
+        Validates the LoRA configuration settings against application
+        constraints and requirements.
+        """
+        error_msg = self._validate_features()
+        if self.r > lora_config.max_lora_rank:
+            error_msg.append(
+                f"LoRA rank {self.r} is greater than max_lora_rank"
+                f" {lora_config.max_lora_rank}.")
+        if self.bias != "none" and not lora_config.bias_enabled:
+            error_msg.append(
+                "Adapter bias cannot be used without bias_enabled.")
+        if error_msg:
+            raise ValueError(f"{' '.join(error_msg)}")

vllm/lora/punica_wrapper/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: Apache-2.0
+from vllm.lora.punica_wrapper.punica_base import PunicaWrapperBase
+from vllm.lora.punica_wrapper.punica_selector import get_punica_wrapper
+__all__ = [
+    "PunicaWrapperBase",
+    "get_punica_wrapper",
+]