PyPI - vllm-cpu - Versions diffs - 0.11.0.post2__cp312-cp312-manylinux_2_17_x86_64.whl - Mend

vllm-cpu 0.11.0.post2__cp312-cp312-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1398) hide show

vllm/logger.py ADDED Viewed

@@ -0,0 +1,229 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Logging configuration for vLLM."""
+import datetime
+import json
+import logging
+import os
+import sys
+from collections.abc import Hashable
+from functools import lru_cache, partial
+from logging import Logger
+from logging.config import dictConfig
+from os import path
+from types import MethodType
+from typing import Any, Optional, cast
+import vllm.envs as envs
+VLLM_CONFIGURE_LOGGING = envs.VLLM_CONFIGURE_LOGGING
+VLLM_LOGGING_CONFIG_PATH = envs.VLLM_LOGGING_CONFIG_PATH
+VLLM_LOGGING_LEVEL = envs.VLLM_LOGGING_LEVEL
+VLLM_LOGGING_PREFIX = envs.VLLM_LOGGING_PREFIX
+VLLM_LOGGING_STREAM = envs.VLLM_LOGGING_STREAM
+_FORMAT = (f"{VLLM_LOGGING_PREFIX}%(levelname)s %(asctime)s "
+           "[%(fileinfo)s:%(lineno)d] %(message)s")
+_DATE_FORMAT = "%m-%d %H:%M:%S"
+DEFAULT_LOGGING_CONFIG = {
+    "formatters": {
+        "vllm": {
+            "class": "vllm.logging_utils.NewLineFormatter",
+            "datefmt": _DATE_FORMAT,
+            "format": _FORMAT,
+        },
+    },
+    "handlers": {
+        "vllm": {
+            "class": "logging.StreamHandler",
+            "formatter": "vllm",
+            "level": VLLM_LOGGING_LEVEL,
+            "stream": VLLM_LOGGING_STREAM,
+        },
+    },
+    "loggers": {
+        "vllm": {
+            "handlers": ["vllm"],
+            "level": "DEBUG",
+            "propagate": False,
+        },
+    },
+    "version": 1,
+    "disable_existing_loggers": False
+}
+@lru_cache
+def _print_debug_once(logger: Logger, msg: str, *args: Hashable) -> None:
+    # Set the stacklevel to 2 to print the original caller's line info
+    logger.debug(msg, *args, stacklevel=2)
+@lru_cache
+def _print_info_once(logger: Logger, msg: str, *args: Hashable) -> None:
+    # Set the stacklevel to 2 to print the original caller's line info
+    logger.info(msg, *args, stacklevel=2)
+@lru_cache
+def _print_warning_once(logger: Logger, msg: str, *args: Hashable) -> None:
+    # Set the stacklevel to 2 to print the original caller's line info
+    logger.warning(msg, *args, stacklevel=2)
+class _VllmLogger(Logger):
+    """
+    Note:
+        This class is just to provide type information.
+        We actually patch the methods directly on the [`logging.Logger`][]
+        instance to avoid conflicting with other libraries such as
+        `intel_extension_for_pytorch.utils._logger`.
+    """
+    def debug_once(self, msg: str, *args: Hashable) -> None:
+        """
+        As [`debug`][logging.Logger.debug], but subsequent calls with
+        the same message are silently dropped.
+        """
+        _print_debug_once(self, msg, *args)
+    def info_once(self, msg: str, *args: Hashable) -> None:
+        """
+        As [`info`][logging.Logger.info], but subsequent calls with
+        the same message are silently dropped.
+        """
+        _print_info_once(self, msg, *args)
+    def warning_once(self, msg: str, *args: Hashable) -> None:
+        """
+        As [`warning`][logging.Logger.warning], but subsequent calls with
+        the same message are silently dropped.
+        """
+        _print_warning_once(self, msg, *args)
+# Pre-defined methods mapping to avoid repeated dictionary creation
+_METHODS_TO_PATCH = {
+    "debug_once": _print_debug_once,
+    "info_once": _print_info_once,
+    "warning_once": _print_warning_once,
+}
+def _configure_vllm_root_logger() -> None:
+    logging_config = dict[str, Any]()
+    if not VLLM_CONFIGURE_LOGGING and VLLM_LOGGING_CONFIG_PATH:
+        raise RuntimeError(
+            "VLLM_CONFIGURE_LOGGING evaluated to false, but "
+            "VLLM_LOGGING_CONFIG_PATH was given. VLLM_LOGGING_CONFIG_PATH "
+            "implies VLLM_CONFIGURE_LOGGING. Please enable "
+            "VLLM_CONFIGURE_LOGGING or unset VLLM_LOGGING_CONFIG_PATH.")
+    if VLLM_CONFIGURE_LOGGING:
+        logging_config = DEFAULT_LOGGING_CONFIG
+    if VLLM_LOGGING_CONFIG_PATH:
+        if not path.exists(VLLM_LOGGING_CONFIG_PATH):
+            raise RuntimeError(
+                "Could not load logging config. File does not exist: %s",
+                VLLM_LOGGING_CONFIG_PATH)
+        with open(VLLM_LOGGING_CONFIG_PATH, encoding="utf-8") as file:
+            custom_config = json.loads(file.read())
+        if not isinstance(custom_config, dict):
+            raise ValueError("Invalid logging config. Expected dict, got %s.",
+                             type(custom_config).__name__)
+        logging_config = custom_config
+    for formatter in logging_config.get("formatters", {}).values():
+        # This provides backwards compatibility after #10134.
+        if formatter.get("class") == "vllm.logging.NewLineFormatter":
+            formatter["class"] = "vllm.logging_utils.NewLineFormatter"
+    if logging_config:
+        dictConfig(logging_config)
+def init_logger(name: str) -> _VllmLogger:
+    """The main purpose of this function is to ensure that loggers are
+    retrieved in such a way that we can be sure the root vllm logger has
+    already been configured."""
+    logger = logging.getLogger(name)
+    for method_name, method in _METHODS_TO_PATCH.items():
+        setattr(logger, method_name, MethodType(method, logger))
+    return cast(_VllmLogger, logger)
+# The root logger is initialized when the module is imported.
+# This is thread-safe as the module is only imported once,
+# guaranteed by the Python GIL.
+_configure_vllm_root_logger()
+logger = init_logger(__name__)
+def _trace_calls(log_path, root_dir, frame, event, arg=None):
+    if event in ['call', 'return']:
+        # Extract the filename, line number, function name, and the code object
+        filename = frame.f_code.co_filename
+        lineno = frame.f_lineno
+        func_name = frame.f_code.co_name
+        if not filename.startswith(root_dir):
+            # only log the functions in the vllm root_dir
+            return
+        # Log every function call or return
+        try:
+            last_frame = frame.f_back
+            if last_frame is not None:
+                last_filename = last_frame.f_code.co_filename
+                last_lineno = last_frame.f_lineno
+                last_func_name = last_frame.f_code.co_name
+            else:
+                # initial frame
+                last_filename = ""
+                last_lineno = 0
+                last_func_name = ""
+            with open(log_path, 'a') as f:
+                ts = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")
+                if event == 'call':
+                    f.write(f"{ts} Call to"
+                            f" {func_name} in {filename}:{lineno}"
+                            f" from {last_func_name} in {last_filename}:"
+                            f"{last_lineno}\n")
+                else:
+                    f.write(f"{ts} Return from"
+                            f" {func_name} in {filename}:{lineno}"
+                            f" to {last_func_name} in {last_filename}:"
+                            f"{last_lineno}\n")
+        except NameError:
+            # modules are deleted during shutdown
+            pass
+    return partial(_trace_calls, log_path, root_dir)
+def enable_trace_function_call(log_file_path: str,
+                               root_dir: Optional[str] = None):
+    """
+    Enable tracing of every function call in code under `root_dir`.
+    This is useful for debugging hangs or crashes.
+    `log_file_path` is the path to the log file.
+    `root_dir` is the root directory of the code to trace. If None, it is the
+    vllm root directory.
+    Note that this call is thread-level, any threads calling this function
+    will have the trace enabled. Other threads will not be affected.
+    """
+    logger.warning(
+        "VLLM_TRACE_FUNCTION is enabled. It will record every"
+        " function executed by Python. This will slow down the code. It "
+        "is suggested to be used for debugging hang or crashes only.")
+    logger.info("Trace frame log is saved to %s", log_file_path)
+    if root_dir is None:
+        # by default, this is the vllm root directory
+        root_dir = os.path.dirname(os.path.dirname(__file__))
+    sys.settrace(partial(_trace_calls, log_file_path, root_dir))

vllm/logging_utils/__init__.py ADDED Viewed

@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from vllm.logging_utils.formatter import NewLineFormatter
+from vllm.logging_utils.log_time import logtime
+__all__ = [
+    "NewLineFormatter",
+    "logtime",
+]

vllm/logging_utils/dump_input.py ADDED Viewed

@@ -0,0 +1,81 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import contextlib
+import enum
+import json
+from typing import Optional
+import torch
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.v1.metrics.stats import SchedulerStats
+from vllm.version import __version__ as VLLM_VERSION
+logger = init_logger(__name__)
+def prepare_object_to_dump(obj) -> str:
+    if isinstance(obj, str):
+        return f"'{obj}'"  # Double quotes
+    elif isinstance(obj, dict):
+        dict_str = ', '.join({f'{str(k)}: {prepare_object_to_dump(v)}' \
+            for k, v in obj.items()})
+        return f'{{{dict_str}}}'
+    elif isinstance(obj, list):
+        return f"[{', '.join([prepare_object_to_dump(v) for v in obj])}]"
+    elif isinstance(obj, set):
+        return f"[{', '.join([prepare_object_to_dump(v) for v in list(obj)])}]"
+        # return [prepare_object_to_dump(v) for v in list(obj)]
+    elif isinstance(obj, tuple):
+        return f"[{', '.join([prepare_object_to_dump(v) for v in obj])}]"
+    elif isinstance(obj, enum.Enum):
+        return repr(obj)
+    elif isinstance(obj, torch.Tensor):
+        # We only print the 'draft' of the tensor to not expose sensitive data
+        # and to get some metadata in case of CUDA runtime crashed
+        return (f"Tensor(shape={obj.shape}, "
+                f"device={obj.device},"
+                f"dtype={obj.dtype})")
+    elif hasattr(obj, 'anon_repr'):
+        return obj.anon_repr()
+    elif hasattr(obj, '__dict__'):
+        items = obj.__dict__.items()
+        dict_str = ', '.join([f'{str(k)}={prepare_object_to_dump(v)}' \
+            for k, v in items])
+        return f"{type(obj).__name__}({dict_str})"
+    else:
+        # Hacky way to make sure we can serialize the object in JSON format
+        try:
+            return json.dumps(obj)
+        except (TypeError, OverflowError):
+            return repr(obj)
+def dump_engine_exception(config: VllmConfig,
+                          scheduler_output: SchedulerOutput,
+                          scheduler_stats: Optional[SchedulerStats]):
+    # NOTE: ensure we can log extra info without risking raises
+    # unexpected errors during logging
+    with contextlib.suppress(Exception):
+        _dump_engine_exception(config, scheduler_output, scheduler_stats)
+def _dump_engine_exception(config: VllmConfig,
+                           scheduler_output: SchedulerOutput,
+                           scheduler_stats: Optional[SchedulerStats]):
+    logger.error(
+        "Dumping input data for V1 LLM engine (v%s) with config: %s, ",
+        VLLM_VERSION,
+        config,
+    )
+    try:
+        dump_obj = prepare_object_to_dump(scheduler_output)
+        logger.error("Dumping scheduler output for model execution: %s",
+                     dump_obj)
+        if scheduler_stats:
+            logger.error("Dumping scheduler stats: %s", scheduler_stats)
+    except Exception:
+        logger.exception("Error preparing object to dump")

vllm/logging_utils/formatter.py ADDED Viewed

@@ -0,0 +1,79 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import logging
+from pathlib import Path
+from vllm import envs
+class NewLineFormatter(logging.Formatter):
+    """Adds logging prefix to newlines to align multi-line messages."""
+    def __init__(self, fmt, datefmt=None, style="%"):
+        super().__init__(fmt, datefmt, style)
+        self.use_relpath = envs.VLLM_LOGGING_LEVEL == "DEBUG"
+        if self.use_relpath:
+            self.root_dir = Path(__file__).resolve().parent.parent.parent
+    def format(self, record):
+        def shrink_path(relpath: Path) -> str:
+            """
+            Shortens a file path for logging display:
+            - Removes leading 'vllm' folder if present.
+            - If path starts with 'v1',
+            keeps the first two and last two levels,
+            collapsing the middle as '...'.
+            - Otherwise, keeps the first and last two levels,
+            collapsing the middle as '...'.
+            - If the path is short, returns it as-is.
+            - Examples:
+            vllm/model_executor/layers/quantization/utils/fp8_utils.py ->
+            model_executor/.../quantization/utils/fp8_utils.py
+            vllm/model_executor/layers/quantization/awq.py ->
+            model_executor/layers/quantization/awq.py
+            vllm/v1/attention/backends/mla/common.py ->
+            v1/attention/backends/mla/common.py
+            Args:
+                relpath (Path): The relative path to be shortened.
+            Returns:
+                str: The shortened path string for display.
+            """
+            parts = list(relpath.parts)
+            new_parts = []
+            if parts and parts[0] == "vllm":
+                parts = parts[1:]
+            if parts and parts[0] == "v1":
+                new_parts += parts[:2]
+                parts = parts[2:]
+            elif parts:
+                new_parts += parts[:1]
+                parts = parts[1:]
+            if len(parts) > 2:
+                new_parts += ["..."] + parts[-2:]
+            else:
+                new_parts += parts
+            return "/".join(new_parts)
+        if self.use_relpath:
+            abs_path = getattr(record, "pathname", None)
+            if abs_path:
+                try:
+                    relpath = Path(abs_path).resolve().relative_to(
+                        self.root_dir)
+                except Exception:
+                    relpath = Path(record.filename)
+            else:
+                relpath = Path(record.filename)
+            record.fileinfo = shrink_path(relpath)
+        else:
+            record.fileinfo = record.filename
+        msg = super().format(record)
+        if record.message != "":
+            parts = msg.split(record.message)
+            msg = msg.replace("\n", "\r\n" + parts[0])
+        return msg

vllm/logging_utils/log_time.py ADDED Viewed

@@ -0,0 +1,32 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Provides a timeslice logging decorator
+"""
+import functools
+import time
+def logtime(logger, msg=None):
+    """
+    Logs the execution time of the decorated function.
+    Always place it beneath other decorators.
+    """
+    def _inner(func):
+        @functools.wraps(func)
+        def _wrapper(*args, **kwargs):
+            start = time.perf_counter()
+            result = func(*args, **kwargs)
+            elapsed = time.perf_counter() - start
+            prefix = f"Function '{func.__module__}.{func.__qualname__}'" \
+                if msg is None else msg
+            logger.debug("%s: Elapsed time %.7f secs", prefix, elapsed)
+            return result
+        return _wrapper
+    return _inner

vllm/logits_process.py ADDED Viewed

@@ -0,0 +1,119 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Sequence
+from typing import Callable, Union
+import torch
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+LogitsProcessor = Union[
+    Callable[[list[int], torch.Tensor], torch.Tensor],
+    Callable[[list[int], list[int], torch.Tensor], torch.Tensor],
+]
+"""LogitsProcessor is a function that takes a list
+of previously generated tokens, the logits tensor
+for the next token and, optionally, prompt tokens as a
+first argument, and returns a modified tensor of logits
+to sample from."""
+def get_bad_words_logits_processors(
+        bad_words: list[str],
+        tokenizer: AnyTokenizer) -> list[LogitsProcessor]:
+    bad_words_ids: list[list[int]] = list()
+    for bad_word in bad_words:
+        # To prohibit words both at the beginning
+        # and in the middle of text
+        # (related to add_prefix_space tokenizer parameter)
+        for add_prefix_space in [False, True]:
+            prefix = " " if add_prefix_space else ""
+            prompt = prefix + bad_word.lstrip()
+            prompt_token_ids = tokenizer.encode(text=prompt,
+                                                add_special_tokens=False)
+            # If no space at the beginning
+            # or if prefix space produces a new word token
+            if (not add_prefix_space) or (
+                    add_prefix_space
+                    and prompt_token_ids[0] != bad_words_ids[-1][0]
+                    and len(prompt_token_ids) == len(bad_words_ids[-1])):
+                bad_words_ids.append(prompt_token_ids)
+    return [NoBadWordsLogitsProcessor(bad_words_ids=bad_words_ids)]
+class NoBadWordsLogitsProcessor:
+    _SMALLEST_LOGIT = float("-inf")
+    _NEUTRAL_LOGIT = 0.0
+    def __init__(self, bad_words_ids: list[list[int]]):
+        self.bad_words_ids = bad_words_ids
+        self.word_bias: torch.FloatTensor = None
+    def __call__(
+        self,
+        past_tokens_ids: Sequence[int],
+        logits: torch.FloatTensor,
+    ) -> torch.Tensor:
+        if self.word_bias is None:
+            self._init_word_bias(logits=logits)
+        last_token_bias = torch.zeros_like(logits)
+        for bad_word_ids in self.bad_words_ids:
+            if len(bad_word_ids) == 1:  # 1-token words already processed
+                continue
+            if len(bad_word_ids) > len(past_tokens_ids) + 1:
+                continue
+            prefix_length = len(bad_word_ids) - 1
+            last_token_id = bad_word_ids[-1]
+            actual_prefix = past_tokens_ids[-prefix_length:]
+            expected_prefix = bad_word_ids[:prefix_length]
+            assert len(actual_prefix) == len(expected_prefix)
+            is_match = tuple(actual_prefix) == tuple(expected_prefix)
+            last_token_bias[last_token_id] += (self._SMALLEST_LOGIT if is_match
+                                               else self._NEUTRAL_LOGIT)
+        logits = logits + self.word_bias + last_token_bias
+        return logits
+    def _init_word_bias(self, logits: torch.FloatTensor) -> None:
+        # Code based on NoBadWordsLogitsProcessor and SequenceBiasLogitsProcessor  # noqa: E501
+        # from https://github.com/huggingface/transformers/blob/main/src/transformers/generation/logits_process.py
+        vocab_size = logits.shape[-1]
+        self._check_token_ids_bounds(vocab_size=vocab_size)
+        self.word_bias = torch.zeros((vocab_size, ),
+                                     dtype=torch.float,
+                                     device=logits.device)
+        for bad_word_ids in self.bad_words_ids:
+            if len(bad_word_ids) == 1:
+                bad_word_id = bad_word_ids[-1]
+                self.word_bias[bad_word_id] = self._SMALLEST_LOGIT
+    def _check_token_ids_bounds(self, vocab_size: int) -> None:
+        invalid_token_ids = []
+        for bad_word_ids in self.bad_words_ids:
+            for token_id in bad_word_ids:
+                if token_id < 0 or token_id >= vocab_size:
+                    invalid_token_ids.append(token_id)
+        if len(invalid_token_ids) > 0:
+            raise ValueError(
+                f"The model vocabulary size is {vocab_size},"
+                f" but the following tokens"
+                f" were specified as bad: {invalid_token_ids}."
+                f" All token id values should be integers satisfying:"
+                f" 0 <= token_id < {vocab_size}.")

vllm/logprobs.py ADDED Viewed

@@ -0,0 +1,28 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from dataclasses import dataclass
+from typing import Optional
+# We use dataclass for now because it is used for
+# openai server output, and msgspec is not serializable.
+# TODO(sang): Fix it.
+@dataclass
+class Logprob:
+    """Infos for supporting OpenAI compatible logprobs and token ranks.
+    Attributes:
+        logprob: The logprob of chosen token
+        rank: The vocab rank of chosen token (>=1)
+        decoded_token: The decoded chosen token index
+    """
+    logprob: float
+    rank: Optional[int] = None
+    decoded_token: Optional[str] = None
+# {token_id -> logprob} per each sequence group. None if the corresponding
+# sequence group doesn't require prompt logprob.
+PromptLogprobs = list[Optional[dict[int, Logprob]]]
+# {token_id -> logprob} for each sequence group.
+SampleLogprobs = list[dict[int, Logprob]]

vllm/lora/__init__.py ADDED Viewed

File without changes

vllm/lora/layers/__init__.py ADDED Viewed

@@ -0,0 +1,34 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from vllm.lora.layers.base import BaseLayerWithLoRA
+from vllm.lora.layers.column_parallel_linear import (
+    ColumnParallelLinearWithLoRA, ColumnParallelLinearWithShardedLoRA,
+    MergedColumnParallelLinearWithLoRA,
+    MergedColumnParallelLinearWithShardedLoRA, MergedQKVParallelLinearWithLoRA,
+    MergedQKVParallelLinearWithShardedLoRA, QKVParallelLinearWithLoRA,
+    QKVParallelLinearWithShardedLoRA)
+from vllm.lora.layers.logits_processor import LogitsProcessorWithLoRA
+from vllm.lora.layers.replicated_linear import ReplicatedLinearWithLoRA
+from vllm.lora.layers.row_parallel_linear import (
+    RowParallelLinearWithLoRA, RowParallelLinearWithShardedLoRA)
+from vllm.lora.layers.utils import LoRAMapping
+from vllm.lora.layers.vocal_parallel_embedding import (
+    VocabParallelEmbeddingWithLoRA)
+__all__ = [
+    "BaseLayerWithLoRA",
+    "VocabParallelEmbeddingWithLoRA",
+    "LogitsProcessorWithLoRA",
+    "ColumnParallelLinearWithLoRA",
+    "ColumnParallelLinearWithShardedLoRA",
+    "MergedColumnParallelLinearWithLoRA",
+    "MergedColumnParallelLinearWithShardedLoRA",
+    "MergedQKVParallelLinearWithLoRA",
+    "MergedQKVParallelLinearWithShardedLoRA",
+    "QKVParallelLinearWithLoRA",
+    "QKVParallelLinearWithShardedLoRA",
+    "RowParallelLinearWithLoRA",
+    "RowParallelLinearWithShardedLoRA",
+    "ReplicatedLinearWithLoRA",
+    "LoRAMapping",
+]

vllm/lora/layers/base.py ADDED Viewed

@@ -0,0 +1,69 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import TYPE_CHECKING, Optional, Union
+import torch
+import torch.nn as nn
+from transformers import PretrainedConfig
+from vllm.config.lora import LoRAConfig
+if TYPE_CHECKING:
+    from vllm.lora.punica_wrapper import PunicaWrapperBase
+class BaseLayerWithLoRA(nn.Module):
+    def slice_lora_a(
+        self, lora_a: Union[torch.Tensor, list[Union[torch.Tensor, None]]]
+    ) -> Union[torch.Tensor, list[Union[torch.Tensor, None]]]:
+        """Slice lora a if splitting for tensor parallelism."""
+        ...
+    def slice_lora_b(
+        self, lora_b: Union[torch.Tensor, list[Union[torch.Tensor, None]]]
+    ) -> Union[torch.Tensor, list[Union[torch.Tensor, None]]]:
+        """Slice lora b if splitting with tensor parallelism."""
+        ...
+    def create_lora_weights(
+        self,
+        max_loras: int,
+        lora_config: LoRAConfig,
+        model_config: Optional[PretrainedConfig] = None,
+    ) -> None:
+        """Initializes lora matrices."""
+        ...
+    def reset_lora(self, index: int):
+        """Resets the lora weights at index back to 0."""
+        ...
+    def set_lora(
+        self,
+        index: int,
+        lora_a: torch.Tensor,
+        lora_b: torch.Tensor,
+        embeddings_tensor: Optional[torch.Tensor],
+        bias: Optional[torch.Tensor] = None,
+    ):
+        """Overwrites lora tensors at index."""
+        ...
+    def set_mapping(
+        self,
+        punica_wrapper,
+    ):
+        self.punica_wrapper: PunicaWrapperBase = punica_wrapper
+    @classmethod
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: list,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
+        """Returns True if the layer can be replaced by this LoRA layer."""
+        raise NotImplementedError