PyPI - vllm-cpu-amxbf16 - Versions diffs - 0.9.1__cp312-cp312-manylinux_2_17_x86_64.whl - Mend

vllm-cpu-amxbf16 0.9.1__cp312-cp312-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1197) hide show

vllm/worker/neuron_model_runner.py ADDED Viewed

@@ -0,0 +1,460 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, Union
+import torch
+from torch import nn
+from vllm.config import DeviceConfig, VllmConfig
+from vllm.logger import init_logger
+from vllm.lora.layers import LoRAMapping
+from vllm.lora.request import LoRARequest
+from vllm.model_executor import SamplingMetadata
+from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.model_executor.model_loader.neuron import get_neuron_model
+from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
+                             MultiModalKwargs)
+from vllm.platforms import current_platform
+from vllm.sampling_params import SamplingParams
+from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
+from vllm.utils import is_pin_memory_available, make_tensor_with_pad
+from vllm.worker.model_runner_base import ModelRunnerBase, ModelRunnerInputBase
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionBackend
+logger = init_logger(__name__)
+@dataclass(frozen=True)
+class ModelInputForNeuron(ModelRunnerInputBase):
+    """
+    Used by the NeuronModelRunner.
+    """
+    input_tokens: Optional[torch.Tensor] = None
+    input_positions: Optional[torch.Tensor] = None
+    input_block_ids: Optional[torch.Tensor] = None
+    sampling_metadata: SamplingMetadata = None
+    multi_modal_kwargs: BatchedTensorInputs = None
+    adapter_ids: Optional[str] = None
+    def as_broadcastable_tensor_dict(
+            self) -> Dict[str, Union[int, torch.Tensor]]:
+        return {
+            "input_tokens": self.input_tokens,
+            "input_positions": self.input_positions,
+            "input_block_ids": self.input_block_ids,
+            "sampling_metadata": self.sampling_metadata,
+            "multi_modal_kwargs": self.multi_modal_kwargs,
+        }
+    @classmethod
+    def from_broadcasted_tensor_dict(
+        cls,
+        tensor_dict: Dict[str, Any],
+        attn_backend: Optional["AttentionBackend"] = None,
+    ) -> "ModelInputForNeuron":
+        return ModelInputForNeuron(
+            input_tokens=tensor_dict["input_tokens"],
+            input_positions=tensor_dict["input_positions"],
+            input_block_ids=tensor_dict["input_block_ids"],
+            sampling_metadata=tensor_dict["sampling_metadata"],
+            multi_modal_kwargs=tensor_dict["multi_modal_kwargs"],
+        )
+class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]):
+    """A model runner for AWS Neuron hardware"""
+    # NEURON has an upper limit on the top_k
+    _MAX_NEURON_SAMPLING_TOP_K = 256
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+    ):
+        ModelRunnerBase.__init__(self, vllm_config)
+        if (self.model_config is not None
+                and self.model_config.get_sliding_window()):
+            logger.warning("Sliding window is not supported on Neuron. "
+                           "The model will run without sliding window.")
+        self.device_config = (self.device_config if self.device_config
+                              is not None else DeviceConfig())
+        self.lora_config = vllm_config.lora_config
+        self.device = self.device_config.device
+        self.pin_memory = is_pin_memory_available()
+        # Multi-modal data support
+        self.multi_modal_input_mapper = MULTIMODAL_REGISTRY \
+            .create_input_mapper(self.model_config)
+        # Lazy initialization.
+        self.model: nn.Module  # initialize after load_model.
+        # Once NEURON_ON_DEVICE_SAMPLING_DISABLED is set to a non-zero value,
+        # turn off on-device sampling.
+        self._on_device_sampling_disabled = int(
+            os.getenv("NEURON_ON_DEVICE_SAMPLING_DISABLED", "0"))
+        # NEURON needs to update sampling parameters when request IDs change
+        # across batches. This variable stores the previous batch's request IDs
+        # to determine if an update is needed.
+        self._previous_batch_request_ids: List[str] = []
+        if not self._on_device_sampling_disabled:
+            self._init_neuron_sampling()
+    def _init_neuron_sampling(self) -> None:
+        if current_platform.use_transformers_neuronx():
+            from transformers_neuronx.config import GenerationConfig
+        else:
+            from transformers import GenerationConfig
+        logger.warning(
+            "On-device sampling is turned on in Neuron by default, only "
+            "top_k, top_p, and temperature are current supported sampling "
+            "parameters. To turn off the on-device sampling, please set "
+            "the environment variable NEURON_ON_DEVICE_SAMPLING_DISABLED=1.")
+        self.model_config.neuron_sampling_params = GenerationConfig(
+            max_length=self.scheduler_config.max_model_len,
+            do_sample=True,
+            per_batch_line=True,
+            top_k=[self._MAX_NEURON_SAMPLING_TOP_K] \
+                  * self.scheduler_config.max_num_seqs,
+            top_p=[1.0] * self.scheduler_config.max_num_seqs,
+            temperature=[1.0] * self.scheduler_config.max_num_seqs,
+            dynamic=True,
+            global_top_k=self._MAX_NEURON_SAMPLING_TOP_K)
+    def load_model(self) -> None:
+        self.model = get_neuron_model(self.model_config,
+                                      parallel_config=self.parallel_config,
+                                      scheduler_config=self.scheduler_config)
+    def get_model(self) -> nn.Module:
+        return self.model
+    def _prepare_prompt(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, List[int],
+               BatchedTensorInputs]:
+        assert len(seq_group_metadata_list) > 0
+        input_tokens: List[List[int]] = []
+        input_positions: List[List[int]] = []
+        input_block_ids: List[int] = []
+        seq_lens: List[int] = []
+        multi_modal_kwargs_list: List[MultiModalKwargs] = []
+        for seq_group_metadata in seq_group_metadata_list:
+            assert seq_group_metadata.is_prompt
+            seq_ids = list(seq_group_metadata.seq_data.keys())
+            assert len(seq_ids) == 1
+            seq_id = seq_ids[0]
+            seq_data = seq_group_metadata.seq_data[seq_id]
+            prompt_tokens = seq_data.get_token_ids()
+            seq_len = len(prompt_tokens)
+            seq_lens.append(seq_len)
+            input_tokens.append(prompt_tokens)
+            input_positions.append(list(range(seq_len)))
+            assert seq_group_metadata.block_tables is not None
+            block_table = seq_group_metadata.block_tables[seq_id]
+            assert len(block_table) == 1
+            input_block_ids.append(block_table[0])
+            mm_kwargs = seq_group_metadata.multi_modal_data
+            if mm_kwargs:
+                mm_kwargs = self.process_multi_modal_data_neuron(mm_kwargs)
+                multi_modal_kwargs_list.append(mm_kwargs)
+        max_seq_len = max(seq_lens)
+        assert max_seq_len > 0
+        input_tokens = make_tensor_with_pad(input_tokens,
+                                            pad=0,
+                                            max_len=max_seq_len,
+                                            dtype=torch.long,
+                                            device=self.device)
+        input_positions = make_tensor_with_pad(input_positions,
+                                               pad=0,
+                                               max_len=max_seq_len,
+                                               dtype=torch.long,
+                                               device=self.device)
+        input_block_ids = torch.tensor(input_block_ids,
+                                       dtype=torch.long,
+                                       device=self.device)
+        multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list)
+        return (input_tokens, input_positions, input_block_ids, seq_lens,
+                multi_modal_kwargs)
+    def _prepare_decode(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        assert len(seq_group_metadata_list) > 0
+        input_tokens: List[List[int]] = []
+        input_positions: List[List[int]] = []
+        input_block_ids: List[int] = []
+        context_lens: List[int] = []
+        for seq_group_metadata in seq_group_metadata_list:
+            assert not seq_group_metadata.is_prompt
+            seq_ids = list(seq_group_metadata.seq_data.keys())
+            for seq_id in seq_ids:
+                seq_data = seq_group_metadata.seq_data[seq_id]
+                generation_token = seq_data.get_last_token_id()
+                input_tokens.append([generation_token])
+                seq_len = seq_data.get_len()
+                position = seq_len - 1
+                input_positions.append([position])
+                context_lens.append(seq_len)
+                assert seq_group_metadata.block_tables is not None
+                block_table = seq_group_metadata.block_tables[seq_id]
+                assert len(block_table) == 1
+                input_block_ids.append(block_table[0])
+        input_tokens = make_tensor_with_pad(input_tokens,
+                                            pad=0,
+                                            max_len=1,
+                                            dtype=torch.long,
+                                            device=self.device)
+        input_positions = make_tensor_with_pad(input_positions,
+                                               pad=0,
+                                               max_len=1,
+                                               dtype=torch.long,
+                                               device=self.device)
+        context_lens = torch.tensor(context_lens,
+                                    dtype=torch.int,
+                                    device=self.device)
+        input_block_ids = torch.tensor(input_block_ids,
+                                       dtype=torch.long,
+                                       device=self.device)
+        return input_tokens, input_positions, input_block_ids
+    def make_model_input_from_broadcasted_tensor_dict(
+            self, tensor_dict: Dict[str, Any]) -> ModelInputForNeuron:
+        return ModelInputForNeuron.from_broadcasted_tensor_dict(tensor_dict)
+    def prepare_model_input(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        virtual_engine: int = 0,
+        finished_requests_ids: Optional[List[str]] = None
+    ) -> ModelInputForNeuron:
+        multi_modal_kwargs = None
+        # NOTE: We assume that all sequences in the group are all prompts or
+        # all decodes.
+        is_prompt = seq_group_metadata_list[0].is_prompt
+        # Prepare input tensors.
+        if is_prompt:
+            (input_tokens, input_positions, input_block_ids, seq_lens,
+             multi_modal_kwargs
+             ) = self._prepare_prompt(seq_group_metadata_list)
+        else:
+            (input_tokens, input_positions,
+             input_block_ids) = self._prepare_decode(seq_group_metadata_list)
+            seq_lens = None
+        if not self._on_device_sampling_disabled:
+            for seq_group_metadata in seq_group_metadata_list:
+                sampling_params = seq_group_metadata.sampling_params
+                top_k, top_p, temperature = (
+                    self._convert_to_neuron_sampling_params(sampling_params))
+                sampling_params.top_k = top_k
+                sampling_params.top_p = top_p
+                sampling_params.temperature = temperature
+        # we need multi_modal_data for later tokens as well
+        multi_modal_kwargs_list: List[MultiModalKwargs] = []
+        for seq_group_metadata in seq_group_metadata_list:
+            mm_data = seq_group_metadata.multi_modal_data
+            if mm_data:
+                multi_modal_kwargs_list.append(mm_data)
+        multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list)
+        sampling_metadata = SamplingMetadata.prepare(
+            seq_group_metadata_list,
+            seq_lens,
+            # query_lens is not needed if chunked prefill is not
+            # supported. Since neuron worker doesn't support chunked prefill
+            # just use seq_lens instead.
+            seq_lens,
+            self.device,
+            self.pin_memory,
+            generators=self.get_generators(finished_requests_ids))
+        if current_platform.use_transformers_neuronx(
+        ) and not self._on_device_sampling_disabled:
+            # Once the request IDs are changed in current iteration, we will
+            # update the on-device sampling parameters.
+            current_batch_request_ids = [
+                seq_group_meta_data.request_id
+                for seq_group_meta_data in seq_group_metadata_list
+            ]
+            if current_batch_request_ids != self._previous_batch_request_ids:
+                self._update_neuron_sampling_params(seq_group_metadata_list)
+                self._previous_batch_request_ids = current_batch_request_ids
+        return ModelInputForNeuron(input_tokens=input_tokens,
+                                   input_positions=input_positions,
+                                   input_block_ids=input_block_ids,
+                                   sampling_metadata=sampling_metadata,
+                                   multi_modal_kwargs=multi_modal_kwargs)
+    def _update_neuron_sampling_params(
+            self, seq_group_metadata_list: List[SequenceGroupMetadata]):
+        # Update Neuron sampling parameters (GenerationConfig in Neuron)
+        current_sampling_params = self.model_config.neuron_sampling_params
+        assert current_sampling_params is not None, (
+            f"Failed to update sampling_params, "
+            f"current sampling params is {current_sampling_params}")
+        is_update_needed = False
+        top_k = current_sampling_params.top_k
+        top_p = current_sampling_params.top_p
+        temperature = current_sampling_params.temperature
+        # The index of a sequence's sampling parameters in neuron is equal to
+        # its index in `input_block_ids`.
+        for seq_group_metadata in seq_group_metadata_list:
+            seq_ids = list(seq_group_metadata.seq_data.keys())
+            sampling_params = seq_group_metadata.sampling_params
+            seq_group_top_k = sampling_params.top_k
+            seq_group_top_p = sampling_params.top_p
+            seq_group_temperature = sampling_params.temperature
+            for seq_id in seq_ids:
+                index = seq_group_metadata.block_tables[seq_id][0]
+                if (top_k[index] != seq_group_top_k
+                        or top_p[index] != seq_group_top_p
+                        or temperature[index] != seq_group_temperature):
+                    is_update_needed = True
+                top_k[index] = seq_group_top_k
+                top_p[index] = seq_group_top_p
+                temperature[index] = seq_group_temperature
+        # update_generation_config is only available in transformers-neuronx
+        if is_update_needed and current_platform.use_transformers_neuronx():
+            self.model.model.update_generation_config(current_sampling_params)
+    def _convert_to_neuron_sampling_params(
+            self, sampling_params: SamplingParams) -> Tuple[int, float, float]:
+        # Returns the top_k, top_p and temperature parameters for neuron.
+        top_k = sampling_params.top_k
+        top_p = sampling_params.top_p
+        temperature = sampling_params.temperature
+        if temperature == 0.0:
+            # Enable greedy sampling on zero temperature
+            return (1, 1.0, 1.0)
+        if top_k < 1 or top_k > self._MAX_NEURON_SAMPLING_TOP_K:
+            top_k = self._MAX_NEURON_SAMPLING_TOP_K
+        return (top_k, top_p, temperature)
+    @torch.inference_mode()
+    def execute_model(
+        self,
+        model_input: ModelInputForNeuron,
+        kv_caches: Optional[List[torch.Tensor]] = None,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        num_steps: int = 1,
+    ) -> Optional[List[SamplerOutput]]:
+        if num_steps > 1:
+            raise ValueError(
+                "NeuronModelRunner does not support multi-step execution.")
+        # extract top_k, top_p and temperature from model_input for neuron
+        # forward call
+        sampling_params = (torch.tensor([[
+            seq_group.sampling_params.top_k, seq_group.sampling_params.top_p,
+            seq_group.sampling_params.temperature
+        ] for seq_group in model_input.sampling_metadata.seq_groups]))
+        if current_platform.use_neuronx_distributed():
+            hidden_states = self.model(
+                input_ids=model_input.input_tokens,
+                positions=model_input.input_positions,
+                input_block_ids=model_input.input_block_ids,
+                sampling_params=sampling_params,
+                adapter_ids=model_input.adapter_ids,
+                **MultiModalKwargs.as_kwargs(
+                    model_input.multi_modal_kwargs or {},
+                    device=self.device,
+                ),
+            )
+        elif current_platform.use_transformers_neuronx():
+            # [TODO] validate on-device sampling
+            # The model signature may need change for on-device sampling
+            hidden_states = self.model(
+                input_ids=model_input.input_tokens,
+                positions=model_input.input_positions,
+                input_block_ids=model_input.input_block_ids,
+                **MultiModalKwargs.as_kwargs(
+                    model_input.multi_modal_kwargs or {},
+                    device=self.device,
+                ),
+            )
+        # Compute the logits only if the on-device sampling is turned off as
+        # on-device sampling outputs the token ids.
+        if self._on_device_sampling_disabled:
+            logits = self.model.compute_logits(hidden_states,
+                                               model_input.sampling_metadata)
+        else:
+            logits = hidden_states
+        # Sample the next token.
+        output = self.model.sample(
+            logits=logits,
+            sampling_metadata=model_input.sampling_metadata,
+        )
+        return [output]
+    @property
+    def vocab_size(self) -> int:
+        return self.model_config.get_vocab_size()
+    def process_multi_modal_data_neuron(self, mm_data):
+        # this is a no-op for NeuronModelRunner
+        return mm_data
+    def remove_all_loras(self):
+        raise NotImplementedError(
+            "LoRAs are not supported for Transformers NeuronX framework")
+    def set_active_loras(self, lora_requests: Set[LoRARequest],
+                         lora_mapping: LoRAMapping) -> None:
+        raise NotImplementedError(
+            "LoRAs are not supported for Transformers NeuronX framework")
+    def add_lora(self, lora_request: LoRARequest):
+        raise NotImplementedError(
+            "LoRAs are not supported for Transformers NeuronX framework")
+    def remove_lora(self, lora_id: int) -> bool:
+        raise NotImplementedError(
+            "LoRAs are not supported for Transformers NeuronX framework")
+    def pin_lora(self, lora_id: int) -> bool:
+        raise NotImplementedError(
+            "LoRAs are not supported for Transformers NeuronX framework")
+    def list_loras(self) -> Set[int]:
+        raise NotImplementedError(
+            "LoRAs are not supported for Transformers NeuronX framework")

vllm/worker/neuron_worker.py ADDED Viewed

@@ -0,0 +1,193 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""A Neuron worker class."""
+import os
+from typing import List, Optional, Set, Tuple
+import torch.distributed
+from vllm.config import VllmConfig
+from vllm.distributed import (ensure_model_parallel_initialized,
+                              init_distributed_environment)
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.model_executor import set_random_seed
+from vllm.platforms import current_platform
+from vllm.platforms.neuron import NeuronFramework
+from vllm.sequence import ExecuteModelRequest
+from vllm.worker.neuron_model_runner import NeuronModelRunner
+from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, WorkerBase,
+                                     WorkerInput)
+logger = init_logger(__name__)
+class NeuronWorker(LocalOrDistributedWorkerBase):
+    """A worker class that executes the model on a group of neuron cores.
+    """
+    model_runner: NeuronModelRunner
+    def __init__(self,
+                 vllm_config: VllmConfig,
+                 local_rank: int,
+                 rank: int,
+                 distributed_init_method: str,
+                 is_driver_worker: bool = False) -> None:
+        WorkerBase.__init__(self, vllm_config=vllm_config)
+        self.local_rank = local_rank
+        self.rank = rank
+        self.distributed_init_method = distributed_init_method
+        self.is_driver_worker = is_driver_worker
+        self.lora_config = vllm_config.lora_config
+        if self.model_config.trust_remote_code:
+            # note: lazy import to avoid importing torch before initializing
+            from vllm.utils import init_cached_hf_modules
+            init_cached_hf_modules()
+        neuron_framework = current_platform.get_neuron_framework_to_use()
+        if neuron_framework == NeuronFramework.TRANSFORMERS_NEURONX:
+            self.model_runner = self.get_tnx_model_runner(vllm_config)
+        elif neuron_framework == NeuronFramework.NEURONX_DISTRIBUTED_INFERENCE:
+            self.model_runner = self.get_neuronx_distributed_model_runner(
+                vllm_config)
+        else:
+            raise NotImplementedError(
+                "Specified framework" +
+                f" {os.environ.get('VLLM_NEURON_FRAMEWORK')}" +
+                " is either not installed or not supported." +
+                " Supported frameworks: " +
+                "[transformers-neuronx, neuronx-distributed-inference]")
+    def get_tnx_model_runner(self, vllm_config):
+        assert (self.lora_config
+                is None), ("LoRA is not supported for TransformersNeuronX "
+                           "framework.")
+        from vllm.worker.multi_step_neuron_model_runner import (
+            MultiStepNeuronModelRunner)
+        if self.speculative_config is not None:
+            return MultiStepNeuronModelRunner(vllm_config=vllm_config)
+        else:
+            return NeuronModelRunner(vllm_config=vllm_config)
+    def get_neuronx_distributed_model_runner(self, vllm_config):
+        from vllm.worker.multi_step_neuronx_distributed_model_runner import (
+            MultiStepNeuronxDistributedModelRunner)
+        from vllm.worker.neuronx_distributed_model_runner import (
+            NeuronxDistributedModelRunner)
+        if self.speculative_config is not None:
+            assert (self.lora_config
+                    is None), "LoRA is not supported for Speculative Decoding"
+            return MultiStepNeuronxDistributedModelRunner(
+                vllm_config=vllm_config)
+        else:
+            return NeuronxDistributedModelRunner(vllm_config=vllm_config)
+    def init_device(self) -> None:
+        self.init_distributed_environment()
+        # Set random seed.
+        set_random_seed(self.model_config.seed)
+    def load_model(self):
+        self.model_runner.load_model()
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
+        """Determine the number of available KV blocks.
+        Swapping is not yet supported, so always return num_cpu_blocks=0.
+        We configure num_gpu_blocks to be equal to max_num_seqs.
+        """
+        # Set the number of GPU blocks to be the same as the maximum number of
+        # sequences that can be processed in a single batch. This is equivalent
+        # to schedule without PagedAttention.
+        num_gpu_blocks = self.scheduler_config.max_num_seqs + 1
+        # Swap not yet supported with Neuron backend.
+        num_cpu_blocks = 0
+        return num_gpu_blocks, num_cpu_blocks
+    def initialize_cache(self, num_gpu_blocks: int,
+                         num_cpu_blocks: int) -> None:
+        """Initialize the KV cache.
+        """
+        # Different values are not tested.
+        assert num_cpu_blocks == 0
+        assert num_gpu_blocks == self.scheduler_config.max_num_seqs + 1
+        self.cache_config.num_gpu_blocks = num_gpu_blocks
+        self.cache_config.num_cpu_blocks = num_cpu_blocks
+    @property
+    def do_metadata_broadcast(self) -> bool:
+        return False
+    @property
+    def kv_cache(self) -> Optional[List[List[torch.Tensor]]]:
+        return None
+    @torch.inference_mode()
+    def prepare_worker_input(
+            self, execute_model_req: ExecuteModelRequest) -> WorkerInput:
+        return WorkerInput(num_seq_groups=len(
+            execute_model_req.seq_group_metadata_list), )
+    def execute_worker(self, worker_input: WorkerInput) -> None:
+        pass
+    def get_cache_block_size_bytes(self) -> int:
+        """Determine the size in bytes of a cache block.
+        This is required for speculative decoding; it is not yet implemented.
+        """
+        raise NotImplementedError
+    def init_distributed_environment(self):
+        """Neuron uses transformers-neuronx for tensor parallelism.
+        vLLM still needs the environment initialized when TP/PP > 1
+        """
+        init_distributed_environment(
+            world_size=1,
+            rank=self.rank,
+            local_rank=self.local_rank,
+            distributed_init_method=self.distributed_init_method,
+            backend="gloo",
+        )
+        ensure_model_parallel_initialized(
+            1,
+            1,
+        )
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        if current_platform.use_transformers_neuronx():
+            raise NotImplementedError(
+                f"{type(self)} does not support LoRA with Neuron Framework "
+                f"Transformers NeuronX")
+        return self.model_runner.add_lora(lora_request)
+    def remove_lora(self, lora_id: int) -> bool:
+        if current_platform.use_transformers_neuronx():
+            raise NotImplementedError(
+                f"{type(self)} does not support LoRA with Neuron Framework "
+                f"Transformers NeuronX")
+        return self.model_runner.remove_lora(lora_id)
+    def pin_lora(self, lora_id: int) -> bool:
+        if current_platform.use_transformers_neuronx():
+            raise NotImplementedError(
+                f"{type(self)} does not support LoRA with Neuron Framework "
+                f"Transformers NeuronX")
+        return self.model_runner.pin_lora(lora_id)
+    def list_loras(self) -> Set[int]:
+        if current_platform.use_transformers_neuronx():
+            raise NotImplementedError(
+                f"{type(self)} does not support LoRA with Neuron Framework "
+                f"Transformers NeuronX")
+        return self.model_runner.list_loras()