PyPI - vllm-ascend - Versions diffs - 0.10.0rc1__cp310-cp310-manylinux_2_24_aarch64.whl - Mend

vllm-ascend 0.10.0rc1__cp310-cp310-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (107) hide show

vllm_ascend/__init__.py +27 -0
vllm_ascend/_build_info.py +3 -0
vllm_ascend/_version.py +21 -0
vllm_ascend/ascend_config.py +183 -0
vllm_ascend/ascend_forward_context.py +114 -0
vllm_ascend/attention/__init__.py +0 -0
vllm_ascend/attention/attention_mask.py +104 -0
vllm_ascend/attention/attention_v1.py +477 -0
vllm_ascend/attention/attention_v1_torchair.py +496 -0
vllm_ascend/attention/mla_v1.py +1279 -0
vllm_ascend/compilation/__init__.py +0 -0
vllm_ascend/compilation/piecewise_backend.py +225 -0
vllm_ascend/core/__init__.py +0 -0
vllm_ascend/core/schedule_config.py +74 -0
vllm_ascend/core/scheduler.py +487 -0
vllm_ascend/device_allocator/__init__.py +0 -0
vllm_ascend/device_allocator/camem.py +278 -0
vllm_ascend/distributed/__init__.py +24 -0
vllm_ascend/distributed/communication_op.py +25 -0
vllm_ascend/distributed/communicator.py +96 -0
vllm_ascend/distributed/device_communicators/__init__.py +0 -0
vllm_ascend/distributed/device_communicators/pyhccl.py +165 -0
vllm_ascend/distributed/device_communicators/pyhccl_wrapper.py +253 -0
vllm_ascend/distributed/llmdatadist_c_mgr_connector.py +894 -0
vllm_ascend/distributed/parallel_state.py +48 -0
vllm_ascend/distributed/tensor_parallel.py +248 -0
vllm_ascend/envs.py +175 -0
vllm_ascend/libvllm_ascend_kernels.so +0 -0
vllm_ascend/lora/__init__.py +0 -0
vllm_ascend/lora/punica_wrapper/__init__.py +0 -0
vllm_ascend/lora/punica_wrapper/lora_ops.py +112 -0
vllm_ascend/lora/punica_wrapper/punica_npu.py +364 -0
vllm_ascend/models/__init__.py +61 -0
vllm_ascend/models/deepseek_dbo.py +1046 -0
vllm_ascend/models/deepseek_mtp.py +218 -0
vllm_ascend/models/deepseek_v2.py +990 -0
vllm_ascend/models/deepseek_v3.py +27 -0
vllm_ascend/models/pangu_moe.py +1117 -0
vllm_ascend/models/qwen2_5_vl.py +499 -0
vllm_ascend/models/qwen2_5_vl_without_padding.py +377 -0
vllm_ascend/models/qwen2_vl.py +352 -0
vllm_ascend/models/qwen3.py +156 -0
vllm_ascend/models/qwen3_moe.py +388 -0
vllm_ascend/multistream/__init__.py +0 -0
vllm_ascend/multistream/base.py +29 -0
vllm_ascend/multistream/context.py +67 -0
vllm_ascend/multistream/decorator.py +22 -0
vllm_ascend/multistream/layers.py +61 -0
vllm_ascend/multistream/metadata.py +182 -0
vllm_ascend/multistream/ms_split.py +247 -0
vllm_ascend/ops/__init__.py +49 -0
vllm_ascend/ops/activation.py +42 -0
vllm_ascend/ops/attention.py +309 -0
vllm_ascend/ops/cache.py +35 -0
vllm_ascend/ops/comm_utils.py +62 -0
vllm_ascend/ops/common_fused_moe.py +115 -0
vllm_ascend/ops/expert_load_balancer.py +99 -0
vllm_ascend/ops/fused_moe.py +1557 -0
vllm_ascend/ops/layernorm.py +86 -0
vllm_ascend/ops/moe_dispatcher/__init__.py +0 -0
vllm_ascend/ops/moe_dispatcher/token_dispatcher.py +453 -0
vllm_ascend/ops/rotary_embedding.py +292 -0
vllm_ascend/ops/sequence_parallel.py +120 -0
vllm_ascend/ops/vocab_parallel_embedding.py +74 -0
vllm_ascend/patch/__init__.py +104 -0
vllm_ascend/patch/platform/__init__.py +25 -0
vllm_ascend/patch/platform/patch_0_10_0/__init__.py +16 -0
vllm_ascend/patch/platform/patch_common/__init__.py +18 -0
vllm_ascend/patch/platform/patch_common/patch_distributed.py +115 -0
vllm_ascend/patch/platform/patch_main/__init__.py +16 -0
vllm_ascend/patch/worker/__init__.py +26 -0
vllm_ascend/patch/worker/patch_0_10_0/__init__.py +18 -0
vllm_ascend/patch/worker/patch_0_10_0/patch_sampler_gather_logprobs.py +87 -0
vllm_ascend/patch/worker/patch_common/__init__.py +20 -0
vllm_ascend/patch/worker/patch_common/patch_distributed.py +49 -0
vllm_ascend/patch/worker/patch_common/patch_linear.py +145 -0
vllm_ascend/patch/worker/patch_common/patch_minicpm.py +36 -0
vllm_ascend/patch/worker/patch_main/__init__.py +16 -0
vllm_ascend/platform.py +288 -0
vllm_ascend/quantization/__init__.py +0 -0
vllm_ascend/quantization/func_wrapper.py +184 -0
vllm_ascend/quantization/quant_config.py +354 -0
vllm_ascend/quantization/quantizer.py +311 -0
vllm_ascend/quantization/w4a8_dynamic.py +396 -0
vllm_ascend/quantization/w8a8.py +767 -0
vllm_ascend/quantization/w8a8_dynamic.py +1033 -0
vllm_ascend/sample/__init__.py +0 -0
vllm_ascend/sample/rejection_sampler.py +453 -0
vllm_ascend/sample/sampler.py +65 -0
vllm_ascend/torchair/__init__.py +0 -0
vllm_ascend/torchair/torchair_model_runner.py +29 -0
vllm_ascend/torchair/torchair_worker.py +61 -0
vllm_ascend/torchair/utils.py +98 -0
vllm_ascend/utils.py +507 -0
vllm_ascend/vllm_ascend_C.cpython-310-aarch64-linux-gnu.so +0 -0
vllm_ascend/worker/__init__.py +0 -0
vllm_ascend/worker/eagle_proposer_v1.py +384 -0
vllm_ascend/worker/model_runner_v1.py +2791 -0
vllm_ascend/worker/mtp_proposer_v1.py +400 -0
vllm_ascend/worker/npu_input_batch.py +758 -0
vllm_ascend/worker/worker_v1.py +355 -0
vllm_ascend-0.10.0rc1.dist-info/LICENSE +201 -0
vllm_ascend-0.10.0rc1.dist-info/METADATA +130 -0
vllm_ascend-0.10.0rc1.dist-info/RECORD +107 -0
vllm_ascend-0.10.0rc1.dist-info/WHEEL +5 -0
vllm_ascend-0.10.0rc1.dist-info/entry_points.txt +5 -0
vllm_ascend-0.10.0rc1.dist-info/top_level.txt +1 -0

vllm_ascend/__init__.py ADDED Viewed

@@ -0,0 +1,27 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+#
+def register():
+    """Register the NPU platform."""
+    return "vllm_ascend.platform.NPUPlatform"
+def register_model():
+    from .models import register_model
+    register_model()

vllm_ascend/_build_info.py ADDED Viewed

@@ -0,0 +1,3 @@
+# Auto-generated file
+__soc_version__ = 'ASCEND910B1'
+__sleep_mode_enabled__ = True

vllm_ascend/_version.py ADDED Viewed

@@ -0,0 +1,21 @@
+# file generated by setuptools-scm
+# don't change, don't track in version control
+__all__ = ["__version__", "__version_tuple__", "version", "version_tuple"]
+TYPE_CHECKING = False
+if TYPE_CHECKING:
+    from typing import Tuple
+    from typing import Union
+    VERSION_TUPLE = Tuple[Union[int, str], ...]
+else:
+    VERSION_TUPLE = object
+version: str
+__version__: str
+__version_tuple__: VERSION_TUPLE
+version_tuple: VERSION_TUPLE
+__version__ = version = '0.10.0rc1'
+__version_tuple__ = version_tuple = (0, 10, 0, 'rc1')

vllm_ascend/ascend_config.py ADDED Viewed

@@ -0,0 +1,183 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional
+from vllm.logger import logger
+TORCHAIR_MODEL_LIST = ["deepseek", "pangu", "kimi_k2"]
+def _check_torchair_supported(model_type: str):
+    for supported_model in TORCHAIR_MODEL_LIST:
+        if supported_model in model_type.lower():
+            return True
+    return False
+class AscendConfig:
+    """
+    Configuration Object for additional_config from vllm.configs.
+    """
+    def __init__(self, vllm_config):
+        additional_config = vllm_config.additional_config if vllm_config.additional_config is not None else {}
+        torchair_graph_config = additional_config.get("torchair_graph_config",
+                                                      {})
+        self.torchair_graph_config = TorchairGraphConfig(torchair_graph_config)
+        ascend_scheduler_config = additional_config.get(
+            "ascend_scheduler_config", {})
+        self.ascend_scheduler_config = AscendSchedulerConfig(
+            ascend_scheduler_config)
+        self.expert_map_path = additional_config.get("expert_map_path", None)
+        self.chunked_prefill_for_mla = additional_config.get(
+            "chunked_prefill_for_mla", False)
+class TorchairGraphConfig:
+    """
+    Configuration Object for torchair_graph_config from additional_config
+    """
+    def __init__(self, torchair_graph_config):
+        self.enabled = torchair_graph_config.get("enabled", False)
+        self.use_cached_graph = torchair_graph_config.get(
+            "use_cached_graph", False)
+        self.graph_batch_sizes = torchair_graph_config.get(
+            "graph_batch_sizes", [])
+        self.graph_batch_sizes_init = torchair_graph_config.get(
+            "graph_batch_sizes_init", False)
+        self.enable_multistream_mla = torchair_graph_config.get(
+            "enable_multistream_mla", False)
+        self.enable_multistream_moe = torchair_graph_config.get(
+            "enable_multistream_moe", False)
+        self.enable_view_optimize = torchair_graph_config.get(
+            "enable_view_optimize", True)
+        self.enable_kv_nz = torchair_graph_config.get("enable_kv_nz", False)
+        if not isinstance(self.graph_batch_sizes, list):
+            raise TypeError("graph_batch_sizes must be list[int]")
+        if self.graph_batch_sizes_init and len(self.graph_batch_sizes) > 0:
+            raise ValueError(
+                "graph_batch_sizes_init is only valid when graph_batch_sizes is empty"
+            )
+        if not self.enabled:
+            if self.use_cached_graph:
+                raise RuntimeError(
+                    "use_cached_graph is valid only when Torchair graph mode is enabled"
+                )
+            if self.graph_batch_sizes:
+                raise RuntimeError(
+                    "graph_batch_sizes is valid only when Torchair graph mode is enabled"
+                )
+            if self.graph_batch_sizes_init:
+                raise RuntimeError(
+                    "graph_batch_sizes_init is valid only when Torchair graph mode is enabled"
+                )
+            if self.enable_multistream_mla:
+                raise RuntimeError(
+                    "enable_multistream_mla is valid only when Torchair graph mode is enabled"
+                )
+            if self.enable_multistream_moe:
+                raise RuntimeError(
+                    "enable_multistream_moe is valid only when Torchair graph mode is enabled"
+                )
+            if self.enable_kv_nz:
+                raise RuntimeError(
+                    "enable_kv_nz is valid only when Torchair graph mode is enabled"
+                )
+class AscendSchedulerConfig:
+    """
+    Configuration Object for ascend_scheduler_config from additional_config
+    """
+    def __init__(self, ascend_scheduler_config: dict):
+        self.enabled = ascend_scheduler_config.get("enabled", False)
+        # Ascend scheduler is based on vllm v0 scheduler, so we should support
+        # all vllm v0 scheduler configs as well.
+        for k, v in ascend_scheduler_config.items():
+            if not hasattr(self, k):
+                setattr(self, k, v)
+_ASCEND_CONFIG: Optional[AscendConfig] = None
+def init_ascend_config(vllm_config):
+    additional_config = vllm_config.additional_config if vllm_config.additional_config is not None else {}
+    refresh = additional_config.get("refresh",
+                                    False) if additional_config else False
+    global _ASCEND_CONFIG
+    if _ASCEND_CONFIG is not None and not refresh:
+        return _ASCEND_CONFIG
+    _ASCEND_CONFIG = AscendConfig(vllm_config)
+    return _ASCEND_CONFIG
+def clear_ascend_config():
+    global _ASCEND_CONFIG
+    _ASCEND_CONFIG = None
+def get_ascend_config():
+    global _ASCEND_CONFIG
+    if _ASCEND_CONFIG is None:
+        raise RuntimeError(
+            "Ascend config is not initialized. Please call init_ascend_config first."
+        )
+    return _ASCEND_CONFIG
+def check_ascend_config(vllm_config, enforce_eager):
+    ascend_config = get_ascend_config()
+    # for eager mode
+    if enforce_eager:
+        # torchair_graph cannot be enabled with eager mode.
+        if ascend_config.torchair_graph_config.enabled:
+            raise RuntimeError(
+                "Can't enable graph mode and eager mode at the same time. Please set `enforce_eager=False` if you attempt to enable NPU graph mode."
+            )
+    # for graph mode
+    else:
+        # torchair_graph case
+        if ascend_config.torchair_graph_config.enabled:
+            # torchair_graph is supported for deepseek/pangu model only.
+            if vllm_config.model_config:
+                model_type = vllm_config.model_config.hf_config.model_type
+                if not _check_torchair_supported(model_type):
+                    raise NotImplementedError(
+                        "Torchair graph mode only works with following model types:"
+                        f"{TORCHAIR_MODEL_LIST}.")
+        # aclgraph case
+        else:
+            # aclgraph doesn't work with deepseek model and only qwen model is well tested.
+            if vllm_config.model_config:
+                model_type = vllm_config.model_config.hf_config.model_type
+                if "deepseek" in model_type:
+                    raise NotImplementedError(
+                        "ACL Graph does not support deepseek. Please "
+                        "try torchair graph mode to serve deepseek models on vllm-ascend."
+                        " Or set `enforce_eager=True` to use eager mode.")
+                if "qwen" not in model_type:
+                    logger.warning(
+                        "ACL Graph is currently experimental. Please "
+                        "raise an issue on https://github.com/vllm-project/vllm-ascend/issues"
+                        " if you encourage any Error")

vllm_ascend/ascend_forward_context.py ADDED Viewed

@@ -0,0 +1,114 @@
+import math
+from contextlib import contextmanager
+from enum import Enum
+from typing import Any, Optional
+import torch
+from vllm.config import VllmConfig
+from vllm.distributed import get_dp_group, get_ep_group, get_tp_group
+from vllm.forward_context import get_forward_context, set_forward_context
+import vllm_ascend.envs as envs
+from vllm_ascend.platform import NPUPlatform
+class FusedMoEState(Enum):
+    AllGather = 0
+    All2All = 1
+    MC2 = 2
+    AllGatherEP = 3
+    NaiveMulticast = 4
+    All2AllSeq = 5
+# TODO(zzzzwwjj): add soc_version to choose branch
+def _get_fused_moe_state(ep_size: int, with_prefill: bool,
+                         is_deepseek_v3_r1: bool):
+    # the fusion operator torch_npu.npu_grouped_matmul_finalize_routing called by allgather ep
+    # only supports deepseek v3/r1
+    if (envs.VLLM_ENABLE_FUSED_EXPERTS_ALLGATHER_EP and ep_size > 1
+            and is_deepseek_v3_r1):
+        return FusedMoEState.AllGatherEP
+    elif ep_size == 1:
+        if with_prefill:
+            return FusedMoEState.NaiveMulticast
+        else:
+            return FusedMoEState.AllGather
+    elif envs.VLLM_ASCEND_ENABLE_MOE_ALL2ALL_SEQ:
+        # MC2 Dispatch/Combine performs better than alltoall_seq in decoding stage.
+        return (FusedMoEState.All2AllSeq if
+                (ep_size < 16 or with_prefill) else FusedMoEState.MC2)
+    # NOTE: mc2 need ep_size >= 16 & all2all can't use in torchair graph.
+    elif ep_size < 16 or with_prefill:
+        return FusedMoEState.All2All
+    else:
+        return FusedMoEState.MC2
+@contextmanager
+def set_ascend_forward_context(
+    attn_metadata: Any,
+    vllm_config: VllmConfig,
+    virtual_engine: int = 0,
+    num_tokens: Optional[int] = None,
+    num_tokens_across_dp: Optional[torch.Tensor] = None,
+    with_prefill: bool = True,
+    in_profile_run: bool = False,
+    num_actual_tokens: Optional[int] = None,
+):
+    """A context manager that stores the current forward context,
+    can be attention metadata, etc.
+    We add some additional param into forward_context.
+    """
+    with set_forward_context(attn_metadata,
+                             vllm_config,
+                             virtual_engine=virtual_engine,
+                             num_tokens=num_tokens,
+                             num_tokens_across_dp=num_tokens_across_dp):
+        forward_context = get_forward_context()
+        forward_context.with_prefill = with_prefill
+        ep_size = (get_ep_group().world_size if
+                   vllm_config.parallel_config.enable_expert_parallel else 1)
+        is_deepseek_v3_r1 = hasattr(
+            vllm_config.model_config.hf_config, 'n_routed_experts'
+        ) and vllm_config.model_config.hf_config.n_routed_experts == 256
+        fused_moe_state = _get_fused_moe_state(ep_size, with_prefill,
+                                               is_deepseek_v3_r1)
+        forward_context.fused_moe_state = fused_moe_state
+        forward_context.in_profile_run = in_profile_run
+        # NOTE: This cannot be set using set_forward_context
+        # due to multiple warmups before actual capturing
+        forward_context.capturing = False
+        if num_tokens is None and attn_metadata is not None:
+            num_tokens = attn_metadata.num_actual_tokens
+        dp_world_size = get_dp_group().world_size
+        if dp_world_size > 1 and forward_context.dp_metadata is not None:
+            max_tokens_across_dp = forward_context.dp_metadata.max_tokens_across_dp_cpu.item(
+            )
+        else:
+            max_tokens_across_dp = num_tokens
+        forward_context.max_tokens_across_dp = max_tokens_across_dp
+        if num_tokens is not None:
+            if num_actual_tokens is None:
+                num_actual_tokens = num_tokens
+            tp_world_size = get_tp_group().world_size
+            # NOTE: token num which need to pad to when mc2
+            forward_context.padded_num_tokens = math.ceil(
+                max_tokens_across_dp / tp_world_size) * tp_world_size
+            mc2_mask = torch.zeros(forward_context.padded_num_tokens,
+                                   dtype=torch.bool,
+                                   device=NPUPlatform.device_type)
+            mc2_mask[:num_actual_tokens] = True
+            forward_context.mc2_mask = mc2_mask
+        try:
+            yield
+        finally:
+            pass

vllm_ascend/attention/__init__.py ADDED Viewed

File without changes

vllm_ascend/attention/attention_mask.py ADDED Viewed

@@ -0,0 +1,104 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+def _generate_attn_mask(max_seq_len, dtype):
+    # Construct lower triangle matrix.
+    mask_flag = torch.tril(
+        torch.ones((max_seq_len, max_seq_len),
+                   dtype=torch.bool)).view(max_seq_len, max_seq_len)
+    # Create upper triangle matrix used to mark mask positions.
+    mask_flag = ~mask_flag
+    # Currently for fp16 dtype, the mask value should be set to -inf.
+    # TODO: Eliminate this part in the future.
+    if dtype == torch.float16:
+        mask_value = torch.finfo(torch.float32).min
+    else:
+        mask_value = 1
+    attn_mask = torch.masked_fill(torch.zeros(size=(max_seq_len, max_seq_len)),
+                                  mask_flag, mask_value).to(dtype)
+    return attn_mask
+class AttentionMaskBuilder:
+    def __init__(
+        self,
+        max_seq_len: int,
+        dtype: torch.dtype,
+    ):
+        attn_mask = _generate_attn_mask(max_seq_len, dtype)
+        self._seq_len_cached = attn_mask.shape[0]
+        self.attn_mask_cache = attn_mask
+        self.splitfuse_mask_value = -10000
+    def get_attn_mask(self, max_seq_len: int, dtype: torch.dtype,
+                      device: torch.device):
+        self._update_attn_cache(max_seq_len, dtype, device)
+        return self.attn_mask_cache[:max_seq_len, :max_seq_len].contiguous()
+    def get_splitfuse_attn_mask(
+        self,
+        seq_lens,
+        query_lens,
+        position,
+        dtype,
+        device,
+    ) -> torch.Tensor:
+        max_seq_len = max(seq_lens, default=0)
+        if max_seq_len <= self._seq_len_cached:
+            self._update_attn_cache(max_seq_len, dtype, device)
+            # FIXME: Currently the mask value of chunked-prefill situation and Prefill-Only situation
+            # is not the same. Fix this in the future when kernel is ready.
+            if self.attn_mask_cache.numel(
+            ) > 1 and self.attn_mask_cache[0][1] > 0:
+                attn_mask = self.get_attn_mask(  # type: ignore
+                    max_seq_len, dtype, device)
+                # Do not use in-place multiplication to avoid modifying `self.attn_mask_cache`!
+                attn_mask = attn_mask * -10000
+            else:
+                attn_mask = self.attn_mask_cache
+            return torch.index_select(attn_mask, dim=0,
+                                      index=position)[:, :max_seq_len]
+        total_q_len = sum(query_lens)
+        attn_mask = torch.zeros((total_q_len, max_seq_len),
+                                dtype=dtype,
+                                device="cpu")
+        current_row = 0
+        for i in range(len(query_lens)):
+            seq_len = seq_lens[i]
+            q_len = query_lens[i]
+            context_len = seq_len - q_len
+            assert context_len >= 0
+            attn_mask[current_row:current_row + q_len,
+                      context_len:] = self.splitfuse_mask_value
+            right_tensor = attn_mask[current_row:current_row + q_len,
+                                     context_len:seq_len]
+            right_tensor.masked_fill_(
+                right_tensor.tril() == self.splitfuse_mask_value, 0)
+            current_row += q_len
+        return attn_mask.to(device, non_blocking=True)
+    def _update_attn_cache(self, seqlen: int, dtype: torch.dtype,
+                           device: torch.device):
+        if seqlen > self._seq_len_cached:
+            self._seq_len_cached = seqlen
+            self.attn_mask_cache = _generate_attn_mask(seqlen, dtype)
+        if self.attn_mask_cache.device != device:
+            self.attn_mask_cache = self.attn_mask_cache.to(device)