PyPI - transformers - Versions diffs - 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl - Mend

transformers 5.0.0rc1py3-none-any.whl → 5.0.0rc2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (671) hide show

transformers/models/bamba/modeling_bamba.py CHANGED Viewed

@@ -35,7 +35,7 @@ from transformers.activations import ACT2FN
 from ... import initialization as init
 from ...cache_utils import Cache
 from ...generation import GenerationMixin
-from ...integrations import use_kernel_forward_from_hub, use_kernelized_func
+from ...integrations import lazy_load_kernel, use_kernel_forward_from_hub, use_kernelized_func
 from ...modeling_attn_mask_utils import AttentionMaskConverter
 from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
@@ -44,22 +44,9 @@ from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
 from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
 from ...utils.generic import maybe_autocast
-from ...utils.import_utils import is_causal_conv1d_available, is_mamba_2_ssm_available
 from .configuration_bamba import BambaConfig
-if is_mamba_2_ssm_available():
-    from mamba_ssm.ops.triton.selective_state_update import selective_state_update
-    from mamba_ssm.ops.triton.ssd_combined import mamba_chunk_scan_combined, mamba_split_conv1d_scan_combined
-else:
-    selective_state_update = None
-if is_causal_conv1d_available():
-    from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
-else:
-    causal_conv1d_update, causal_conv1d_fn = None, None
 logger = logging.get_logger(__name__)
@@ -212,7 +199,7 @@ class BambaRotaryEmbedding(nn.Module):
         inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
-        self.original_inv_freq = inv_freq
+        self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
     @staticmethod
     def compute_default_rope_parameters(
@@ -501,9 +488,6 @@ def apply_mask_to_padding_states(hidden_states, attention_mask):
     return hidden_states
-is_fast_path_available = all((selective_state_update, causal_conv1d_fn, causal_conv1d_update))
 # Adapted from transformers.models.mamba2.modeling_mamba2.Mamba2Mixer
 class BambaMixer(nn.Module):
     """
@@ -575,6 +559,20 @@ class BambaMixer(nn.Module):
         self.out_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=self.use_bias)
+        global causal_conv1d_update, causal_conv1d_fn
+        causal_conv1d = lazy_load_kernel("causal-conv1d")
+        causal_conv1d_update = getattr(causal_conv1d, "causal_conv1d_update", None)
+        causal_conv1d_fn = getattr(causal_conv1d, "causal_conv1d_fn", None)
+        global selective_state_update, mamba_chunk_scan_combined, mamba_split_conv1d_scan_combined
+        mamba_ssm = lazy_load_kernel("mamba-ssm")
+        selective_state_update = getattr(mamba_ssm, "selective_state_update", None)
+        mamba_chunk_scan_combined = getattr(mamba_ssm, "mamba_chunk_scan_combined", None)
+        mamba_split_conv1d_scan_combined = getattr(mamba_ssm, "mamba_split_conv1d_scan_combined", None)
+        global is_fast_path_available
+        is_fast_path_available = all((selective_state_update, causal_conv1d_fn, causal_conv1d_update))
         if not is_fast_path_available:
             logger.warning_once(
                 "The fast path is not available because one of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)`"
@@ -1489,6 +1487,7 @@ class BambaForCausalLM(BambaPreTrainedModel, GenerationMixin):
         cache_position=None,
         position_ids=None,
         use_cache=True,
+        is_first_iteration=False,
         **kwargs,
     ):
         # Overwritten -- has a unique cache type, `HybridMambaAttentionDynamicCache`
@@ -1521,7 +1520,7 @@ class BambaForCausalLM(BambaPreTrainedModel, GenerationMixin):
                 position_ids = position_ids[:, -input_ids.shape[1] :]
         # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and empty_past_kv:
+        if inputs_embeds is not None and is_first_iteration:
             model_inputs = {"inputs_embeds": inputs_embeds}
         else:
             model_inputs = {"input_ids": input_ids.contiguous()}  # `contiguous()` needed for compilation use cases

transformers/models/bamba/modular_bamba.py CHANGED Viewed

@@ -43,6 +43,7 @@ from transformers.models.mamba2.modeling_mamba2 import (
 )
 from ... import initialization as init
+from ...integrations import lazy_load_kernel
 from ...modeling_attn_mask_utils import AttentionMaskConverter
 from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
 from ...modeling_utils import PreTrainedModel
@@ -52,24 +53,9 @@ from ...utils import (
     can_return_tuple,
     logging,
 )
-from ...utils.import_utils import is_causal_conv1d_available, is_mamba_2_ssm_available
 from .configuration_bamba import BambaConfig
-if is_mamba_2_ssm_available():
-    from mamba_ssm.ops.triton.selective_state_update import selective_state_update
-    from mamba_ssm.ops.triton.ssd_combined import mamba_chunk_scan_combined, mamba_split_conv1d_scan_combined
-else:
-    selective_state_update = None
-if is_causal_conv1d_available():
-    from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
-else:
-    causal_conv1d_update, causal_conv1d_fn = None, None
-is_fast_path_available = all((selective_state_update, causal_conv1d_fn, causal_conv1d_update))
 logger = logging.get_logger(__name__)
@@ -276,6 +262,20 @@ class BambaMixer(nn.Module):
         self.out_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=self.use_bias)
+        global causal_conv1d_update, causal_conv1d_fn
+        causal_conv1d = lazy_load_kernel("causal-conv1d")
+        causal_conv1d_update = getattr(causal_conv1d, "causal_conv1d_update", None)
+        causal_conv1d_fn = getattr(causal_conv1d, "causal_conv1d_fn", None)
+        global selective_state_update, mamba_chunk_scan_combined, mamba_split_conv1d_scan_combined
+        mamba_ssm = lazy_load_kernel("mamba-ssm")
+        selective_state_update = getattr(mamba_ssm, "selective_state_update", None)
+        mamba_chunk_scan_combined = getattr(mamba_ssm, "mamba_chunk_scan_combined", None)
+        mamba_split_conv1d_scan_combined = getattr(mamba_ssm, "mamba_split_conv1d_scan_combined", None)
+        global is_fast_path_available
+        is_fast_path_available = all((selective_state_update, causal_conv1d_fn, causal_conv1d_update))
         if not is_fast_path_available:
             logger.warning_once(
                 "The fast path is not available because one of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)`"
@@ -1151,6 +1151,7 @@ class BambaForCausalLM(LlamaForCausalLM):
         cache_position=None,
         position_ids=None,
         use_cache=True,
+        is_first_iteration=False,
         **kwargs,
     ):
         # Overwritten -- has a unique cache type, `HybridMambaAttentionDynamicCache`
@@ -1183,7 +1184,7 @@ class BambaForCausalLM(LlamaForCausalLM):
                 position_ids = position_ids[:, -input_ids.shape[1] :]
         # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and empty_past_kv:
+        if inputs_embeds is not None and is_first_iteration:
             model_inputs = {"inputs_embeds": inputs_embeds}
         else:
             model_inputs = {"input_ids": input_ids.contiguous()}  # `contiguous()` needed for compilation use cases

transformers/models/bark/modeling_bark.py CHANGED Viewed

@@ -23,6 +23,7 @@ import torch
 from torch import nn
 from torch.nn import functional as F
+from ... import initialization as init
 from ...cache_utils import Cache, DynamicCache
 from ...generation import GenerationMixin
 from ...generation.logits_process import (
@@ -349,6 +350,14 @@ class BarkPreTrainedModel(PreTrainedModel):
         return super().device
+    def _init_weights(self, module):
+        super()._init_weights(module)
+        if isinstance(module, BarkSelfAttention):
+            if module.is_causal:
+                block_size = module.config.block_size
+                bias = torch.tril(torch.ones((block_size, block_size), dtype=bool)).view(1, 1, block_size, block_size)
+                init.copy_(module.bias, bias)
 # GPT2-like autoregressive model
 class BarkCausalModel(BarkPreTrainedModel, GenerationMixin):

transformers/models/bart/configuration_bart.py CHANGED Viewed

@@ -157,7 +157,6 @@ class BartConfig(PreTrainedConfig):
             decoder_start_token_id=decoder_start_token_id,
             **kwargs,
         )
-        self.tie_encoder_decoder = True
 __all__ = ["BartConfig"]

transformers/models/bart/modeling_bart.py CHANGED Viewed

@@ -23,6 +23,7 @@ import torch
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from ... import initialization as init
 from ...activations import ACT2FN
 from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
 from ...generation import GenerationMixin
@@ -476,6 +477,11 @@ class BartPreTrainedModel(PreTrainedModel):
     _can_compile_fullgraph = True
+    def _init_weights(self, module):
+        super()._init_weights(module)
+        if isinstance(module, BartForConditionalGeneration):
+            init.zeros_(module.final_logits_bias)
     @property
     def dummy_inputs(self):
         pad_token = self.config.pad_token_id
@@ -1463,6 +1469,7 @@ class BartDecoderWrapper(BartPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.decoder = BartDecoder(config)
+        self.post_init()
     def forward(self, *args, **kwargs):
         return self.decoder(*args, **kwargs)

transformers/models/beit/image_processing_beit_fast.py CHANGED Viewed

@@ -163,7 +163,6 @@ class BeitImageProcessorFast(BaseImageProcessorFast):
             processed_images_grouped[shape] = stacked_images
         processed_images = reorder_images(processed_images_grouped, grouped_images_index)
-        processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
         return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)

transformers/models/bert/modeling_bert.py CHANGED Viewed

@@ -569,6 +569,9 @@ class BertPreTrainedModel(PreTrainedModel):
         super()._init_weights(module)
         if isinstance(module, BertLMPredictionHead):
             init.zeros_(module.bias)
+        elif isinstance(module, BertEmbeddings):
+            init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
+            init.zeros_(module.token_type_ids)
 @dataclass

transformers/models/bert_generation/modeling_bert_generation.py CHANGED Viewed

@@ -463,6 +463,8 @@ class BertGenerationPreTrainedModel(PreTrainedModel):
         super()._init_weights(module)
         if isinstance(module, BertGenerationOnlyLMHead):
             init.zeros_(module.bias)
+        elif isinstance(module, BertGenerationEmbeddings):
+            init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
 @auto_docstring(

transformers/models/big_bird/modeling_big_bird.py CHANGED Viewed

@@ -1521,6 +1521,9 @@ class BigBirdPreTrainedModel(PreTrainedModel):
         super()._init_weights(module)
         if isinstance(module, BigBirdLMPredictionHead):
             init.zeros_(module.bias)
+        elif isinstance(module, BigBirdEmbeddings):
+            init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
+            init.zeros_(module.token_type_ids)
 @dataclass

transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py CHANGED Viewed

@@ -23,6 +23,7 @@ import torch
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from ... import initialization as init
 from ...activations import ACT2FN
 from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
 from ...generation import GenerationMixin
@@ -1536,6 +1537,11 @@ class BigBirdPegasusPreTrainedModel(PreTrainedModel):
     _skip_keys_device_placement = "past_key_values"
     _can_compile_fullgraph = True
+    def _init_weights(self, module):
+        super()._init_weights(module)
+        if isinstance(module, BigBirdPegasusForConditionalGeneration):
+            init.zeros_(module.final_logits_bias)
     @property
     def dummy_inputs(self):
         pad_token = self.config.pad_token_id
@@ -2582,6 +2588,7 @@ class BigBirdPegasusDecoderWrapper(BigBirdPegasusPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.decoder = BigBirdPegasusDecoder(config)
+        self.post_init()
     def forward(self, *args, **kwargs):
         return self.decoder(*args, **kwargs)

transformers/models/bit/modeling_bit.py CHANGED Viewed

@@ -84,7 +84,7 @@ class WeightStandardizedConv2d(nn.Conv2d):
     """Conv2d with Weight Standardization. Used for ViT Hybrid model.
     Paper: [Micro-Batch Training with Batch-Channel Normalization and Weight
-    Standardization](https://huggingface.co/papers/1903.10520v2)
+    Standardization](https://huggingface.co/papers/1903.10520)
     """
     def __init__(
@@ -643,6 +643,10 @@ class BitPreTrainedModel(PreTrainedModel):
         elif isinstance(module, (nn.BatchNorm2d, nn.GroupNorm)):
             init.constant_(module.weight, 1)
             init.constant_(module.bias, 0)
+            if getattr(module, "running_mean", None) is not None:
+                init.zeros_(module.running_mean)
+                init.ones_(module.running_var)
+                init.zeros_(module.num_batches_tracked)
 @auto_docstring

transformers/models/bitnet/modeling_bitnet.py CHANGED Viewed

@@ -287,7 +287,7 @@ class BitNetRotaryEmbedding(nn.Module):
         inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
-        self.original_inv_freq = inv_freq
+        self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
     @staticmethod
     def compute_default_rope_parameters(

transformers/models/blenderbot/modeling_blenderbot.py CHANGED Viewed

@@ -24,6 +24,7 @@ import torch
 from torch import nn
 from torch.nn import CrossEntropyLoss
+from ... import initialization as init
 from ...activations import ACT2FN
 from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
 from ...generation import GenerationMixin
@@ -437,6 +438,11 @@ class BlenderbotPreTrainedModel(PreTrainedModel):
     _supports_flex_attn = True
     _can_compile_fullgraph = True
+    def _init_weights(self, module):
+        super()._init_weights(module)
+        if isinstance(module, BlenderbotForConditionalGeneration):
+            init.zeros_(module.final_logits_bias)
     @property
     def dummy_inputs(self):
         pad_token = self.config.pad_token_id
@@ -1156,6 +1162,7 @@ class BlenderbotDecoderWrapper(BlenderbotPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.decoder = BlenderbotDecoder(config)
+        self.post_init()
     def forward(self, *args, **kwargs):
         return self.decoder(*args, **kwargs)

transformers/models/blenderbot/tokenization_blenderbot.py CHANGED Viewed

@@ -160,13 +160,6 @@ class BlenderbotTokenizer(TokenizersBackend):
         self._tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=add_prefix_space)
         self._tokenizer.decoder = decoders.ByteLevel()
-        self._tokenizer.post_processor = processors.RobertaProcessing(
-            sep=(str(eos_token), self._vocab.get(str(eos_token), 2)),
-            cls=(str(bos_token), self._vocab.get(str(bos_token), 0)),
-            add_prefix_space=add_prefix_space,
-            trim_offsets=True,
-        )
         super().__init__(
             bos_token=bos_token,
             eos_token=eos_token,
@@ -178,6 +171,12 @@ class BlenderbotTokenizer(TokenizersBackend):
             add_prefix_space=add_prefix_space,
             **kwargs,
         )
+        self._tokenizer.post_processor = processors.RobertaProcessing(
+            sep=(str(eos_token), self.eos_token_id),
+            cls=(str(bos_token), self.bos_token_id),
+            add_prefix_space=add_prefix_space,
+            trim_offsets=True,
+        )
 __all__ = ["BlenderbotTokenizer"]

transformers/models/blenderbot_small/modeling_blenderbot_small.py CHANGED Viewed

@@ -22,6 +22,7 @@ import torch
 from torch import nn
 from torch.nn import CrossEntropyLoss
+from ... import initialization as init
 from ...activations import ACT2FN
 from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
 from ...generation import GenerationMixin
@@ -430,6 +431,11 @@ class BlenderbotSmallPreTrainedModel(PreTrainedModel):
     _supports_flex_attn = True
     _can_compile_fullgraph = True
+    def _init_weights(self, module):
+        super()._init_weights(module)
+        if isinstance(module, BlenderbotSmallForConditionalGeneration):
+            init.zeros_(module.final_logits_bias)
     @property
     def dummy_inputs(self):
         pad_token = self.config.pad_token_id
@@ -1116,6 +1122,7 @@ class BlenderbotSmallDecoderWrapper(BlenderbotSmallPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.decoder = BlenderbotSmallDecoder(config)
+        self.post_init()
     def forward(self, *args, **kwargs):
         return self.decoder(*args, **kwargs)

transformers/models/blip/modeling_blip.py CHANGED Viewed

@@ -430,6 +430,8 @@ class BlipPreTrainedModel(PreTrainedModel):
                 std = self.config.vision_config.initializer_range
             init.trunc_normal_(module.position_embedding, mean=0.0, std=std)
             init.trunc_normal_(module.class_embedding, mean=0.0, std=std)
+        elif isinstance(module, BlipTextEmbeddings):
+            init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
 class BlipEncoder(nn.Module):

transformers/models/blip/modeling_blip_text.py CHANGED Viewed

@@ -21,6 +21,7 @@ import torch
 from torch import Tensor, device, nn
 from torch.nn import CrossEntropyLoss
+from ... import initialization as init
 from ...activations import ACT2FN
 from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
 from ...generation import GenerationMixin
@@ -504,6 +505,11 @@ class BlipTextPreTrainedModel(PreTrainedModel):
     base_model_prefix = "bert"
     _no_split_modules = []
+    def _init_weights(self, module):
+        super()._init_weights(module)
+        if isinstance(module, BlipTextEmbeddings):
+            init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
 # Adapted from https://github.com/salesforce/BLIP/blob/3a29b7410476bf5f2ba0955827390eb6ea1f4f9d/models/med.py#L571
 class BlipTextModel(BlipTextPreTrainedModel):
@@ -740,6 +746,8 @@ class BlipTextLMHeadModel(BlipTextPreTrainedModel, GenerationMixin):
         self.cls = BlipTextOnlyMLMHead(config)
         self.label_smoothing = config.label_smoothing
+        self.post_init()
     def get_input_embeddings(self):
         return self.bert.get_input_embeddings()

transformers/models/blip_2/modeling_blip_2.py CHANGED Viewed

@@ -428,6 +428,8 @@ class Blip2PreTrainedModel(PreTrainedModel):
             ),
         ):
             init.zeros_(module.query_tokens)
+        elif isinstance(module, Blip2TextEmbeddings):
+            init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
 # Copied from transformers.models.blip.modeling_blip.BlipEncoder with Blip->Blip2

transformers/models/bloom/modeling_bloom.py CHANGED Viewed

@@ -714,36 +714,21 @@ class BloomForCausalLM(BloomPreTrainedModel, GenerationMixin):
         inputs_embeds=None,
         cache_position=None,
         use_cache=True,
+        is_first_iteration=False,
         **kwargs,
     ):
         # Overwritten because of the fixed-shape attention mask creation
-        # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
-        # Exception 1: when passing input_embeds, input_ids may be missing entries
-        # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
-        # Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case.
-        #              (we can't check exception 3 while compiling)
-        # Exception 4: If input_embeds are passed then slice it through `cache_position`, to keep only the unprocessed tokens and
-        # generate the first token for each sequence. Later use the generated Input ids for continuation.
-        if past_key_values is not None:
-            if inputs_embeds is not None and input_ids.shape[1] == 0:  # Exception 4
-                inputs_embeds = inputs_embeds[:, -cache_position.shape[0] :]
-            elif (
-                inputs_embeds is not None  # Exception 1
-                or cache_position[-1] >= input_ids.shape[1]  # Exception 3
-            ):
-                input_ids = input_ids[:, -cache_position.shape[0] :]
-            elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
-                input_ids = input_ids[:, cache_position]
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and len(cache_position) == inputs_embeds.shape[1]:
-            model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
-        else:
-            # This `clone` call is needed to avoid recapturing cuda graphs with `torch.compile`'s  `mode="reduce-overhead`, as otherwise the
-            # input `position_ids` would have various stride during the decoding. Here, simply using `.contiguous()` is not sufficient as in
-            # the batch size = 1 case, `position_ids` is already contiguous but with varying stride which retriggers a capture.
-            model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            cache_position=cache_position,
+            use_cache=use_cache,
+            is_first_iteration=is_first_iteration,
+            **kwargs,
+        )
         # This part differs from other models because BLOOM needs a 2D mask to construct alibi tensor
         # The only difference is the usage of 2D instead of 4D mask, but the shape will be static
@@ -753,24 +738,8 @@ class BloomForCausalLM(BloomPreTrainedModel, GenerationMixin):
             diff = target_length - seq_length
             new_attn_mask = torch.zeros(batch_size, diff, device=attention_mask.device, dtype=attention_mask.dtype)
-            attention_mask = torch.cat(
-                [attention_mask, new_attn_mask],
-                dim=-1,
-            )
-        model_inputs.update(
-            {
-                "cache_position": cache_position,
-                "past_key_values": past_key_values,
-                "use_cache": use_cache,
-                "attention_mask": attention_mask,
-            }
-        )
-        # Forward ALL kwargs that are uninitialized (e.g. `use_cache`).
-        for key, value in kwargs.items():
-            if key not in model_inputs:
-                model_inputs[key] = value
+            attention_mask = torch.cat([attention_mask, new_attn_mask], dim=-1)
+            model_inputs["attention_mask"] = attention_mask
         return model_inputs

transformers 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl

transformers 5.0.0rc1py3-none-any.whl → 5.0.0rc2py3-none-any.whl