PyPI - transformers - Versions diffs - 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl - Mend

transformers 5.0.0rc3py3-none-any.whl → 5.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1021) hide show

transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py CHANGED Viewed

@@ -229,7 +229,10 @@ class Wav2Vec2BertConfig(PreTrainedConfig):
         conformer_conv_dropout=0.1,
         **kwargs,
     ):
-        super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id)
+        super().__init__(**kwargs)
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
         self.hidden_size = hidden_size
         self.num_hidden_layers = num_hidden_layers
         self.intermediate_size = intermediate_size

transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py CHANGED Viewed

@@ -267,7 +267,10 @@ class Wav2Vec2ConformerConfig(PreTrainedConfig):
         conformer_conv_dropout=0.1,
         **kwargs,
     ):
-        super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id)
+        super().__init__(**kwargs)
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
         self.hidden_size = hidden_size
         self.feat_extract_norm = feat_extract_norm
         self.feat_extract_activation = feat_extract_activation

transformers/models/wavlm/configuration_wavlm.py CHANGED Viewed

@@ -248,7 +248,10 @@ class WavLMConfig(PreTrainedConfig):
         output_hidden_size=None,
         **kwargs,
     ):
-        super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id)
+        super().__init__(**kwargs)
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
         self.hidden_size = hidden_size
         self.feat_extract_norm = feat_extract_norm
         self.feat_extract_activation = feat_extract_activation

transformers/models/wavlm/modeling_wavlm.py CHANGED Viewed

@@ -26,7 +26,7 @@ from ...modeling_outputs import (
     Wav2Vec2BaseModelOutput,
     XVectorOutput,
 )
-from ...modeling_utils import PreTrainedModel
+from ...modeling_utils import PreTrainedModel, get_torch_context_manager_or_global_device
 from ...utils import auto_docstring, is_peft_available, logging
 from .configuration_wavlm import WavLMConfig
@@ -1141,6 +1141,9 @@ class WavLMForCTC(WavLMPreTrainedModel):
         This method is **not** supposed to be called by the user and is prone to be changed in the future.
         """
+        if get_torch_context_manager_or_global_device() == torch.device("meta"):
+            return
         # Note that `tie_weights` is usually used to tie input and output embedding weights. The method is re-purposed to
         # correctly load adapter layers for WavLM so that we do not have to introduce a new API to
         # [`PreTrainedModel`]. While slightly hacky, WavLM never has to tie input and output embeddings, so that it is

transformers/models/whisper/configuration_whisper.py CHANGED Viewed

@@ -224,6 +224,7 @@ class WhisperConfig(PreTrainedConfig):
         mask_feature_length=10,
         mask_feature_min_masks=0,
         median_filter_width=7,
+        tie_word_embeddings=True,
         **kwargs,
     ):
         self.vocab_size = vocab_size
@@ -262,13 +263,14 @@ class WhisperConfig(PreTrainedConfig):
         self.mask_feature_min_masks = mask_feature_min_masks
         self.median_filter_width = median_filter_width
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.decoder_start_token_id = decoder_start_token_id
+        self.tie_word_embeddings = tie_word_embeddings
         super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
             is_encoder_decoder=is_encoder_decoder,
-            decoder_start_token_id=decoder_start_token_id,
             suppress_tokens=suppress_tokens,
             begin_suppress_tokens=begin_suppress_tokens,
             **kwargs,

transformers/models/whisper/generation_whisper.py CHANGED Viewed

@@ -647,7 +647,6 @@ class WhisperGenerationMixin(GenerationMixin):
         """
         # 1. prepare generation config
-        generation_config = self.generation_config if generation_config is None else generation_config
         generation_config, kwargs = self._prepare_generation_config(generation_config, **kwargs)
         # 2. set global generate variables

transformers/models/whisper/modeling_whisper.py CHANGED Viewed

@@ -338,9 +338,9 @@ class WhisperAttention(nn.Module):
                     key_states, value_states, self.layer_idx, {"cache_position": cache_position}
                 )
-        attention_interface: Callable = eager_attention_forward
-        if self.config._attn_implementation != "eager":
-            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
         attn_output, attn_weights = attention_interface(
             self,

transformers/models/x_clip/configuration_x_clip.py CHANGED Viewed

@@ -95,7 +95,10 @@ class XCLIPTextConfig(PreTrainedConfig):
         eos_token_id=2,
         **kwargs,
     ):
-        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+        super().__init__(**kwargs)
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
         self.vocab_size = vocab_size
         self.hidden_size = hidden_size

transformers/models/x_clip/modeling_x_clip.py CHANGED Viewed

@@ -27,14 +27,16 @@ from ...modeling_attn_mask_utils import _create_4d_causal_attention_mask, _prepa
 from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
 from ...utils import (
     ModelOutput,
+    TransformersKwargs,
     auto_docstring,
     can_return_tuple,
-    filter_out_non_signature_kwargs,
     logging,
     torch_int,
 )
+from ...utils.generic import is_flash_attention_requested
 from .configuration_x_clip import XCLIPConfig, XCLIPTextConfig, XCLIPVisionConfig
@@ -289,7 +291,7 @@ class XCLIPAttention(nn.Module):
         values = values.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
         # CLIP text model uses both `causal_attention_mask` and `attention_mask`
         # in case FA2 kernel is called, `is_causal` should be inferred from `causal_attention_mask`
-        if self.config._attn_implementation != "flash_attention_2":
+        if not is_flash_attention_requested(self.config):
             if attention_mask is not None and causal_attention_mask is not None:
                 attention_mask = attention_mask + causal_attention_mask
             elif causal_attention_mask is not None:
@@ -297,9 +299,9 @@ class XCLIPAttention(nn.Module):
         else:
             self.is_causal = causal_attention_mask is not None
-        attention_interface: Callable = eager_attention_forward
-        if self.config._attn_implementation != "eager":
-            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
         attn_output, attn_weights = attention_interface(
             self,
@@ -1201,19 +1203,16 @@ class XCLIPModel(XCLIPPreTrainedModel):
         # Initialize weights and apply final processing
         self.post_init()
-    @filter_out_non_signature_kwargs()
+    @can_return_tuple
     @auto_docstring
     def get_text_features(
         self,
         input_ids: torch.Tensor | None = None,
         attention_mask: torch.Tensor | None = None,
         position_ids: torch.Tensor | None = None,
-    ) -> torch.FloatTensor:
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple | BaseModelOutputWithPooling:
         r"""
-        Returns:
-            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
-            applying the projection layer to the pooled output of [`XCLIPTextModel`].
         Examples:
         ```python
@@ -1231,22 +1230,22 @@ class XCLIPModel(XCLIPPreTrainedModel):
             input_ids=input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
+            return_dict=True,
+            **kwargs,
         )
-        text_features = self.text_projection(text_outputs.pooler_output)
-        return text_features
+        pooled_output = text_outputs.pooler_output
+        text_outputs.pooler_output = self.text_projection(pooled_output)
+        return text_outputs
-    @filter_out_non_signature_kwargs()
+    @can_return_tuple
     @auto_docstring
     def get_video_features(
         self,
         pixel_values: torch.Tensor,
-    ) -> torch.FloatTensor:
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple | BaseModelOutputWithPooling:
         r"""
-        Returns:
-            video_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The video embeddings obtained by
-            applying the projection layer to the pooled output of [`XCLIPVisionModel`] and
-            [`XCLIPMultiframeIntegrationTransformer`].
         Examples:
         ```python
@@ -1319,17 +1318,17 @@ class XCLIPModel(XCLIPPreTrainedModel):
         batch_size, num_frames, num_channels, height, width = pixel_values.shape
         pixel_values = pixel_values.reshape(-1, num_channels, height, width)
-        vision_outputs: BaseModelOutputWithPooling = self.vision_model(pixel_values=pixel_values)
-        video_embeds = vision_outputs.pooler_output
+        video_outputs: BaseModelOutputWithPooling = self.vision_model(
+            pixel_values=pixel_values, return_dict=True, **kwargs
+        )
+        video_embeds = video_outputs.pooler_output
         video_embeds = self.visual_projection(video_embeds)
         cls_features = video_embeds.view(batch_size, num_frames, -1)
+        mit_outputs: BaseModelOutputWithPooling = self.mit(cls_features, return_dict=True, **kwargs)
+        video_outputs.pooler_output = mit_outputs.pooler_output
-        mit_outputs: BaseModelOutputWithPooling = self.mit(cls_features)
-        video_embeds = mit_outputs.pooler_output
-        return video_embeds
+        return video_outputs
     @auto_docstring
     def forward(

transformers/models/xglm/configuration_xglm.py CHANGED Viewed

@@ -109,8 +109,11 @@ class XGLMConfig(PreTrainedConfig):
         pad_token_id=1,
         bos_token_id=0,
         eos_token_id=2,
+        add_cross_attention=False,
+        tie_word_embeddings=True,
         **kwargs,
     ):
+        self.add_cross_attention = add_cross_attention
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
         self.d_model = d_model
@@ -125,14 +128,13 @@ class XGLMConfig(PreTrainedConfig):
         self.init_std = init_std
         self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
         self.use_cache = use_cache
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.tie_word_embeddings = tie_word_embeddings
+        self.decoder_start_token_id = decoder_start_token_id
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            decoder_start_token_id=decoder_start_token_id,
-            **kwargs,
-        )
+        super().__init__(**kwargs)
 __all__ = ["XGLMConfig"]

transformers/models/xlm/configuration_xlm.py CHANGED Viewed

@@ -138,6 +138,9 @@ class XLMConfig(PreTrainedConfig):
         "num_attention_heads": "n_heads",
         "num_hidden_layers": "n_layers",
         "n_words": "vocab_size",  # For backward compatibility
+        "bos_index": "bos_token_id",
+        "eos_index": "eos_token_id",
+        "pad_index": "pad_token_id",
     }
     def __init__(
@@ -158,9 +161,6 @@ class XLMConfig(PreTrainedConfig):
         embed_init_std=2048**-0.5,
         layer_norm_eps=1e-12,
         init_std=0.02,
-        bos_index=0,
-        eos_index=1,
-        pad_index=2,
         unk_index=3,
         mask_index=5,
         is_encoder=True,
@@ -175,6 +175,8 @@ class XLMConfig(PreTrainedConfig):
         lang_id=0,
         pad_token_id=2,
         bos_token_id=0,
+        eos_token_id=1,
+        tie_word_embeddings=True,
         **kwargs,
     ):
         """Constructs XLMConfig."""
@@ -191,9 +193,6 @@ class XLMConfig(PreTrainedConfig):
         self.n_langs = n_langs
         self.use_lang_emb = use_lang_emb
         self.layer_norm_eps = layer_norm_eps
-        self.bos_index = bos_index
-        self.eos_index = eos_index
-        self.pad_index = pad_index
         self.unk_index = unk_index
         self.mask_index = mask_index
         self.is_encoder = is_encoder
@@ -209,11 +208,15 @@ class XLMConfig(PreTrainedConfig):
         self.end_n_top = end_n_top
         self.mask_token_id = mask_token_id
         self.lang_id = lang_id
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.tie_word_embeddings = tie_word_embeddings
         if "n_words" in kwargs:
             self.n_words = kwargs["n_words"]
-        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, **kwargs)
+        super().__init__(**kwargs)
 __all__ = ["XLMConfig"]

transformers/models/xlm/modeling_xlm.py CHANGED Viewed

@@ -798,7 +798,7 @@ class XLMModel(XLMPreTrainedModel):
             if input_ids is not None:
                 lengths = (input_ids != self.pad_index).sum(dim=1).long()
             else:
-                lengths = torch.tensor([slen] * bs, device=device)
+                lengths = torch.full((bs,), slen, device=device, dtype=torch.long)
         # check inputs
         assert lengths.size(0) == bs

transformers/models/xlm_roberta/configuration_xlm_roberta.py CHANGED Viewed

@@ -104,10 +104,19 @@ class XLMRobertaConfig(PreTrainedConfig):
         eos_token_id=2,
         use_cache=True,
         classifier_dropout=None,
+        is_decoder=False,
+        add_cross_attention=False,
+        tie_word_embeddings=True,
         **kwargs,
     ):
-        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+        super().__init__(**kwargs)
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.tie_word_embeddings = tie_word_embeddings
+        self.is_decoder = is_decoder
+        self.add_cross_attention = add_cross_attention
         self.vocab_size = vocab_size
         self.hidden_size = hidden_size
         self.num_hidden_layers = num_hidden_layers

transformers/models/xlm_roberta/modeling_xlm_roberta.py CHANGED Viewed

@@ -242,9 +242,9 @@ class XLMRobertaSelfAttention(nn.Module):
                 {"cache_position": cache_position},
             )
-        attention_interface: Callable = eager_attention_forward
-        if self.config._attn_implementation != "eager":
-            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
         attn_output, attn_weights = attention_interface(
             self,
@@ -319,9 +319,9 @@ class XLMRobertaCrossAttention(nn.Module):
                 # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls
                 past_key_values.is_updated[self.layer_idx] = True
-        attention_interface: Callable = eager_attention_forward
-        if self.config._attn_implementation != "eager":
-            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
         attn_output, attn_weights = attention_interface(
             self,

transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py CHANGED Viewed

@@ -101,9 +101,18 @@ class XLMRobertaXLConfig(PreTrainedConfig):
         eos_token_id=2,
         use_cache=True,
         classifier_dropout=None,
+        is_decoder=False,
+        add_cross_attention=False,
+        tie_word_embeddings=True,
         **kwargs,
     ):
-        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+        super().__init__(**kwargs)
+        self.is_decoder = is_decoder
+        self.add_cross_attention = add_cross_attention
+        self.tie_word_embeddings = tie_word_embeddings
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
         self.vocab_size = vocab_size
         self.hidden_size = hidden_size
         self.num_hidden_layers = num_hidden_layers

transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py CHANGED Viewed

@@ -244,9 +244,9 @@ class XLMRobertaXLSelfAttention(nn.Module):
                 {"cache_position": cache_position},
             )
-        attention_interface: Callable = eager_attention_forward
-        if self.config._attn_implementation != "eager":
-            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
         attn_output, attn_weights = attention_interface(
             self,
@@ -321,9 +321,9 @@ class XLMRobertaXLCrossAttention(nn.Module):
                 # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls
                 past_key_values.is_updated[self.layer_idx] = True
-        attention_interface: Callable = eager_attention_forward
-        if self.config._attn_implementation != "eager":
-            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
         attn_output, attn_weights = attention_interface(
             self,

transformers/models/xlnet/configuration_xlnet.py CHANGED Viewed

@@ -165,6 +165,7 @@ class XLNetConfig(PreTrainedConfig):
         pad_token_id=5,
         bos_token_id=1,
         eos_token_id=2,
+        tie_word_embeddings=True,
         **kwargs,
     ):
         """Constructs XLNetConfig."""
@@ -204,10 +205,11 @@ class XLNetConfig(PreTrainedConfig):
         self.bos_token_id = bos_token_id
         self.pad_token_id = pad_token_id
         self.eos_token_id = eos_token_id
+        self.tie_word_embeddings = tie_word_embeddings
         self.use_mems_eval = use_mems_eval
         self.use_mems_train = use_mems_train
-        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+        super().__init__(**kwargs)
     @property
     def max_position_embeddings(self):

transformers/models/xlstm/configuration_xlstm.py CHANGED Viewed

@@ -231,13 +231,11 @@ class xLSTMConfig(PreTrainedConfig):
         self.eos_token_id = eos_token_id
         self.max_inference_chunksize = max_inference_chunksize
-        super().__init__(
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            pad_token_id=pad_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.tie_word_embeddings = tie_word_embeddings
+        super().__init__(**kwargs)
     @property
     def qk_dim(self):

transformers/models/xlstm/modeling_xlstm.py CHANGED Viewed

@@ -1538,38 +1538,6 @@ class xLSTMForCausalLM(xLSTMPreTrainedModel, GenerationMixin):
     def set_input_embeddings(self, new_embeddings):
         return self.backbone.set_input_embeddings(new_embeddings)
-    def prepare_inputs_for_generation(
-        self,
-        input_ids,
-        attention_mask=None,  # not used but needed, otherwise generate complains when passing tokenizer inputs
-        inputs_embeds=None,
-        use_cache=None,
-        cache_params: xLSTMCache | None = None,
-        **kwargs,
-    ):
-        if use_cache and cache_params is not None:
-            # If the first cache position is non-zero, we assume we are in generation mode.
-            # Thus, the cache_params state is assumed to be the state before the last token
-            # (lastly generated token), and all previous tokens are already ingested.
-            # This should as well support generation from scratch with the [BOS] token inserted first.
-            input_ids = input_ids[:, -1:]
-            if inputs_embeds is not None:
-                inputs_embeds = inputs_embeds[:, -1:]
-        if inputs_embeds is not None and cache_params is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-        model_inputs.update({"cache_params": cache_params, "use_cache": use_cache})
-        # Forward ALL kwargs that are uninitialized (e.g. `use_cache`).
-        for key, value in kwargs.items():
-            if key not in model_inputs:
-                model_inputs[key] = value
-        return model_inputs
     @can_return_tuple
     @auto_docstring
     def forward(

transformers/models/xmod/configuration_xmod.py CHANGED Viewed

@@ -126,10 +126,19 @@ class XmodConfig(PreTrainedConfig):
         ln_before_adapter=True,
         languages=("en_XX",),
         default_language=None,
+        is_decoder=False,
+        add_cross_attention=False,
+        tie_word_embeddings=True,
         **kwargs,
     ):
-        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+        super().__init__(**kwargs)
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.tie_word_embeddings = tie_word_embeddings
+        self.is_decoder = is_decoder
+        self.add_cross_attention = add_cross_attention
         self.vocab_size = vocab_size
         self.hidden_size = hidden_size
         self.num_hidden_layers = num_hidden_layers

transformers/models/xmod/modeling_xmod.py CHANGED Viewed

@@ -239,9 +239,9 @@ class XmodSelfAttention(nn.Module):
                 {"cache_position": cache_position},
             )
-        attention_interface: Callable = eager_attention_forward
-        if self.config._attn_implementation != "eager":
-            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
         attn_output, attn_weights = attention_interface(
             self,
@@ -317,9 +317,9 @@ class XmodCrossAttention(nn.Module):
                 # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls
                 past_key_values.is_updated[self.layer_idx] = True
-        attention_interface: Callable = eager_attention_forward
-        if self.config._attn_implementation != "eager":
-            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
         attn_output, attn_weights = attention_interface(
             self,
@@ -449,9 +449,6 @@ class XmodOutput(nn.Module):
         return hidden_states
     def lang_adapter(self, lang_ids: torch.Tensor, hidden_states: torch.Tensor):
-        # Process subsequent samples with the same lang_id in parallel
-        lang_ids, lang_lengths = torch.unique_consecutive(lang_ids, return_counts=True)
         if not self.ln_before_adapter:
             residual = hidden_states
@@ -463,14 +460,14 @@ class XmodOutput(nn.Module):
         if self.ln_before_adapter:
             residual = hidden_states
-        split_hidden_states = torch.split(hidden_states, lang_lengths.tolist(), 0)
-        lang_wise_outputs = []
-        for i, (lang_id, split_hidden_state) in enumerate(zip(lang_ids, split_hidden_states)):
-            lang = list(self.adapter_modules.keys())[int(lang_id.item())]
-            lang_wise_outputs.append(self.adapter_modules[lang](split_hidden_state))
-        hidden_states = torch.cat(lang_wise_outputs, 0)
+        new_hidden_states = torch.zeros_like(hidden_states)
+        for adapter_idx, lang_key in enumerate(self.adapter_modules.keys()):
+            lang_mask = lang_ids == adapter_idx
+            lang_hidden_states = hidden_states[lang_mask]
+            adapted_lang_hidden_states = self.adapter_modules[lang_key](lang_hidden_states)
+            new_hidden_states[lang_mask] = adapted_lang_hidden_states
-        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.dropout(new_hidden_states)
         hidden_states += residual
         return hidden_states

transformers 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl

transformers 5.0.0rc3py3-none-any.whl → 5.1.0py3-none-any.whl