PyPI - transformers - Versions diffs - 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl - Mend

transformers 5.0.0rc3py3-none-any.whl → 5.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1021) hide show

transformers/models/mt5/modeling_mt5.py CHANGED Viewed

@@ -36,12 +36,7 @@ from ...modeling_outputs import (
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...utils import (
-    DUMMY_INPUTS,
-    DUMMY_MASK,
-    auto_docstring,
-    logging,
-)
+from ...utils import DUMMY_INPUTS, DUMMY_MASK, auto_docstring, logging, torch_compilable_check
 from .configuration_mt5 import MT5Config
@@ -1404,8 +1399,10 @@ class MT5ForSequenceClassification(MT5PreTrainedModel):
         eos_mask = input_ids.eq(self.config.eos_token_id).to(sequence_output.device)
-        if len(torch.unique_consecutive(eos_mask.sum(1))) > 1:
-            raise ValueError("All examples must have the same number of <eos> tokens.")
+        torch_compilable_check(
+            torch.unique_consecutive(eos_mask.sum(1)).numel() == 1,
+            "All examples must have the same number of <eos> tokens.",
+        )
         batch_size, _, hidden_size = sequence_output.shape
         sentence_representation = sequence_output[eos_mask, :].view(batch_size, -1, hidden_size)[:, -1, :]
         logits = self.classification_head(sentence_representation)

transformers/models/musicgen/configuration_musicgen.py CHANGED Viewed

@@ -100,8 +100,14 @@ class MusicgenDecoderConfig(PreTrainedConfig):
         bos_token_id=2048,
         eos_token_id=None,
         tie_word_embeddings=False,
+        is_decoder=False,
+        add_cross_attention=False,
+        cross_attention_hidden_size=None,
         **kwargs,
     ):
+        self.is_decoder = is_decoder
+        self.add_cross_attention = add_cross_attention
+        self.cross_attention_hidden_size = cross_attention_hidden_size
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
         self.hidden_size = hidden_size
@@ -122,13 +128,11 @@ class MusicgenDecoderConfig(PreTrainedConfig):
             raise ValueError(f"Expected 1 (mono) or 2 (stereo) audio channels, got {audio_channels} channels.")
         self.audio_channels = audio_channels
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
+        self.tie_word_embeddings = tie_word_embeddings
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        super().__init__(**kwargs)
 class MusicgenConfig(PreTrainedConfig):
@@ -211,6 +215,7 @@ class MusicgenConfig(PreTrainedConfig):
         self.audio_encoder = audio_encoder
         self.decoder = decoder
         self.initializer_factor = self.decoder.initializer_factor
+        self.tie_encoder_decoder = kwargs.get("tie_encoder_decoder", False)
         kwargs["is_encoder_decoder"] = True
         super().__init__(**kwargs)

transformers/models/musicgen/modeling_musicgen.py CHANGED Viewed

@@ -56,6 +56,7 @@ from ...modeling_outputs import (
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
 from ...utils import TransformersKwargs, auto_docstring, is_torch_flex_attn_available, logging
+from ...utils.generic import is_flash_attention_requested
 from ..auto.configuration_auto import AutoConfig
 from ..auto.modeling_auto import AutoModel
 from .configuration_musicgen import MusicgenConfig, MusicgenDecoderConfig
@@ -279,9 +280,9 @@ class MusicgenAttention(nn.Module):
                 if is_cross_attention and isinstance(past_key_values, EncoderDecoderCache):
                     past_key_values.is_updated[self.layer_idx] = True
-        attention_interface: Callable = eager_attention_forward
-        if self.config._attn_implementation != "eager":
-            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
         attn_output, attn_weights = attention_interface(
             self,
@@ -629,7 +630,7 @@ class MusicgenDecoder(MusicgenPreTrainedModel):
         inputs_embeds: torch.Tensor,
         past_key_values_length: int,
     ):
-        if self.config._attn_implementation == "flash_attention_2":
+        if is_flash_attention_requested(self.config):
             # 2d mask is passed through the layers
             attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
         elif self.config._attn_implementation == "sdpa":
@@ -668,7 +669,7 @@ class MusicgenDecoder(MusicgenPreTrainedModel):
     ):
         # expand encoder attention mask
         if encoder_hidden_states is not None and encoder_attention_mask is not None:
-            if self.config._attn_implementation == "flash_attention_2":
+            if is_flash_attention_requested(self.config):
                 encoder_attention_mask = encoder_attention_mask if 0 in encoder_attention_mask else None
             elif self.config._attn_implementation == "sdpa":
                 encoder_attention_mask = _prepare_4d_attention_mask_for_sdpa(

transformers/models/musicgen_melody/configuration_musicgen_melody.py CHANGED Viewed

@@ -74,6 +74,11 @@ class MusicgenMelodyDecoderConfig(PreTrainedConfig):
         bos_token_id (`int`, *optional*, defaults to 2048): The id of the *beginning-of-sequence* token.
         eos_token_id (`int`, *optional*): The id of the *end-of-sequence* token.
         tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie word embeddings with the text encoder.
+        is_decoder (`bool`, *optional*, defaults to `False`):
+            Whether to only use the decoder in an encoder-decoder architecture, otherwise it has no effect on
+            decoder-only or encoder-only architectures.
+        add_cross_attention (`bool`, *optional*, defaults to `False`):
+            Whether cross-attention layers should be added to the model.
     """
     model_type = "musicgen_melody_decoder"
@@ -102,8 +107,12 @@ class MusicgenMelodyDecoderConfig(PreTrainedConfig):
         bos_token_id=2048,
         eos_token_id=None,
         tie_word_embeddings=False,
+        is_decoder=False,
+        add_cross_attention=False,
         **kwargs,
     ):
+        self.is_decoder = is_decoder
+        self.add_cross_attention = add_cross_attention
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
         self.hidden_size = hidden_size
@@ -124,13 +133,11 @@ class MusicgenMelodyDecoderConfig(PreTrainedConfig):
             raise ValueError(f"Expected 1 (mono) or 2 (stereo) audio channels, got {audio_channels} channels.")
         self.audio_channels = audio_channels
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
+        self.tie_word_embeddings = tie_word_embeddings
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        super().__init__(**kwargs)
 class MusicgenMelodyConfig(PreTrainedConfig):
@@ -225,6 +232,7 @@ class MusicgenMelodyConfig(PreTrainedConfig):
         self.decoder = decoder
         self.num_chroma = num_chroma
         self.chroma_length = chroma_length
+        self.tie_encoder_decoder = kwargs.get("tie_encoder_decoder", False)
         kwargs["is_encoder_decoder"] = False
         super().__init__(**kwargs)

transformers/models/musicgen_melody/modeling_musicgen_melody.py CHANGED Viewed

@@ -13,7 +13,6 @@
 # limitations under the License.
 """PyTorch Musicgen Melody model."""
-import copy
 import inspect
 import math
 import random
@@ -48,6 +47,7 @@ from ...modeling_outputs import BaseModelOutputWithPast, ModelOutput
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
 from ...utils import TransformersKwargs, auto_docstring, is_torch_flex_attn_available, logging
+from ...utils.generic import is_flash_attention_requested
 from ..auto.configuration_auto import AutoConfig
 from ..auto.modeling_auto import AutoModel, AutoModelForTextEncoding
 from .configuration_musicgen_melody import MusicgenMelodyConfig, MusicgenMelodyDecoderConfig
@@ -286,9 +286,9 @@ class MusicgenMelodyAttention(nn.Module):
                 if is_cross_attention and isinstance(past_key_values, EncoderDecoderCache):
                     past_key_values.is_updated[self.layer_idx] = True
-        attention_interface: Callable = eager_attention_forward
-        if self.config._attn_implementation != "eager":
-            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
         attn_output, attn_weights = attention_interface(
             self,
@@ -601,7 +601,7 @@ class MusicgenMelodyDecoder(MusicgenMelodyPreTrainedModel):
         inputs_embeds: torch.Tensor,
         past_key_values_length: int,
     ):
-        if self.config._attn_implementation == "flash_attention_2":
+        if is_flash_attention_requested(self.config):
             # 2d mask is passed through the layers
             attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
         elif self.config._attn_implementation == "sdpa":
@@ -1102,12 +1102,7 @@ class MusicgenMelodyForCausalLM(MusicgenMelodyPreTrainedModel, GenerationMixin):
                     - [`~generation.GenerateBeamEncoderDecoderOutput`]
         """
         # 1. Handle `generation_config` and kwargs that might update it, and validate the resulting objects
-        if generation_config is None:
-            generation_config = self.generation_config
-        generation_config = copy.deepcopy(generation_config)
-        model_kwargs = generation_config.update(**kwargs)  # All unused kwargs must be model kwargs
-        generation_config.validate()
+        generation_config, model_kwargs = self._prepare_generation_config(generation_config, **kwargs)
         self._validate_model_kwargs(model_kwargs.copy())
         # 2. Set generation parameters if not already defined
@@ -2043,12 +2038,7 @@ class MusicgenMelodyForConditionalGeneration(PreTrainedModel, GenerationMixin):
                     - [`~generation.GenerateBeamEncoderDecoderOutput`]
         """
         # 1. Handle `generation_config` and kwargs that might update it, and validate the resulting objects
-        if generation_config is None:
-            generation_config = self.generation_config
-        generation_config = copy.deepcopy(generation_config)
-        model_kwargs = generation_config.update(**kwargs)  # All unused kwargs must be model kwargs
-        generation_config.validate()
+        generation_config, model_kwargs = self._prepare_generation_config(generation_config, **kwargs)
         self._validate_model_kwargs(model_kwargs.copy())
         # 2. Set generation parameters if not already defined

transformers/models/mvp/configuration_mvp.py CHANGED Viewed

@@ -129,8 +129,12 @@ class MvpConfig(PreTrainedConfig):
         use_prompt=False,
         prompt_length=100,
         prompt_mid_dim=800,
+        is_decoder=False,
+        tie_word_embeddings=True,
         **kwargs,
     ):
+        self.is_decoder = is_decoder
+        self.tie_word_embeddings = tie_word_embeddings
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
         self.d_model = d_model
@@ -155,12 +159,12 @@ class MvpConfig(PreTrainedConfig):
         self.prompt_length = prompt_length
         self.prompt_mid_dim = prompt_mid_dim
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.decoder_start_token_id = decoder_start_token_id
         super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
             is_encoder_decoder=is_encoder_decoder,
-            decoder_start_token_id=decoder_start_token_id,
             **kwargs,
         )

transformers/models/mvp/modeling_mvp.py CHANGED Viewed

@@ -38,7 +38,7 @@ from ...modeling_outputs import (
     Seq2SeqSequenceClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...utils import auto_docstring, logging
+from ...utils import auto_docstring, logging, torch_compilable_check
 from .configuration_mvp import MvpConfig
@@ -1299,8 +1299,10 @@ class MvpForSequenceClassification(MvpPreTrainedModel):
         eos_mask = input_ids.eq(self.config.eos_token_id).to(hidden_states.device)
-        if len(torch.unique_consecutive(eos_mask.sum(1))) > 1:
-            raise ValueError("All examples must have the same number of <eos> tokens.")
+        torch_compilable_check(
+            torch.unique_consecutive(eos_mask.sum(1)).numel() == 1,
+            "All examples must have the same number of <eos> tokens.",
+        )
         sentence_representation = hidden_states[eos_mask, :].view(hidden_states.size(0), -1, hidden_states.size(-1))[
             :, -1, :
         ]
@@ -1571,7 +1573,7 @@ class MvpForCausalLM(MvpPreTrainedModel, GenerationMixin):
         >>> from transformers import AutoTokenizer, MvpForCausalLM
         >>> tokenizer = AutoTokenizer.from_pretrained("RUCAIBox/mvp")
-        >>> model = MvpForCausalLM.from_pretrained("RUCAIBox/mvp", add_cross_attention=False)
+        >>> model = MvpForCausalLM.from_pretrained("RUCAIBox/mvp")
         >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
         >>> outputs = model(**inputs)

transformers/models/nanochat/configuration_nanochat.py CHANGED Viewed

@@ -144,14 +144,12 @@ class NanoChatConfig(PretrainedConfig):
         self.final_logit_softcapping = final_logit_softcapping
         self.attention_bias = attention_bias
         self.rope_parameters = rope_parameters
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.tie_word_embeddings = tie_word_embeddings
-        super().__init__(
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            pad_token_id=pad_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
+        super().__init__(**kwargs)
 __all__ = ["NanoChatConfig"]

transformers/models/nanochat/modeling_nanochat.py CHANGED Viewed

@@ -250,9 +250,9 @@ class NanoChatAttention(nn.Module):
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
             key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
-        attention_interface: Callable = eager_attention_forward
-        if self.config._attn_implementation != "eager":
-            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
         attn_output, attn_weights = attention_interface(
             self,
@@ -443,7 +443,7 @@ class NanoChatModel(NanoChatPreTrainedModel):
 @auto_docstring
 class NanoChatForCausalLM(NanoChatPreTrainedModel, GenerationMixin):
     _tied_weights_keys = {"lm_head.weight": "model.embed_tokens.weight"}
-    _tp_plan = {"lm_head": "colwise_rep"}
+    _tp_plan = {"lm_head": "colwise_gather_output"}
     _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
     def __init__(self, config):

transformers/models/nanochat/modular_nanochat.py CHANGED Viewed

@@ -92,9 +92,9 @@ class NanoChatAttention(Qwen3Attention):
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
             key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
-        attention_interface: Callable = eager_attention_forward
-        if self.config._attn_implementation != "eager":
-            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
         attn_output, attn_weights = attention_interface(
             self,
@@ -208,7 +208,7 @@ class NanoChatModel(LlamaModel):
 @auto_docstring
 class NanoChatForCausalLM(Gemma2ForCausalLM):
-    _tp_plan = {"lm_head": "colwise_rep"}
+    _tp_plan = {"lm_head": "colwise_gather_output"}
     def forward(self, **super_kwargs) -> CausalLMOutputWithPast:
         r"""

transformers/models/nemotron/configuration_nemotron.py CHANGED Viewed

@@ -142,13 +142,11 @@ class NemotronConfig(PreTrainedConfig):
         self.rope_parameters = rope_parameters
         kwargs.setdefault("partial_rotary_factor", 0.5)  # assign default for BC
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
+        self.tie_word_embeddings = tie_word_embeddings
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        super().__init__(**kwargs)
 __all__ = ["NemotronConfig"]

transformers/models/nemotron/modeling_nemotron.py CHANGED Viewed

@@ -44,7 +44,7 @@ from ...modeling_rope_utils import (
 )
 from ...modeling_utils import PreTrainedModel
 from ...utils import auto_docstring, can_return_tuple, is_torch_flex_attn_available, logging
-from ...utils.generic import maybe_autocast
+from ...utils.generic import is_flash_attention_requested, maybe_autocast
 from .configuration_nemotron import NemotronConfig
@@ -61,12 +61,7 @@ def _cast_if_autocast_enabled(device_type, *args):
     if not torch.is_autocast_enabled():
         return args
     else:
-        # NOTE: `torch.get_autocast_dtype` is there starting from PyTorch 2.4
-        target_dtype = (
-            torch.get_autocast_dtype(device_type)
-            if hasattr(torch, "get_autocast_dtype")
-            else torch.get_autocast_gpu_dtype()
-        )
+        target_dtype = torch.get_autocast_dtype(device_type)
         return torch.amp.autocast_mode._cast(args, device_type, target_dtype)
@@ -387,12 +382,7 @@ class NemotronFlashAttention2(NemotronAttention):
         device_type = query_states.device.type if query_states.device.type != "mps" else "cpu"
         if input_dtype == torch.float32:
             if torch.is_autocast_enabled():
-                # NOTE: `torch.get_autocast_dtype` is there starting from PyTorch 2.4
-                target_dtype = (
-                    torch.get_autocast_dtype(device_type)
-                    if hasattr(torch, "get_autocast_dtype")
-                    else torch.get_autocast_gpu_dtype()
-                )
+                target_dtype = torch.get_autocast_dtype(device_type)
             # Handle the case where the model is quantized
             elif hasattr(self.config, "_is_quantized"):
                 target_dtype = self.config.dtype
@@ -743,7 +733,7 @@ class NemotronModel(NemotronPreTrainedModel):
         past_key_values: Cache,
         output_attentions: bool = False,
     ):
-        if self.config._attn_implementation == "flash_attention_2":
+        if is_flash_attention_requested(self.config):
             if attention_mask is not None and (attention_mask == 0.0).any():
                 return attention_mask
             return None

transformers/models/nllb/tokenization_nllb.py CHANGED Viewed

@@ -100,11 +100,16 @@ class NllbTokenizer(TokenizersBackend):
         src_lang=None,
         tgt_lang=None,
         additional_special_tokens=None,
+        extra_special_tokens=None,
         legacy_behaviour=False,
         **kwargs,
     ):
-        if additional_special_tokens is None:
-            additional_special_tokens = kwargs.get("extra_special_tokens", FAIRSEQ_LANGUAGE_CODES)
+        # V5: extra_special_tokens takes precedence over additional_special_tokens (deprecated)
+        # Handle case where both are passed (ie. from config and user override)
+        if extra_special_tokens is not None:
+            additional_special_tokens = extra_special_tokens
+        elif additional_special_tokens is None:
+            additional_special_tokens = FAIRSEQ_LANGUAGE_CODES
         mask_token = (
             AddedToken(mask_token, normalized=True, lstrip=True, special=True)
@@ -145,9 +150,6 @@ class NllbTokenizer(TokenizersBackend):
         self._tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement="▁", prepend_scheme="always", split=True)
         self._tokenizer.decoder = decoders.Metaspace(replacement="▁", prepend_scheme="always", split=True)
-        # Remove extra_special_tokens from kwargs if present to avoid conflict
-        kwargs.pop("extra_special_tokens", None)
         super().__init__(
             bos_token=bos_token,
             eos_token=eos_token,

transformers/models/nllb_moe/configuration_nllb_moe.py CHANGED Viewed

@@ -165,6 +165,7 @@ class NllbMoeConfig(PreTrainedConfig):
         pad_token_id=1,
         bos_token_id=0,
         eos_token_id=2,
+        tie_word_embeddings=True,
         output_router_logits=False,
         **kwargs,
     ):
@@ -205,15 +206,12 @@ class NllbMoeConfig(PreTrainedConfig):
         self.moe_eval_capacity_token_fraction = moe_eval_capacity_token_fraction
         self.moe_token_dropout = moe_token_dropout
         self.output_router_logits = output_router_logits
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            is_encoder_decoder=is_encoder_decoder,
-            decoder_start_token_id=decoder_start_token_id,
-            **kwargs,
-        )
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.decoder_start_token_id = decoder_start_token_id
+        self.tie_word_embeddings = tie_word_embeddings
+        super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
 __all__ = ["NllbMoeConfig"]

transformers/models/nllb_moe/modeling_nllb_moe.py CHANGED Viewed

@@ -495,9 +495,9 @@ class NllbMoeAttention(nn.Module):
                 if is_cross_attention and isinstance(past_key_values, EncoderDecoderCache):
                     past_key_values.is_updated[self.layer_idx] = True
-        attention_interface: Callable = eager_attention_forward
-        if self.config._attn_implementation != "eager":
-            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
         attn_output, attn_weights = attention_interface(
             self,

transformers/models/nougat/image_processing_nougat_fast.py CHANGED Viewed

@@ -16,7 +16,7 @@
 from typing import Optional
 import torch
-from torchvision.transforms.v2 import functional as F
+import torchvision.transforms.v2.functional as tvF
 from ...image_processing_utils import BatchFeature
 from ...image_processing_utils_fast import (
@@ -102,7 +102,7 @@ class NougatImageProcessorFast(BaseImageProcessorFast):
             gray_threshold (`int`, *optional*, defaults to `200`)
                 Value below which pixels are considered to be gray.
         """
-        data = F.rgb_to_grayscale(image, num_output_channels=1)
+        data = tvF.rgb_to_grayscale(image, num_output_channels=1)
         max_val = torch.max(data)
         min_val = torch.min(data)
@@ -176,7 +176,7 @@ class NougatImageProcessorFast(BaseImageProcessorFast):
         new_size = (height, width)
-        return F.resize(image, new_size, interpolation=F.InterpolationMode.BICUBIC)
+        return tvF.resize(image, new_size, interpolation=tvF.InterpolationMode.BICUBIC)
     def pad_images(
         self,
@@ -205,13 +205,13 @@ class NougatImageProcessorFast(BaseImageProcessorFast):
         pad_right = delta_width - pad_left
         padding = (pad_left, pad_top, pad_right, pad_bottom)
-        return F.pad(image, padding)
+        return tvF.pad(image, padding)
     def resize(
         self,
         image: "torch.Tensor",
         size: SizeDict,
-        interpolation: Optional["F.InterpolationMode"] = None,
+        interpolation: Optional["tvF.InterpolationMode"] = None,
         antialias: bool = True,
         **kwargs,
     ) -> "torch.Tensor":
@@ -229,14 +229,14 @@ class NougatImageProcessorFast(BaseImageProcessorFast):
         Returns:
             `torch.Tensor`: The resized image.
         """
-        interpolation = interpolation if interpolation is not None else F.InterpolationMode.BICUBIC
+        interpolation = interpolation if interpolation is not None else tvF.InterpolationMode.BICUBIC
         shortest_edge = min(size["height"], size["width"])
         new_size = get_resize_output_image_size(
             image, size=shortest_edge, default_to_square=False, input_data_format=ChannelDimension.FIRST
         )
-        return F.resize(image, new_size, interpolation=interpolation, antialias=antialias)
+        return tvF.resize(image, new_size, interpolation=interpolation, antialias=antialias)
     def _preprocess(
         self,
@@ -246,7 +246,7 @@ class NougatImageProcessorFast(BaseImageProcessorFast):
         do_align_long_axis: bool,
         do_thumbnail: bool,
         do_pad: bool,
-        interpolation: Optional["F.InterpolationMode"],
+        interpolation: Optional["tvF.InterpolationMode"],
         do_center_crop: bool,
         crop_size: SizeDict,
         do_rescale: bool,

transformers/models/nystromformer/configuration_nystromformer.py CHANGED Viewed

@@ -107,8 +107,15 @@ class NystromformerConfig(PreTrainedConfig):
         pad_token_id=1,
         bos_token_id=0,
         eos_token_id=2,
+        add_cross_attention=False,
+        tie_word_embeddings=True,
         **kwargs,
     ):
+        self.add_cross_attention = add_cross_attention
+        self.tie_word_embeddings = tie_word_embeddings
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
         self.hidden_size = hidden_size
@@ -125,7 +132,7 @@ class NystromformerConfig(PreTrainedConfig):
         self.conv_kernel_size = conv_kernel_size
         self.inv_coeff_init_option = inv_coeff_init_option
         self.layer_norm_eps = layer_norm_eps
-        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+        super().__init__(**kwargs)
 __all__ = ["NystromformerConfig"]

transformers/models/olmo/configuration_olmo.py CHANGED Viewed

@@ -157,13 +157,11 @@ class OlmoConfig(PreTrainedConfig):
         self.clip_qkv = clip_qkv
         self.rope_parameters = rope_parameters
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
+        self.tie_word_embeddings = tie_word_embeddings
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        super().__init__(**kwargs)
 __all__ = ["OlmoConfig"]

transformers/models/olmo/modeling_olmo.py CHANGED Viewed

@@ -270,9 +270,9 @@ class OlmoAttention(nn.Module):
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
             key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
-        attention_interface: Callable = eager_attention_forward
-        if self.config._attn_implementation != "eager":
-            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
         attn_output, attn_weights = attention_interface(
             self,
@@ -436,7 +436,7 @@ class OlmoModel(OlmoPreTrainedModel):
 @auto_docstring
 class OlmoForCausalLM(OlmoPreTrainedModel, GenerationMixin):
     _tied_weights_keys = {"lm_head.weight": "model.embed_tokens.weight"}
-    _tp_plan = {"lm_head": "colwise_rep"}
+    _tp_plan = {"lm_head": "colwise_gather_output"}
     _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
     def __init__(self, config):

transformers/models/olmo/modular_olmo.py CHANGED Viewed

@@ -144,9 +144,9 @@ class OlmoAttention(LlamaAttention):
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
             key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
-        attention_interface: Callable = eager_attention_forward
-        if self.config._attn_implementation != "eager":
-            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
         attn_output, attn_weights = attention_interface(
             self,

transformers 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl

transformers 5.0.0rc3py3-none-any.whl → 5.1.0py3-none-any.whl