PyPI - transformers - Versions diffs - 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl - Mend

transformers 5.0.0rc3py3-none-any.whl → 5.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1021) hide show

transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py CHANGED Viewed

@@ -48,7 +48,7 @@ from ...image_utils import (
 from ...masking_utils import create_causal_mask
 from ...modeling_flash_attention_utils import FlashAttentionKwargs
 from ...modeling_layers import GradientCheckpointingLayer
-from ...modeling_outputs import MoeCausalLMOutputWithPast, MoeModelOutputWithPast
+from ...modeling_outputs import BaseModelOutputWithPooling, MoeCausalLMOutputWithPast, MoeModelOutputWithPast
 from ...modeling_rope_utils import dynamic_rope_update
 from ...modeling_utils import PreTrainedModel
 from ...processing_utils import Unpack
@@ -80,6 +80,7 @@ from ..qwen2_5_vl.modeling_qwen2_5_vl import (
     Qwen2_5_VisionRotaryEmbedding,
     Qwen2_5_VLModel,
     Qwen2_5_VLPreTrainedModel,
+    Qwen2_5_VLVisionAttention,
     Qwen2_5_VLVisionBlock,
 )
 from ..qwen2_vl.configuration_qwen2_vl import Qwen2VLVisionConfig
@@ -212,8 +213,6 @@ class Ernie4_5_VL_MoeTextConfig(Ernie4_5_MoeConfig, PreTrainedConfig):
             relevant if `config.is_decoder=True`.
         use_bias (`bool`, *optional*, defaults to `False`):
             Whether to use a bias in any of the projections including mlp and attention for example.
-        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
-            Whether the model's input and output word embeddings should be tied.
         rope_parameters (`RopeParameters`, *optional*):
             Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain
             a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
@@ -235,6 +234,12 @@ class Ernie4_5_VL_MoeTextConfig(Ernie4_5_MoeConfig, PreTrainedConfig):
             allow the model to output the auxiliary loss, including load balancing loss and router z-loss.
         router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
             The aux loss factor for the total loss.
+        pad_token_id (`int`, *optional*):
+            Padding token id.
+        eos_token_id (`int`, *optional*):
+            End of stream token id.
+        bos_token_id (`int`, *optional*):
+            Beginning of stream token id.
     """
     model_type = "ernie4_5_vl_moe_text"
@@ -267,7 +272,6 @@ class Ernie4_5_VL_MoeTextConfig(Ernie4_5_MoeConfig, PreTrainedConfig):
         rms_norm_eps=1e-5,
         use_cache=True,
         use_bias=False,
-        tie_word_embeddings=True,
         rope_parameters=None,
         mlp_layer_types=None,
         moe_intermediate_size=None,
@@ -277,6 +281,9 @@ class Ernie4_5_VL_MoeTextConfig(Ernie4_5_MoeConfig, PreTrainedConfig):
         moe_norm_min=1e-12,
         output_router_logits=False,
         router_aux_loss_coef=0.001,
+        pad_token_id=None,
+        eos_token_id=None,
+        bos_token_id=None,
         **kwargs,
     ):
         self.vocab_size = vocab_size
@@ -308,10 +315,11 @@ class Ernie4_5_VL_MoeTextConfig(Ernie4_5_MoeConfig, PreTrainedConfig):
         self.moe_norm_min = moe_norm_min
         self.output_router_logits = output_router_logits
         self.router_aux_loss_coef = router_aux_loss_coef
+        self.pad_token_id = pad_token_id
+        self.eos_token_id = eos_token_id
+        self.bos_token_id = bos_token_id
-        PreTrainedConfig.__init__(
-            tie_word_embeddings=tie_word_embeddings, ignore_keys_at_rope_validation={"mrope_section"}, **kwargs
-        )
+        PreTrainedConfig.__init__(ignore_keys_at_rope_validation={"mrope_section"}, **kwargs)
 class Ernie4_5_VL_MoeConfig(PreTrainedConfig):
@@ -341,6 +349,8 @@ class Ernie4_5_VL_MoeConfig(PreTrainedConfig):
             The video token index to encode the end of video.
         video_token_id (`int`, *optional*, defaults to 103367):
             The video token index to encode the video prompt.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether the model's input and output word embeddings should be tied.
     ```python
     >>> from transformers import Ernie4_5_VL_MoeForConditionalGeneration, Ernie4_5_VL_MoeConfig
@@ -369,6 +379,7 @@ class Ernie4_5_VL_MoeConfig(PreTrainedConfig):
         video_start_token_id=101306,
         video_end_token_id=101307,
         video_token_id=103367,
+        tie_word_embeddings=True,
         **kwargs,
     ):
         if isinstance(vision_config, dict):
@@ -391,6 +402,7 @@ class Ernie4_5_VL_MoeConfig(PreTrainedConfig):
         self.video_start_token_id = video_start_token_id
         self.video_end_token_id = video_end_token_id
         self.video_token_id = video_token_id
+        self.tie_word_embeddings = tie_word_embeddings
         super().__init__(**kwargs)
@@ -688,6 +700,23 @@ class Ernie4_5_VL_MoeDecoderLayer(GradientCheckpointingLayer):
         return hidden_states
+class Ernie4_5_VL_MoeVisionAttention(Qwen2_5_VLVisionAttention):
+    pass
+class Ernie4_5_VL_MoeVisionBlock(Qwen2_5_VLVisionBlock):
+    def __init__(self, config) -> None:
+        super().__init__(config, None)
+        self.norm1 = nn.LayerNorm(config.hidden_size, config.rms_norm_eps)
+        self.norm2 = nn.LayerNorm(config.hidden_size, config.rms_norm_eps)
+        self.mlp = Ernie4_5VLVisionMLP(
+            dim=config.hidden_size,
+            hidden_dim=config.intermediate_size,
+            hidden_act=config.hidden_act,
+        )
 class Ernie4_5_VL_MoePreTrainedModel(Qwen2_5_VLPreTrainedModel):
     _can_compile_fullgraph = False
@@ -830,20 +859,13 @@ class Ernie4_5_VL_MoeVisionRotaryEmbedding(Qwen2_5_VisionRotaryEmbedding):
     pass
-class Ernie4_5_VL_MoeVisionBlock(Qwen2_5_VLVisionBlock):
-    def __init__(self, config) -> None:
-        super().__init__(config, None)
-        self.norm1 = nn.LayerNorm(config.hidden_size, config.rms_norm_eps)
-        self.norm2 = nn.LayerNorm(config.hidden_size, config.rms_norm_eps)
-        self.mlp = Ernie4_5VLVisionMLP(
-            dim=config.hidden_size,
-            hidden_dim=config.intermediate_size,
-            hidden_act=config.hidden_act,
-        )
 class Ernie4_5_VL_MoeVisionTransformerPretrainedModel(Qwen2VisionTransformerPretrainedModel):
+    _can_record_outputs = {
+        "router_logits": OutputRecorder(Ernie4_5_VL_MoeMoeBlock, index=1),
+        "hidden_states": Ernie4_5_VL_MoeVisionBlock,
+        "attentions": Ernie4_5_VL_MoeVisionAttention,
+    }
     def __init__(self, config) -> None:
         super().__init__(config)
@@ -866,12 +888,10 @@ class Ernie4_5_VL_MoeVisionTransformerPretrainedModel(Qwen2VisionTransformerPret
     def get_device(self):
         raise AttributeError("Ernie 4.5 VL Moe does not need this!")
+    @check_model_inputs
     def forward(
-        self,
-        hidden_states: torch.Tensor,
-        grid_thw: torch.Tensor,
-        **kwargs,
-    ) -> torch.Tensor:
+        self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs: Unpack[TransformersKwargs]
+    ) -> tuple | BaseModelOutputWithPooling:
         hidden_states = self.patch_embed(hidden_states)
         rotary_pos_emb = self.rot_pos_emb(grid_thw)
         emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
@@ -895,7 +915,7 @@ class Ernie4_5_VL_MoeVisionTransformerPretrainedModel(Qwen2VisionTransformerPret
                 **kwargs,
             )
         hidden_states = self.ln(hidden_states)
-        return hidden_states
+        return BaseModelOutputWithPooling(last_hidden_state=hidden_states)
 class Ernie4_5_VL_MoeVisionMLP(nn.Module):
@@ -1248,43 +1268,39 @@ class Ernie4_5_VL_MoeModel(Qwen2_5_VLModel):
             return position_ids, mrope_position_deltas
+    @can_return_tuple
+    @auto_docstring
     def get_video_features(
-        self, pixel_values_videos: torch.FloatTensor, video_grid_thw: torch.LongTensor | None = None
-    ):
-        """
-        Encodes videos into continuous embeddings that can be forwarded to the language model.
-        Args:
-            pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
-                The tensors corresponding to the input videos.
-            video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
-                The temporal, height and width of feature shape of each video in LLM.
-        """
-        video_embeds = self.vision_tower(pixel_values_videos, video_grid_thw)
-        video_embeds = self.resampler_model(video_embeds, video_grid_thw)
+        self,
+        pixel_values_videos: torch.FloatTensor,
+        video_grid_thw: torch.LongTensor | None = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple | BaseModelOutputWithPooling:
+        video_outputs = self.vision_tower(pixel_values_videos, video_grid_thw, return_dict=True, **kwargs)
+        video_embeds = self.resampler_model(video_outputs.last_hidden_state, video_grid_thw)
         split_sizes = (
             video_grid_thw.prod(-1)
             // self.vision_tower.spatial_merge_size**2
             // self.resampler_model.temporal_merge_size
         ).tolist()
         video_embeds = torch.split(video_embeds, split_sizes)
-        return video_embeds
-    def get_image_features(self, pixel_values: torch.FloatTensor, image_grid_thw: torch.LongTensor | None = None):
-        """
-        Encodes images into continuous embeddings that can be forwarded to the language model.
+        video_outputs.pooler_output = video_embeds
+        return video_outputs
-        Args:
-            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
-                The tensors corresponding to the input images.
-            image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
-                The temporal, height and width of feature shape of each image in LLM.
-        """
-        image_embeds = self.vision_tower(pixel_values, image_grid_thw)
-        image_embeds = self.resampler_model(image_embeds, image_grid_thw)
+    @can_return_tuple
+    @auto_docstring
+    def get_image_features(
+        self,
+        pixel_values: torch.FloatTensor,
+        image_grid_thw: torch.LongTensor | None = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple | BaseModelOutputWithPooling:
+        image_outputs = self.vision_tower(pixel_values, image_grid_thw, return_dict=True, **kwargs)
+        image_embeds = self.resampler_model(image_outputs.last_hidden_state, image_grid_thw)
         split_sizes = (image_grid_thw.prod(-1) // self.vision_tower.spatial_merge_size**2).tolist()
         image_embeds = torch.split(image_embeds, split_sizes)
-        return image_embeds
+        image_outputs.pooler_output = image_embeds
+        return image_outputs
     @auto_docstring
     @can_return_tuple
@@ -1322,7 +1338,7 @@ class Ernie4_5_VL_MoeModel(Qwen2_5_VLModel):
             inputs_embeds = self.get_input_embeddings()(input_ids)
         if pixel_values is not None:
-            image_embeds = self.get_image_features(pixel_values, image_grid_thw)
+            image_embeds = self.get_image_features(pixel_values, image_grid_thw, return_dict=True).pooler_output
             image_embeds = torch.cat(image_embeds, dim=0).to(inputs_embeds.device, inputs_embeds.dtype)
             image_mask, _ = self.get_placeholder_mask(
                 input_ids, inputs_embeds=inputs_embeds, image_features=image_embeds
@@ -1330,7 +1346,7 @@ class Ernie4_5_VL_MoeModel(Qwen2_5_VLModel):
             inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)
         if pixel_values_videos is not None:
-            video_embeds = self.get_video_features(pixel_values_videos, video_grid_thw)
+            video_embeds = self.get_video_features(pixel_values_videos, video_grid_thw, return_dict=True).pooler_output
             video_embeds = torch.cat(video_embeds, dim=0).to(inputs_embeds.device, inputs_embeds.dtype)
             _, video_mask = self.get_placeholder_mask(
                 input_ids, inputs_embeds=inputs_embeds, video_features=video_embeds
@@ -1381,6 +1397,14 @@ class Ernie4_5_VL_MoeForConditionalGeneration(Glm4vForConditionalGeneration, Gen
         self.num_experts = config.text_config.moe_num_experts
         self.num_experts_per_tok = config.text_config.moe_k
+    @auto_docstring
+    def get_video_features(self, **super_kwargs):
+        return super().get_video_features(**super_kwargs)
+    @auto_docstring
+    def get_image_features(self, **super_kwargs):
+        return super().get_image_features(**super_kwargs)
     def prepare_inputs_for_generation(
         self,
         input_ids,

transformers/models/esm/configuration_esm.py CHANGED Viewed

@@ -101,16 +101,6 @@ class TrunkConfig:
         if self.max_recycles <= 0:
             raise ValueError(f"`max_recycles` should be positive, got {self.max_recycles}.")
-        if self.sequence_state_dim % self.sequence_state_dim != 0:
-            raise ValueError(
-                "`sequence_state_dim` should be a round multiple of `sequence_state_dim`, got"
-                f" {self.sequence_state_dim} and {self.sequence_state_dim}."
-            )
-        if self.pairwise_state_dim % self.pairwise_state_dim != 0:
-            raise ValueError(
-                "`pairwise_state_dim` should be a round multiple of `pairwise_state_dim`, got"
-                f" {self.pairwise_state_dim} and {self.pairwise_state_dim}."
-            )
         sequence_num_heads = self.sequence_state_dim // self.sequence_head_width
         pairwise_num_heads = self.pairwise_state_dim // self.pairwise_head_width
@@ -267,10 +257,18 @@ class EsmConfig(PreTrainedConfig):
         is_folding_model=False,
         esmfold_config=None,
         vocab_list=None,
+        is_decoder=False,
+        add_cross_attention=False,
+        tie_word_embeddings=True,
         **kwargs,
     ):
-        super().__init__(pad_token_id=pad_token_id, mask_token_id=mask_token_id, **kwargs)
+        super().__init__(**kwargs)
+        self.is_decoder = is_decoder
+        self.add_cross_attention = add_cross_attention
+        self.tie_word_embeddings = tie_word_embeddings
+        self.pad_token_id = pad_token_id
+        self.mask_token_id = mask_token_id
         self.vocab_size = vocab_size
         self.hidden_size = hidden_size
         self.num_hidden_layers = num_hidden_layers

transformers/models/esm/modeling_esm.py CHANGED Viewed

@@ -343,9 +343,9 @@ class EsmSelfAttention(nn.Module):
         if self.position_embedding_type == "rotary":
             query_layer, key_layer = self.rotary_embeddings(query_layer, key_layer)
-        attention_interface: Callable = eager_attention_forward
-        if self.config._attn_implementation != "eager":
-            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
         attn_output, attn_weights = attention_interface(
             self,

transformers/models/esm/modeling_esmfold.py CHANGED Viewed

@@ -136,12 +136,7 @@ class EsmForProteinFoldingOutput(ModelOutput):
 def is_fp16_enabled(device_type):
     # Autocast world
-    # NOTE: `torch.get_autocast_dtype` is there starting from PyTorch 2.4
-    autocast_dtype = (
-        torch.get_autocast_dtype(device_type)
-        if hasattr(torch, "get_autocast_dtype")
-        else torch.get_autocast_gpu_dtype()
-    )
+    autocast_dtype = torch.get_autocast_dtype(device_type)
     fp16_enabled = autocast_dtype == torch.float16
     fp16_enabled = fp16_enabled and torch.is_autocast_enabled()

transformers/models/esm/openfold_utils/protein.py CHANGED Viewed

@@ -81,9 +81,8 @@ def from_proteinnet_string(proteinnet_str: str) -> Protein:
     for g in groups:
         if g[0] == "[PRIMARY]":
             seq = g[1][0].strip()
-            for i in range(len(seq)):
-                if seq[i] not in residue_constants.restypes:
-                    seq[i] = "X"  # FIXME: strings are immutable
+            # Replace unknown residues with "X" (strings are immutable, so convert to list first)
+            seq = [char if char in residue_constants.restypes else "X" for char in seq]
             aatype = np.array(
                 [residue_constants.restype_order.get(res_symbol, residue_constants.restype_num) for res_symbol in seq]
             )

transformers/models/evolla/configuration_evolla.py CHANGED Viewed

@@ -77,10 +77,16 @@ class SaProtConfig(PreTrainedConfig):
         position_embedding_type="rotary",
         emb_layer_norm_before=False,
         token_dropout=True,
+        is_decoder=False,
+        add_cross_attention=False,
         **kwargs,
     ):
-        super().__init__(pad_token_id=pad_token_id, mask_token_id=mask_token_id, **kwargs)
+        super().__init__(**kwargs)
+        self.pad_token_id = pad_token_id
+        self.mask_token_id = mask_token_id
+        self.is_decoder = is_decoder
+        self.add_cross_attention = add_cross_attention
         self.vocab_size = vocab_size
         self.hidden_size = hidden_size
         self.num_hidden_layers = num_hidden_layers
@@ -169,6 +175,11 @@ class EvollaConfig(PreTrainedConfig):
             Whether or not the model should return the last key/values attentions (not used by all models).
         tie_word_embeddings (`bool`, *optional*, defaults to `False`):
             Whether or not to tie the input and output word embeddings.
+        is_decoder (`bool`, *optional*, defaults to `False`):
+            Whether to only use the decoder in an encoder-decoder architecture, otherwise it has no effect on
+            decoder-only or encoder-only architectures.
+        add_cross_attention (`bool`, *optional*, defaults to `False`):
+            Whether cross-attention layers should be added to the model.
     Example:
@@ -220,8 +231,12 @@ class EvollaConfig(PreTrainedConfig):
         eos_token_id: int | None = 128009,
         use_cache: bool | None = False,
         tie_word_embeddings: bool | None = False,
+        is_decoder: bool | None = False,
+        add_cross_attention: bool | None = False,
         **kwargs,
     ):
+        self.is_decoder = is_decoder
+        self.add_cross_attention = add_cross_attention
         self.vocab_size = vocab_size
         self.hidden_size = hidden_size
         self.intermediate_size = intermediate_size
@@ -255,13 +270,11 @@ class EvollaConfig(PreTrainedConfig):
             logger.info("`protein_encoder_config` is `None`. Initializing the `SaProtConfig` with default values.")
         self.protein_encoder_config = SaProtConfig(**protein_encoder_config)
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
+        self.tie_word_embeddings = tie_word_embeddings
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        super().__init__(**kwargs)
 __all__ = ["EvollaConfig"]

transformers/models/evolla/modeling_evolla.py CHANGED Viewed

@@ -308,9 +308,9 @@ class EvollaSaProtSelfAttention(nn.Module):
         if self.position_embedding_type == "rotary":
             query_layer, key_layer = self.rotary_embeddings(query_layer, key_layer)
-        attention_interface: Callable = eager_attention_forward
-        if self.config._attn_implementation != "eager":
-            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
         attn_output, attn_weights = attention_interface(
             self,
@@ -1147,9 +1147,9 @@ class EvollaAttention(nn.Module):
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
             key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
-        attention_interface: Callable = eager_attention_forward
-        if self.config._attn_implementation != "eager":
-            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
         attn_output, attn_weights = attention_interface(
             self,
@@ -1361,7 +1361,11 @@ class EvollaModel(EvollaPreTrainedModel):
                 attention_mask=protein_attention_mask,
             )
             protein_feats = protein_outputs.sequence_compressor_output
-            protein_batch_mask = torch.tensor([True] * protein_input_ids.shape[0], device=protein_input_ids.device)
+            protein_batch_mask = torch.ones(
+                protein_input_ids.shape[0],
+                device=protein_input_ids.device,
+                dtype=torch.bool,
+            )
         causal_mask = create_causal_mask(
             config=self.config,

transformers/models/evolla/modular_evolla.py CHANGED Viewed

@@ -824,7 +824,11 @@ class EvollaModel(EvollaPreTrainedModel):
                 attention_mask=protein_attention_mask,
             )
             protein_feats = protein_outputs.sequence_compressor_output
-            protein_batch_mask = torch.tensor([True] * protein_input_ids.shape[0], device=protein_input_ids.device)
+            protein_batch_mask = torch.ones(
+                protein_input_ids.shape[0],
+                device=protein_input_ids.device,
+                dtype=torch.bool,
+            )
         causal_mask = create_causal_mask(
             config=self.config,

transformers/models/exaone4/configuration_exaone4.py CHANGED Viewed

@@ -67,6 +67,8 @@ class Exaone4Config(PreTrainedConfig):
             Beginning of stream token id.
         eos_token_id (`int`, *optional*, defaults to 2):
             End of stream token id.
+        pad_token_id (`int`, *optional*):
+            The id of the padding token.
         tie_word_embeddings (`bool`, *optional*, defaults to `False`):
             Whether to tie weight embeddings
         rope_parameters (`RopeParameters`, *optional*):
@@ -139,6 +141,7 @@ class Exaone4Config(PreTrainedConfig):
         use_cache: bool | None = True,
         bos_token_id: int | None = 0,
         eos_token_id: int | None = 2,
+        pad_token_id: int | None = None,
         tie_word_embeddings: bool | None = False,
         rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None,
         attention_dropout: float | None = 0.0,
@@ -161,6 +164,10 @@ class Exaone4Config(PreTrainedConfig):
         self.attention_dropout = attention_dropout
         self.sliding_window = sliding_window
         self.sliding_window_pattern = sliding_window_pattern
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.tie_word_embeddings = tie_word_embeddings
         self.layer_types = layer_types
         if self.sliding_window is None:
@@ -172,15 +179,11 @@ class Exaone4Config(PreTrainedConfig):
                 else "full_attention"
                 for i in range(self.num_hidden_layers)
             ]
-        if "sliding_window" in self.layer_types:
-            self.cache_implementation = "hybrid"
         layer_type_validation(self.layer_types, self.num_hidden_layers)
         self.rope_parameters = rope_parameters
-        super().__init__(
-            bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs
-        )
+        super().__init__(**kwargs)
 __all__ = ["Exaone4Config"]

transformers/models/exaone4/modeling_exaone4.py CHANGED Viewed

@@ -261,9 +261,9 @@ class Exaone4Attention(nn.Module):
             }
             key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
-        attention_interface: Callable = eager_attention_forward
-        if self.config._attn_implementation != "eager":
-            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
         attn_output, attn_weights = attention_interface(
             self,
@@ -454,7 +454,7 @@ class Exaone4Model(Exaone4PreTrainedModel):
 @auto_docstring
 class Exaone4ForCausalLM(Exaone4PreTrainedModel, GenerationMixin):
     _tied_weights_keys = {"lm_head.weight": "model.embed_tokens.weight"}
-    _tp_plan = {"lm_head": "colwise_rep"}
+    _tp_plan = {"lm_head": "colwise_gather_output"}
     _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
     def __init__(self, config):

transformers/models/exaone4/modular_exaone4.py CHANGED Viewed

@@ -101,6 +101,8 @@ class Exaone4Config(PreTrainedConfig):
             Beginning of stream token id.
         eos_token_id (`int`, *optional*, defaults to 2):
             End of stream token id.
+        pad_token_id (`int`, *optional*):
+            The id of the padding token.
         tie_word_embeddings (`bool`, *optional*, defaults to `False`):
             Whether to tie weight embeddings
         rope_parameters (`RopeParameters`, *optional*):
@@ -173,6 +175,7 @@ class Exaone4Config(PreTrainedConfig):
         use_cache: bool | None = True,
         bos_token_id: int | None = 0,
         eos_token_id: int | None = 2,
+        pad_token_id: int | None = None,
         tie_word_embeddings: bool | None = False,
         rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None,
         attention_dropout: float | None = 0.0,
@@ -195,6 +198,10 @@ class Exaone4Config(PreTrainedConfig):
         self.attention_dropout = attention_dropout
         self.sliding_window = sliding_window
         self.sliding_window_pattern = sliding_window_pattern
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.tie_word_embeddings = tie_word_embeddings
         self.layer_types = layer_types
         if self.sliding_window is None:
@@ -206,15 +213,11 @@ class Exaone4Config(PreTrainedConfig):
                 else "full_attention"
                 for i in range(self.num_hidden_layers)
             ]
-        if "sliding_window" in self.layer_types:
-            self.cache_implementation = "hybrid"
         layer_type_validation(self.layer_types, self.num_hidden_layers)
         self.rope_parameters = rope_parameters
-        super().__init__(
-            bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs
-        )
+        super().__init__(**kwargs)
 class Exaone4RMSNorm(LlamaRMSNorm):
@@ -282,9 +285,9 @@ class Exaone4Attention(nn.Module):
             }
             key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
-        attention_interface: Callable = eager_attention_forward
-        if self.config._attn_implementation != "eager":
-            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
         attn_output, attn_weights = attention_interface(
             self,

transformers/models/exaone_moe/__init__.py ADDED Viewed

@@ -0,0 +1,27 @@
+# Copyright 2026 The LG AI Research and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+if TYPE_CHECKING:
+    from .configuration_exaone_moe import *
+    from .modeling_exaone_moe import *
+else:
+    import sys
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)

transformers 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl

transformers 5.0.0rc3py3-none-any.whl → 5.1.0py3-none-any.whl