PyPI - transformers - Versions diffs - 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl - Mend

transformers 5.0.0rc3py3-none-any.whl → 5.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1021) hide show

transformers/models/exaone_moe/modular_exaone_moe.py ADDED Viewed

@@ -0,0 +1,373 @@
+# Copyright 2026 The LG AI Research and HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""LG AI Research EXAONE Lab"""
+import torch
+import torch.nn as nn
+from ... import initialization as init
+from ...cache_utils import Cache
+from ...configuration_utils import PreTrainedConfig, layer_type_validation
+from ...modeling_outputs import CausalLMOutputWithPast
+from ...modeling_utils import PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, is_grouped_mm_available
+from ..deepseek_v3.modeling_deepseek_v3 import (
+    DeepseekV3MoE,
+    DeepseekV3NaiveMoe,
+    DeepseekV3TopkRouter,
+)
+from ..exaone4.configuration_exaone4 import Exaone4Config
+from ..exaone4.modeling_exaone4 import (
+    Exaone4Attention,
+    Exaone4ForCausalLM,
+    Exaone4Model,
+    Exaone4PreTrainedModel,
+)
+from ..olmoe.modeling_olmoe import (
+    OlmoeDecoderLayer,
+)
+from ..qwen2_moe.modeling_qwen2_moe import Qwen2MoeMLP
+class ExaoneMoeConfig(Exaone4Config):
+    model_type = "exaone_moe"
+    r"""
+    This is the configuration class to store the configuration of a [`ExaoneMoeModel`]. It is used to
+    instantiate a EXAONE MoE model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the K-EXAONE-236B-A23B [LGAI-EXAONE/K-EXAONE-236B-A23B](https://huggingface.co/LGAI-EXAONE/K-EXAONE-236B-A23B)
+    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PreTrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 102400):
+            Vocabulary size of the EXAONE MoE model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`ExaoneMoeModel`].
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 16384):
+            Dimensionality of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*, defaults to 32):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 32768 for EXAONE 3.5).
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if ``config.is_decoder=True``.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 53):
+            End of stream token id.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            Padding token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_parameters (`RopeParameters`, *optional*):
+            Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
+            a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
+            with longer `max_position_embeddings`.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        sliding_window (`int`, *optional*, defaults to 4096):
+            The size of the sliding window for the sliding window attention.
+        sliding_window_pattern (`str`, *optional*, defaults to 4):
+            The pattern to use for sliding window attention. Can be one of:
+                - `None`: No sliding window attention is used
+                - `int`: Every `sliding_window` layers, use global attention, else use local attention.
+                - `str`: A sequence of "L" (local attention) and "G" (global attention) characters that defines the
+                    attention pattern. The pattern starts from layer 0 and repeats every `sliding_window` layers. The
+                    final layer always uses global attention regardless of the pattern.
+            For instance, sliding_window_pattern="LLLG" same as sliding_window=4, which means:
+                - Layer 0, 1, 2: local attention,
+                - Layer 3: global attention,
+                ...(repeated)
+        layer_types (`list`, *optional*):
+            Attention pattern for each layer. Prioritized over `sliding_window_pattern`.
+        mlp_layer_types (`list`, *optional*):
+            MLP pattern for each layer. Prioritized over `first_k_dense_replace`.
+        first_k_dense_replace (`int`, *optional*, defaults to 1):
+            Number of dense layers in shallow layers(embed->dense->dense->...->dense->moe->moe...->lm_head).
+                                                        \--k dense layers--/
+        moe_intermediate_size (`int`, *optional*, defaults to 1024):
+            Dimension of the MoE representations.
+        num_experts (`int`, *optional*, defaults to 64):
+            Number of routed experts.
+        num_experts_per_tok (`int`, *optional*, defaults to 8):
+            Number of selected experts, None means dense model.
+        num_shared_experts (`int`, *optional*, defaults to 1):
+            Number of shared experts.
+        norm_topk_prob (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the weights of the routed experts.
+        routed_scaling_factor (`float`, *optional*, defaults to 2.5):
+            Scaling factor or routed experts.
+        n_group (`int`, *optional*, defaults to 1):
+            Number of groups for routed experts.
+        topk_group (`int`, *optional*, defaults to 1):
+            Number of selected groups for each token(for each token, ensuring the selected experts is only within `topk_group` groups).
+    Example:
+    ```python
+    >>> from transformers import ExaoneMoeModel, ExaoneMoeConfig
+    >>> # Initializing a EXAONE configuration
+    >>> configuration = ExaoneMoeConfig()
+    >>> # Initializing a model from configuration
+    >>> model = ExaoneMoeModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    def __init__(
+        self,
+        vocab_size=102400,
+        hidden_size=4096,
+        intermediate_size=16384,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=32,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        bos_token_id=1,
+        eos_token_id=53,
+        pad_token_id=0,
+        tie_word_embeddings=False,
+        rope_parameters=None,
+        attention_dropout=0.0,
+        sliding_window=4096,
+        sliding_window_pattern=4,
+        layer_types=None,
+        mlp_layer_types=None,
+        first_k_dense_replace=1,
+        moe_intermediate_size=1024,
+        num_experts=64,
+        num_experts_per_tok=8,
+        num_shared_experts=1,
+        norm_topk_prob=True,
+        routed_scaling_factor=2.5,
+        n_group=1,
+        topk_group=1,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.attention_dropout = attention_dropout
+        self.sliding_window = sliding_window
+        self.sliding_window_pattern = sliding_window_pattern
+        self.first_k_dense_replace = first_k_dense_replace
+        self.moe_intermediate_size = moe_intermediate_size
+        self.num_experts = num_experts
+        self.num_experts_per_tok = num_experts_per_tok
+        self.num_shared_experts = num_shared_experts
+        self.norm_topk_prob = norm_topk_prob
+        self.routed_scaling_factor = routed_scaling_factor
+        self.n_group = n_group
+        self.topk_group = topk_group
+        self.rope_parameters = rope_parameters
+        self.layer_types = layer_types
+        if self.sliding_window is None:
+            sliding_window_pattern = 0
+        if self.layer_types is None:
+            self.layer_types = [
+                "sliding_attention"
+                if ((i + 1) % (sliding_window_pattern) != 0 and i < self.num_hidden_layers)
+                else "full_attention"
+                for i in range(self.num_hidden_layers)
+            ]
+        layer_type_validation(self.layer_types)
+        self.mlp_layer_types = mlp_layer_types
+        if self.mlp_layer_types is None:
+            self.mlp_layer_types = [
+                "dense" if i < self.first_k_dense_replace else "sparse" for i in range(self.num_hidden_layers)
+            ]
+        layer_type_validation(self.mlp_layer_types, self.num_hidden_layers, attention=False)
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.tie_word_embeddings = tie_word_embeddings
+        PreTrainedConfig.__init__(**kwargs)
+class ExaoneMoeAttention(Exaone4Attention):
+    pass
+class ExaoneMoeMLP(Qwen2MoeMLP):
+    pass
+class ExaoneMoeTopkRouter(DeepseekV3TopkRouter):
+    def __init__(self, config):
+        nn.Module.__init__()
+        self.config = config
+        self.weight = nn.Parameter(torch.empty((config.num_experts, config.hidden_size)))
+        self.register_buffer("e_score_correction_bias", torch.zeros(config.num_experts))
+class ExaoneMoeExperts(DeepseekV3NaiveMoe):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_experts = config.num_experts
+class ExaoneMoeSparseMoEBlock(DeepseekV3MoE):
+    def __init__(self, config):
+        super().__init__()
+        self.experts = ExaoneMoeExperts(config)
+        self.shared_experts = ExaoneMoeMLP(
+            config=config, intermediate_size=config.moe_intermediate_size * config.num_shared_experts
+        )
+        self.n_routed_experts = config.num_experts
+class ExaoneMoeDecoderLayer(OlmoeDecoderLayer):
+    def __init__(self, config: ExaoneMoeConfig, layer_idx: int):
+        super().__init__(config, layer_idx)
+        self.mlp = (
+            ExaoneMoeSparseMoEBlock(config) if config.mlp_layer_types[layer_idx] == "sparse" else ExaoneMoeMLP(config)
+        )
+class ExaoneMoePreTrainedModel(Exaone4PreTrainedModel):
+    config: ExaoneMoeConfig
+    _can_record_outputs = {
+        "hidden_states": ExaoneMoeDecoderLayer,
+        "attentions": ExaoneMoeAttention,
+        "router_logits": ExaoneMoeSparseMoEBlock,
+    }
+    _can_compile_fullgraph = (
+        is_grouped_mm_available()
+    )  # https://huggingface.co/docs/transformers/experts_interface#torchcompile
+    _keep_in_fp32_modules_strict = ["e_score_correction_bias"]
+    _keys_to_ignore_on_load_unexpected = [r"mtp.*"]
+    @torch.no_grad()
+    def _init_weights(self, module):
+        PreTrainedModel._init_weights(self, module)
+        if isinstance(module, ExaoneMoeTopkRouter):
+            init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
+            init.zeros_(module.e_score_correction_bias)
+        elif isinstance(module, ExaoneMoeExperts):
+            init.normal_(module.gate_up_proj, mean=0.0, std=self.config.initializer_range)
+            init.normal_(module.down_proj, mean=0.0, std=self.config.initializer_range)
+class ExaoneMoeModel(Exaone4Model):
+    pass
+class ExaoneMoeForCausalLM(Exaone4ForCausalLM):
+    def forward(
+        self,
+        input_ids: torch.LongTensor | None = None,
+        attention_mask: torch.Tensor | None = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: Cache | None = None,
+        inputs_embeds: torch.FloatTensor | None = None,
+        labels: torch.LongTensor | None = None,
+        use_cache: bool | None = None,
+        cache_position: torch.LongTensor | None = None,
+        logits_to_keep: int | torch.Tensor = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> CausalLMOutputWithPast:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        Example:
+        ```python
+        >>> from transformers import AutoModelForCausalLM, AutoTokenizer
+        >>> model = AutoModelForCausalLM.from_pretrained("LGAI-EXAONE/K-EXAONE-236B-A23B")
+        >>> tokenizer = AutoTokenizer.from_pretrained("LGAI-EXAONE/K-EXAONE-236B-A23B")
+        >>> prompt = "Explain how wonderful you are"
+        >>> messages = [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": prompt}
+        ]
+        >>> input_ids = tokenizer.apply_chat_template(
+            messages,
+            tokenize=True,
+            add_generation_prompt=True,
+            return_tensors="pt",
+            enable_thinking=False,
+        )
+        >>> output = model.generate(**input_ids.to(model.device), max_new_tokens=128)
+        >>> tokenizer.decode(output[0], skip_special_tokens=False)
+        "<|system|>\nYou are a helpful assistant.<|endofturn|>\n<|user|>\nExplain how wonderful you are<|endofturn|>\n<|assistant|>\n<think>\n\n</think>\n\nThank you for the kind question! While I can't feel emotions or take pride in the way humans do, I *can* share what makes me uniquely helpful and capable—qualities that many people find wonderful.\n\nHere’s how I can support you:\n\n🌟 **Knowledge at Your Fingertips**  \nI have access to a vast amount of information across countless topics—from science and history to technology and creative writing. Whether you're curious, learning, or solving a problem, I can help explain things clearly and accurately.\n\n💬 **Clear, Helpful Communication**  \nI aim to respond in a way that's easy to understand, whether you need a simple explanation or a detailed analysis. I adapt my tone and depth to match"
+        ```
+        """
+        super().forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
+            **kwargs,
+        )
+__all__ = [
+    "ExaoneMoeConfig",
+    "ExaoneMoePreTrainedModel",
+    "ExaoneMoeModel",
+    "ExaoneMoeForCausalLM",
+]

transformers/models/falcon/configuration_falcon.py CHANGED Viewed

@@ -82,11 +82,15 @@ class FalconConfig(PreTrainedConfig):
             The id of the "beginning-of-sequence" token.
         eos_token_id (`int`, *optional*, defaults to 11):
             The id of the "end-of-sequence" token.
+        pad_token_id (`int`, *optional*):
+            Padding token id.
         ffn_hidden_size (`int`, *optional*):
             The hidden size of the feedforward layer in the Transformer decoder.
             defaults to 4x hidden dim
         activation (`str`, *optional*, defaults to `"gelu"`):
             The activation function used in the feedforward layer.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to tie weight embeddings
     Example:
@@ -128,8 +132,10 @@ class FalconConfig(PreTrainedConfig):
         rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None,
         bos_token_id: int | None = 11,
         eos_token_id: int | None = 11,
+        pad_token_id: int | None = None,
         ffn_hidden_size: int | None = None,
         activation: str | None = "gelu",
+        tie_word_embeddings: bool | None = True,
         **kwargs,
     ):
         self.vocab_size = vocab_size
@@ -145,6 +151,7 @@ class FalconConfig(PreTrainedConfig):
         self.attention_dropout = attention_dropout
         self.bos_token_id = bos_token_id
         self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
         self.num_kv_heads = num_attention_heads if num_kv_heads is None else num_kv_heads
         self.alibi = alibi
         self.new_decoder_architecture = new_decoder_architecture
@@ -154,6 +161,7 @@ class FalconConfig(PreTrainedConfig):
         self.num_ln_in_parallel_attn = num_ln_in_parallel_attn
         self.max_position_embeddings = max_position_embeddings
         self.activation = activation
+        self.tie_word_embeddings = tie_word_embeddings
         if ffn_hidden_size is None:
             self.ffn_hidden_size = hidden_size * 4
         else:
@@ -161,7 +169,7 @@ class FalconConfig(PreTrainedConfig):
         self.rope_parameters = rope_parameters
-        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+        super().__init__(**kwargs)
     @property
     def head_dim(self):

transformers/models/falcon/modeling_falcon.py CHANGED Viewed

@@ -47,7 +47,7 @@ from ...utils import (
     auto_docstring,
     logging,
 )
-from ...utils.generic import maybe_autocast
+from ...utils.generic import is_flash_attention_requested, maybe_autocast
 from .configuration_falcon import FalconConfig
@@ -511,12 +511,7 @@ class FalconFlashAttention2(FalconAttention):
         device_type = query_layer.device.type if query_layer.device.type != "mps" else "cpu"
         if input_dtype == torch.float32:
             if torch.is_autocast_enabled():
-                # NOTE: `torch.get_autocast_dtype` is there starting from PyTorch 2.4
-                target_dtype = (
-                    torch.get_autocast_dtype(device_type)
-                    if hasattr(torch, "get_autocast_dtype")
-                    else torch.get_autocast_gpu_dtype()
-                )
+                target_dtype = torch.get_autocast_dtype(device_type)
             # Handle the case where the model is quantized
             elif hasattr(self.config, "_is_quantized"):
                 target_dtype = self.config.dtype
@@ -859,7 +854,7 @@ class FalconModel(FalconPreTrainedModel):
         # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using
         # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114
-        if self.config._attn_implementation == "flash_attention_2":
+        if is_flash_attention_requested(self.config):
             if attention_mask is not None and 0.0 in attention_mask:
                 return attention_mask
             return None

transformers/models/falcon_h1/configuration_falcon_h1.py CHANGED Viewed

@@ -101,6 +101,12 @@ class FalconH1Config(PreTrainedConfig):
             Whether to use RMSNorm before the gate in the Mamba block
         mamba_rms_norm (`bool`, *optional*, defaults to `False`):
             Whether to use RMSNorm instead of LayerNorm in the Mamba block
+        time_step_min (`float`, *optional*, defaults to 0.001):
+            Minimum `time_step` used to bound `dt_proj.bias`.
+        time_step_max (`float`, *optional*, defaults to 0.1):
+            Maximum `time_step` used to bound `dt_proj.bias`.
+        time_step_limit (`tuple`, *optional*, defaults to `(0.0, inf)`):
+            Accepted range of time step values for clamping.
         projectors_bias (`bool`, *optional*, defaults to `False`):
             Flag indicating whether or not to use bias in the input and output projections (["in_proj", "out_proj"]) of the attention block
         rope_parameters (`float`, *optional*):
@@ -160,6 +166,9 @@ class FalconH1Config(PreTrainedConfig):
         mamba_proj_bias: bool | None = False,
         mamba_norm_before_gate: bool | None = True,
         mamba_rms_norm: bool | None = False,
+        time_step_min: float | None = 0.001,
+        time_step_max: float | None = 0.1,
+        time_step_limit: tuple[float, float] | None = (0.0, float("inf")),
         projectors_bias: bool | None = False,
         rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None,
         lm_head_multiplier: float | None = 1.0,
@@ -220,6 +229,9 @@ class FalconH1Config(PreTrainedConfig):
         self.mamba_norm_before_gate = mamba_norm_before_gate
         self.mamba_rms_norm = mamba_rms_norm
+        self.time_step_min = time_step_min
+        self.time_step_max = time_step_max
+        self.time_step_limit = tuple(time_step_limit) if time_step_limit is not None else None
         self.lm_head_multiplier = lm_head_multiplier
         self.embedding_multiplier = embedding_multiplier
@@ -259,15 +271,12 @@ class FalconH1Config(PreTrainedConfig):
         else:
             self.ssm_out_multiplier = 1.0
+        self.tie_word_embeddings = tie_word_embeddings
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
         self.rope_parameters = rope_parameters
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
+        super().__init__(**kwargs)
     @property
     def layers_block_type(self):

transformers/models/falcon_h1/modeling_falcon_h1.py CHANGED Viewed

@@ -44,7 +44,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
 from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torchdynamo_compiling, logging
-from ...utils.generic import maybe_autocast
+from ...utils.generic import is_flash_attention_requested, maybe_autocast
 from ...utils.import_utils import is_causal_conv1d_available, is_mamba_2_ssm_available
 from .configuration_falcon_h1 import FalconH1Config
@@ -411,9 +411,9 @@ class FalconH1Attention(nn.Module):
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
             key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
-        attention_interface: Callable = eager_attention_forward
-        if self.config._attn_implementation != "eager":
-            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
         attn_output, attn_weights = attention_interface(
             self,
@@ -567,10 +567,9 @@ class FalconH1Mixer(nn.Module):
         self.head_dim = config.mamba_d_head
         self.chunk_size = config.mamba_chunk_size
-        # FIXME:
-        self.time_step_limit = (0.0, float("inf"))
-        self.time_step_min = 0.001
-        self.time_step_max = 0.1
+        self.time_step_limit = config.time_step_limit
+        self.time_step_min = config.time_step_min
+        self.time_step_max = config.time_step_max
         self.conv_dim = self.intermediate_size + 2 * self.n_groups * self.ssm_state_size
         self.conv1d = nn.Conv1d(
@@ -1038,7 +1037,7 @@ class FalconH1Mixer(nn.Module):
         cache_position: torch.LongTensor | None = None,
         attention_mask: torch.Tensor | None = None,
     ):
-        if is_fast_path_available and "cuda" in self.in_proj.weight.device.type:
+        if is_fast_path_available and "cuda" in self.in_proj.weight.device.type and not is_torchdynamo_compiling():
             return self.cuda_kernels_forward(hidden_states, cache_params, cache_position, attention_mask)
         dtype = hidden_states.dtype
         if attention_mask is not None and attention_mask.shape[1] > 1 and attention_mask.shape[0] > 1:
@@ -1389,7 +1388,7 @@ class FalconH1Model(FalconH1PreTrainedModel):
         past_key_values: FalconHybridMambaAttentionDynamicCache,
         output_attentions: bool,
     ):
-        if self.config._attn_implementation == "flash_attention_2":
+        if is_flash_attention_requested(self.config):
             if attention_mask is not None and 0.0 in attention_mask:
                 return attention_mask
             return None
@@ -1501,7 +1500,7 @@ class FalconH1Model(FalconH1PreTrainedModel):
 @auto_docstring
 class FalconH1ForCausalLM(FalconH1PreTrainedModel, GenerationMixin):
     _tied_weights_keys = {"lm_head.weight": "model.embed_tokens.weight"}
-    _tp_plan = {"lm_head": "colwise_rep"}
+    _tp_plan = {"lm_head": "colwise_gather_output"}
     _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
     def __init__(self, config):
@@ -1597,22 +1596,7 @@ class FalconH1ForCausalLM(FalconH1PreTrainedModel, GenerationMixin):
     ):
         # Overwritten -- has a unique cache type, `FalconHybridMambaAttentionDynamicCache`
-        empty_past_kv = past_key_values is None
-        # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
-        # Exception 1: when passing input_embeds, input_ids may be missing entries
-        # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
-        # Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case.
-        #              (we can't check exception 3 while compiling)
-        if not empty_past_kv:
-            if (
-                inputs_embeds is not None  # Exception 1
-                or (is_torchdynamo_compiling() or cache_position[-1] >= input_ids.shape[1])  # Exception 3
-            ):
-                input_ids = input_ids[:, -cache_position.shape[0] :]
-            elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
-                input_ids = input_ids[:, cache_position]
-        else:
+        if past_key_values is None:
             past_key_values = FalconHybridMambaAttentionDynamicCache(
                 self.config,
                 input_ids.shape[0],
@@ -1622,35 +1606,19 @@ class FalconH1ForCausalLM(FalconH1PreTrainedModel, GenerationMixin):
                 ],
             )
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if not empty_past_kv:
-                position_ids = position_ids[:, -input_ids.shape[1] :]
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and is_first_iteration:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids.contiguous()}  # `contiguous()` needed for compilation use cases
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "use_cache": use_cache,
-                "attention_mask": attention_mask,
-                "logits_to_keep": self.config.num_logits_to_keep,
-                "cache_position": cache_position,
-            }
+        kwargs["logits_to_keep"] = self.config.num_logits_to_keep
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            cache_position=cache_position,
+            position_ids=position_ids,
+            use_cache=use_cache,
+            is_first_iteration=is_first_iteration,
+            **kwargs,
         )
-        # Forward ALL kwargs that are uninitialized (e.g. `use_cache`).
-        for key, value in kwargs.items():
-            if key not in model_inputs:
-                model_inputs[key] = value
         return model_inputs

transformers 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl

transformers 5.0.0rc3py3-none-any.whl → 5.1.0py3-none-any.whl