PyPI - transformers - Versions diffs - 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl - Mend

transformers 5.0.0rc3py3-none-any.whl → 5.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1021) hide show

transformers/models/solar_open/modular_solar_open.py ADDED Viewed

@@ -0,0 +1,224 @@
+# Copyright 2026 Upstage and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch SolarOpen model."""
+from torch import nn
+from ...modeling_rope_utils import RopeParameters
+from ...utils import logging
+from ..glm4_moe.configuration_glm4_moe import Glm4MoeConfig
+from ..glm4_moe.modeling_glm4_moe import (
+    Glm4MoeForCausalLM,
+    Glm4MoeModel,
+    Glm4MoeMoE,
+    Glm4MoePreTrainedModel,
+    Glm4MoeRMSNorm,
+)
+from ..llama.modeling_llama import LlamaAttention, LlamaDecoderLayer
+logger = logging.get_logger(__name__)
+class SolarOpenConfig(Glm4MoeConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`SolarOpenModel`]. It is used to instantiate a
+    SolarOpen model according to the specified arguments, defining the model architecture.
+    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PreTrainedConfig`] for more information.
+    Instantiating a configuration defaults will yield a similar configuration to that of
+    [upstage/Solar-Open-100B](https://huggingface.co/upstage/Solar-Open-100B) architecture.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 196608):
+            Vocabulary size of the SolarOpen model.
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        moe_intermediate_size (`int`, *optional*, defaults to 1280):
+            Intermediate size of the routed expert.
+        num_hidden_layers (`int`, *optional*, defaults to 48):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 64):
+            Number of attention heads for each attention layer.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            Number of key_value heads for Grouped Query Attention.
+        n_shared_experts (`int`, *optional*, defaults to 1):
+            Number of shared experts.
+        n_routed_experts (`int`, *optional*, defaults to 128):
+            Number of routed experts.
+        head_dim (`int`, *optional*, defaults to 128):
+            Dimension of each attention head.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 131072):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether to return the last key/values attentions.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_parameters (`RopeParameters`, *optional*):
+            Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
+            a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
+            with longer `max_position_embeddings`.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in the projection layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        num_experts_per_tok (`int`, *optional*, defaults to 8):
+            Number of experts per token.
+        routed_scaling_factor (`float`, *optional*, defaults to 1.0):
+            Scaling factor for routed experts.
+        n_group (`int`, *optional*, defaults to 1):
+            Number of groups for routed experts.
+        topk_group (`int`, *optional*, defaults to 1):
+            Number of selected groups for each token.
+        norm_topk_prob (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the topk probabilities.
+        bos_token_id (`int`, *optional*):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*):
+            End of stream token id.
+        pad_token_id (`int`, *optional*):
+            Padding token id.
+    """
+    model_type = "solar_open"
+    default_theta = 1_000_000.0
+    # Default tensor parallel plan for base model `SolarOpenModel`
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.experts.gate_up_proj": "packed_colwise",
+        "layers.*.mlp.experts.down_proj": "rowwise",
+        "layers.*.mlp.experts": "moe_tp_experts",
+    }
+    def __init__(
+        self,
+        vocab_size: int = 196608,
+        hidden_size: int = 4096,
+        moe_intermediate_size: int = 1280,
+        num_hidden_layers: int = 48,
+        num_attention_heads: int = 64,
+        num_key_value_heads: int = 8,
+        n_shared_experts: int = 1,
+        n_routed_experts: int = 128,
+        head_dim: int = 128,
+        hidden_act: str = "silu",
+        max_position_embeddings: int = 131072,
+        initializer_range: float = 0.02,
+        rms_norm_eps: int = 1e-5,
+        use_cache: bool = True,
+        tie_word_embeddings: bool = False,
+        rope_parameters: RopeParameters | None = None,
+        attention_bias: bool = False,
+        attention_dropout: float = 0.0,
+        num_experts_per_tok: int = 8,
+        routed_scaling_factor: float = 1.0,
+        n_group: int = 1,
+        topk_group: int = 1,
+        norm_topk_prob: bool = True,
+        bos_token_id: int | None = None,
+        eos_token_id: int | None = None,
+        pad_token_id: int | None = None,
+        **kwargs,
+    ):
+        # Default partial_rotary_factor to 1.0 (instead of 0.5 in Glm4MoeConfig).
+        # `setdefault` ensures this value is not overridden by subsequent calls.
+        # This workaround is required due to modular inheritance limitations.
+        kwargs.setdefault("partial_rotary_factor", 1.0)
+        self.head_dim = head_dim
+        super().__init__(
+            vocab_size=vocab_size,
+            hidden_size=hidden_size,
+            moe_hidden_size=moe_intermediate_size,
+            num_hidden_layers=num_hidden_layers,
+            num_attention_heads=num_attention_heads,
+            num_key_value_heads=num_key_value_heads,
+            n_shared_experts=n_shared_experts,
+            n_routed_experts=n_routed_experts,
+            head_dim=head_dim,
+            hidden_act=hidden_act,
+            max_position_embeddings=max_position_embeddings,
+            initializer_range=initializer_range,
+            rms_norm_eps=rms_norm_eps,
+            use_cache=use_cache,
+            tie_word_embeddings=tie_word_embeddings,
+            rope_parameters=rope_parameters,
+            attention_bias=attention_bias,
+            attention_dropout=attention_dropout,
+            num_experts_per_tok=num_experts_per_tok,
+            routed_scaling_factor=routed_scaling_factor,
+            n_group=n_group,
+            topk_group=topk_group,
+            norm_topk_prob=norm_topk_prob,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            pad_token_id=pad_token_id,
+            **kwargs,
+        )
+        del self.intermediate_size
+        del self.first_k_dense_replace
+        del self.use_qk_norm
+class SolarOpenDecoderLayer(LlamaDecoderLayer):
+    def __init__(self, config: SolarOpenConfig, layer_idx: int):
+        super().__init__(config, layer_idx)
+        self.mlp = SolarOpenMoE(config)
+class SolarOpenMoE(Glm4MoeMoE):
+    pass
+class SolarOpenAttention(LlamaAttention):
+    def __init__(self, config: SolarOpenConfig, layer_idx: int):
+        super().__init__(config, layer_idx)
+        self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=False)
+class SolarOpenRMSNorm(Glm4MoeRMSNorm):
+    pass
+class SolarOpenPreTrainedModel(Glm4MoePreTrainedModel):
+    pass
+class SolarOpenModel(Glm4MoeModel):
+    _keys_to_ignore_on_load_unexpected = []
+class SolarOpenForCausalLM(Glm4MoeForCausalLM):
+    pass
+__all__ = [
+    "SolarOpenConfig",
+    "SolarOpenPreTrainedModel",
+    "SolarOpenModel",
+    "SolarOpenForCausalLM",
+]

transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py CHANGED Viewed

@@ -87,7 +87,7 @@ class SpeechEncoderDecoderModel(PreTrainedModel, GenerationMixin):
             if not isinstance(config, self.config_class):
                 raise ValueError(f"Config: {config} has to be of type {self.config_class}")
-        if config.decoder.cross_attention_hidden_size is not None:
+        if getattr(config.decoder, "cross_attention_hidden_size", None) is not None:
             if config.decoder.cross_attention_hidden_size != config.encoder.hidden_size:
                 raise ValueError(
                     "If `cross_attention_hidden_size` is specified in the decoder's configuration, it has to be equal"
@@ -132,7 +132,7 @@ class SpeechEncoderDecoderModel(PreTrainedModel, GenerationMixin):
         self.encoder_output_dim = getattr(config.encoder, "output_hidden_size", config.encoder.hidden_size)
         if (
             self.encoder_output_dim != self.decoder.config.hidden_size
-            and self.decoder.config.cross_attention_hidden_size is None
+            and getattr(self.decoder.config, "cross_attention_hidden_size", None) is None
         ):
             # encoder outputs might need to be projected to different dimension for decoder
             self.enc_to_dec_proj = nn.Linear(self.encoder.config.hidden_size, self.decoder.config.hidden_size)
@@ -249,7 +249,9 @@ class SpeechEncoderDecoderModel(PreTrainedModel, GenerationMixin):
                     encoder_pretrained_model_name_or_path, **kwargs_encoder, return_unused_kwargs=True
                 )
-                if encoder_config.is_decoder is True or encoder_config.add_cross_attention is True:
+                if getattr(encoder_config, "is_decoder", False) or getattr(
+                    encoder_config, "add_cross_attention", False
+                ):
                     logger.info(
                         f"Initializing {encoder_pretrained_model_name_or_path} as a encoder model "
                         "from a decoder model. Cross-attention and causal mask are disabled."
@@ -423,7 +425,7 @@ class SpeechEncoderDecoderModel(PreTrainedModel, GenerationMixin):
         # optionally project encoder_hidden_states
         if (
             self.encoder_output_dim != self.decoder.config.hidden_size
-            and self.decoder.config.cross_attention_hidden_size is None
+            and getattr(self.decoder.config, "cross_attention_hidden_size", None) is None
         ):
             encoder_hidden_states = self.enc_to_dec_proj(encoder_hidden_states)

transformers/models/speech_to_text/configuration_speech_to_text.py CHANGED Viewed

@@ -98,6 +98,8 @@ class Speech2TextConfig(PreTrainedConfig):
             features.
         input_channels (`int`, *optional*, defaults to 1):
             An integer specifying number of input channels of the input feature vector.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to tie weight embeddings
     Example:
@@ -149,6 +151,7 @@ class Speech2TextConfig(PreTrainedConfig):
         conv_channels=1024,
         input_feat_per_channel=80,
         input_channels=1,
+        tie_word_embeddings=True,
         **kwargs,
     ):
         self.vocab_size = vocab_size
@@ -185,14 +188,12 @@ class Speech2TextConfig(PreTrainedConfig):
                 f"`config.num_conv_layers = {self.num_conv_layers}`."
             )
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            is_encoder_decoder=is_encoder_decoder,
-            decoder_start_token_id=decoder_start_token_id,
-            **kwargs,
-        )
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.decoder_start_token_id = decoder_start_token_id
+        self.tie_word_embeddings = tie_word_embeddings
+        super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
 __all__ = ["Speech2TextConfig"]

transformers/models/speech_to_text/modeling_speech_to_text.py CHANGED Viewed

@@ -293,9 +293,9 @@ class Speech2TextAttention(nn.Module):
                 if is_cross_attention and isinstance(past_key_values, EncoderDecoderCache):
                     past_key_values.is_updated[self.layer_idx] = True
-        attention_interface: Callable = eager_attention_forward
-        if self.config._attn_implementation != "eager":
-            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
         attn_output, attn_weights = attention_interface(
             self,

transformers/models/speecht5/configuration_speecht5.py CHANGED Viewed

@@ -246,6 +246,7 @@ class SpeechT5Config(PreTrainedConfig):
         guided_attention_loss_scale=10.0,
         use_cache=True,
         is_encoder_decoder=True,
+        tie_word_embeddings=True,
         **kwargs,
     ):
         self.vocab_size = vocab_size
@@ -322,15 +323,13 @@ class SpeechT5Config(PreTrainedConfig):
         self.use_cache = use_cache
         self.is_encoder_decoder = is_encoder_decoder
+        self.tie_word_embeddings = tie_word_embeddings
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            is_encoder_decoder=is_encoder_decoder,
-            decoder_start_token_id=decoder_start_token_id,
-            **kwargs,
-        )
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.decoder_start_token_id = decoder_start_token_id
+        super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
     def inputs_to_logits_ratio(self):
         return functools.reduce(operator.mul, self.conv_stride, 1)

transformers/models/splinter/configuration_splinter.py CHANGED Viewed

@@ -59,9 +59,6 @@ class SplinterConfig(PreTrainedConfig):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if `config.is_decoder=True`.
         question_token_id (`int`, *optional*, defaults to 104):
             The id of the `[QUESTION]` token.
@@ -96,13 +93,17 @@ class SplinterConfig(PreTrainedConfig):
         type_vocab_size=2,
         initializer_range=0.02,
         layer_norm_eps=1e-12,
-        use_cache=True,
         pad_token_id=0,
+        bos_token_id=None,
+        eos_token_id=None,
         question_token_id=104,
         **kwargs,
     ):
-        super().__init__(pad_token_id=pad_token_id, **kwargs)
+        super().__init__(**kwargs)
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
         self.hidden_size = hidden_size
@@ -115,7 +116,6 @@ class SplinterConfig(PreTrainedConfig):
         self.initializer_range = initializer_range
         self.type_vocab_size = type_vocab_size
         self.layer_norm_eps = layer_norm_eps
-        self.use_cache = use_cache
         self.question_token_id = question_token_id

transformers/models/splinter/modeling_splinter.py CHANGED Viewed

@@ -30,6 +30,7 @@ from ...utils import (
     auto_docstring,
     can_return_tuple,
     logging,
+    torch_compilable_check,
 )
 from .configuration_splinter import SplinterConfig
@@ -148,9 +149,9 @@ class SplinterSelfAttention(nn.Module):
         key_states = self.key(hidden_states).view(hidden_shape).transpose(1, 2)
         value_states = self.value(hidden_states).view(hidden_shape).transpose(1, 2)
-        attention_interface: Callable = eager_attention_forward
-        if self.config._attn_implementation != "eager":
-            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
         attn_output, attn_weights = attention_interface(
             self,
@@ -783,6 +784,10 @@ class SplinterForPreTraining(SplinterPreTrainedModel):
             dtype=torch.long,
             device=input_ids.device,
         )
+        torch_compilable_check(
+            num_questions.size(0) == input_ids.size(0),
+            "All samples in the batch must have at least one question token.",
+        )
         cols = torch.cat([torch.arange(n) for n in num_questions])
         positions[rows, cols] = flat_positions
         return positions

transformers/models/squeezebert/configuration_squeezebert.py CHANGED Viewed

@@ -61,6 +61,10 @@ class SqueezeBertConfig(PreTrainedConfig):
         pad_token_id (`int`, *optional*, defaults to 0):
             The ID of the token in the word embedding to use as padding.
+        bos_token_id (`int`, *optional*):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*):
+            End of stream token id.
         embedding_size (`int`, *optional*, defaults to 768):
             The dimension of the word embedding vectors.
@@ -76,6 +80,8 @@ class SqueezeBertConfig(PreTrainedConfig):
             The number of groups in the second feed forward network layer.
         output_groups (`int`, *optional*, defaults to 4):
             The number of groups in the third feed forward network layer.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to tie weight embeddings
     Examples:
@@ -110,6 +116,8 @@ class SqueezeBertConfig(PreTrainedConfig):
         initializer_range=0.02,
         layer_norm_eps=1e-12,
         pad_token_id=0,
+        bos_token_id=None,
+        eos_token_id=None,
         embedding_size=768,
         q_groups=4,
         k_groups=4,
@@ -117,10 +125,15 @@ class SqueezeBertConfig(PreTrainedConfig):
         post_attention_groups=1,
         intermediate_groups=4,
         output_groups=4,
+        tie_word_embeddings=True,
         **kwargs,
     ):
-        super().__init__(pad_token_id=pad_token_id, **kwargs)
+        super().__init__(**kwargs)
+        self.tie_word_embeddings = tie_word_embeddings
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
         self.vocab_size = vocab_size
         self.hidden_size = hidden_size
         self.num_hidden_layers = num_hidden_layers

transformers/models/stablelm/configuration_stablelm.py CHANGED Viewed

@@ -87,6 +87,8 @@ class StableLmConfig(PreTrainedConfig):
             The id of the `BOS` token in the vocabulary.
         eos_token_id (int, *optional*, defaults to 0):
             The id of the `EOS` token in the vocabulary.
+        pad_token_id (int, *optional*):
+            The id of the `PAD` token in the vocabulary.
     Example:
@@ -122,6 +124,7 @@ class StableLmConfig(PreTrainedConfig):
         attention_dropout: float | None = 0.0,
         bos_token_id: int | None = 0,
         eos_token_id: int | None = 0,
+        pad_token_id: int | None = None,
         **kwargs,
     ):
         self.vocab_size = vocab_size
@@ -145,12 +148,11 @@ class StableLmConfig(PreTrainedConfig):
         self.rope_parameters = rope_parameters
         kwargs.setdefault("partial_rotary_factor", 0.25)  # assign default for BC
-        super().__init__(
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.tie_word_embeddings = tie_word_embeddings
+        super().__init__(**kwargs)
 __all__ = ["StableLmConfig"]

transformers/models/stablelm/modeling_stablelm.py CHANGED Viewed

@@ -44,7 +44,7 @@ from ...modeling_rope_utils import (
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
 from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torch_flex_attn_available, logging
-from ...utils.generic import maybe_autocast
+from ...utils.generic import is_flash_attention_requested, maybe_autocast
 from .configuration_stablelm import StableLmConfig
@@ -327,9 +327,9 @@ class StableLmAttention(nn.Module):
             }
             key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
-        attention_interface: Callable = eager_attention_forward
-        if self.config._attn_implementation != "eager":
-            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
         attn_output, attn_weights = attention_interface(
             self,
@@ -575,7 +575,7 @@ class StableLmModel(StableLmPreTrainedModel):
         past_key_values: Cache,
         output_attentions: bool = False,
     ):
-        if self.config._attn_implementation == "flash_attention_2":
+        if is_flash_attention_requested(self.config):
             if attention_mask is not None and (attention_mask == 0.0).any():
                 return attention_mask
             return None

transformers/models/starcoder2/configuration_starcoder2.py CHANGED Viewed

@@ -67,6 +67,8 @@ class Starcoder2Config(PreTrainedConfig):
             The id of the "beginning-of-sequence" token.
         eos_token_id (`int`, *optional*, defaults to 50256):
             The id of the "end-of-sequence" token.
+        pad_token_id (`int`, *optional*):
+            Padding token id.
         rope_parameters (`RopeParameters`, *optional*):
             Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
             a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
@@ -81,6 +83,8 @@ class Starcoder2Config(PreTrainedConfig):
             Embedding dropout.
         use_bias (`bool`, *optional*, defaults to `True`):
             Whether to use bias term on linear layers of the model.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to tie weight embeddings
     ```python
@@ -128,12 +132,14 @@ class Starcoder2Config(PreTrainedConfig):
         use_cache: bool | None = True,
         bos_token_id: int | None = 50256,
         eos_token_id: int | None = 50256,
+        pad_token_id: int | None = None,
         rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None,
         sliding_window: int | None = None,
         attention_dropout: float | None = 0.0,
         residual_dropout: float | None = 0.0,
         embedding_dropout: float | None = 0.0,
         use_bias: bool | None = True,
+        tie_word_embeddings: bool | None = True,
         **kwargs,
     ):
         self.vocab_size = vocab_size
@@ -154,11 +160,11 @@ class Starcoder2Config(PreTrainedConfig):
         self.embedding_dropout = embedding_dropout
         self.rope_parameters = rope_parameters
-        super().__init__(
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            **kwargs,
-        )
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.tie_word_embeddings = tie_word_embeddings
+        super().__init__(**kwargs)
 __all__ = ["Starcoder2Config"]

transformers/models/starcoder2/modeling_starcoder2.py CHANGED Viewed

@@ -182,9 +182,9 @@ class Starcoder2Attention(nn.Module):
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
             key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
-        attention_interface: Callable = eager_attention_forward
-        if self.config._attn_implementation != "eager":
-            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
         attn_output, attn_weights = attention_interface(
             self,
@@ -364,7 +364,7 @@ class Starcoder2Model(Starcoder2PreTrainedModel):
         use_cache: bool | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> BaseModelOutputWithPast:
+    ) -> tuple | BaseModelOutputWithPast:
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
@@ -422,7 +422,7 @@ class Starcoder2Model(Starcoder2PreTrainedModel):
 @auto_docstring
 class Starcoder2ForCausalLM(Starcoder2PreTrainedModel, GenerationMixin):
     _tied_weights_keys = {"lm_head.weight": "model.embed_tokens.weight"}
-    _tp_plan = {"lm_head": "colwise_rep"}
+    _tp_plan = {"lm_head": "colwise_gather_output"}
     _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
     def __init__(self, config):

transformers/models/starcoder2/modular_starcoder2.py CHANGED Viewed

@@ -99,9 +99,9 @@ class Starcoder2Attention(MistralAttention):
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
             key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
-        attention_interface: Callable = eager_attention_forward
-        if self.config._attn_implementation != "eager":
-            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
         attn_output, attn_weights = attention_interface(
             self,
@@ -153,7 +153,7 @@ class Starcoder2Model(MistralModel):
         use_cache: bool | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> BaseModelOutputWithPast:
+    ) -> tuple | BaseModelOutputWithPast:
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")

transformers/models/superglue/configuration_superglue.py CHANGED Viewed

@@ -51,6 +51,9 @@ class SuperGlueConfig(PreTrainedConfig):
             The matching threshold.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        is_decoder (`bool`, *optional*, defaults to `False`):
+            Whether to only use the decoder in an encoder-decoder architecture, otherwise it has no effect on
+            decoder-only or encoder-only architectures.
     Examples:
         ```python
@@ -80,6 +83,7 @@ class SuperGlueConfig(PreTrainedConfig):
         sinkhorn_iterations: int = 100,
         matching_threshold: float = 0.0,
         initializer_range: float = 0.02,
+        is_decoder=False,
         **kwargs,
     ):
         self.gnn_layers_types = gnn_layers_types if gnn_layers_types is not None else ["self", "cross"] * 9

transformers 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl

transformers 5.0.0rc3py3-none-any.whl → 5.1.0py3-none-any.whl