PyPI - transformers - Versions diffs - 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl - Mend

transformers 5.0.0rc1py3-none-any.whl → 5.0.0rc2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (671) hide show

transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py CHANGED Viewed

@@ -25,7 +25,7 @@ from ... import initialization as init
 from ...cache_utils import Cache
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
-from ...utils import TransformersKwargs, logging
+from ...utils import TransformersKwargs, is_grouped_mm_available, logging
 from ..hunyuan_v1_dense.modeling_hunyuan_v1_dense import HunYuanDenseV1RotaryEmbedding
 from ..llama.modeling_llama import (
     LlamaAttention,
@@ -177,7 +177,9 @@ class HunYuanMoEV1DecoderLayer(LlamaDecoderLayer):
 class HunYuanMoEV1PreTrainedModel(LlamaPreTrainedModel):
-    _can_compile_fullgraph = False
+    _can_compile_fullgraph = (
+        is_grouped_mm_available()
+    )  # https://huggingface.co/docs/transformers/experts_interface#torchcompile
     @torch.no_grad()
     def _init_weights(self, module):

transformers/models/ibert/modeling_ibert.py CHANGED Viewed

@@ -593,16 +593,32 @@ class IBertPreTrainedModel(PreTrainedModel):
             init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
             if module.bias is not None:
                 init.zeros_(module.bias)
+            if getattr(module, "weight_integer", None) is not None:
+                init.zeros_(module.weight_integer)
+                init.zeros_(module.fc_scaling_factor)
+            if getattr(module, "bias_integer", None) is not None:
+                init.zeros_(module.bias_integer)
         elif isinstance(module, (QuantEmbedding, nn.Embedding)):
             init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
             # Here we need the check explicitly, as we slice the weight in the `zeros_` call, so it looses the flag
             if module.padding_idx is not None and not getattr(module.weight, "_is_hf_initialized", False):
                 init.zeros_(module.weight[module.padding_idx])
+            if getattr(module, "weight_scaling_factor", None) is not None:
+                init.zeros_(module.weight_scaling_factor)
+                init.zeros_(module.weight_integer)
         elif isinstance(module, (IntLayerNorm, nn.LayerNorm)):
             init.zeros_(module.bias)
             init.ones_(module.weight)
+            if getattr(module, "shift", None) is not None:
+                init.zeros_(module.shift)
         elif isinstance(module, IBertLMHead):
             init.zeros_(module.bias)
+        elif isinstance(module, IBertEmbeddings):
+            init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
+        elif isinstance(module, QuantAct):
+            init.constant_(module.x_min, -1e-5)
+            init.constant_(module.x_max, 1e-5)
+            init.zeros_(module.act_scaling_factor)
     def resize_token_embeddings(self, new_num_tokens=None):
         raise NotImplementedError("`resize_token_embeddings` is not supported for I-BERT.")

transformers/models/idefics/modeling_idefics.py CHANGED Viewed

@@ -840,6 +840,7 @@ class IdeficsPreTrainedModel(PreTrainedModel):
         super()._init_weights(module)
         if isinstance(module, IdeficsVisionEmbeddings):
             init.normal_(module.class_embedding)
+            init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
         elif isinstance(module, IdeficsGatedCrossAttentionLayer):
             if self.config.alpha_initializer == "zeros":
                 init.zeros_(module.alpha_cross_attn)
@@ -852,6 +853,15 @@ class IdeficsPreTrainedModel(PreTrainedModel):
                 init.normal_(module.alpha_dense, mean=0.0, std=self.config.alphas_initializer_range)
         elif isinstance(module, IdeficsPerceiverResampler):
             init.normal_(module.latents)
+        elif isinstance(module, IdeficsEmbedding):
+            inv_freq = 1.0 / (module.base ** (torch.arange(0, module.dim, 2) / module.dim))
+            init.copy_(module.inv_freq, inv_freq)
+            t = torch.arange(module.max_position_embeddings).type_as(inv_freq)
+            freqs = torch.einsum("i,j->ij", t, inv_freq)
+            # Different from paper, but it uses a different permutation in order to obtain the same calculation
+            emb = torch.cat((freqs, freqs), dim=-1)
+            init.copy_(module.cos_cached, emb.cos())
+            init.copy_(module.sin_cached, emb.sin())
 @auto_docstring

transformers/models/idefics2/modeling_idefics2.py CHANGED Viewed

@@ -452,6 +452,8 @@ class Idefics2VisionTransformer(Idefics2PreTrainedModel):
         self.encoder = Idefics2Encoder(config)
         self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.post_init()
     def get_input_embeddings(self):
         return self.embeddings
@@ -711,6 +713,8 @@ class Idefics2PerceiverResampler(Idefics2PreTrainedModel):
         self.layers = nn.ModuleList([Idefics2PerceiverLayer(config, idx) for idx in range(self.depth)])
         self.norm = Idefics2RMSNorm(self.hidden_size, eps=self.rms_norm_eps)
+        self.post_init()
     @auto_docstring
     def forward(
         self,
@@ -1115,6 +1119,7 @@ class Idefics2ForConditionalGeneration(Idefics2PreTrainedModel, GenerationMixin)
         pixel_attention_mask=None,
         image_hidden_states=None,
         logits_to_keep=None,
+        is_first_iteration=False,
         **kwargs,
     ):
         # Overwritten -- there are mutually exclusive inputs (if the logic to make `image_hidden_states` take
@@ -1130,10 +1135,11 @@ class Idefics2ForConditionalGeneration(Idefics2PreTrainedModel, GenerationMixin)
             pixel_attention_mask=pixel_attention_mask,
             image_hidden_states=image_hidden_states,
             logits_to_keep=logits_to_keep,
+            is_first_iteration=is_first_iteration,
             **kwargs,
         )
-        if image_hidden_states is not None or cache_position[0] != 0:
+        if image_hidden_states is not None or not is_first_iteration:
             model_inputs["pixel_values"] = None
             model_inputs["pixel_attention_mask"] = None

transformers/models/idefics3/modeling_idefics3.py CHANGED Viewed

@@ -458,6 +458,8 @@ class Idefics3VisionTransformer(Idefics3PreTrainedModel):
         self.patch_size = config.patch_size
         self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.post_init()
     # Copied from transformers.models.idefics2.modeling_idefics2.Idefics2VisionTransformer.get_input_embeddings
     def get_input_embeddings(self):
         return self.embeddings
@@ -887,6 +889,7 @@ class Idefics3ForConditionalGeneration(Idefics3PreTrainedModel, GenerationMixin)
         pixel_attention_mask=None,
         image_hidden_states=None,
         logits_to_keep=None,
+        is_first_iteration=False,
         **kwargs,
     ):
         # Overwritten -- there are mutually exclusive inputs (if the logic to make `image_hidden_states` take
@@ -902,10 +905,11 @@ class Idefics3ForConditionalGeneration(Idefics3PreTrainedModel, GenerationMixin)
             pixel_attention_mask=pixel_attention_mask,
             image_hidden_states=image_hidden_states,
             logits_to_keep=logits_to_keep,
+            is_first_iteration=is_first_iteration,
             **kwargs,
         )
-        if image_hidden_states is not None or cache_position[0] != 0:
+        if image_hidden_states is not None or not is_first_iteration:
             model_inputs["pixel_values"] = None
             model_inputs["pixel_attention_mask"] = None

transformers/models/imagegpt/image_processing_imagegpt_fast.py CHANGED Viewed

@@ -164,12 +164,8 @@ class ImageGPTImageProcessorFast(BaseImageProcessorFast):
             input_ids = reorder_images(input_ids_grouped, grouped_images_index)
-            return BatchFeature(
-                data={"input_ids": torch.stack(input_ids, dim=0) if return_tensors else input_ids},
-                tensor_type=return_tensors,
-            )
+            return BatchFeature(data={"input_ids": input_ids}, tensor_type=return_tensors)
-        pixel_values = torch.stack(pixel_values, dim=0) if return_tensors else pixel_values
         return BatchFeature(data={"pixel_values": pixel_values}, tensor_type=return_tensors)
     def to_dict(self):

transformers/models/imagegpt/modeling_imagegpt.py CHANGED Viewed

@@ -61,7 +61,7 @@ class ImageGPTLayerNorm(nn.Module):
 class ImageGPTAttention(nn.Module):
     def __init__(self, config, is_cross_attention: Optional[bool] = False, layer_idx: Optional[int] = None):
         super().__init__()
+        self.config = config
         max_positions = config.max_position_embeddings
         self.register_buffer(
             "bias",
@@ -70,7 +70,6 @@ class ImageGPTAttention(nn.Module):
             ),
             persistent=False,
         )
-        self.register_buffer("masked_bias", torch.tensor(-1e4), persistent=False)
         self.embed_dim = config.hidden_size
         self.num_heads = config.num_attention_heads
@@ -384,6 +383,14 @@ class ImageGPTPreTrainedModel(PreTrainedModel):
                 if "c_proj" in name and "weight" in name:
                     # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
                     init.normal_(p, mean=0.0, std=self.config.initializer_range / math.sqrt(2 * self.config.n_layer))
+        elif isinstance(module, ImageGPTAttention):
+            max_positions = module.config.max_position_embeddings
+            init.copy_(
+                module.bias,
+                torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view(
+                    1, 1, max_positions, max_positions
+                ),
+            )
 @auto_docstring

transformers/models/instructblip/modeling_instructblip.py CHANGED Viewed

@@ -335,6 +335,8 @@ class InstructBlipPreTrainedModel(PreTrainedModel):
             init.trunc_normal_(module.class_embedding, mean=0.0, std=factor)
         elif isinstance(module, (InstructBlipForConditionalGeneration, InstructBlipModel)):
             init.zeros_(module.query_tokens)
+        elif isinstance(module, InstructBlipQFormerEmbeddings):
+            init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
 # Copied from transformers.models.blip.modeling_blip.BlipEncoder with Blip->InstructBlip

transformers/models/instructblipvideo/modeling_instructblipvideo.py CHANGED Viewed

@@ -128,6 +128,56 @@ class InstructBlipVideoVisionEmbeddings(nn.Module):
         return embeddings
+class InstructBlipVideoQFormerEmbeddings(nn.Module):
+    """Construct the embeddings from word and position embeddings."""
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
+        self.config = config
+    def forward(
+        self,
+        input_ids=None,
+        position_ids=None,
+        query_embeds=None,
+        past_key_values_length=0,
+    ):
+        if input_ids is not None:
+            seq_length = input_ids.size()[1]
+        else:
+            seq_length = 0
+        if position_ids is None:
+            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length].clone()
+        if input_ids is not None:
+            embeddings = self.word_embeddings(input_ids)
+            position_embeddings = self.position_embeddings(position_ids.to(embeddings.device))
+            embeddings = embeddings + position_embeddings
+            if query_embeds is not None:
+                embeddings = torch.cat((query_embeds, embeddings), dim=1)
+        else:
+            embeddings = query_embeds
+        embeddings = embeddings.to(self.layernorm.weight.dtype)
+        embeddings = self.layernorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
 @auto_docstring
 class InstructBlipVideoPreTrainedModel(PreTrainedModel):
     config: InstructBlipVideoConfig
@@ -158,6 +208,8 @@ class InstructBlipVideoPreTrainedModel(PreTrainedModel):
             init.trunc_normal_(module.class_embedding, mean=0.0, std=factor)
         elif isinstance(module, (InstructBlipVideoForConditionalGeneration, InstructBlipVideoModel)):
             init.zeros_(module.query_tokens)
+        elif isinstance(module, InstructBlipVideoQFormerEmbeddings):
+            init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
 # Adapted from transformers.models.siglip.modeling_siglip.eager_attention_forward -> InstructBlipVideo doesn't cast attn weights to fp32
@@ -677,56 +729,6 @@ class InstructBlipVideoQFormerEncoder(nn.Module):
         )
-class InstructBlipVideoQFormerEmbeddings(nn.Module):
-    """Construct the embeddings from word and position embeddings."""
-    def __init__(self, config):
-        super().__init__()
-        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
-        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
-        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer(
-            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
-        )
-        self.config = config
-    def forward(
-        self,
-        input_ids=None,
-        position_ids=None,
-        query_embeds=None,
-        past_key_values_length=0,
-    ):
-        if input_ids is not None:
-            seq_length = input_ids.size()[1]
-        else:
-            seq_length = 0
-        if position_ids is None:
-            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length].clone()
-        if input_ids is not None:
-            embeddings = self.word_embeddings(input_ids)
-            position_embeddings = self.position_embeddings(position_ids.to(embeddings.device))
-            embeddings = embeddings + position_embeddings
-            if query_embeds is not None:
-                embeddings = torch.cat((query_embeds, embeddings), dim=1)
-        else:
-            embeddings = query_embeds
-        embeddings = embeddings.to(self.layernorm.weight.dtype)
-        embeddings = self.layernorm(embeddings)
-        embeddings = self.dropout(embeddings)
-        return embeddings
 class InstructBlipVideoQFormerModel(InstructBlipVideoPreTrainedModel):
     """
     Querying Transformer (Q-Former), used in InstructBlipVideo. Slightly modified from BLIP-2 as it also takes the

transformers/models/instructblipvideo/video_processing_instructblipvideo.py CHANGED Viewed

@@ -84,7 +84,6 @@ class InstructBlipVideoVideoProcessor(BaseVideoProcessor):
             processed_videos_grouped[shape] = stacked_videos
         processed_videos = reorder_videos(processed_videos_grouped, grouped_videos_index)
-        processed_videos = torch.stack(processed_videos, dim=0) if return_tensors else processed_videos
         return BatchFeature(data={"pixel_values": processed_videos}, tensor_type=return_tensors)

transformers/models/internvl/modeling_internvl.py CHANGED Viewed

@@ -209,10 +209,9 @@ class InternVLVisionPatchEmbeddings(nn.Module):
             )
         embeddings = self.projection(pixel_values.to(self.projection.weight.dtype))
-        patch_height, patch_width = embeddings.shape[2], embeddings.shape[3]
         embeddings = embeddings.flatten(2).transpose(1, 2)
-        return embeddings, (patch_height, patch_width)
+        return embeddings
 # Based on timm implementation, which can be found here:
@@ -291,7 +290,7 @@ class InternVLVisionEmbeddings(nn.Module):
         bool_masked_pos: Optional[torch.BoolTensor] = None,
     ) -> torch.Tensor:
         _, _, height, width = pixel_values.shape
-        embeddings, (patch_height, patch_width) = self.patch_embeddings(pixel_values)
+        embeddings = self.patch_embeddings(pixel_values)
         batch_size, seq_len, _ = embeddings.size()
         if bool_masked_pos is not None:
@@ -308,7 +307,7 @@ class InternVLVisionEmbeddings(nn.Module):
         embeddings = self.dropout(embeddings)
-        return embeddings, (patch_height, patch_width)
+        return embeddings
 class InternVLVisionMLP(nn.Module):
@@ -455,7 +454,7 @@ class InternVLVisionModel(InternVLVisionPreTrainedModel):
         bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
             Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
         """
-        embedding_output, _ = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos)
+        embedding_output = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos)
         encoder_outputs = self.encoder(embedding_output)
         sequence_output = encoder_outputs[0]
@@ -898,6 +897,7 @@ class InternVLForConditionalGeneration(InternVLPreTrainedModel, GenerationMixin)
         attention_mask=None,
         cache_position=None,
         logits_to_keep=None,
+        is_first_iteration=False,
         **kwargs,
     ):
         # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
@@ -909,12 +909,15 @@ class InternVLForConditionalGeneration(InternVLPreTrainedModel, GenerationMixin)
             attention_mask=attention_mask,
             cache_position=cache_position,
             logits_to_keep=logits_to_keep,
+            is_first_iteration=is_first_iteration,
             **kwargs,
         )
-        if cache_position[0] == 0:
-            # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
-            # Otherwise we need pixel values to be passed to model
+        if is_first_iteration or not kwargs.get("use_cache", True):
+            # Pixel values are used only in the first iteration if available
+            # In subsquent iterations, they are already merged with text and cached
+            # NOTE: first iteration doesn't have to be prefill, it can be the first
+            # iteration with a question and cached system prompt (continue generate from cache)
             model_inputs["pixel_values"] = pixel_values
         return model_inputs

transformers/models/internvl/modular_internvl.py CHANGED Viewed

@@ -29,7 +29,7 @@ from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
-from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging, torch_int
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, torch_int
 from ...utils.generic import check_model_inputs
 from ..clip.modeling_clip import CLIPMLP
 from ..janus.modeling_janus import JanusVisionAttention
@@ -44,9 +44,6 @@ from ..llava.modeling_llava import (
 from .configuration_internvl import InternVLConfig, InternVLVisionConfig
-logger = logging.get_logger(__name__)
 def eager_attention_forward(
     module: nn.Module,
     query: torch.Tensor,
@@ -177,10 +174,9 @@ class InternVLVisionPatchEmbeddings(nn.Module):
             )
         embeddings = self.projection(pixel_values.to(self.projection.weight.dtype))
-        patch_height, patch_width = embeddings.shape[2], embeddings.shape[3]
         embeddings = embeddings.flatten(2).transpose(1, 2)
-        return embeddings, (patch_height, patch_width)
+        return embeddings
 # Based on timm implementation, which can be found here:
@@ -259,7 +255,7 @@ class InternVLVisionEmbeddings(nn.Module):
         bool_masked_pos: Optional[torch.BoolTensor] = None,
     ) -> torch.Tensor:
         _, _, height, width = pixel_values.shape
-        embeddings, (patch_height, patch_width) = self.patch_embeddings(pixel_values)
+        embeddings = self.patch_embeddings(pixel_values)
         batch_size, seq_len, _ = embeddings.size()
         if bool_masked_pos is not None:
@@ -276,7 +272,7 @@ class InternVLVisionEmbeddings(nn.Module):
         embeddings = self.dropout(embeddings)
-        return embeddings, (patch_height, patch_width)
+        return embeddings
 class InternVLVisionMLP(CLIPMLP):
@@ -412,7 +408,7 @@ class InternVLVisionModel(InternVLVisionPreTrainedModel):
         bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
             Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
         """
-        embedding_output, _ = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos)
+        embedding_output = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos)
         encoder_outputs = self.encoder(embedding_output)
         sequence_output = encoder_outputs[0]

transformers/models/internvl/video_processing_internvl.py CHANGED Viewed

@@ -140,7 +140,6 @@ class InternVLVideoProcessor(BaseVideoProcessor):
             processed_videos_grouped[shape] = stacked_videos
         processed_videos = reorder_videos(processed_videos_grouped, grouped_videos_index)
-        processed_videos = torch.stack(processed_videos, dim=0) if return_tensors else processed_videos
         return BatchFeature(data={"pixel_values_videos": processed_videos}, tensor_type=return_tensors)

transformers/models/jais2/__init__.py ADDED Viewed

@@ -0,0 +1,27 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+if TYPE_CHECKING:
+    from .configuration_jais2 import *
+    from .modeling_jais2 import *
+else:
+    import sys
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)

transformers/models/jais2/configuration_jais2.py ADDED Viewed

@@ -0,0 +1,152 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/jais2/modular_jais2.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_jais2.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 the HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional
+from ...configuration_utils import PreTrainedConfig
+from ...modeling_rope_utils import RopeParameters
+class Jais2Config(PreTrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Jais2Model`]. It is used to instantiate a Jais2
+    model according to the specified arguments, defining the model architecture.
+    [inceptionai/Jais-2-8B-Chat](https://huggingface.co/inceptionai/Jais-2-8B-Chat).
+    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PreTrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 150272):
+            Vocabulary size of the Jais2 model.
+        hidden_size (`int`, *optional*, defaults to 3328):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 26624):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 26):
+            Number of attention heads for each attention layer.
+        num_key_value_heads (`int`, *optional*):
+            Number of key_value heads for Grouped Query Attention.
+        hidden_act (`str`, *optional*, defaults to `"relu2"`):
+            The non-linear activation function in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 8192):
+            The maximum sequence length.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether to return last key/values attentions.
+        pad_token_id (`int`, *optional*):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 0):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 150024):
+            End of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings.
+        attention_bias (`bool`, *optional*, defaults to `True`):
+            Whether to use a bias in the query, key, value and output projection layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        mlp_bias (`bool`, *optional*, defaults to `True`):
+            Whether to use a bias in up_proj, down_proj and gate_proj layers.
+        head_dim (`int`, *optional*):
+            The attention head dimension.
+        rope_parameters (`dict`, *optional*):
+            The RoPE parameters.
+    """
+    model_type = "jais2"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+    def __init__(
+        self,
+        vocab_size: Optional[int] = 150272,
+        hidden_size: Optional[int] = 3328,
+        intermediate_size: Optional[int] = 26624,
+        num_hidden_layers: Optional[int] = 32,
+        num_attention_heads: Optional[int] = 26,
+        num_key_value_heads: Optional[int] = None,
+        hidden_act: Optional[str] = "relu2",
+        max_position_embeddings: Optional[int] = 8192,
+        initializer_range: Optional[float] = 0.02,
+        layer_norm_eps: Optional[float] = 1e-5,
+        use_cache: Optional[bool] = True,
+        pad_token_id: Optional[int] = None,
+        bos_token_id: Optional[int] = 0,
+        eos_token_id: Optional[int] = 150024,
+        tie_word_embeddings: Optional[bool] = False,
+        attention_bias: Optional[bool] = True,
+        attention_dropout: Optional[float] = 0.0,
+        mlp_bias: Optional[bool] = True,
+        head_dim: Optional[int] = None,
+        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.use_cache = use_cache
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.mlp_bias = mlp_bias
+        self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads
+        self.rope_parameters = rope_parameters
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+        self.layer_norm_eps = layer_norm_eps
+__all__ = ["Jais2Config"]

transformers 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl

transformers 5.0.0rc1py3-none-any.whl → 5.0.0rc2py3-none-any.whl