PyPI - transformers - Versions diffs - 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl - Mend

transformers 5.0.0rc1py3-none-any.whl → 5.0.0rc2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (671) hide show

transformers/models/blt/modeling_blt.py CHANGED Viewed

@@ -27,6 +27,7 @@ import torch.distributions
 import torch.nn as nn
 import torch.nn.functional as F
+from ... import initialization as init
 from ...activations import ACT2FN
 from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
 from ...generation import GenerationMixin
@@ -102,7 +103,7 @@ class BltRotaryEmbedding(nn.Module):
         inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
-        self.original_inv_freq = inv_freq
+        self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
     @staticmethod
     def compute_default_rope_parameters(
@@ -444,6 +445,163 @@ class BltPreTrainedModel(PreTrainedModel):
         "attentions": OutputRecorder(BltSelfAttention, index=1, layer_name="local_decoder"),
     }
+    @torch.no_grad()
+    def _init_weights(self, module):
+        """
+        Initialize BLT weights following the original ByteLatentTransformer:
+        - Most weights are drawn from a truncated normal.
+        - Scale is ~ 1 / sqrt(model_dim) (or 1 / sqrt(hidden_dim) for FFN outputs).
+        - Norm layers are set to weight = 1, bias = 0.
+        """
+        class_name = module.__class__.__name__
+        # Norms: RMSNorm / LayerNorm
+        if isinstance(module, (BltRMSNorm, nn.LayerNorm)) or "RMSNorm" in class_name or "LayerNorm" in class_name:
+            if getattr(module, "weight", None) is not None:
+                init.ones_(module.weight)
+            if getattr(module, "bias", None) is not None:
+                init.zeros_(module.bias)
+            return
+        # Embeddings (encoder / patcher / hash embeddings)
+        if isinstance(module, nn.Embedding):
+            hidden_size = getattr(self.config, "hidden_size", None)
+            if hidden_size is None and hasattr(self.config, "encoder_config"):
+                hidden_size = getattr(self.config.encoder_config, "hidden_size", None)
+            if hidden_size is None:
+                hidden_size = module.embedding_dim
+            std = hidden_size**-0.5
+            init.trunc_normal_(
+                module.weight,
+                mean=0.0,
+                std=std,
+                a=-3 * std,
+                b=3 * std,
+            )
+            if module.padding_idx is not None:
+                init.zeros_(module.weight[module.padding_idx])
+            return
+        # Self-attention / cross-attention projections
+        if isinstance(module, (BltSelfAttention, BltCrossAttention)) or class_name in (
+            "MllamaTextSelfAttention",
+            "MllamaTextCrossAttention",
+        ):
+            dim = getattr(self.config, "hidden_size", None)
+            if dim is None and hasattr(module, "hidden_size"):
+                dim = module.hidden_size
+            if dim is None:
+                for name in ("q_proj", "k_proj", "v_proj", "o_proj", "dense"):
+                    proj = getattr(module, name, None)
+                    if proj is not None and hasattr(proj, "weight"):
+                        dim = proj.weight.shape[-1]
+                        break
+            if dim is None:
+                return
+            std = dim**-0.5
+            # Input projections (q, k, v)
+            for proj_name in ("q_proj", "k_proj", "v_proj"):
+                proj = getattr(module, proj_name, None)
+                if proj is not None and hasattr(proj, "weight"):
+                    init.trunc_normal_(
+                        proj.weight,
+                        mean=0.0,
+                        std=std,
+                        a=-3 * std,
+                        b=3 * std,
+                    )
+                    if getattr(proj, "bias", None) is not None:
+                        init.zeros_(proj.bias)
+            # Output projection: o_proj or dense
+            o_proj = getattr(module, "o_proj", getattr(module, "dense", None))
+            if o_proj is not None and hasattr(o_proj, "weight"):
+                init.trunc_normal_(
+                    o_proj.weight,
+                    mean=0.0,
+                    std=std,
+                    a=-3 * std,
+                    b=3 * std,
+                )
+                if getattr(o_proj, "bias", None) is not None:
+                    init.zeros_(o_proj.bias)
+            return
+        # MLP / FFN blocks
+        if isinstance(module, BltMLP) or class_name == "MllamaTextMLP":
+            hidden_size = getattr(self.config, "hidden_size", None)
+            if hidden_size is None and hasattr(self.config, "decoder_config"):
+                hidden_size = getattr(self.config.decoder_config, "hidden_size", None)
+            if hidden_size is None and hasattr(self.config, "encoder_config"):
+                hidden_size = getattr(self.config.encoder_config, "hidden_size", None)
+            # Input-side std
+            in_std = None
+            if hidden_size is not None:
+                in_std = hidden_size**-0.5
+            gate_proj = getattr(module, "gate_proj", getattr(module, "fc1", None))
+            up_proj = getattr(module, "up_proj", None)
+            down_proj = getattr(module, "down_proj", getattr(module, "fc2", None))
+            # gate / input projections
+            for proj in (gate_proj, up_proj):
+                if proj is not None and hasattr(proj, "weight"):
+                    std = in_std or (proj.weight.shape[1] ** -0.5)
+                    init.trunc_normal_(
+                        proj.weight,
+                        mean=0.0,
+                        std=std,
+                        a=-3 * std,
+                        b=3 * std,
+                    )
+                    if getattr(proj, "bias", None) is not None:
+                        init.zeros_(proj.bias)
+            # output/ down projections
+            if down_proj is not None and hasattr(down_proj, "weight"):
+                hidden_dim = down_proj.weight.shape[1]
+                out_std = hidden_dim**-0.5
+                init.trunc_normal_(
+                    down_proj.weight,
+                    mean=0.0,
+                    std=out_std,
+                    a=-3 * out_std,
+                    b=3 * out_std,
+                )
+                if getattr(down_proj, "bias", None) is not None:
+                    init.zeros_(down_proj.bias)
+            return
+        # Generic Linear layers (projections, lm_head, etc.)
+        if isinstance(module, nn.Linear):
+            fan_in = module.in_features
+            std = fan_in**-0.5
+            init.trunc_normal_(
+                module.weight,
+                mean=0.0,
+                std=std,
+                a=-3 * std,
+                b=3 * std,
+            )
+            if module.bias is not None:
+                init.zeros_(module.bias)
+            return
+        if isinstance(module, BltRotaryEmbedding):
+            rope_fn = (
+                ROPE_INIT_FUNCTIONS[module.rope_type]
+                if module.rope_type != "default"
+                else module.compute_default_rope_parameters
+            )
+            buffer_value, _ = rope_fn(module.config)
+            init.copy_(module.inv_freq, buffer_value)
+            init.copy_(module.original_inv_freq, buffer_value)
 class BltLocalEncoder(BltPreTrainedModel):
     config: BltLocalEncoderConfig
@@ -753,6 +911,8 @@ class BltPatcher(BltPreTrainedModel):
             bias=False,
         )
+        self.post_init()
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
@@ -952,7 +1112,7 @@ def compute_hash_embeddings(
             hash_ids = byte_group_hash_function(local_encoder_tokens, group_size, prime, encoder_hash_byte_group_vocab)
             # Apply offset to get the correct slice of the fused embedding
             offset_hash_ids = hash_ids + embedding_idx * encoder_hash_byte_group_vocab
-            embeddings += encoder_hash_tok_embedding(offset_hash_ids)
+            embeddings += encoder_hash_tok_embedding(offset_hash_ids).to(embeddings.device)
             embedding_idx += 1
     return embeddings

transformers/models/blt/modular_blt.py CHANGED Viewed

@@ -22,10 +22,11 @@ import torch.distributions
 import torch.nn as nn
 import torch.nn.functional as F
+from ... import initialization as init
 from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
 from ...masking_utils import create_causal_mask
 from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
-from ...modeling_rope_utils import dynamic_rope_update
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
 from ...processing_utils import Unpack
 from ...utils import TransformersKwargs, auto_docstring, logging
@@ -133,7 +134,7 @@ def compute_hash_embeddings(
             hash_ids = byte_group_hash_function(local_encoder_tokens, group_size, prime, encoder_hash_byte_group_vocab)
             # Apply offset to get the correct slice of the fused embedding
             offset_hash_ids = hash_ids + embedding_idx * encoder_hash_byte_group_vocab
-            embeddings += encoder_hash_tok_embedding(offset_hash_ids)
+            embeddings += encoder_hash_tok_embedding(offset_hash_ids).to(embeddings.device)
             embedding_idx += 1
     return embeddings
@@ -360,8 +361,170 @@ class BltPreTrainedModel(MllamaPreTrainedModel):
         "attentions": OutputRecorder(BltSelfAttention, index=1, layer_name="local_decoder"),
     }
+    # Weight initialization is adapted from:
+    # - https://github.com/facebookresearch/blt/blob/main/bytelatent/model/blt.py
+    # - https://github.com/pytorch/torchtitan/blob/main/torchtitan/experiments/transformers_modeling_backend/model/model.py
+    #
+    # Both implementations use truncated normal initialization with std ~ 1 / sqrt(d_model)
+    # (or 1 / sqrt(hidden_dim) for FFN outputs), and unit initialization for normalization layers.
+    # We follow the same scheme here, but expressed in the Transformers APIs.
+    @torch.no_grad()
     def _init_weights(self, module):
-        raise AttributeError("No need to inherit it!")
+        """
+        Initialize BLT weights following the original ByteLatentTransformer:
+        - Most weights are drawn from a truncated normal.
+        - Scale is ~ 1 / sqrt(model_dim) (or 1 / sqrt(hidden_dim) for FFN outputs).
+        - Norm layers are set to weight = 1, bias = 0.
+        """
+        class_name = module.__class__.__name__
+        # Norms: RMSNorm / LayerNorm
+        if isinstance(module, (BltRMSNorm, nn.LayerNorm)) or "RMSNorm" in class_name or "LayerNorm" in class_name:
+            if getattr(module, "weight", None) is not None:
+                init.ones_(module.weight)
+            if getattr(module, "bias", None) is not None:
+                init.zeros_(module.bias)
+            return
+        # Embeddings (encoder / patcher / hash embeddings)
+        if isinstance(module, nn.Embedding):
+            hidden_size = getattr(self.config, "hidden_size", None)
+            if hidden_size is None and hasattr(self.config, "encoder_config"):
+                hidden_size = getattr(self.config.encoder_config, "hidden_size", None)
+            if hidden_size is None:
+                hidden_size = module.embedding_dim
+            std = hidden_size**-0.5
+            init.trunc_normal_(
+                module.weight,
+                mean=0.0,
+                std=std,
+                a=-3 * std,
+                b=3 * std,
+            )
+            if module.padding_idx is not None:
+                init.zeros_(module.weight[module.padding_idx])
+            return
+        # Self-attention / cross-attention projections
+        if isinstance(module, (BltSelfAttention, BltCrossAttention)) or class_name in (
+            "MllamaTextSelfAttention",
+            "MllamaTextCrossAttention",
+        ):
+            dim = getattr(self.config, "hidden_size", None)
+            if dim is None and hasattr(module, "hidden_size"):
+                dim = module.hidden_size
+            if dim is None:
+                for name in ("q_proj", "k_proj", "v_proj", "o_proj", "dense"):
+                    proj = getattr(module, name, None)
+                    if proj is not None and hasattr(proj, "weight"):
+                        dim = proj.weight.shape[-1]
+                        break
+            if dim is None:
+                return
+            std = dim**-0.5
+            # Input projections (q, k, v)
+            for proj_name in ("q_proj", "k_proj", "v_proj"):
+                proj = getattr(module, proj_name, None)
+                if proj is not None and hasattr(proj, "weight"):
+                    init.trunc_normal_(
+                        proj.weight,
+                        mean=0.0,
+                        std=std,
+                        a=-3 * std,
+                        b=3 * std,
+                    )
+                    if getattr(proj, "bias", None) is not None:
+                        init.zeros_(proj.bias)
+            # Output projection: o_proj or dense
+            o_proj = getattr(module, "o_proj", getattr(module, "dense", None))
+            if o_proj is not None and hasattr(o_proj, "weight"):
+                init.trunc_normal_(
+                    o_proj.weight,
+                    mean=0.0,
+                    std=std,
+                    a=-3 * std,
+                    b=3 * std,
+                )
+                if getattr(o_proj, "bias", None) is not None:
+                    init.zeros_(o_proj.bias)
+            return
+        # MLP / FFN blocks
+        if isinstance(module, BltMLP) or class_name == "MllamaTextMLP":
+            hidden_size = getattr(self.config, "hidden_size", None)
+            if hidden_size is None and hasattr(self.config, "decoder_config"):
+                hidden_size = getattr(self.config.decoder_config, "hidden_size", None)
+            if hidden_size is None and hasattr(self.config, "encoder_config"):
+                hidden_size = getattr(self.config.encoder_config, "hidden_size", None)
+            # Input-side std
+            in_std = None
+            if hidden_size is not None:
+                in_std = hidden_size**-0.5
+            gate_proj = getattr(module, "gate_proj", getattr(module, "fc1", None))
+            up_proj = getattr(module, "up_proj", None)
+            down_proj = getattr(module, "down_proj", getattr(module, "fc2", None))
+            # gate / input projections
+            for proj in (gate_proj, up_proj):
+                if proj is not None and hasattr(proj, "weight"):
+                    std = in_std or (proj.weight.shape[1] ** -0.5)
+                    init.trunc_normal_(
+                        proj.weight,
+                        mean=0.0,
+                        std=std,
+                        a=-3 * std,
+                        b=3 * std,
+                    )
+                    if getattr(proj, "bias", None) is not None:
+                        init.zeros_(proj.bias)
+            # output/ down projections
+            if down_proj is not None and hasattr(down_proj, "weight"):
+                hidden_dim = down_proj.weight.shape[1]
+                out_std = hidden_dim**-0.5
+                init.trunc_normal_(
+                    down_proj.weight,
+                    mean=0.0,
+                    std=out_std,
+                    a=-3 * out_std,
+                    b=3 * out_std,
+                )
+                if getattr(down_proj, "bias", None) is not None:
+                    init.zeros_(down_proj.bias)
+            return
+        # Generic Linear layers (projections, lm_head, etc.)
+        if isinstance(module, nn.Linear):
+            fan_in = module.in_features
+            std = fan_in**-0.5
+            init.trunc_normal_(
+                module.weight,
+                mean=0.0,
+                std=std,
+                a=-3 * std,
+                b=3 * std,
+            )
+            if module.bias is not None:
+                init.zeros_(module.bias)
+            return
+        if isinstance(module, BltRotaryEmbedding):
+            rope_fn = (
+                ROPE_INIT_FUNCTIONS[module.rope_type]
+                if module.rope_type != "default"
+                else module.compute_default_rope_parameters
+            )
+            buffer_value, _ = rope_fn(module.config)
+            init.copy_(module.inv_freq, buffer_value)
+            init.copy_(module.original_inv_freq, buffer_value)
     def _update_causal_mask(self, module):
         raise AttributeError("No need to inherit it!")
@@ -634,6 +797,8 @@ class BltPatcher(BltPreTrainedModel):
             bias=False,
         )
+        self.post_init()
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,

transformers/models/bridgetower/image_processing_bridgetower_fast.py CHANGED Viewed

@@ -251,10 +251,8 @@ class BridgeTowerImageProcessorFast(BaseImageProcessorFast):
             processed_images, processed_masks = self.pad(
                 processed_images, return_mask=True, disable_grouping=disable_grouping
             )
-            processed_masks = torch.stack(processed_masks, dim=0) if return_tensors else processed_masks
             data["pixel_mask"] = processed_masks
-        processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
         data["pixel_values"] = processed_images
         return BatchFeature(data=data, tensor_type=return_tensors)

transformers/models/bridgetower/modeling_bridgetower.py CHANGED Viewed

@@ -943,6 +943,11 @@ class BridgeTowerPreTrainedModel(PreTrainedModel):
             init.ones_(module.weight)
         elif isinstance(module, BridgeTowerForContrastiveLearning):
             init.constant_(module.logit_scale, self.config.logit_scale_init_value)
+        elif isinstance(module, BridgeTowerVisionEmbeddings):
+            init.copy_(module.position_ids, torch.arange(module.num_positions).expand((1, -1)))
+        elif isinstance(module, BridgeTowerTextEmbeddings):
+            init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
+            init.zeros_(module.token_type_ids)
         if isinstance(module, (nn.Linear, BridgeTowerMLMHead)) and module.bias is not None:
             init.zeros_(module.bias)
@@ -955,6 +960,7 @@ class BridgeTowerVisionModel(BridgeTowerPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.visual = BridgeTowerVisionTransformer(config)
+        self.post_init()
     @property
     def dtype(self):

transformers/models/bros/modeling_bros.py CHANGED Viewed

@@ -522,6 +522,14 @@ class BrosPreTrainedModel(PreTrainedModel):
         std = self.config.initializer_range
         if isinstance(module, BrosRelationExtractor):
             init.normal_(module.dummy_node, std=std)
+        elif isinstance(module, BrosTextEmbeddings):
+            init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
+            init.zeros_(module.token_type_ids)
+        elif isinstance(module, BrosPositionalEmbedding1D):
+            inv_freq = 1 / (
+                10000 ** (torch.arange(0.0, module.dim_bbox_sinusoid_emb_1d, 2.0) / module.dim_bbox_sinusoid_emb_1d)
+            )
+            init.copy_(module.inv_freq, inv_freq)
 @auto_docstring

transformers/models/camembert/modeling_camembert.py CHANGED Viewed

@@ -54,6 +54,112 @@ from .configuration_camembert import CamembertConfig
 logger = logging.get_logger(__name__)
+class CamembertEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
+        self.register_buffer(
+            "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
+        )
+        self.padding_idx = config.pad_token_id
+        self.position_embeddings = nn.Embedding(
+            config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
+        )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        past_key_values_length: int = 0,
+    ) -> torch.Tensor:
+        if position_ids is None:
+            if input_ids is not None:
+                # Create the position ids from the input token ids. Any padded tokens remain padded.
+                position_ids = self.create_position_ids_from_input_ids(
+                    input_ids, self.padding_idx, past_key_values_length
+                )
+            else:
+                position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds, self.padding_idx)
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+        batch_size, seq_length = input_shape
+        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
+        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
+        # issue #5664
+        if token_type_ids is None:
+            if hasattr(self, "token_type_ids"):
+                # NOTE: We assume either pos ids to have bsz == 1 (broadcastable) or bsz == effective bsz (input_shape[0])
+                buffered_token_type_ids = self.token_type_ids.expand(position_ids.shape[0], -1)
+                buffered_token_type_ids = torch.gather(buffered_token_type_ids, dim=1, index=position_ids)
+                token_type_ids = buffered_token_type_ids.expand(batch_size, seq_length)
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+        embeddings = inputs_embeds + token_type_embeddings
+        position_embeddings = self.position_embeddings(position_ids)
+        embeddings = embeddings + position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+    @staticmethod
+    def create_position_ids_from_inputs_embeds(inputs_embeds, padding_idx):
+        """
+        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
+        Args:
+            inputs_embeds: torch.Tensor
+        Returns: torch.Tensor
+        """
+        input_shape = inputs_embeds.size()[:-1]
+        sequence_length = input_shape[1]
+        position_ids = torch.arange(
+            padding_idx + 1, sequence_length + padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
+        )
+        return position_ids.unsqueeze(0).expand(input_shape)
+    @staticmethod
+    def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
+        """
+        Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
+        are ignored. This is modified from fairseq's `utils.make_positions`.
+        Args:
+            x: torch.Tensor x:
+        Returns: torch.Tensor
+        """
+        # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
+        mask = input_ids.ne(padding_idx).int()
+        incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
+        return incremental_indices.long() + padding_idx
 def eager_attention_forward(
     module: nn.Module,
     query: torch.Tensor,
@@ -417,112 +523,9 @@ class CamembertPreTrainedModel(PreTrainedModel):
         super()._init_weights(module)
         if isinstance(module, CamembertLMHead):
             init.zeros_(module.bias)
-class CamembertEmbeddings(nn.Module):
-    """Construct the embeddings from word, position and token_type embeddings."""
-    def __init__(self, config):
-        super().__init__()
-        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
-        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
-        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer(
-            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
-        )
-        self.register_buffer(
-            "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
-        )
-        self.padding_idx = config.pad_token_id
-        self.position_embeddings = nn.Embedding(
-            config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
-        )
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        token_type_ids: Optional[torch.LongTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        past_key_values_length: int = 0,
-    ) -> torch.Tensor:
-        if position_ids is None:
-            if input_ids is not None:
-                # Create the position ids from the input token ids. Any padded tokens remain padded.
-                position_ids = self.create_position_ids_from_input_ids(
-                    input_ids, self.padding_idx, past_key_values_length
-                )
-            else:
-                position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds, self.padding_idx)
-        if input_ids is not None:
-            input_shape = input_ids.size()
-        else:
-            input_shape = inputs_embeds.size()[:-1]
-        batch_size, seq_length = input_shape
-        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
-        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
-        # issue #5664
-        if token_type_ids is None:
-            if hasattr(self, "token_type_ids"):
-                # NOTE: We assume either pos ids to have bsz == 1 (broadcastable) or bsz == effective bsz (input_shape[0])
-                buffered_token_type_ids = self.token_type_ids.expand(position_ids.shape[0], -1)
-                buffered_token_type_ids = torch.gather(buffered_token_type_ids, dim=1, index=position_ids)
-                token_type_ids = buffered_token_type_ids.expand(batch_size, seq_length)
-            else:
-                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
-        if inputs_embeds is None:
-            inputs_embeds = self.word_embeddings(input_ids)
-        token_type_embeddings = self.token_type_embeddings(token_type_ids)
-        embeddings = inputs_embeds + token_type_embeddings
-        position_embeddings = self.position_embeddings(position_ids)
-        embeddings = embeddings + position_embeddings
-        embeddings = self.LayerNorm(embeddings)
-        embeddings = self.dropout(embeddings)
-        return embeddings
-    @staticmethod
-    def create_position_ids_from_inputs_embeds(inputs_embeds, padding_idx):
-        """
-        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
-        Args:
-            inputs_embeds: torch.Tensor
-        Returns: torch.Tensor
-        """
-        input_shape = inputs_embeds.size()[:-1]
-        sequence_length = input_shape[1]
-        position_ids = torch.arange(
-            padding_idx + 1, sequence_length + padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
-        )
-        return position_ids.unsqueeze(0).expand(input_shape)
-    @staticmethod
-    def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
-        """
-        Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
-        are ignored. This is modified from fairseq's `utils.make_positions`.
-        Args:
-            x: torch.Tensor x:
-        Returns: torch.Tensor
-        """
-        # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
-        mask = input_ids.ne(padding_idx).int()
-        incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
-        return incremental_indices.long() + padding_idx
+        elif isinstance(module, CamembertEmbeddings):
+            init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
+            init.zeros_(module.token_type_ids)
 class CamembertEncoder(nn.Module):

transformers 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl

transformers 5.0.0rc1py3-none-any.whl → 5.0.0rc2py3-none-any.whl