PyPI - transformers - Versions diffs - 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl - Mend

transformers 5.0.0rc3py3-none-any.whl → 5.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1021) hide show

transformers/models/dab_detr/configuration_dab_detr.py CHANGED Viewed

@@ -13,10 +13,10 @@
 # limitations under the License.
 """DAB-DETR model configuration"""
+from ...backbone_utils import consolidate_backbone_kwargs_to_config
 from ...configuration_utils import PreTrainedConfig
 from ...utils import logging
-from ...utils.backbone_utils import verify_backbone_config_arguments
-from ..auto import CONFIG_MAPPING, AutoConfig
+from ..auto import AutoConfig
 logger = logging.get_logger(__name__)
@@ -33,21 +33,9 @@ class DabDetrConfig(PreTrainedConfig):
     documentation from [`PreTrainedConfig`] for more information.
     Args:
-        use_timm_backbone (`bool`, *optional*, defaults to `True`):
-            Whether or not to use the `timm` library for the backbone. If set to `False`, will use the [`AutoBackbone`]
-            API.
         backbone_config (`Union[dict, "PreTrainedConfig"]`, *optional*, defaults to `ResNetConfig()`):
             The configuration of the backbone model. Only used in case `use_timm_backbone` is set to `False` in which
             case it will default to `ResNetConfig()`.
-        backbone (`str`, *optional*, defaults to `"resnet50"`):
-            Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
-            will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
-            is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
-        use_pretrained_backbone (`bool`, *optional*, defaults to `True`):
-            Whether to use pretrained weights for the backbone.
-        backbone_kwargs (`dict`, *optional*):
-            Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
-            e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
         num_queries (`int`, *optional*, defaults to 300):
             Number of object queries, i.e. detection slots. This is the maximal number of objects
             [`DabDetrModel`] can detect in a single image. For COCO, we recommend 100 queries.
@@ -117,6 +105,8 @@ class DabDetrConfig(PreTrainedConfig):
         initializer_bias_prior_prob (`float`, *optional*):
             The prior probability used by the bias initializer to initialize biases for `enc_score_head` and `class_embed`.
             If `None`, `prior_prob` computed as `prior_prob = 1 / (num_labels + 1)` while initializing model weights.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to tie weight embeddings
     Examples:
@@ -143,11 +133,7 @@ class DabDetrConfig(PreTrainedConfig):
     def __init__(
         self,
-        use_timm_backbone=True,
         backbone_config=None,
-        backbone="resnet50",
-        use_pretrained_backbone=True,
-        backbone_kwargs=None,
         num_queries=300,
         encoder_layers=6,
         encoder_ffn_dim=2048,
@@ -181,41 +167,31 @@ class DabDetrConfig(PreTrainedConfig):
         normalize_before=False,
         sine_position_embedding_scale=None,
         initializer_bias_prior_prob=None,
+        tie_word_embeddings=True,
         **kwargs,
     ):
         if query_dim != 4:
             raise ValueError("The query dimensions has to be 4.")
-        # We default to values which were previously hard-coded in the model. This enables configurability of the config
-        # while keeping the default behavior the same.
-        if use_timm_backbone and backbone_kwargs is None:
-            backbone_kwargs = {}
-            if dilation:
-                backbone_kwargs["output_stride"] = 16
-            backbone_kwargs["out_indices"] = [1, 2, 3, 4]
-            backbone_kwargs["in_chans"] = 3  # num_channels
-        # Backwards compatibility
-        elif not use_timm_backbone and backbone in (None, "resnet50"):
-            if backbone_config is None:
-                logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.")
-                backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage4"])
-            elif isinstance(backbone_config, dict):
-                backbone_model_type = backbone_config.get("model_type")
-                config_class = CONFIG_MAPPING[backbone_model_type]
-                backbone_config = config_class.from_dict(backbone_config)
-            backbone = None
-            # set timm attributes to None
-            dilation = None
-        verify_backbone_config_arguments(
-            use_timm_backbone=use_timm_backbone,
-            use_pretrained_backbone=use_pretrained_backbone,
-            backbone=backbone,
+        # Init timm backbone with hardcoded values for BC
+        timm_default_kwargs = {
+            "num_channels": 3,
+            "features_only": True,
+            "use_pretrained_backbone": False,
+            "out_indices": [1, 2, 3, 4],
+        }
+        if dilation:
+            timm_default_kwargs["output_stride"] = 16
+        backbone_config, kwargs = consolidate_backbone_kwargs_to_config(
             backbone_config=backbone_config,
-            backbone_kwargs=backbone_kwargs,
+            default_backbone="resnet50",
+            default_config_type="resnet50",
+            default_config_kwargs={"out_features": ["stage4"]},
+            timm_default_kwargs=timm_default_kwargs,
+            **kwargs,
         )
-        self.use_timm_backbone = use_timm_backbone
         self.backbone_config = backbone_config
         self.num_queries = num_queries
         self.hidden_size = hidden_size
@@ -233,9 +209,6 @@ class DabDetrConfig(PreTrainedConfig):
         self.init_xavier_std = init_xavier_std
         self.num_hidden_layers = encoder_layers
         self.auxiliary_loss = auxiliary_loss
-        self.backbone = backbone
-        self.use_pretrained_backbone = use_pretrained_backbone
-        self.backbone_kwargs = backbone_kwargs
         # Hungarian matcher
         self.class_cost = class_cost
         self.bbox_cost = bbox_cost
@@ -254,6 +227,7 @@ class DabDetrConfig(PreTrainedConfig):
         self.temperature_height = temperature_height
         self.sine_position_embedding_scale = sine_position_embedding_scale
         self.initializer_bias_prior_prob = initializer_bias_prior_prob
+        self.tie_word_embeddings = tie_word_embeddings
         super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)

transformers/models/dab_detr/modeling_dab_detr.py CHANGED Viewed

@@ -21,6 +21,7 @@ from torch import Tensor, nn
 from ... import initialization as init
 from ...activations import ACT2FN
+from ...backbone_utils import load_backbone
 from ...modeling_attn_mask_utils import _prepare_4d_attention_mask
 from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithCrossAttentions, Seq2SeqModelOutput
@@ -30,7 +31,6 @@ from ...utils import (
     auto_docstring,
     logging,
 )
-from ...utils.backbone_utils import load_backbone
 from .configuration_dab_detr import DabDetrConfig
@@ -231,7 +231,7 @@ class DabDetrConvEncoder(nn.Module):
         return out
-# Copied from transformers.models.detr.modeling_detr.DetrConvModel with Detr->DabDetr
+# TODO: use modular - Copied from transformers.models.detr.modeling_detr.DetrConvModel with Detr->DabDetr
 class DabDetrConvModel(nn.Module):
     """
     This module adds 2D position embeddings to all intermediate feature maps of the convolutional encoder.
@@ -1239,10 +1239,12 @@ class DabDetrModel(DabDetrPreTrainedModel):
         ```python
         >>> from transformers import AutoImageProcessor, AutoModel
         >>> from PIL import Image
-        >>> import requests
+        >>> import httpx
+        >>> from io import BytesIO
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> with httpx.stream("GET", url) as response:
+        ...     image = Image.open(BytesIO(response.read()))
         >>> image_processor = AutoImageProcessor.from_pretrained("IDEA-Research/dab_detr-base")
         >>> model = AutoModel.from_pretrained("IDEA-Research/dab_detr-base")
@@ -1393,7 +1395,7 @@ class DabDetrModel(DabDetrPreTrainedModel):
         )
-# Copied from transformers.models.detr.modeling_detr.DetrMHAttentionMap with Detr->DabDetr
+# TODO: use modular - Copied from transformers.models.detr.modeling_detr.DetrMHAttentionMap with Detr->DabDetr
 class DabDetrMHAttentionMap(nn.Module):
     """This is a 2D attention module, which only returns the attention softmax (no multiplication by value)"""
@@ -1492,10 +1494,12 @@ class DabDetrForObjectDetection(DabDetrPreTrainedModel):
         ```python
         >>> from transformers import AutoImageProcessor, AutoModelForObjectDetection
         >>> from PIL import Image
-        >>> import requests
+        >>> import httpx
+        >>> from io import BytesIO
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> with httpx.stream("GET", url) as response:
+        ...     image = Image.open(BytesIO(response.read()))
         >>> image_processor = AutoImageProcessor.from_pretrained("IDEA-Research/dab-detr-resnet-50")
         >>> model = AutoModelForObjectDetection.from_pretrained("IDEA-Research/dab-detr-resnet-50")

transformers/models/dac/modeling_dac.py CHANGED Viewed

@@ -326,7 +326,7 @@ class DacResidualVectorQuantizer(nn.Module):
             )
             # Create mask to apply quantizer dropout
-            mask = torch.full((hidden_state.shape[0],), fill_value=i, device=hidden_state.device) < n_quantizers
+            mask = torch.full((hidden_state.shape[0],), i, device=hidden_state.device, dtype=torch.long) < n_quantizers
             quantized_representation = quantized_representation + quantized_representation_i * mask[:, None, None]
             residual = residual - quantized_representation_i

transformers/models/data2vec/configuration_data2vec_audio.py CHANGED Viewed

@@ -213,7 +213,10 @@ class Data2VecAudioConfig(PreTrainedConfig):
         output_hidden_size=None,
         **kwargs,
     ):
-        super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id)
+        super().__init__(**kwargs)
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
         self.hidden_size = hidden_size
         self.feat_extract_activation = feat_extract_activation
         self.conv_dim = list(conv_dim)

transformers/models/data2vec/configuration_data2vec_text.py CHANGED Viewed

@@ -103,10 +103,19 @@ class Data2VecTextConfig(PreTrainedConfig):
         eos_token_id=2,
         use_cache=True,
         classifier_dropout=None,
+        is_decoder=False,
+        add_cross_attention=False,
+        tie_word_embeddings=True,
         **kwargs,
     ):
-        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+        super().__init__(**kwargs)
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.tie_word_embeddings = tie_word_embeddings
+        self.is_decoder = is_decoder
+        self.add_cross_attention = add_cross_attention
         self.vocab_size = vocab_size
         self.hidden_size = hidden_size
         self.num_hidden_layers = num_hidden_layers

transformers/models/data2vec/modeling_data2vec_audio.py CHANGED Viewed

@@ -265,9 +265,9 @@ class Data2VecAudioAttention(nn.Module):
         key_states = self.k_proj(current_states).view(*kv_input_shape).transpose(1, 2)
         value_states = self.v_proj(current_states).view(*kv_input_shape).transpose(1, 2)
-        attention_interface: Callable = eager_attention_forward
-        if self.config._attn_implementation != "eager":
-            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
         attn_output, attn_weights = attention_interface(
             self,

transformers/models/data2vec/modeling_data2vec_text.py CHANGED Viewed

@@ -241,9 +241,9 @@ class Data2VecTextSelfAttention(nn.Module):
                 {"cache_position": cache_position},
             )
-        attention_interface: Callable = eager_attention_forward
-        if self.config._attn_implementation != "eager":
-            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
         attn_output, attn_weights = attention_interface(
             self,
@@ -318,9 +318,9 @@ class Data2VecTextCrossAttention(nn.Module):
                 # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls
                 past_key_values.is_updated[self.layer_idx] = True
-        attention_interface: Callable = eager_attention_forward
-        if self.config._attn_implementation != "eager":
-            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
         attn_output, attn_weights = attention_interface(
             self,

transformers/models/data2vec/modeling_data2vec_vision.py CHANGED Viewed

@@ -1186,10 +1186,12 @@ class Data2VecVisionForSemanticSegmentation(Data2VecVisionPreTrainedModel):
         ```python
         >>> from transformers import AutoImageProcessor, Data2VecVisionForSemanticSegmentation
         >>> from PIL import Image
-        >>> import requests
+        >>> import httpx
+        >>> from io import BytesIO
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> with httpx.stream("GET", url) as response:
+        ...     image = Image.open(BytesIO(response.read()))
         >>> image_processor = AutoImageProcessor.from_pretrained("facebook/data2vec-vision-base")
         >>> model = Data2VecVisionForSemanticSegmentation.from_pretrained("facebook/data2vec-vision-base")

transformers/models/dbrx/configuration_dbrx.py CHANGED Viewed

@@ -123,7 +123,8 @@ class DbrxConfig(PreTrainedConfig):
     This is the configuration class to store the configuration of a [`DbrxModel`]. It is used to instantiate a Dbrx model according to the
     specified arguments, defining the model architecture. Instantiating a configuration with the
-    defaults will yield a different configuration to that of the [databricks/dbrx-instruct](https://huggingface.co/databricks/dbrx-instruct) architecture.
+    defaults will yield a different configuration to that of the [transformers-community/dbrx-instruct](https://huggingface.co/transformers-community/dbrx-instruct) architecture.
+    Note: this link points to a re-upload; the original repository is closed.
     Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PreTrainedConfig`] for more information.
@@ -197,6 +198,10 @@ class DbrxConfig(PreTrainedConfig):
         initializer_range: float | None = 0.02,
         output_router_logits: bool | None = False,
         rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None,
+        pad_token_id: int | None = None,
+        bos_token_id: int | None = None,
+        eos_token_id: int | None = None,
+        tie_word_embeddings: bool | None = False,
         **kwargs: Any,
     ):
         if attn_config is None:
@@ -224,13 +229,16 @@ class DbrxConfig(PreTrainedConfig):
         self.initializer_range = initializer_range
         self.output_router_logits = output_router_logits
         self.num_key_value_heads = self.attn_config.kv_n_heads
-        tie_word_embeddings = kwargs.pop("tie_word_embeddings", False)
         if tie_word_embeddings:
             raise ValueError("tie_word_embeddings is not supported for DBRX models.")
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.tie_word_embeddings = tie_word_embeddings
         self.rope_parameters = rope_parameters
-        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
+        super().__init__(**kwargs)
 __all__ = ["DbrxConfig"]

transformers/models/dbrx/modeling_dbrx.py CHANGED Viewed

@@ -244,9 +244,9 @@ class DbrxAttention(nn.Module):
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
             key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
-        attention_interface: Callable = eager_attention_forward
-        if self.config._attn_implementation != "eager":
-            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
         attn_output, attn_weights = attention_interface(
             self,
@@ -654,7 +654,7 @@ def load_balancing_loss_func(
 class DbrxForCausalLM(DbrxPreTrainedModel, GenerationMixin):
     _tied_weights_keys = {"lm_head.weight": "transformer.wte.weight"}
-    _tp_plan = {"lm_head": "colwise_rep"}
+    _tp_plan = {"lm_head": "colwise_gather_output"}
     _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
     def __init__(self, config: DbrxConfig):
@@ -712,8 +712,8 @@ class DbrxForCausalLM(DbrxPreTrainedModel, GenerationMixin):
         ```python
         >> from transformers import AutoTokenizer, DbrxForCausalLM
-        >> model = DbrxForCausalLM.from_pretrained("databricks/dbrx-instruct")
-        >> tokenizer = AutoTokenizer.from_pretrained("databricks/dbrx-instruct")
+        >> model = DbrxForCausalLM.from_pretrained("transformers-community/dbrx-instruct")
+        >> tokenizer = AutoTokenizer.from_pretrained("transformers-community/dbrx-instruct")
         >> prompt = "Hey, are you conscious? Can you talk to me?"
         >> inputs = tokenizer(prompt, return_tensors="pt")

transformers/models/dbrx/modular_dbrx.py CHANGED Viewed

@@ -113,9 +113,9 @@ class DbrxAttention(nn.Module):
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
             key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
-        attention_interface: Callable = eager_attention_forward
-        if self.config._attn_implementation != "eager":
-            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
         attn_output, attn_weights = attention_interface(
             self,
@@ -441,7 +441,7 @@ class DbrxModel(DbrxPreTrainedModel):
 class DbrxForCausalLM(DbrxPreTrainedModel, GenerationMixin):
     _tied_weights_keys = {"lm_head.weight": "transformer.wte.weight"}
-    _tp_plan = {"lm_head": "colwise_rep"}
+    _tp_plan = {"lm_head": "colwise_gather_output"}
     _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
     def __init__(self, config: DbrxConfig):
@@ -499,8 +499,8 @@ class DbrxForCausalLM(DbrxPreTrainedModel, GenerationMixin):
         ```python
         >> from transformers import AutoTokenizer, DbrxForCausalLM
-        >> model = DbrxForCausalLM.from_pretrained("databricks/dbrx-instruct")
-        >> tokenizer = AutoTokenizer.from_pretrained("databricks/dbrx-instruct")
+        >> model = DbrxForCausalLM.from_pretrained("transformers-community/dbrx-instruct")
+        >> tokenizer = AutoTokenizer.from_pretrained("transformers-community/dbrx-instruct")
         >> prompt = "Hey, are you conscious? Can you talk to me?"
         >> inputs = tokenizer(prompt, return_tensors="pt")

transformers/models/deberta/configuration_deberta.py CHANGED Viewed

@@ -111,11 +111,14 @@ class DebertaConfig(PreTrainedConfig):
         relative_attention=False,
         max_relative_positions=-1,
         pad_token_id=0,
+        bos_token_id=None,
+        eos_token_id=None,
         position_biased_input=True,
         pos_att_type=None,
         pooler_dropout=0,
         pooler_hidden_act="gelu",
         legacy=True,
+        tie_word_embeddings=True,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -133,6 +136,9 @@ class DebertaConfig(PreTrainedConfig):
         self.relative_attention = relative_attention
         self.max_relative_positions = max_relative_positions
         self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.tie_word_embeddings = tie_word_embeddings
         self.position_biased_input = position_biased_input
         # Backwards compatibility

transformers/models/deberta_v2/configuration_deberta_v2.py CHANGED Viewed

@@ -111,11 +111,14 @@ class DebertaV2Config(PreTrainedConfig):
         relative_attention=False,
         max_relative_positions=-1,
         pad_token_id=0,
+        bos_token_id=None,
+        eos_token_id=None,
         position_biased_input=True,
         pos_att_type=None,
         pooler_dropout=0,
         pooler_hidden_act="gelu",
         legacy=True,
+        tie_word_embeddings=True,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -133,6 +136,9 @@ class DebertaV2Config(PreTrainedConfig):
         self.relative_attention = relative_attention
         self.max_relative_positions = max_relative_positions
         self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.tie_word_embeddings = tie_word_embeddings
         self.position_biased_input = position_biased_input
         # Backwards compatibility

transformers/models/decision_transformer/configuration_decision_transformer.py CHANGED Viewed

@@ -124,8 +124,10 @@ class DecisionTransformerConfig(PreTrainedConfig):
         eos_token_id=50256,
         scale_attn_by_inverse_layer_idx=False,
         reorder_and_upcast_attn=False,
+        add_cross_attention=False,
         **kwargs,
     ):
+        self.add_cross_attention = add_cross_attention
         self.state_dim = state_dim
         self.act_dim = act_dim
         self.hidden_size = hidden_size
@@ -150,7 +152,7 @@ class DecisionTransformerConfig(PreTrainedConfig):
         self.bos_token_id = bos_token_id
         self.eos_token_id = eos_token_id
-        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+        super().__init__(**kwargs)
 __all__ = ["DecisionTransformerConfig"]

transformers/models/decision_transformer/modeling_decision_transformer.py CHANGED Viewed

@@ -234,9 +234,9 @@ class DecisionTransformerGPT2Attention(nn.Module):
                 past_key_values.is_updated[self.layer_idx] = True
         using_eager = self.config._attn_implementation == "eager"
-        attention_interface: Callable = eager_attention_forward
-        if self.config._attn_implementation != "eager":
-            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
         if using_eager and self.reorder_and_upcast_attn:
             attn_output, attn_weights = self._upcast_and_reordered_attn(

transformers/models/deepseek_v2/configuration_deepseek_v2.py CHANGED Viewed

@@ -127,9 +127,8 @@ class DeepseekV2Config(PreTrainedConfig):
         "layers.*.self_attn.q_b_proj": "colwise",
         "layers.*.self_attn.kv_b_proj": "colwise",
         "layers.*.self_attn.o_proj": "rowwise",
-        "layers.*.mlp.experts.gate_up_proj": "local_colwise",
-        "layers.*.mlp.experts.down_proj": "local_rowwise",
-        "layers.*.mlp.experts": "gather",
+        "layers.*.mlp.experts.gate_up_proj": "packed_colwise",
+        "layers.*.mlp.experts.down_proj": "rowwise",
     }
     base_model_pp_plan = {
         "embed_tokens": (["input_ids"], ["inputs_embeds"]),
@@ -213,13 +212,11 @@ class DeepseekV2Config(PreTrainedConfig):
         self.head_dim = qk_rope_head_dim
         self.rope_parameters = rope_parameters
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
+        self.tie_word_embeddings = tie_word_embeddings
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        super().__init__(**kwargs)
 __all__ = ["DeepseekV2Config"]

transformers/models/deepseek_v2/modeling_deepseek_v2.py CHANGED Viewed

@@ -37,7 +37,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
 from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_grouped_mm_available
-from ...utils.generic import check_model_inputs, maybe_autocast
+from ...utils.generic import check_model_inputs, is_flash_attention_requested, maybe_autocast
 from .configuration_deepseek_v2 import DeepseekV2Config
@@ -116,7 +116,6 @@ class DeepseekV2Moe(nn.Module):
             topk_weight, topk_idx = torch.topk(tmp_scores, k=self.top_k, dim=-1, sorted=False)
         topk_weight = topk_weight * self.routed_scaling_factor
-        topk_weight = torch.zeros_like(router_logits).scatter_(1, topk_idx, topk_weight)
         return topk_idx, topk_weight
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
@@ -372,12 +371,12 @@ class DeepseekV2Attention(nn.Module):
             cache_kwargs = {"cache_position": cache_position}
             key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
-        if self.config._attn_implementation == "flash_attention_2" and self.qk_head_dim != self.v_head_dim:
+        if is_flash_attention_requested(self.config) and self.qk_head_dim != self.v_head_dim:
             value_states = F.pad(value_states, [0, self.qk_head_dim - self.v_head_dim])
-        attention_interface: Callable = eager_attention_forward
-        if self.config._attn_implementation != "eager":
-            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
         attn_output, attn_weights = attention_interface(
             self,
@@ -390,7 +389,7 @@ class DeepseekV2Attention(nn.Module):
             **kwargs,
         )
-        if self.config._attn_implementation == "flash_attention_2" and self.qk_head_dim != self.v_head_dim:
+        if is_flash_attention_requested(self.config) and self.qk_head_dim != self.v_head_dim:
             attn_output = attn_output[:, :, :, : self.v_head_dim]
         attn_output = attn_output.reshape(batch_size, seq_length, -1).contiguous()
@@ -553,7 +552,7 @@ class DeepseekV2Model(DeepseekV2PreTrainedModel):
 @auto_docstring
 class DeepseekV2ForCausalLM(DeepseekV2PreTrainedModel, GenerationMixin):
     _tied_weights_keys = {"lm_head.weight": "model.embed_tokens.weight"}
-    _tp_plan = {"lm_head": "colwise_rep"}
+    _tp_plan = {"lm_head": "colwise_gather_output"}
     _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
     def __init__(self, config):

transformers/models/deepseek_v2/modular_deepseek_v2.py CHANGED Viewed

@@ -23,7 +23,7 @@ from ...cache_utils import Cache
 from ...modeling_rope_utils import RopeParameters, dynamic_rope_update
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...utils import is_grouped_mm_available, logging
-from ...utils.generic import maybe_autocast
+from ...utils.generic import is_flash_attention_requested, maybe_autocast
 from ..llama.configuration_llama import LlamaConfig
 from ..llama.modeling_llama import (
     LlamaDecoderLayer,
@@ -144,9 +144,8 @@ class DeepseekV2Config(LlamaConfig):
         "layers.*.self_attn.q_b_proj": "colwise",
         "layers.*.self_attn.kv_b_proj": "colwise",
         "layers.*.self_attn.o_proj": "rowwise",
-        "layers.*.mlp.experts.gate_up_proj": "local_colwise",
-        "layers.*.mlp.experts.down_proj": "local_rowwise",
-        "layers.*.mlp.experts": "gather",
+        "layers.*.mlp.experts.gate_up_proj": "packed_colwise",
+        "layers.*.mlp.experts.down_proj": "rowwise",
     }
     model_type = "deepseek_v2"
@@ -269,7 +268,6 @@ class DeepseekV2Moe(nn.Module):
             topk_weight, topk_idx = torch.topk(tmp_scores, k=self.top_k, dim=-1, sorted=False)
         topk_weight = topk_weight * self.routed_scaling_factor
-        topk_weight = torch.zeros_like(router_logits).scatter_(1, topk_idx, topk_weight)
         return topk_idx, topk_weight
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
@@ -397,12 +395,12 @@ class DeepseekV2Attention(nn.Module):
             cache_kwargs = {"cache_position": cache_position}
             key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
-        if self.config._attn_implementation == "flash_attention_2" and self.qk_head_dim != self.v_head_dim:
+        if is_flash_attention_requested(self.config) and self.qk_head_dim != self.v_head_dim:
             value_states = F.pad(value_states, [0, self.qk_head_dim - self.v_head_dim])
-        attention_interface: Callable = eager_attention_forward
-        if self.config._attn_implementation != "eager":
-            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
         attn_output, attn_weights = attention_interface(
             self,
@@ -415,7 +413,7 @@ class DeepseekV2Attention(nn.Module):
             **kwargs,
         )
-        if self.config._attn_implementation == "flash_attention_2" and self.qk_head_dim != self.v_head_dim:
+        if is_flash_attention_requested(self.config) and self.qk_head_dim != self.v_head_dim:
             attn_output = attn_output[:, :, :, : self.v_head_dim]
         attn_output = attn_output.reshape(batch_size, seq_length, -1).contiguous()

transformers 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl

transformers 5.0.0rc3py3-none-any.whl → 5.1.0py3-none-any.whl