magic-pdf 1.2.2__py3-none-any.whl → 1.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. magic_pdf/data/batch_build_dataset.py +156 -0
  2. magic_pdf/data/dataset.py +56 -25
  3. magic_pdf/data/utils.py +108 -9
  4. magic_pdf/dict2md/ocr_mkcontent.py +4 -3
  5. magic_pdf/libs/pdf_image_tools.py +11 -6
  6. magic_pdf/libs/performance_stats.py +12 -1
  7. magic_pdf/libs/version.py +1 -1
  8. magic_pdf/model/batch_analyze.py +175 -201
  9. magic_pdf/model/doc_analyze_by_custom_model.py +142 -92
  10. magic_pdf/model/pdf_extract_kit.py +5 -38
  11. magic_pdf/model/sub_modules/language_detection/utils.py +2 -4
  12. magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py +24 -19
  13. magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py +3 -1
  14. magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py +3 -1
  15. magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py +31 -102
  16. magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/__init__.py +13 -0
  17. magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/modeling_unimernet.py +189 -0
  18. magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/__init__.py +8 -0
  19. magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/configuration_unimer_mbart.py +163 -0
  20. magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/modeling_unimer_mbart.py +2351 -0
  21. magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/__init__.py +9 -0
  22. magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/configuration_unimer_swin.py +132 -0
  23. magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/image_processing_unimer_swin.py +132 -0
  24. magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/modeling_unimer_swin.py +1084 -0
  25. magic_pdf/model/sub_modules/model_init.py +50 -37
  26. magic_pdf/model/sub_modules/model_utils.py +18 -12
  27. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/__init__.py +1 -0
  28. magic_pdf/model/sub_modules/ocr/{paddleocr → paddleocr2pytorch}/ocr_utils.py +102 -97
  29. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorch_paddle.py +193 -0
  30. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/base_ocr_v20.py +39 -0
  31. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/data/__init__.py +8 -0
  32. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/data/imaug/__init__.py +48 -0
  33. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/data/imaug/operators.py +418 -0
  34. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/architectures/__init__.py +25 -0
  35. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/architectures/base_model.py +105 -0
  36. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/__init__.py +62 -0
  37. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/det_mobilenet_v3.py +269 -0
  38. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_hgnet.py +290 -0
  39. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_lcnetv3.py +516 -0
  40. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_mobilenet_v3.py +136 -0
  41. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_mv1_enhance.py +234 -0
  42. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_svtrnet.py +638 -0
  43. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/common.py +76 -0
  44. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/__init__.py +43 -0
  45. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/cls_head.py +23 -0
  46. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/det_db_head.py +109 -0
  47. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/rec_ctc_head.py +54 -0
  48. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/rec_multi_head.py +58 -0
  49. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/__init__.py +29 -0
  50. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/db_fpn.py +456 -0
  51. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/intracl.py +117 -0
  52. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/rnn.py +228 -0
  53. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/__init__.py +33 -0
  54. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/cls_postprocess.py +20 -0
  55. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/db_postprocess.py +179 -0
  56. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/rec_postprocess.py +690 -0
  57. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/__init__.py +0 -0
  58. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/arch_config.yaml +383 -0
  59. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/arabic_dict.txt +162 -0
  60. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/chinese_cht_dict.txt +8421 -0
  61. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/cyrillic_dict.txt +163 -0
  62. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/devanagari_dict.txt +167 -0
  63. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/en_dict.txt +95 -0
  64. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/japan_dict.txt +4399 -0
  65. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ka_dict.txt +153 -0
  66. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/korean_dict.txt +3688 -0
  67. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/latin_dict.txt +185 -0
  68. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ppocr_keys_v1.txt +6623 -0
  69. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ta_dict.txt +128 -0
  70. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/te_dict.txt +151 -0
  71. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/models_config.yml +49 -0
  72. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/__init__.py +1 -0
  73. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/__init__.py +1 -0
  74. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_cls.py +106 -0
  75. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_det.py +217 -0
  76. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_rec.py +440 -0
  77. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_system.py +104 -0
  78. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/pytorchocr_utility.py +227 -0
  79. magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +15 -19
  80. magic_pdf/pdf_parse_union_core_v2.py +112 -74
  81. magic_pdf/pre_proc/ocr_dict_merge.py +9 -1
  82. magic_pdf/pre_proc/ocr_span_list_modify.py +51 -0
  83. magic_pdf/resources/model_config/model_configs.yaml +1 -1
  84. magic_pdf/resources/slanet_plus/slanet-plus.onnx +0 -0
  85. magic_pdf/tools/cli.py +30 -12
  86. magic_pdf/tools/common.py +90 -12
  87. {magic_pdf-1.2.2.dist-info → magic_pdf-1.3.1.dist-info}/METADATA +92 -59
  88. magic_pdf-1.3.1.dist-info/RECORD +203 -0
  89. {magic_pdf-1.2.2.dist-info → magic_pdf-1.3.1.dist-info}/WHEEL +1 -1
  90. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +0 -204
  91. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_291_mod.py +0 -213
  92. magic_pdf/model/sub_modules/table/structeqtable/struct_eqtable.py +0 -37
  93. magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py +0 -71
  94. magic_pdf/resources/model_config/UniMERNet/demo.yaml +0 -46
  95. magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml +0 -351
  96. magic_pdf-1.2.2.dist-info/RECORD +0 -147
  97. /magic_pdf/model/sub_modules/{ocr/paddleocr/__init__.py → mfr/unimernet/unimernet_hf/unimer_mbart/tokenization_unimer_mbart.py} +0 -0
  98. /magic_pdf/model/sub_modules/{table/structeqtable → ocr/paddleocr2pytorch/pytorchocr}/__init__.py +0 -0
  99. /magic_pdf/model/sub_modules/{table/tablemaster → ocr/paddleocr2pytorch/pytorchocr/modeling}/__init__.py +0 -0
  100. {magic_pdf-1.2.2.dist-info → magic_pdf-1.3.1.dist-info}/LICENSE.md +0 -0
  101. {magic_pdf-1.2.2.dist-info → magic_pdf-1.3.1.dist-info}/entry_points.txt +0 -0
  102. {magic_pdf-1.2.2.dist-info → magic_pdf-1.3.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,2351 @@
1
+ # coding=utf-8
2
+ # Copyright 2021, The Facebook AI Research Team and The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """PyTorch UnimerMBART model."""
16
+
17
+ import copy
18
+ import math
19
+ from dataclasses import dataclass
20
+ from typing import List, Optional, Tuple, Union
21
+
22
+ import torch
23
+ import torch.nn.functional as F
24
+ import torch.utils.checkpoint
25
+ from torch import nn
26
+ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
27
+
28
+ from transformers.activations import ACT2FN
29
+ from transformers.modeling_attn_mask_utils import (
30
+ _prepare_4d_attention_mask,
31
+ _prepare_4d_attention_mask_for_sdpa,
32
+ _prepare_4d_causal_attention_mask,
33
+ _prepare_4d_causal_attention_mask_for_sdpa,
34
+ )
35
+ from transformers.modeling_outputs import (
36
+ BaseModelOutput,
37
+ BaseModelOutputWithPastAndCrossAttentions,
38
+ CausalLMOutputWithCrossAttentions,
39
+ Seq2SeqLMOutput,
40
+ Seq2SeqModelOutput,
41
+ Seq2SeqQuestionAnsweringModelOutput,
42
+ Seq2SeqSequenceClassifierOutput,
43
+ )
44
+ from transformers import GenerationMixin, PreTrainedModel
45
+ from transformers.utils import (
46
+ add_code_sample_docstrings,
47
+ add_end_docstrings,
48
+ add_start_docstrings,
49
+ add_start_docstrings_to_model_forward,
50
+ is_flash_attn_2_available,
51
+ is_flash_attn_greater_or_equal_2_10,
52
+ logging,
53
+ replace_return_docstrings,
54
+ )
55
+ from .configuration_unimer_mbart import UnimerMBartConfig
56
+
57
+
58
+ if is_flash_attn_2_available():
59
+ from flash_attn import flash_attn_func, flash_attn_varlen_func
60
+ from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa
61
+
62
+
63
+ logger = logging.get_logger(__name__)
64
+
65
+ _CHECKPOINT_FOR_DOC = "facebook/mbart-large-cc25"
66
+ _CONFIG_FOR_DOC = "MBartConfig"
67
+
68
+ # Base model docstring
69
+ _EXPECTED_OUTPUT_SHAPE = [1, 8, 1024]
70
+
71
+
72
+ # Copied from transformers.models.llama.modeling_llama._get_unpad_data
73
+ def _get_unpad_data(attention_mask):
74
+ seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
75
+ indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
76
+ max_seqlen_in_batch = seqlens_in_batch.max().item()
77
+ cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
78
+ return (
79
+ indices,
80
+ cu_seqlens,
81
+ max_seqlen_in_batch,
82
+ )
83
+
84
+
85
+ def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int):
86
+ """
87
+ Shift input ids one token to the right, and wrap the last non pad token (the <LID> token) Note that MBart does not
88
+ have a single `decoder_start_token_id` in contrast to other Bart-like models.
89
+ """
90
+ prev_output_tokens = input_ids.clone()
91
+
92
+ if pad_token_id is None:
93
+ raise ValueError("self.model.config.pad_token_id has to be defined.")
94
+ # replace possible -100 values in labels by `pad_token_id`
95
+ prev_output_tokens.masked_fill_(prev_output_tokens == -100, pad_token_id)
96
+
97
+ index_of_eos = (prev_output_tokens.ne(pad_token_id).sum(dim=1) - 1).unsqueeze(-1)
98
+ decoder_start_tokens = prev_output_tokens.gather(1, index_of_eos).squeeze()
99
+ prev_output_tokens[:, 1:] = prev_output_tokens[:, :-1].clone()
100
+ prev_output_tokens[:, 0] = decoder_start_tokens
101
+
102
+ return prev_output_tokens
103
+
104
+ @dataclass
105
+ class CausalLMOutputWithCrossAttentionsAndCounting(CausalLMOutputWithCrossAttentions):
106
+ """
107
+ Base class for causal language model (or autoregressive) outputs.
108
+
109
+ Args:
110
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
111
+ Language modeling loss (for next-token prediction).
112
+ logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
113
+ Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
114
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
115
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
116
+ one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
117
+
118
+ Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
119
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
120
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
121
+ sequence_length)`.
122
+
123
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
124
+ heads.
125
+ cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
126
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
127
+ sequence_length)`.
128
+
129
+ Cross attentions weights after the attention softmax, used to compute the weighted average in the
130
+ cross-attention heads.
131
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
132
+ Tuple of `torch.FloatTensor` tuples of length `config.n_layers`, with each tuple containing the cached key,
133
+ value states of the self-attention and the cross-attention layers if model is used in encoder-decoder
134
+ setting. Only relevant if `config.is_decoder = True`.
135
+
136
+ Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
137
+ `past_key_values` input) to speed up sequential decoding.
138
+ counting:
139
+ Counting
140
+ """
141
+ counting: Optional[torch.FloatTensor] = None
142
+
143
+ # Copied from transformers.models.bart.modeling_bart.BartLearnedPositionalEmbedding with Bart->MBart
144
+ class UnimerMBartLearnedPositionalEmbedding(nn.Embedding):
145
+ """
146
+ This module learns positional embeddings up to a fixed maximum size.
147
+ """
148
+
149
+ def __init__(self, num_embeddings: int, embedding_dim: int):
150
+ # MBart is set up so that if padding_idx is specified then offset the embedding ids by 2
151
+ # and adjust num_embeddings appropriately. Other models don't have this hack
152
+ self.offset = 2
153
+ super().__init__(num_embeddings + self.offset, embedding_dim)
154
+
155
+ def forward(self, input_ids: torch.Tensor, past_key_values_length: int = 0):
156
+ """`input_ids' shape is expected to be [bsz x seqlen]."""
157
+
158
+ bsz, seq_len = input_ids.shape[:2]
159
+ positions = torch.arange(
160
+ past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
161
+ ).expand(bsz, -1)
162
+
163
+ return super().forward(positions + self.offset)
164
+
165
+
166
+ # Copied from transformers.models.bart.modeling_bart.BartScaledWordEmbedding with Bart->MBart
167
+ class UnimerMBartScaledWordEmbedding(nn.Embedding):
168
+ """
169
+ This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
170
+ """
171
+
172
+ def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int, embed_scale: Optional[float] = 1.0):
173
+ super().__init__(num_embeddings, embedding_dim, padding_idx)
174
+ self.embed_scale = embed_scale
175
+
176
+ def forward(self, input_ids: torch.Tensor):
177
+ return super().forward(input_ids) * self.embed_scale
178
+
179
+
180
+ # Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->MBart
181
+ class UnimerMBartAttention(nn.Module):
182
+ """Multi-headed attention from 'Attention Is All You Need' paper, with qk_squeeze"""
183
+
184
+ def __init__(
185
+ self,
186
+ embed_dim: int,
187
+ num_heads: int,
188
+ dropout: float = 0.0,
189
+ is_decoder: bool = False,
190
+ bias: bool = True,
191
+ is_causal: bool = False,
192
+ *,
193
+ config: UnimerMBartConfig,
194
+ ):
195
+ super().__init__()
196
+ self.embed_dim = embed_dim
197
+ self.num_heads = num_heads
198
+ self.dropout = dropout
199
+ self.head_dim = embed_dim // num_heads
200
+ self.config = config
201
+
202
+ if (self.head_dim * num_heads) != self.embed_dim:
203
+ raise ValueError(
204
+ f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
205
+ f" and `num_heads`: {num_heads})."
206
+ )
207
+
208
+ self.squeeze_dim = embed_dim // config.qk_squeeze
209
+ self.squeeze_head_dim = self.squeeze_dim // num_heads
210
+ self.scaling = self.squeeze_head_dim**-0.5
211
+ self.is_decoder = is_decoder
212
+ self.is_causal = is_causal
213
+
214
+ self.q_proj = nn.Linear(embed_dim, self.squeeze_dim, bias=bias)
215
+ self.k_proj = nn.Linear(embed_dim, self.squeeze_dim, bias=bias)
216
+ self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
217
+ self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
218
+
219
+ def _shape_qk(self, tensor: torch.Tensor, seq_len: int, bsz: int):
220
+ return tensor.view(bsz, seq_len, self.num_heads, self.squeeze_head_dim).transpose(1, 2).contiguous()
221
+
222
+ def _shape_v(self, tensor: torch.Tensor, seq_len: int, bsz: int):
223
+ return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
224
+
225
+ def forward(
226
+ self,
227
+ hidden_states: torch.Tensor,
228
+ key_value_states: Optional[torch.Tensor] = None,
229
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
230
+ attention_mask: Optional[torch.Tensor] = None,
231
+ layer_head_mask: Optional[torch.Tensor] = None,
232
+ output_attentions: bool = False,
233
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
234
+ """Input shape: Batch x Time x Channel"""
235
+
236
+ # if key_value_states are provided this layer is used as a cross-attention layer
237
+ # for the decoder
238
+ is_cross_attention = key_value_states is not None
239
+
240
+ bsz, tgt_len, _ = hidden_states.size()
241
+
242
+ # get query proj
243
+ query_states = self.q_proj(hidden_states) * self.scaling
244
+ # get key, value proj
245
+ # `past_key_value[0].shape[2] == key_value_states.shape[1]`
246
+ # is checking that the `sequence_length` of the `past_key_value` is the same as
247
+ # the provided `key_value_states` to support prefix tuning
248
+ if (
249
+ is_cross_attention
250
+ and past_key_value is not None
251
+ and past_key_value[0].shape[2] == key_value_states.shape[1]
252
+ ):
253
+ # reuse k,v, cross_attentions
254
+ key_states = past_key_value[0]
255
+ value_states = past_key_value[1]
256
+ elif is_cross_attention:
257
+ # cross_attentions
258
+ key_states = self._shape_qk(self.k_proj(key_value_states), -1, bsz)
259
+ value_states = self._shape_v(self.v_proj(key_value_states), -1, bsz)
260
+ elif past_key_value is not None:
261
+ # reuse k, v, self_attention
262
+ key_states = self._shape_qk(self.k_proj(hidden_states), -1, bsz)
263
+ value_states = self._shape_v(self.v_proj(hidden_states), -1, bsz)
264
+ key_states = torch.cat([past_key_value[0], key_states], dim=2)
265
+ value_states = torch.cat([past_key_value[1], value_states], dim=2)
266
+ else:
267
+ # self_attention
268
+ key_states = self._shape_qk(self.k_proj(hidden_states), -1, bsz)
269
+ value_states = self._shape_v(self.v_proj(hidden_states), -1, bsz)
270
+
271
+ if self.is_decoder:
272
+ # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
273
+ # Further calls to cross_attention layer can then reuse all cross-attention
274
+ # key/value_states (first "if" case)
275
+ # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
276
+ # all previous decoder key/value_states. Further calls to uni-directional self-attention
277
+ # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
278
+ # if encoder bi-directional self-attention `past_key_value` is always `None`
279
+ past_key_value = (key_states, value_states)
280
+
281
+ proj_shape = (bsz * self.num_heads, -1, self.squeeze_head_dim)
282
+ value_shape = (bsz * self.num_heads, -1, self.head_dim)
283
+ query_states = self._shape_qk(query_states, tgt_len, bsz).view(*proj_shape)
284
+ key_states = key_states.reshape(*proj_shape)
285
+ value_states = value_states.reshape(*value_shape)
286
+
287
+ src_len = key_states.size(1)
288
+ attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
289
+
290
+ if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
291
+ raise ValueError(
292
+ f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
293
+ f" {attn_weights.size()}"
294
+ )
295
+
296
+ if attention_mask is not None:
297
+ if attention_mask.size() != (bsz, 1, tgt_len, src_len):
298
+ raise ValueError(
299
+ f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
300
+ )
301
+ attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
302
+ attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
303
+
304
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1)
305
+
306
+ if layer_head_mask is not None:
307
+ if layer_head_mask.size() != (self.num_heads,):
308
+ raise ValueError(
309
+ f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
310
+ f" {layer_head_mask.size()}"
311
+ )
312
+ attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
313
+ attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
314
+
315
+ if output_attentions:
316
+ # this operation is a bit awkward, but it's required to
317
+ # make sure that attn_weights keeps its gradient.
318
+ # In order to do so, attn_weights have to be reshaped
319
+ # twice and have to be reused in the following
320
+ attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
321
+ attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
322
+ else:
323
+ attn_weights_reshaped = None
324
+
325
+ attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
326
+ attn_output = torch.bmm(attn_probs, value_states)
327
+
328
+ if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
329
+ raise ValueError(
330
+ f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
331
+ f" {attn_output.size()}"
332
+ )
333
+
334
+ attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
335
+ attn_output = attn_output.transpose(1, 2)
336
+
337
+ # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
338
+ # partitioned across GPUs when using tensor-parallelism.
339
+ attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
340
+
341
+ attn_output = self.out_proj(attn_output)
342
+
343
+ return attn_output, attn_weights_reshaped, past_key_value
344
+
345
+
346
+ # Copied from transformers.models.bart.modeling_bart.BartFlashAttention2 with Bart->MBart
347
+ class UnimerMBartFlashAttention2(UnimerMBartAttention):
348
+ """
349
+ MBart flash attention module. This module inherits from `MBartSqueezeAttention` as the weights of the module stays
350
+ untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
351
+ flash attention and deal with padding tokens in case the input contains any of them.
352
+ """
353
+
354
+ # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
355
+ def __init__(self, *args, **kwargs):
356
+ super().__init__(*args, **kwargs)
357
+
358
+ # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
359
+ # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
360
+ # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
361
+ self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
362
+
363
+ # def _reshape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
364
+ # return tensor.view(bsz, seq_len, self.num_heads, self.head_dim)
365
+
366
+ def _shape_qk(self, tensor: torch.Tensor, seq_len: int, bsz: int):
367
+ return tensor.view(bsz, seq_len, self.num_heads, self.squeeze_head_dim)
368
+
369
+ def _shape_v(self, tensor: torch.Tensor, seq_len: int, bsz: int):
370
+ return tensor.view(bsz, seq_len, self.num_heads, self.head_dim)
371
+
372
+ def forward(
373
+ self,
374
+ hidden_states: torch.Tensor,
375
+ key_value_states: Optional[torch.Tensor] = None,
376
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
377
+ attention_mask: Optional[torch.Tensor] = None,
378
+ layer_head_mask: Optional[torch.Tensor] = None,
379
+ output_attentions: bool = False,
380
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
381
+ # MBartFlashAttention2 attention does not support output_attentions
382
+ if output_attentions:
383
+ raise ValueError("MBartFlashAttention2 attention does not support output_attentions")
384
+
385
+ # if key_value_states are provided this layer is used as a cross-attention layer
386
+ # for the decoder
387
+ is_cross_attention = key_value_states is not None
388
+
389
+ bsz, q_len, _ = hidden_states.size()
390
+
391
+ # get query proj
392
+ query_states = self._shape_qk(self.q_proj(hidden_states), -1, bsz)
393
+
394
+ # get key, value proj
395
+ # `past_key_value[0].shape[2] == key_value_states.shape[1]`
396
+ # is checking that the `sequence_length` of the `past_key_value` is the same as
397
+ # the provided `key_value_states` to support prefix tuning
398
+ if (
399
+ is_cross_attention
400
+ and past_key_value is not None
401
+ and past_key_value[0].shape[2] == key_value_states.shape[1]
402
+ ):
403
+ # reuse k,v, cross_attentions
404
+ key_states = past_key_value[0].transpose(1, 2)
405
+ value_states = past_key_value[1].transpose(1, 2)
406
+ elif is_cross_attention:
407
+ # cross_attentions
408
+ key_states = self._shape_qk(self.k_proj(key_value_states), -1, bsz)
409
+ value_states = self._shape_v(self.v_proj(key_value_states), -1, bsz)
410
+ elif past_key_value is not None:
411
+ # reuse k, v, self_attention
412
+ key_states = self._shape_qk(self.k_proj(hidden_states), -1, bsz)
413
+ value_states = self._shape_v(self.v_proj(hidden_states), -1, bsz)
414
+ key_states = torch.cat([past_key_value[0].transpose(1, 2), key_states], dim=1)
415
+ value_states = torch.cat([past_key_value[1].transpose(1, 2), value_states], dim=1)
416
+ else:
417
+ # self_attention
418
+ key_states = self._shape_qk(self.k_proj(hidden_states), -1, bsz)
419
+ value_states = self._shape_v(self.v_proj(hidden_states), -1, bsz)
420
+
421
+ if self.is_decoder:
422
+ # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
423
+ # Further calls to cross_attention layer can then reuse all cross-attention
424
+ # key/value_states (first "if" case)
425
+ # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
426
+ # all previous decoder key/value_states. Further calls to uni-directional self-attention
427
+ # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
428
+ # if encoder bi-directional self-attention `past_key_value` is always `None`
429
+ past_key_value = (key_states.transpose(1, 2), value_states.transpose(1, 2))
430
+
431
+ kv_seq_len = key_states.shape[-2]
432
+ if past_key_value is not None:
433
+ kv_seq_len += past_key_value[0].shape[-2]
434
+
435
+ # In PEFT, usually we cast the layer norms in float32 for training stability reasons
436
+ # therefore the input hidden states gets silently casted in float32. Hence, we need
437
+ # cast them back in the correct dtype just to be sure everything works as expected.
438
+ # This might slowdown training & inference so it is recommended to not cast the LayerNorms
439
+ # in fp32. (LlamaRMSNorm handles it correctly)
440
+
441
+ input_dtype = query_states.dtype
442
+ if input_dtype == torch.float32:
443
+ if torch.is_autocast_enabled():
444
+ target_dtype = torch.get_autocast_gpu_dtype()
445
+ # Handle the case where the model is quantized
446
+ elif hasattr(self.config, "_pre_quantization_dtype"):
447
+ target_dtype = self.config._pre_quantization_dtype
448
+ else:
449
+ target_dtype = self.q_proj.weight.dtype
450
+
451
+ logger.warning_once(
452
+ f"The input hidden states seems to be silently casted in float32, this might be related to"
453
+ f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
454
+ f" {target_dtype}."
455
+ )
456
+
457
+ query_states = query_states.to(target_dtype)
458
+ key_states = key_states.to(target_dtype)
459
+ value_states = value_states.to(target_dtype)
460
+
461
+ attn_output = self._flash_attention_forward(
462
+ query_states, key_states, value_states, attention_mask, q_len, dropout=self.dropout
463
+ )
464
+
465
+ attn_output = attn_output.reshape(bsz, q_len, -1)
466
+ attn_output = self.out_proj(attn_output)
467
+
468
+ if not output_attentions:
469
+ attn_weights = None
470
+
471
+ return attn_output, attn_weights, past_key_value
472
+
473
+ # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._flash_attention_forward
474
+ def _flash_attention_forward(
475
+ self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
476
+ ):
477
+ """
478
+ Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
479
+ first unpad the input, then computes the attention scores and pad the final attention scores.
480
+
481
+ Args:
482
+ query_states (`torch.Tensor`):
483
+ Input query states to be passed to Flash Attention API
484
+ key_states (`torch.Tensor`):
485
+ Input key states to be passed to Flash Attention API
486
+ value_states (`torch.Tensor`):
487
+ Input value states to be passed to Flash Attention API
488
+ attention_mask (`torch.Tensor`):
489
+ The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
490
+ position of padding tokens and 1 for the position of non-padding tokens.
491
+ dropout (`float`):
492
+ Attention dropout
493
+ softmax_scale (`float`, *optional*):
494
+ The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
495
+ """
496
+ if not self._flash_attn_uses_top_left_mask:
497
+ causal = self.is_causal
498
+ else:
499
+ # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
500
+ causal = self.is_causal and query_length != 1
501
+
502
+ # Contains at least one padding token in the sequence
503
+ if attention_mask is not None:
504
+ batch_size = query_states.shape[0]
505
+
506
+ query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
507
+ query_states, key_states, value_states, attention_mask, query_length
508
+ )
509
+
510
+ cu_seqlens_q, cu_seqlens_k = cu_seq_lens
511
+ max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
512
+
513
+ attn_output_unpad = flash_attn_varlen_func(
514
+ query_states,
515
+ key_states,
516
+ value_states,
517
+ cu_seqlens_q=cu_seqlens_q,
518
+ cu_seqlens_k=cu_seqlens_k,
519
+ max_seqlen_q=max_seqlen_in_batch_q,
520
+ max_seqlen_k=max_seqlen_in_batch_k,
521
+ dropout_p=dropout,
522
+ softmax_scale=softmax_scale,
523
+ causal=causal,
524
+ )
525
+
526
+ attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
527
+ else:
528
+ attn_output = flash_attn_func(
529
+ query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
530
+ )
531
+
532
+ return attn_output
533
+
534
+ # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._upad_input
535
+ def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
536
+ indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
537
+ batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
538
+
539
+ key_layer = index_first_axis(
540
+ key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
541
+ )
542
+ value_layer = index_first_axis(
543
+ value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
544
+ )
545
+ if query_length == kv_seq_len:
546
+ query_layer = index_first_axis(
547
+ query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
548
+ )
549
+ cu_seqlens_q = cu_seqlens_k
550
+ max_seqlen_in_batch_q = max_seqlen_in_batch_k
551
+ indices_q = indices_k
552
+ elif query_length == 1:
553
+ max_seqlen_in_batch_q = 1
554
+ cu_seqlens_q = torch.arange(
555
+ batch_size + 1, dtype=torch.int32, device=query_layer.device
556
+ ) # There is a memcpy here, that is very bad.
557
+ indices_q = cu_seqlens_q[:-1]
558
+ query_layer = query_layer.squeeze(1)
559
+ else:
560
+ # The -q_len: slice assumes left padding.
561
+ attention_mask = attention_mask[:, -query_length:]
562
+ query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
563
+
564
+ return (
565
+ query_layer,
566
+ key_layer,
567
+ value_layer,
568
+ indices_q,
569
+ (cu_seqlens_q, cu_seqlens_k),
570
+ (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
571
+ )
572
+
573
+ class UnimerMBartSdpaAttention(UnimerMBartAttention):
574
+ def forward(
575
+ self,
576
+ hidden_states: torch.Tensor,
577
+ key_value_states: Optional[torch.Tensor] = None,
578
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
579
+ attention_mask: Optional[torch.Tensor] = None,
580
+ layer_head_mask: Optional[torch.Tensor] = None,
581
+ output_attentions: bool = False,
582
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
583
+ """Input shape: Batch x Time x Channel"""
584
+ if output_attentions or layer_head_mask is not None:
585
+ # TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` once this is implemented.
586
+ logger.warning(
587
+ "BartModel is using BartSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True` or `layer_head_mask` not None. Falling back to the manual attention"
588
+ ' implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
589
+ )
590
+ return super().forward(
591
+ hidden_states,
592
+ key_value_states=key_value_states,
593
+ past_key_value=past_key_value,
594
+ attention_mask=attention_mask,
595
+ layer_head_mask=layer_head_mask,
596
+ output_attentions=output_attentions,
597
+ )
598
+
599
+ # if key_value_states are provided this layer is used as a cross-attention layer
600
+ # for the decoder
601
+ is_cross_attention = key_value_states is not None
602
+
603
+ bsz, tgt_len, _ = hidden_states.size()
604
+
605
+ # get query proj
606
+ query_states = self.q_proj(hidden_states)
607
+ # get key, value proj
608
+ # `past_key_value[0].shape[2] == key_value_states.shape[1]`
609
+ # is checking that the `sequence_length` of the `past_key_value` is the same as
610
+ # the provided `key_value_states` to support prefix tuning
611
+ if (
612
+ is_cross_attention
613
+ and past_key_value is not None
614
+ and past_key_value[0].shape[2] == key_value_states.shape[1]
615
+ ):
616
+ # reuse k,v, cross_attentions
617
+ key_states = past_key_value[0]
618
+ value_states = past_key_value[1]
619
+ elif is_cross_attention:
620
+ # cross_attentions
621
+ key_states = self._shape_qk(self.k_proj(key_value_states), -1, bsz)
622
+ value_states = self._shape_v(self.v_proj(key_value_states), -1, bsz)
623
+ elif past_key_value is not None:
624
+ # reuse k, v, self_attention
625
+ key_states = self._shape_qk(self.k_proj(hidden_states), -1, bsz)
626
+ value_states = self._shape_v(self.v_proj(hidden_states), -1, bsz)
627
+ key_states = torch.cat([past_key_value[0], key_states], dim=2)
628
+ value_states = torch.cat([past_key_value[1], value_states], dim=2)
629
+ else:
630
+ # self_attention
631
+ key_states = self._shape_qk(self.k_proj(hidden_states), -1, bsz)
632
+ value_states = self._shape_v(self.v_proj(hidden_states), -1, bsz)
633
+
634
+ if self.is_decoder:
635
+ # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
636
+ # Further calls to cross_attention layer can then reuse all cross-attention
637
+ # key/value_states (first "if" case)
638
+ # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
639
+ # all previous decoder key/value_states. Further calls to uni-directional self-attention
640
+ # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
641
+ # if encoder bi-directional self-attention `past_key_value` is always `None`
642
+ past_key_value = (key_states, value_states)
643
+
644
+ query_states = self._shape_qk(query_states, tgt_len, bsz)
645
+
646
+ # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
647
+ # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
648
+ # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case tgt_len == 1.
649
+ is_causal = True if self.is_causal and attention_mask is None and tgt_len > 1 else False
650
+
651
+ # NOTE: SDPA with memory-efficient backend is currently (torch==2.1.2) bugged when using non-contiguous inputs and a custom attn_mask,
652
+ # but we are fine here as `_shape` do call `.contiguous()`. Reference: https://github.com/pytorch/pytorch/issues/112577
653
+ attn_output = torch.nn.functional.scaled_dot_product_attention(
654
+ query_states,
655
+ key_states,
656
+ value_states,
657
+ attn_mask=attention_mask,
658
+ dropout_p=self.dropout if self.training else 0.0,
659
+ is_causal=is_causal,
660
+ )
661
+
662
+ if attn_output.size() != (bsz, self.num_heads, tgt_len, self.head_dim):
663
+ raise ValueError(
664
+ f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
665
+ f" {attn_output.size()}"
666
+ )
667
+
668
+ attn_output = attn_output.transpose(1, 2)
669
+
670
+ # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
671
+ # partitioned across GPUs when using tensor-parallelism.
672
+ attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
673
+
674
+ attn_output = self.out_proj(attn_output)
675
+
676
+ return attn_output, None, past_key_value
677
+
678
+ UNIMER_MBART_ATTENTION_CLASSES = {
679
+ "eager": UnimerMBartAttention,
680
+ "flash_attention_2": UnimerMBartFlashAttention2,
681
+ "sdpa": UnimerMBartSdpaAttention,
682
+ }
683
+
684
+
685
+ class UnimerMBartEncoderLayer(nn.Module):
686
+ def __init__(self, config: UnimerMBartConfig):
687
+ super().__init__()
688
+ self.embed_dim = config.d_model
689
+
690
+ self.self_attn = UNIMER_MBART_ATTENTION_CLASSES[config._attn_implementation](
691
+ embed_dim=self.embed_dim,
692
+ num_heads=config.encoder_attention_heads,
693
+ dropout=config.attention_dropout,
694
+ config=config,
695
+ )
696
+ self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
697
+ self.dropout = config.dropout
698
+ self.activation_fn = ACT2FN[config.activation_function]
699
+ self.activation_dropout = config.activation_dropout
700
+ self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
701
+ self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
702
+ self.final_layer_norm = nn.LayerNorm(self.embed_dim)
703
+
704
+ def forward(
705
+ self,
706
+ hidden_states: torch.Tensor,
707
+ attention_mask: torch.Tensor,
708
+ layer_head_mask: torch.Tensor,
709
+ output_attentions: bool = False,
710
+ ) -> torch.Tensor:
711
+ """
712
+ Args:
713
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
714
+ attention_mask (`torch.FloatTensor`): attention mask of size
715
+ `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
716
+ layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
717
+ `(encoder_attention_heads,)`.
718
+ output_attentions (`bool`, *optional*):
719
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
720
+ returned tensors for more detail.
721
+ """
722
+ residual = hidden_states
723
+ hidden_states = self.self_attn_layer_norm(hidden_states)
724
+ hidden_states, attn_weights, _ = self.self_attn(
725
+ hidden_states=hidden_states,
726
+ attention_mask=attention_mask,
727
+ layer_head_mask=layer_head_mask,
728
+ output_attentions=output_attentions,
729
+ )
730
+ hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
731
+ hidden_states = residual + hidden_states
732
+
733
+ residual = hidden_states
734
+ hidden_states = self.final_layer_norm(hidden_states)
735
+ hidden_states = self.activation_fn(self.fc1(hidden_states))
736
+ hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
737
+ hidden_states = self.fc2(hidden_states)
738
+ hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
739
+ hidden_states = residual + hidden_states
740
+
741
+ if hidden_states.dtype == torch.float16 and (
742
+ torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
743
+ ):
744
+ clamp_value = torch.finfo(hidden_states.dtype).max - 1000
745
+ hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
746
+
747
+ outputs = (hidden_states,)
748
+
749
+ if output_attentions:
750
+ outputs += (attn_weights,)
751
+
752
+ return outputs
753
+
754
+
755
+ class UnimerMBartDecoderLayer(nn.Module):
756
+ def __init__(self, config: UnimerMBartConfig):
757
+ super().__init__()
758
+ self.embed_dim = config.d_model
759
+
760
+ self.self_attn = UNIMER_MBART_ATTENTION_CLASSES[config._attn_implementation](
761
+ embed_dim=self.embed_dim,
762
+ num_heads=config.decoder_attention_heads,
763
+ dropout=config.attention_dropout,
764
+ is_decoder=True,
765
+ is_causal=True,
766
+ config=config,
767
+ )
768
+ self.dropout = config.dropout
769
+ self.activation_fn = ACT2FN[config.activation_function]
770
+ self.activation_dropout = config.activation_dropout
771
+
772
+ self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
773
+ self.encoder_attn = UNIMER_MBART_ATTENTION_CLASSES[config._attn_implementation](
774
+ self.embed_dim,
775
+ config.decoder_attention_heads,
776
+ dropout=config.attention_dropout,
777
+ is_decoder=True,
778
+ config=config,
779
+ )
780
+ self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
781
+ self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
782
+ self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
783
+ self.final_layer_norm = nn.LayerNorm(self.embed_dim)
784
+
785
+ def forward(
786
+ self,
787
+ hidden_states: torch.Tensor,
788
+ attention_mask: Optional[torch.Tensor] = None,
789
+ encoder_hidden_states: Optional[torch.Tensor] = None,
790
+ encoder_attention_mask: Optional[torch.Tensor] = None,
791
+ layer_head_mask: Optional[torch.Tensor] = None,
792
+ cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
793
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
794
+ output_attentions: Optional[bool] = False,
795
+ use_cache: Optional[bool] = True,
796
+ ) -> torch.Tensor:
797
+ """
798
+ Args:
799
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
800
+ attention_mask (`torch.FloatTensor`): attention mask of size
801
+ `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
802
+ encoder_hidden_states (`torch.FloatTensor`):
803
+ cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
804
+ encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
805
+ `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
806
+ layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
807
+ `(encoder_attention_heads,)`.
808
+ cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
809
+ size `(decoder_attention_heads,)`.
810
+ past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
811
+ output_attentions (`bool`, *optional*):
812
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
813
+ returned tensors for more detail.
814
+ """
815
+ residual = hidden_states
816
+ hidden_states = self.self_attn_layer_norm(hidden_states)
817
+
818
+ # Self Attention
819
+ # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
820
+ self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
821
+ # add present self-attn cache to positions 1,2 of present_key_value tuple
822
+ hidden_states, self_attn_weights, present_key_value = self.self_attn(
823
+ hidden_states=hidden_states,
824
+ past_key_value=self_attn_past_key_value,
825
+ attention_mask=attention_mask,
826
+ layer_head_mask=layer_head_mask,
827
+ output_attentions=output_attentions,
828
+ )
829
+ hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
830
+ hidden_states = residual + hidden_states
831
+
832
+ # Cross-Attention Block
833
+ cross_attn_present_key_value = None
834
+ cross_attn_weights = None
835
+ if encoder_hidden_states is not None:
836
+ residual = hidden_states
837
+ hidden_states = self.encoder_attn_layer_norm(hidden_states)
838
+
839
+ # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
840
+ cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
841
+ hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
842
+ hidden_states=hidden_states,
843
+ key_value_states=encoder_hidden_states,
844
+ attention_mask=encoder_attention_mask,
845
+ layer_head_mask=cross_attn_layer_head_mask,
846
+ past_key_value=cross_attn_past_key_value,
847
+ output_attentions=output_attentions,
848
+ )
849
+ hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
850
+ hidden_states = residual + hidden_states
851
+
852
+ # add cross-attn to positions 3,4 of present_key_value tuple
853
+ present_key_value = present_key_value + cross_attn_present_key_value
854
+
855
+ # Fully Connected
856
+ residual = hidden_states
857
+ hidden_states = self.final_layer_norm(hidden_states)
858
+ hidden_states = self.activation_fn(self.fc1(hidden_states))
859
+ hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
860
+ hidden_states = self.fc2(hidden_states)
861
+ hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
862
+ hidden_states = residual + hidden_states
863
+
864
+ outputs = (hidden_states,)
865
+
866
+ if output_attentions:
867
+ outputs += (self_attn_weights, cross_attn_weights)
868
+
869
+ if use_cache:
870
+ outputs += (present_key_value,)
871
+
872
+ return outputs
873
+
874
+
875
+ # Copied from transformers.models.bart.modeling_bart.BartClassificationHead with Bart->MBart
876
+ class UnimerMBartClassificationHead(nn.Module):
877
+ """Head for sentence-level classification tasks."""
878
+
879
+ def __init__(
880
+ self,
881
+ input_dim: int,
882
+ inner_dim: int,
883
+ num_classes: int,
884
+ pooler_dropout: float,
885
+ ):
886
+ super().__init__()
887
+ self.dense = nn.Linear(input_dim, inner_dim)
888
+ self.dropout = nn.Dropout(p=pooler_dropout)
889
+ self.out_proj = nn.Linear(inner_dim, num_classes)
890
+
891
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
892
+ hidden_states = self.dropout(hidden_states)
893
+ hidden_states = self.dense(hidden_states)
894
+ hidden_states = torch.tanh(hidden_states)
895
+ hidden_states = self.dropout(hidden_states)
896
+ hidden_states = self.out_proj(hidden_states)
897
+ return hidden_states
898
+
899
+
900
+ class UnimerMBartPreTrainedModel(PreTrainedModel):
901
+ config_class = UnimerMBartConfig
902
+ base_model_prefix = "model"
903
+ supports_gradient_checkpointing = True
904
+ _no_split_modules = ["MBartDecoderLayer", "MBartSqueezeAttention"]
905
+ _supports_flash_attn_2 = True
906
+ _supports_sdpa = True
907
+
908
+ def _init_weights(self, module):
909
+ std = self.config.init_std
910
+ if isinstance(module, nn.Linear):
911
+ module.weight.data.normal_(mean=0.0, std=std)
912
+ if module.bias is not None:
913
+ module.bias.data.zero_()
914
+ elif isinstance(module, nn.Embedding):
915
+ module.weight.data.normal_(mean=0.0, std=std)
916
+ if module.padding_idx is not None:
917
+ module.weight.data[module.padding_idx].zero_()
918
+
919
+ @property
920
+ def dummy_inputs(self):
921
+ pad_token = self.config.pad_token_id
922
+ input_ids = torch.tensor([[0, 6, 10, 4, 2], [0, 8, 12, 2, pad_token]], device=self.device)
923
+ dummy_inputs = {
924
+ "attention_mask": input_ids.ne(pad_token),
925
+ "input_ids": input_ids,
926
+ }
927
+ return dummy_inputs
928
+
929
+
930
+ MBART_START_DOCSTRING = r"""
931
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
932
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
933
+ etc.)
934
+
935
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
936
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
937
+ and behavior.
938
+
939
+ Parameters:
940
+ config ([`MBartConfig`]):
941
+ Model configuration class with all the parameters of the model. Initializing with a config file does not
942
+ load the weights associated with the model, only the configuration. Check out the
943
+ [`~PreTrainedModel.from_pretrained`] method to load the model weights.
944
+ """
945
+
946
+ MBART_GENERATION_EXAMPLE = r"""
947
+ Translation example:
948
+
949
+ ```python
950
+ >>> from transformers import AutoTokenizer, MBartForConditionalGeneration
951
+
952
+ >>> model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-en-ro")
953
+ >>> tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-en-ro")
954
+
955
+ >>> example_english_phrase = "42 is the answer"
956
+ >>> inputs = tokenizer(example_english_phrase, return_tensors="pt")
957
+
958
+ >>> # Translate
959
+ >>> generated_ids = model.generate(**inputs, num_beams=4, max_length=5)
960
+ >>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
961
+ '42 este răspuns'
962
+ ```
963
+
964
+ Mask filling example:
965
+
966
+ ```python
967
+ >>> from transformers import AutoTokenizer, MBartForConditionalGeneration
968
+
969
+ >>> model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-cc25")
970
+ >>> tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-cc25")
971
+
972
+ >>> # de_DE is the language symbol id <LID> for German
973
+ >>> TXT = "</s> Meine Freunde sind <mask> nett aber sie essen zu viel Kuchen. </s> de_DE"
974
+
975
+ >>> input_ids = tokenizer([TXT], add_special_tokens=False, return_tensors="pt")["input_ids"]
976
+ >>> logits = model(input_ids).logits
977
+
978
+ >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
979
+ >>> probs = logits[0, masked_index].softmax(dim=0)
980
+ >>> values, predictions = probs.topk(5)
981
+
982
+ >>> tokenizer.decode(predictions).split()
983
+ ['nett', 'sehr', 'ganz', 'nicht', 'so']
984
+ ```
985
+ """
986
+
987
+ MBART_INPUTS_DOCSTRING = r"""
988
+ Args:
989
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
990
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
991
+ it.
992
+
993
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
994
+ [`PreTrainedTokenizer.__call__`] for details.
995
+
996
+ [What are input IDs?](../glossary#input-ids)
997
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
998
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
999
+
1000
+ - 1 for tokens that are **not masked**,
1001
+ - 0 for tokens that are **masked**.
1002
+
1003
+ [What are attention masks?](../glossary#attention-mask)
1004
+ decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
1005
+ Indices of decoder input sequence tokens in the vocabulary.
1006
+
1007
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
1008
+ [`PreTrainedTokenizer.__call__`] for details.
1009
+
1010
+ [What are decoder input IDs?](../glossary#decoder-input-ids)
1011
+
1012
+ MBart uses a specific language id token as the starting token for `decoder_input_ids` generation that
1013
+ varies according to source and target language, *e.g.* 25004 for *en_XX*, and 25003 for *de_DE*. If
1014
+ `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
1015
+ `past_key_values`).
1016
+
1017
+ For translation and summarization training, `decoder_input_ids` should be provided. If no
1018
+ `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
1019
+ for denoising pre-training following the paper.
1020
+ decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
1021
+ Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
1022
+ be used by default.
1023
+ head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
1024
+ Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
1025
+
1026
+ - 1 indicates the head is **not masked**,
1027
+ - 0 indicates the head is **masked**.
1028
+
1029
+ decoder_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
1030
+ Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:
1031
+
1032
+ - 1 indicates the head is **not masked**,
1033
+ - 0 indicates the head is **masked**.
1034
+
1035
+ cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
1036
+ Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
1037
+ 1]`:
1038
+
1039
+ - 1 indicates the head is **not masked**,
1040
+ - 0 indicates the head is **masked**.
1041
+
1042
+ encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
1043
+ Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
1044
+ `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
1045
+ hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
1046
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
1047
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
1048
+ `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
1049
+ `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
1050
+
1051
+ Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
1052
+ blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
1053
+
1054
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
1055
+ don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
1056
+ `decoder_input_ids` of shape `(batch_size, sequence_length)`.
1057
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
1058
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
1059
+ This is useful if you want more control over how to convert `input_ids` indices into associated vectors
1060
+ than the model's internal embedding lookup matrix.
1061
+ decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
1062
+ Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
1063
+ representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds` have to be
1064
+ input (see `past_key_values`). This is useful if you want more control over how to convert
1065
+ `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
1066
+
1067
+ If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds` takes the value
1068
+ of `inputs_embeds`.
1069
+ use_cache (`bool`, *optional*):
1070
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
1071
+ `past_key_values`).
1072
+ output_attentions (`bool`, *optional*):
1073
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
1074
+ tensors for more detail.
1075
+ output_hidden_states (`bool`, *optional*):
1076
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
1077
+ more detail.
1078
+ return_dict (`bool`, *optional*):
1079
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
1080
+ """
1081
+
1082
+
1083
+ class UnimerMBartEncoder(UnimerMBartPreTrainedModel):
1084
+ """
1085
+ Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
1086
+ [`MBartEncoderLayer`].
1087
+
1088
+ Args:
1089
+ config: MBartConfig
1090
+ embed_tokens (nn.Embedding): output embedding
1091
+ """
1092
+
1093
+ def __init__(self, config: UnimerMBartConfig, embed_tokens: Optional[nn.Embedding] = None):
1094
+ super().__init__(config)
1095
+
1096
+ self.dropout = config.dropout
1097
+ self.layerdrop = config.encoder_layerdrop
1098
+
1099
+ embed_dim = config.d_model
1100
+ self.padding_idx = config.pad_token_id
1101
+ self.max_source_positions = config.max_position_embeddings
1102
+ embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
1103
+
1104
+ self.embed_tokens = UnimerMBartScaledWordEmbedding(
1105
+ config.vocab_size, embed_dim, self.padding_idx, embed_scale=embed_scale
1106
+ )
1107
+
1108
+ if embed_tokens is not None:
1109
+ self.embed_tokens.weight = embed_tokens.weight
1110
+
1111
+ self.embed_positions = UnimerMBartLearnedPositionalEmbedding(
1112
+ config.max_position_embeddings,
1113
+ embed_dim,
1114
+ )
1115
+ self.layers = nn.ModuleList([UnimerMBartEncoderLayer(config) for _ in range(config.encoder_layers)])
1116
+ self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
1117
+ self._use_sdpa = config._attn_implementation == "sdpa"
1118
+ self.layernorm_embedding = nn.LayerNorm(embed_dim)
1119
+ self.layer_norm = nn.LayerNorm(config.d_model)
1120
+
1121
+ self.gradient_checkpointing = False
1122
+ # Initialize weights and apply final processing
1123
+ self.post_init()
1124
+
1125
+ def _backward_compatibility_gradient_checkpointing(self):
1126
+ # Override to not delete the attribute from the config
1127
+ if self.supports_gradient_checkpointing and getattr(self.config, "gradient_checkpointing", False):
1128
+ self.gradient_checkpointing_enable()
1129
+
1130
+ def forward(
1131
+ self,
1132
+ input_ids: torch.LongTensor = None,
1133
+ attention_mask: Optional[torch.Tensor] = None,
1134
+ head_mask: Optional[torch.Tensor] = None,
1135
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1136
+ output_attentions: Optional[bool] = None,
1137
+ output_hidden_states: Optional[bool] = None,
1138
+ return_dict: Optional[bool] = None,
1139
+ ) -> Union[Tuple, BaseModelOutput]:
1140
+ r"""
1141
+ Args:
1142
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
1143
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
1144
+ provide it.
1145
+
1146
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
1147
+ [`PreTrainedTokenizer.__call__`] for details.
1148
+
1149
+ [What are input IDs?](../glossary#input-ids)
1150
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
1151
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
1152
+
1153
+ - 1 for tokens that are **not masked**,
1154
+ - 0 for tokens that are **masked**.
1155
+
1156
+ [What are attention masks?](../glossary#attention-mask)
1157
+ head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
1158
+ Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
1159
+
1160
+ - 1 indicates the head is **not masked**,
1161
+ - 0 indicates the head is **masked**.
1162
+
1163
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
1164
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
1165
+ This is useful if you want more control over how to convert `input_ids` indices into associated vectors
1166
+ than the model's internal embedding lookup matrix.
1167
+ output_attentions (`bool`, *optional*):
1168
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
1169
+ returned tensors for more detail.
1170
+ output_hidden_states (`bool`, *optional*):
1171
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
1172
+ for more detail.
1173
+ return_dict (`bool`, *optional*):
1174
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
1175
+ """
1176
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1177
+ output_hidden_states = (
1178
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1179
+ )
1180
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1181
+
1182
+ # retrieve input_ids and inputs_embeds
1183
+ if input_ids is not None and inputs_embeds is not None:
1184
+ raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
1185
+ elif input_ids is not None:
1186
+ input = input_ids
1187
+ input_shape = input.shape
1188
+ input_ids = input_ids.view(-1, input_shape[-1])
1189
+ elif inputs_embeds is not None:
1190
+ input = inputs_embeds[:, :, -1]
1191
+ else:
1192
+ raise ValueError("You have to specify either input_ids or inputs_embeds")
1193
+
1194
+ if inputs_embeds is None:
1195
+ inputs_embeds = self.embed_tokens(input_ids)
1196
+
1197
+ embed_pos = self.embed_positions(input)
1198
+
1199
+ hidden_states = inputs_embeds + embed_pos.to(inputs_embeds.device)
1200
+ hidden_states = self.layernorm_embedding(hidden_states)
1201
+ hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
1202
+
1203
+ # expand attention_mask
1204
+ if attention_mask is not None:
1205
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
1206
+ if self._use_flash_attention_2:
1207
+ attention_mask = attention_mask if 0 in attention_mask else None
1208
+ elif self._use_sdpa and head_mask is None and not output_attentions:
1209
+ # output_attentions=True & head_mask can not be supported when using SDPA, fall back to
1210
+ # the manual implementation that requires a 4D causal mask in all cases.
1211
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
1212
+ attention_mask = _prepare_4d_attention_mask_for_sdpa(attention_mask, inputs_embeds.dtype)
1213
+ else:
1214
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
1215
+ attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype)
1216
+
1217
+ encoder_states = () if output_hidden_states else None
1218
+ all_attentions = () if output_attentions else None
1219
+
1220
+ # check if head_mask has a correct number of layers specified if desired
1221
+ if head_mask is not None:
1222
+ if head_mask.size()[0] != len(self.layers):
1223
+ raise ValueError(
1224
+ f"The head_mask should be specified for {len(self.layers)} layers, but it is for"
1225
+ f" {head_mask.size()[0]}."
1226
+ )
1227
+ for idx, encoder_layer in enumerate(self.layers):
1228
+ if output_hidden_states:
1229
+ encoder_states = encoder_states + (hidden_states,)
1230
+ # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
1231
+ to_drop = False
1232
+ if self.training:
1233
+ dropout_probability = torch.rand([])
1234
+ if dropout_probability < self.layerdrop: # skip the layer
1235
+ to_drop = True
1236
+
1237
+ if to_drop:
1238
+ layer_outputs = (None, None)
1239
+ else:
1240
+ if self.gradient_checkpointing and self.training:
1241
+ layer_outputs = self._gradient_checkpointing_func(
1242
+ encoder_layer.__call__,
1243
+ hidden_states,
1244
+ attention_mask,
1245
+ (head_mask[idx] if head_mask is not None else None),
1246
+ output_attentions,
1247
+ )
1248
+ else:
1249
+ layer_outputs = encoder_layer(
1250
+ hidden_states,
1251
+ attention_mask,
1252
+ layer_head_mask=(head_mask[idx] if head_mask is not None else None),
1253
+ output_attentions=output_attentions,
1254
+ )
1255
+
1256
+ hidden_states = layer_outputs[0]
1257
+
1258
+ if output_attentions:
1259
+ all_attentions = all_attentions + (layer_outputs[1],)
1260
+
1261
+ hidden_states = self.layer_norm(hidden_states)
1262
+
1263
+ if output_hidden_states:
1264
+ encoder_states = encoder_states + (hidden_states,)
1265
+
1266
+ if not return_dict:
1267
+ return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
1268
+ return BaseModelOutput(
1269
+ last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
1270
+ )
1271
+
1272
+
1273
+ class UnimerMBartDecoder(UnimerMBartPreTrainedModel):
1274
+ """
1275
+ Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`MBartDecoderLayer`]
1276
+
1277
+ Args:
1278
+ config: MBartConfig
1279
+ embed_tokens (nn.Embedding): output embedding
1280
+ """
1281
+
1282
+ def __init__(self, config: UnimerMBartConfig, embed_tokens: Optional[nn.Embedding] = None):
1283
+ super().__init__(config)
1284
+ self.dropout = config.dropout
1285
+ self.layerdrop = config.decoder_layerdrop
1286
+ self.padding_idx = config.pad_token_id
1287
+ self.max_target_positions = config.max_position_embeddings
1288
+ embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
1289
+
1290
+ self.embed_tokens = UnimerMBartScaledWordEmbedding(
1291
+ config.vocab_size, config.d_model, self.padding_idx, embed_scale=embed_scale
1292
+ )
1293
+
1294
+ if embed_tokens is not None:
1295
+ self.embed_tokens.weight = embed_tokens.weight
1296
+
1297
+ self.embed_positions = UnimerMBartLearnedPositionalEmbedding(
1298
+ config.max_position_embeddings,
1299
+ config.d_model,
1300
+ )
1301
+ self.layers = nn.ModuleList([UnimerMBartDecoderLayer(config) for _ in range(config.decoder_layers)])
1302
+ self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
1303
+ self._use_sdpa = config._attn_implementation == "sdpa"
1304
+ self.layernorm_embedding = nn.LayerNorm(config.d_model)
1305
+ self.layer_norm = nn.LayerNorm(config.d_model)
1306
+
1307
+ self.gradient_checkpointing = False
1308
+ # Initialize weights and apply final processing
1309
+ self.post_init()
1310
+
1311
+ def get_input_embeddings(self):
1312
+ return self.embed_tokens
1313
+
1314
+ def set_input_embeddings(self, value):
1315
+ self.embed_tokens = value
1316
+
1317
+ def forward(
1318
+ self,
1319
+ input_ids: torch.LongTensor = None,
1320
+ attention_mask: Optional[torch.Tensor] = None,
1321
+ count_pred: Optional[torch.FloatTensor] = None,
1322
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
1323
+ encoder_attention_mask: Optional[torch.LongTensor] = None,
1324
+ head_mask: Optional[torch.Tensor] = None,
1325
+ cross_attn_head_mask: Optional[torch.Tensor] = None,
1326
+ past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
1327
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1328
+ use_cache: Optional[bool] = None,
1329
+ output_attentions: Optional[bool] = None,
1330
+ output_hidden_states: Optional[bool] = None,
1331
+ return_dict: Optional[bool] = None,
1332
+ ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
1333
+ r"""
1334
+ Args:
1335
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
1336
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
1337
+ provide it.
1338
+
1339
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
1340
+ [`PreTrainedTokenizer.__call__`] for details.
1341
+
1342
+ [What are input IDs?](../glossary#input-ids)
1343
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
1344
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
1345
+
1346
+ - 1 for tokens that are **not masked**,
1347
+ - 0 for tokens that are **masked**.
1348
+
1349
+ [What are attention masks?](../glossary#attention-mask)
1350
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
1351
+ Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
1352
+ of the decoder.
1353
+ encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
1354
+ Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
1355
+ selected in `[0, 1]`:
1356
+
1357
+ - 1 for tokens that are **not masked**,
1358
+ - 0 for tokens that are **masked**.
1359
+
1360
+ [What are attention masks?](../glossary#attention-mask)
1361
+ head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
1362
+ Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
1363
+
1364
+ - 1 indicates the head is **not masked**,
1365
+ - 0 indicates the head is **masked**.
1366
+
1367
+ cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
1368
+ Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing
1369
+ cross-attention on hidden heads. Mask values selected in `[0, 1]`:
1370
+
1371
+ - 1 indicates the head is **not masked**,
1372
+ - 0 indicates the head is **masked**.
1373
+
1374
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
1375
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
1376
+ shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
1377
+ shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
1378
+
1379
+ Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
1380
+ cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
1381
+
1382
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
1383
+ that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
1384
+ all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
1385
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
1386
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
1387
+ This is useful if you want more control over how to convert `input_ids` indices into associated vectors
1388
+ than the model's internal embedding lookup matrix.
1389
+ output_attentions (`bool`, *optional*):
1390
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
1391
+ returned tensors for more detail.
1392
+ output_hidden_states (`bool`, *optional*):
1393
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
1394
+ for more detail.
1395
+ return_dict (`bool`, *optional*):
1396
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
1397
+ """
1398
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1399
+ output_hidden_states = (
1400
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1401
+ )
1402
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
1403
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1404
+
1405
+ # retrieve input_ids and inputs_embeds
1406
+ if input_ids is not None and inputs_embeds is not None:
1407
+ raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
1408
+ elif input_ids is not None:
1409
+ input = input_ids
1410
+ input_shape = input.size()
1411
+ input_ids = input_ids.view(-1, input_shape[-1])
1412
+ elif inputs_embeds is not None:
1413
+ input_shape = inputs_embeds.size()[:-1]
1414
+ input = inputs_embeds[:, :, -1]
1415
+ else:
1416
+ raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
1417
+
1418
+ # past_key_values_length
1419
+ past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
1420
+
1421
+ if inputs_embeds is None:
1422
+ inputs_embeds = self.embed_tokens(input_ids)
1423
+
1424
+ if self._use_flash_attention_2:
1425
+ # 2d mask is passed through the layers
1426
+ attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
1427
+ elif self._use_sdpa and not output_attentions and cross_attn_head_mask is None:
1428
+ # output_attentions=True & cross_attn_head_mask can not be supported when using SDPA, and we fall back on
1429
+ # the manual implementation that requires a 4D causal mask in all cases.
1430
+ attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
1431
+ attention_mask,
1432
+ input_shape,
1433
+ inputs_embeds,
1434
+ past_key_values_length,
1435
+ )
1436
+ else:
1437
+ # 4d mask is passed through the layers
1438
+ attention_mask = _prepare_4d_causal_attention_mask(
1439
+ attention_mask, input_shape, inputs_embeds, past_key_values_length
1440
+ )
1441
+
1442
+ # expand encoder attention mask
1443
+ if encoder_hidden_states is not None and encoder_attention_mask is not None:
1444
+ if self._use_flash_attention_2:
1445
+ encoder_attention_mask = encoder_attention_mask if 0 in encoder_attention_mask else None
1446
+ elif self._use_sdpa and cross_attn_head_mask is None and not output_attentions:
1447
+ # output_attentions=True & cross_attn_head_mask can not be supported when using SDPA, and we fall back on
1448
+ # the manual implementation that requires a 4D causal mask in all cases.
1449
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
1450
+ encoder_attention_mask = _prepare_4d_attention_mask_for_sdpa(
1451
+ encoder_attention_mask,
1452
+ inputs_embeds.dtype,
1453
+ tgt_len=input_shape[-1],
1454
+ )
1455
+ else:
1456
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
1457
+ encoder_attention_mask = _prepare_4d_attention_mask(
1458
+ encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
1459
+ )
1460
+
1461
+ # embed positions
1462
+ positions = self.embed_positions(input, past_key_values_length)
1463
+
1464
+ hidden_states = inputs_embeds + positions.to(inputs_embeds.device)
1465
+
1466
+ # TODO: add counting context weight to hidden_states
1467
+ if count_pred is not None:
1468
+ count_context_weight = self.counting_context_weight(count_pred)
1469
+ hidden_states = hidden_states + 0.5 * count_context_weight.unsqueeze(1)
1470
+
1471
+ hidden_states = self.layernorm_embedding(hidden_states)
1472
+ hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
1473
+
1474
+ if self.gradient_checkpointing and self.training:
1475
+ if use_cache:
1476
+ logger.warning_once(
1477
+ "`use_cache=True` is incompatible with gradient checkpointing`. Setting `use_cache=False`..."
1478
+ )
1479
+ use_cache = False
1480
+
1481
+ # decoder layers
1482
+ all_hidden_states = () if output_hidden_states else None
1483
+ all_self_attns = () if output_attentions else None
1484
+ all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
1485
+ next_decoder_cache = () if use_cache else None
1486
+
1487
+ # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
1488
+ for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
1489
+ if attn_mask is not None:
1490
+ if attn_mask.size()[0] != len(self.layers):
1491
+ raise ValueError(
1492
+ f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
1493
+ f" {attn_mask.size()[0]}."
1494
+ )
1495
+ for idx, decoder_layer in enumerate(self.layers):
1496
+ # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
1497
+ if output_hidden_states:
1498
+ all_hidden_states += (hidden_states,)
1499
+ if self.training:
1500
+ dropout_probability = torch.rand([])
1501
+ if dropout_probability < self.layerdrop:
1502
+ continue
1503
+
1504
+ past_key_value = past_key_values[idx] if past_key_values is not None else None
1505
+
1506
+ if self.gradient_checkpointing and self.training:
1507
+ layer_outputs = self._gradient_checkpointing_func(
1508
+ decoder_layer.__call__,
1509
+ hidden_states,
1510
+ attention_mask,
1511
+ encoder_hidden_states,
1512
+ encoder_attention_mask,
1513
+ head_mask[idx] if head_mask is not None else None,
1514
+ cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None,
1515
+ None,
1516
+ output_attentions,
1517
+ use_cache,
1518
+ )
1519
+ else:
1520
+ layer_outputs = decoder_layer(
1521
+ hidden_states,
1522
+ attention_mask=attention_mask,
1523
+ encoder_hidden_states=encoder_hidden_states,
1524
+ encoder_attention_mask=encoder_attention_mask,
1525
+ layer_head_mask=(head_mask[idx] if head_mask is not None else None),
1526
+ cross_attn_layer_head_mask=(
1527
+ cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None
1528
+ ),
1529
+ past_key_value=past_key_value,
1530
+ output_attentions=output_attentions,
1531
+ use_cache=use_cache,
1532
+ )
1533
+ hidden_states = layer_outputs[0]
1534
+
1535
+ if use_cache:
1536
+ next_decoder_cache += (layer_outputs[3 if output_attentions else 1],)
1537
+
1538
+ if output_attentions:
1539
+ all_self_attns += (layer_outputs[1],)
1540
+
1541
+ if encoder_hidden_states is not None:
1542
+ all_cross_attentions += (layer_outputs[2],)
1543
+
1544
+ hidden_states = self.layer_norm(hidden_states)
1545
+
1546
+ # add hidden states from the last decoder layer
1547
+ if output_hidden_states:
1548
+ all_hidden_states += (hidden_states,)
1549
+
1550
+ next_cache = next_decoder_cache if use_cache else None
1551
+ if not return_dict:
1552
+ return tuple(
1553
+ v
1554
+ for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions]
1555
+ if v is not None
1556
+ )
1557
+ return BaseModelOutputWithPastAndCrossAttentions(
1558
+ last_hidden_state=hidden_states,
1559
+ past_key_values=next_cache,
1560
+ hidden_states=all_hidden_states,
1561
+ attentions=all_self_attns,
1562
+ cross_attentions=all_cross_attentions,
1563
+ )
1564
+
1565
+
1566
+ @add_start_docstrings(
1567
+ "The bare MBART Model outputting raw hidden-states without any specific head on top.",
1568
+ MBART_START_DOCSTRING,
1569
+ )
1570
+ class UnimerMBartModel(UnimerMBartPreTrainedModel):
1571
+ _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
1572
+
1573
+ def __init__(self, config: UnimerMBartConfig):
1574
+ super().__init__(config)
1575
+
1576
+ padding_idx, vocab_size = config.pad_token_id, config.vocab_size
1577
+ self.shared = nn.Embedding(vocab_size, config.d_model, padding_idx)
1578
+
1579
+ self.encoder = UnimerMBartEncoder(config, self.shared)
1580
+ self.decoder = UnimerMBartDecoder(config, self.shared)
1581
+
1582
+ # Initialize weights and apply final processing
1583
+ self.post_init()
1584
+
1585
+ def get_input_embeddings(self):
1586
+ return self.shared
1587
+
1588
+ def set_input_embeddings(self, value):
1589
+ self.shared = value
1590
+ self.encoder.embed_tokens = self.shared
1591
+ self.decoder.embed_tokens = self.shared
1592
+
1593
+ def get_encoder(self):
1594
+ return self.encoder
1595
+
1596
+ def get_decoder(self):
1597
+ return self.decoder
1598
+
1599
+ def _tie_weights(self):
1600
+ if self.config.tie_word_embeddings:
1601
+ self._tie_or_clone_weights(self.encoder.embed_tokens, self.get_input_embeddings())
1602
+ self._tie_or_clone_weights(self.decoder.embed_tokens, self.get_input_embeddings())
1603
+
1604
+ @add_start_docstrings_to_model_forward(MBART_INPUTS_DOCSTRING)
1605
+ @add_code_sample_docstrings(
1606
+ checkpoint=_CHECKPOINT_FOR_DOC,
1607
+ output_type=Seq2SeqModelOutput,
1608
+ config_class=_CONFIG_FOR_DOC,
1609
+ expected_output=_EXPECTED_OUTPUT_SHAPE,
1610
+ )
1611
+ def forward(
1612
+ self,
1613
+ input_ids: torch.LongTensor = None,
1614
+ attention_mask: Optional[torch.Tensor] = None,
1615
+ decoder_input_ids: Optional[torch.LongTensor] = None,
1616
+ decoder_attention_mask: Optional[torch.LongTensor] = None,
1617
+ head_mask: Optional[torch.Tensor] = None,
1618
+ decoder_head_mask: Optional[torch.Tensor] = None,
1619
+ cross_attn_head_mask: Optional[torch.Tensor] = None,
1620
+ encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
1621
+ past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
1622
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1623
+ decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
1624
+ use_cache: Optional[bool] = None,
1625
+ output_attentions: Optional[bool] = None,
1626
+ output_hidden_states: Optional[bool] = None,
1627
+ return_dict: Optional[bool] = None,
1628
+ ) -> Union[Seq2SeqModelOutput, Tuple[torch.FloatTensor]]:
1629
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1630
+ output_hidden_states = (
1631
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1632
+ )
1633
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
1634
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1635
+
1636
+ # different to other models, MBart automatically creates decoder_input_ids from
1637
+ # input_ids if no decoder_input_ids are provided
1638
+ if decoder_input_ids is None and decoder_inputs_embeds is None:
1639
+ decoder_input_ids = shift_tokens_right(input_ids, self.config.pad_token_id)
1640
+
1641
+ if encoder_outputs is None:
1642
+ encoder_outputs = self.encoder(
1643
+ input_ids=input_ids,
1644
+ attention_mask=attention_mask,
1645
+ head_mask=head_mask,
1646
+ inputs_embeds=inputs_embeds,
1647
+ output_attentions=output_attentions,
1648
+ output_hidden_states=output_hidden_states,
1649
+ return_dict=return_dict,
1650
+ )
1651
+ # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
1652
+ elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
1653
+ encoder_outputs = BaseModelOutput(
1654
+ last_hidden_state=encoder_outputs[0],
1655
+ hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
1656
+ attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
1657
+ )
1658
+
1659
+ # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
1660
+ decoder_outputs = self.decoder(
1661
+ input_ids=decoder_input_ids,
1662
+ attention_mask=decoder_attention_mask,
1663
+ encoder_hidden_states=encoder_outputs[0],
1664
+ encoder_attention_mask=attention_mask,
1665
+ head_mask=decoder_head_mask,
1666
+ cross_attn_head_mask=cross_attn_head_mask,
1667
+ past_key_values=past_key_values,
1668
+ inputs_embeds=decoder_inputs_embeds,
1669
+ use_cache=use_cache,
1670
+ output_attentions=output_attentions,
1671
+ output_hidden_states=output_hidden_states,
1672
+ return_dict=return_dict,
1673
+ )
1674
+
1675
+ if not return_dict:
1676
+ return decoder_outputs + encoder_outputs
1677
+
1678
+ return Seq2SeqModelOutput(
1679
+ last_hidden_state=decoder_outputs.last_hidden_state,
1680
+ past_key_values=decoder_outputs.past_key_values,
1681
+ decoder_hidden_states=decoder_outputs.hidden_states,
1682
+ decoder_attentions=decoder_outputs.attentions,
1683
+ cross_attentions=decoder_outputs.cross_attentions,
1684
+ encoder_last_hidden_state=encoder_outputs.last_hidden_state,
1685
+ encoder_hidden_states=encoder_outputs.hidden_states,
1686
+ encoder_attentions=encoder_outputs.attentions,
1687
+ )
1688
+
1689
+
1690
+ @add_start_docstrings(
1691
+ "The MBART Model with a language modeling head. Can be used for summarization, after fine-tuning the pretrained models.",
1692
+ MBART_START_DOCSTRING,
1693
+ )
1694
+ class UnimerMBartForConditionalGeneration(UnimerMBartPreTrainedModel, GenerationMixin):
1695
+ base_model_prefix = "model"
1696
+ _keys_to_ignore_on_load_missing = ["final_logits_bias"]
1697
+ _tied_weights_keys = ["model.encoder.embed_tokens.weight", "model.decoder.embed_tokens.weight", "lm_head.weight"]
1698
+
1699
+ def __init__(self, config: UnimerMBartConfig):
1700
+ super().__init__(config)
1701
+ self.model = UnimerMBartModel(config)
1702
+ self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings)))
1703
+ self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False)
1704
+
1705
+ # Initialize weights and apply final processing
1706
+ self.post_init()
1707
+
1708
+ def get_encoder(self):
1709
+ return self.model.get_encoder()
1710
+
1711
+ def get_decoder(self):
1712
+ return self.model.get_decoder()
1713
+
1714
+ def resize_token_embeddings(self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None) -> nn.Embedding:
1715
+ new_embeddings = super().resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
1716
+ self._resize_final_logits_bias(new_embeddings.weight.shape[0])
1717
+ return new_embeddings
1718
+
1719
+ def _resize_final_logits_bias(self, new_num_tokens: int) -> None:
1720
+ old_num_tokens = self.final_logits_bias.shape[-1]
1721
+ if new_num_tokens <= old_num_tokens:
1722
+ new_bias = self.final_logits_bias[:, :new_num_tokens]
1723
+ else:
1724
+ extra_bias = torch.zeros((1, new_num_tokens - old_num_tokens), device=self.final_logits_bias.device)
1725
+ new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1)
1726
+ self.register_buffer("final_logits_bias", new_bias)
1727
+
1728
+ def get_output_embeddings(self):
1729
+ return self.lm_head
1730
+
1731
+ def set_output_embeddings(self, new_embeddings):
1732
+ self.lm_head = new_embeddings
1733
+
1734
+ @add_start_docstrings_to_model_forward(MBART_INPUTS_DOCSTRING)
1735
+ @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
1736
+ @add_end_docstrings(MBART_GENERATION_EXAMPLE)
1737
+ def forward(
1738
+ self,
1739
+ input_ids: torch.LongTensor = None,
1740
+ attention_mask: Optional[torch.Tensor] = None,
1741
+ decoder_input_ids: Optional[torch.LongTensor] = None,
1742
+ decoder_attention_mask: Optional[torch.LongTensor] = None,
1743
+ head_mask: Optional[torch.Tensor] = None,
1744
+ decoder_head_mask: Optional[torch.Tensor] = None,
1745
+ cross_attn_head_mask: Optional[torch.Tensor] = None,
1746
+ encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
1747
+ past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
1748
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1749
+ decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
1750
+ labels: Optional[torch.LongTensor] = None,
1751
+ use_cache: Optional[bool] = None,
1752
+ output_attentions: Optional[bool] = None,
1753
+ output_hidden_states: Optional[bool] = None,
1754
+ return_dict: Optional[bool] = None,
1755
+ ) -> Union[Seq2SeqLMOutput, Tuple[torch.FloatTensor]]:
1756
+ r"""
1757
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1758
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
1759
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
1760
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
1761
+
1762
+ Returns:
1763
+
1764
+ """
1765
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1766
+
1767
+ if labels is not None:
1768
+ if use_cache:
1769
+ logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
1770
+ use_cache = False
1771
+ if decoder_input_ids is None and decoder_inputs_embeds is None:
1772
+ decoder_input_ids = shift_tokens_right(labels, self.config.pad_token_id)
1773
+
1774
+ outputs = self.model(
1775
+ input_ids,
1776
+ attention_mask=attention_mask,
1777
+ decoder_input_ids=decoder_input_ids,
1778
+ encoder_outputs=encoder_outputs,
1779
+ decoder_attention_mask=decoder_attention_mask,
1780
+ head_mask=head_mask,
1781
+ decoder_head_mask=decoder_head_mask,
1782
+ cross_attn_head_mask=cross_attn_head_mask,
1783
+ past_key_values=past_key_values,
1784
+ inputs_embeds=inputs_embeds,
1785
+ decoder_inputs_embeds=decoder_inputs_embeds,
1786
+ use_cache=use_cache,
1787
+ output_attentions=output_attentions,
1788
+ output_hidden_states=output_hidden_states,
1789
+ return_dict=return_dict,
1790
+ )
1791
+ lm_logits = self.lm_head(outputs[0]) + self.final_logits_bias
1792
+
1793
+ masked_lm_loss = None
1794
+ if labels is not None:
1795
+ loss_fct = CrossEntropyLoss()
1796
+ masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
1797
+
1798
+ if not return_dict:
1799
+ output = (lm_logits,) + outputs[1:]
1800
+ return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
1801
+
1802
+ return Seq2SeqLMOutput(
1803
+ loss=masked_lm_loss,
1804
+ logits=lm_logits,
1805
+ past_key_values=outputs.past_key_values,
1806
+ decoder_hidden_states=outputs.decoder_hidden_states,
1807
+ decoder_attentions=outputs.decoder_attentions,
1808
+ cross_attentions=outputs.cross_attentions,
1809
+ encoder_last_hidden_state=outputs.encoder_last_hidden_state,
1810
+ encoder_hidden_states=outputs.encoder_hidden_states,
1811
+ encoder_attentions=outputs.encoder_attentions,
1812
+ )
1813
+
1814
+ def prepare_inputs_for_generation(
1815
+ self,
1816
+ decoder_input_ids,
1817
+ past_key_values=None,
1818
+ attention_mask=None,
1819
+ head_mask=None,
1820
+ decoder_head_mask=None,
1821
+ cross_attn_head_mask=None,
1822
+ use_cache=None,
1823
+ encoder_outputs=None,
1824
+ **kwargs,
1825
+ ):
1826
+ # cut decoder_input_ids if past is used
1827
+ if past_key_values is not None:
1828
+ past_length = past_key_values[0][0].shape[2]
1829
+
1830
+ # Some generation methods already pass only the last input ID
1831
+ if decoder_input_ids.shape[1] > past_length:
1832
+ remove_prefix_length = past_length
1833
+ else:
1834
+ # Default to old behavior: keep only final ID
1835
+ remove_prefix_length = decoder_input_ids.shape[1] - 1
1836
+
1837
+ decoder_input_ids = decoder_input_ids[:, remove_prefix_length:]
1838
+
1839
+ return {
1840
+ "input_ids": None, # encoder_outputs is defined. input_ids not needed
1841
+ "encoder_outputs": encoder_outputs,
1842
+ "past_key_values": past_key_values,
1843
+ "decoder_input_ids": decoder_input_ids,
1844
+ "attention_mask": attention_mask,
1845
+ "head_mask": head_mask,
1846
+ "decoder_head_mask": decoder_head_mask,
1847
+ "cross_attn_head_mask": cross_attn_head_mask,
1848
+ "use_cache": use_cache, # change this to avoid caching (presumably for debugging)
1849
+ }
1850
+
1851
+ def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
1852
+ return shift_tokens_right(labels, self.config.pad_token_id)
1853
+
1854
+ @staticmethod
1855
+ def _reorder_cache(past_key_values, beam_idx):
1856
+ reordered_past = ()
1857
+ for layer_past in past_key_values:
1858
+ # cached cross_attention states don't have to be reordered -> they are always the same
1859
+ reordered_past += (
1860
+ tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past[:2])
1861
+ + layer_past[2:],
1862
+ )
1863
+ return reordered_past
1864
+
1865
+
1866
+ @add_start_docstrings(
1867
+ """
1868
+ MBart model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
1869
+ tasks.
1870
+ """,
1871
+ MBART_START_DOCSTRING,
1872
+ )
1873
+ class UnimerMBartForSequenceClassification(UnimerMBartPreTrainedModel):
1874
+ _tied_weights_keys = ["model.encoder.embed_tokens.weight", "model.decoder.embed_tokens.weight"]
1875
+
1876
+ def __init__(self, config: UnimerMBartConfig, **kwargs):
1877
+ super().__init__(config, **kwargs)
1878
+ self.model = UnimerMBartModel(config)
1879
+ self.classification_head = UnimerMBartClassificationHead(
1880
+ config.d_model,
1881
+ config.d_model,
1882
+ config.num_labels,
1883
+ config.classifier_dropout,
1884
+ )
1885
+
1886
+ # Initialize weights and apply final processing
1887
+ self.post_init()
1888
+
1889
+ @add_start_docstrings_to_model_forward(MBART_INPUTS_DOCSTRING)
1890
+ @add_code_sample_docstrings(
1891
+ checkpoint=_CHECKPOINT_FOR_DOC,
1892
+ output_type=Seq2SeqSequenceClassifierOutput,
1893
+ config_class=_CONFIG_FOR_DOC,
1894
+ )
1895
+ # Copied from transformers.models.bart.modeling_bart.BartForSequenceClassification.forward
1896
+ def forward(
1897
+ self,
1898
+ input_ids: torch.LongTensor = None,
1899
+ attention_mask: Optional[torch.Tensor] = None,
1900
+ decoder_input_ids: Optional[torch.LongTensor] = None,
1901
+ decoder_attention_mask: Optional[torch.LongTensor] = None,
1902
+ head_mask: Optional[torch.Tensor] = None,
1903
+ decoder_head_mask: Optional[torch.Tensor] = None,
1904
+ cross_attn_head_mask: Optional[torch.Tensor] = None,
1905
+ encoder_outputs: Optional[List[torch.FloatTensor]] = None,
1906
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1907
+ decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
1908
+ labels: Optional[torch.LongTensor] = None,
1909
+ use_cache: Optional[bool] = None,
1910
+ output_attentions: Optional[bool] = None,
1911
+ output_hidden_states: Optional[bool] = None,
1912
+ return_dict: Optional[bool] = None,
1913
+ ) -> Union[Tuple, Seq2SeqSequenceClassifierOutput]:
1914
+ r"""
1915
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1916
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
1917
+ config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
1918
+ """
1919
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1920
+ if labels is not None:
1921
+ use_cache = False
1922
+
1923
+ if input_ids is None and inputs_embeds is not None:
1924
+ raise NotImplementedError(
1925
+ f"Passing input embeddings is currently not supported for {self.__class__.__name__}"
1926
+ )
1927
+
1928
+ outputs = self.model(
1929
+ input_ids,
1930
+ attention_mask=attention_mask,
1931
+ decoder_input_ids=decoder_input_ids,
1932
+ decoder_attention_mask=decoder_attention_mask,
1933
+ head_mask=head_mask,
1934
+ decoder_head_mask=decoder_head_mask,
1935
+ cross_attn_head_mask=cross_attn_head_mask,
1936
+ encoder_outputs=encoder_outputs,
1937
+ inputs_embeds=inputs_embeds,
1938
+ decoder_inputs_embeds=decoder_inputs_embeds,
1939
+ use_cache=use_cache,
1940
+ output_attentions=output_attentions,
1941
+ output_hidden_states=output_hidden_states,
1942
+ return_dict=return_dict,
1943
+ )
1944
+ hidden_states = outputs[0] # last hidden state
1945
+
1946
+ eos_mask = input_ids.eq(self.config.eos_token_id).to(hidden_states.device)
1947
+
1948
+ if len(torch.unique_consecutive(eos_mask.sum(1))) > 1:
1949
+ raise ValueError("All examples must have the same number of <eos> tokens.")
1950
+ sentence_representation = hidden_states[eos_mask, :].view(hidden_states.size(0), -1, hidden_states.size(-1))[
1951
+ :, -1, :
1952
+ ]
1953
+ logits = self.classification_head(sentence_representation)
1954
+
1955
+ loss = None
1956
+ if labels is not None:
1957
+ labels = labels.to(logits.device)
1958
+ if self.config.problem_type is None:
1959
+ if self.config.num_labels == 1:
1960
+ self.config.problem_type = "regression"
1961
+ elif self.config.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
1962
+ self.config.problem_type = "single_label_classification"
1963
+ else:
1964
+ self.config.problem_type = "multi_label_classification"
1965
+
1966
+ if self.config.problem_type == "regression":
1967
+ loss_fct = MSELoss()
1968
+ if self.config.num_labels == 1:
1969
+ loss = loss_fct(logits.squeeze(), labels.squeeze())
1970
+ else:
1971
+ loss = loss_fct(logits, labels)
1972
+ elif self.config.problem_type == "single_label_classification":
1973
+ loss_fct = CrossEntropyLoss()
1974
+ loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
1975
+ elif self.config.problem_type == "multi_label_classification":
1976
+ loss_fct = BCEWithLogitsLoss()
1977
+ loss = loss_fct(logits, labels)
1978
+ if not return_dict:
1979
+ output = (logits,) + outputs[1:]
1980
+ return ((loss,) + output) if loss is not None else output
1981
+
1982
+ return Seq2SeqSequenceClassifierOutput(
1983
+ loss=loss,
1984
+ logits=logits,
1985
+ past_key_values=outputs.past_key_values,
1986
+ decoder_hidden_states=outputs.decoder_hidden_states,
1987
+ decoder_attentions=outputs.decoder_attentions,
1988
+ cross_attentions=outputs.cross_attentions,
1989
+ encoder_last_hidden_state=outputs.encoder_last_hidden_state,
1990
+ encoder_hidden_states=outputs.encoder_hidden_states,
1991
+ encoder_attentions=outputs.encoder_attentions,
1992
+ )
1993
+
1994
+
1995
+ @add_start_docstrings(
1996
+ """
1997
+ MBART Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
1998
+ layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
1999
+ """,
2000
+ MBART_START_DOCSTRING,
2001
+ )
2002
+ class UnimerMBartForQuestionAnswering(UnimerMBartPreTrainedModel):
2003
+ _tied_weights_keys = ["model.encoder.embed_tokens.weight", "model.decoder.embed_tokens.weight"]
2004
+
2005
+ def __init__(self, config):
2006
+ super().__init__(config)
2007
+
2008
+ config.num_labels = 2
2009
+ self.num_labels = config.num_labels
2010
+
2011
+ self.model = UnimerMBartModel(config)
2012
+ self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
2013
+
2014
+ # Initialize weights and apply final processing
2015
+ self.post_init()
2016
+
2017
+ @add_start_docstrings_to_model_forward(MBART_INPUTS_DOCSTRING)
2018
+ @add_code_sample_docstrings(
2019
+ checkpoint=_CHECKPOINT_FOR_DOC,
2020
+ output_type=Seq2SeqQuestionAnsweringModelOutput,
2021
+ config_class=_CONFIG_FOR_DOC,
2022
+ )
2023
+ # Copied from transformers.models.bart.modeling_bart.BartForQuestionAnswering.forward
2024
+ def forward(
2025
+ self,
2026
+ input_ids: torch.Tensor = None,
2027
+ attention_mask: Optional[torch.Tensor] = None,
2028
+ decoder_input_ids: Optional[torch.LongTensor] = None,
2029
+ decoder_attention_mask: Optional[torch.LongTensor] = None,
2030
+ head_mask: Optional[torch.Tensor] = None,
2031
+ decoder_head_mask: Optional[torch.Tensor] = None,
2032
+ cross_attn_head_mask: Optional[torch.Tensor] = None,
2033
+ encoder_outputs: Optional[List[torch.FloatTensor]] = None,
2034
+ start_positions: Optional[torch.LongTensor] = None,
2035
+ end_positions: Optional[torch.LongTensor] = None,
2036
+ inputs_embeds: Optional[torch.FloatTensor] = None,
2037
+ decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
2038
+ use_cache: Optional[bool] = None,
2039
+ output_attentions: Optional[bool] = None,
2040
+ output_hidden_states: Optional[bool] = None,
2041
+ return_dict: Optional[bool] = None,
2042
+ ) -> Union[Tuple, Seq2SeqQuestionAnsweringModelOutput]:
2043
+ r"""
2044
+ start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
2045
+ Labels for position (index) of the start of the labelled span for computing the token classification loss.
2046
+ Positions are clamped to the length of the sequence (*sequence_length*). Position outside of the sequence
2047
+ are not taken into account for computing the loss.
2048
+ end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
2049
+ Labels for position (index) of the end of the labelled span for computing the token classification loss.
2050
+ Positions are clamped to the length of the sequence (*sequence_length*). Position outside of the sequence
2051
+ are not taken into account for computing the loss.
2052
+ """
2053
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
2054
+ if start_positions is not None and end_positions is not None:
2055
+ use_cache = False
2056
+
2057
+ outputs = self.model(
2058
+ input_ids,
2059
+ attention_mask=attention_mask,
2060
+ decoder_input_ids=decoder_input_ids,
2061
+ decoder_attention_mask=decoder_attention_mask,
2062
+ head_mask=head_mask,
2063
+ decoder_head_mask=decoder_head_mask,
2064
+ cross_attn_head_mask=cross_attn_head_mask,
2065
+ encoder_outputs=encoder_outputs,
2066
+ inputs_embeds=inputs_embeds,
2067
+ decoder_inputs_embeds=decoder_inputs_embeds,
2068
+ use_cache=use_cache,
2069
+ output_attentions=output_attentions,
2070
+ output_hidden_states=output_hidden_states,
2071
+ return_dict=return_dict,
2072
+ )
2073
+
2074
+ sequence_output = outputs[0]
2075
+
2076
+ logits = self.qa_outputs(sequence_output)
2077
+ start_logits, end_logits = logits.split(1, dim=-1)
2078
+ start_logits = start_logits.squeeze(-1).contiguous()
2079
+ end_logits = end_logits.squeeze(-1).contiguous()
2080
+
2081
+ total_loss = None
2082
+ if start_positions is not None and end_positions is not None:
2083
+ # If we are on multi-GPU, split add a dimension
2084
+ if len(start_positions.size()) > 1:
2085
+ start_positions = start_positions.squeeze(-1)
2086
+ if len(end_positions.size()) > 1:
2087
+ end_positions = end_positions.squeeze(-1)
2088
+ # sometimes the start/end positions are outside our model inputs, we ignore these terms
2089
+ ignored_index = start_logits.size(1)
2090
+ start_positions = start_positions.clamp(0, ignored_index)
2091
+ end_positions = end_positions.clamp(0, ignored_index)
2092
+
2093
+ loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
2094
+ start_loss = loss_fct(start_logits, start_positions)
2095
+ end_loss = loss_fct(end_logits, end_positions)
2096
+ total_loss = (start_loss + end_loss) / 2
2097
+
2098
+ if not return_dict:
2099
+ output = (
2100
+ start_logits,
2101
+ end_logits,
2102
+ ) + outputs[1:]
2103
+ return ((total_loss,) + output) if total_loss is not None else output
2104
+
2105
+ return Seq2SeqQuestionAnsweringModelOutput(
2106
+ loss=total_loss,
2107
+ start_logits=start_logits,
2108
+ end_logits=end_logits,
2109
+ past_key_values=outputs.past_key_values,
2110
+ decoder_hidden_states=outputs.decoder_hidden_states,
2111
+ decoder_attentions=outputs.decoder_attentions,
2112
+ cross_attentions=outputs.cross_attentions,
2113
+ encoder_last_hidden_state=outputs.encoder_last_hidden_state,
2114
+ encoder_hidden_states=outputs.encoder_hidden_states,
2115
+ encoder_attentions=outputs.encoder_attentions,
2116
+ )
2117
+
2118
+
2119
+ # Copied from transformers.models.bart.modeling_bart.BartDecoderWrapper with Bart->MBart
2120
+ class UnimerMBartDecoderWrapper(UnimerMBartPreTrainedModel):
2121
+ """
2122
+ This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
2123
+ used in combination with the [`EncoderDecoderModel`] framework.
2124
+ """
2125
+
2126
+ def __init__(self, config):
2127
+ super().__init__(config)
2128
+ self.decoder = UnimerMBartDecoder(config)
2129
+
2130
+ def forward(self, *args, **kwargs):
2131
+ return self.decoder(*args, **kwargs)
2132
+
2133
+
2134
+ # Copied from transformers.models.bart.modeling_bart.BartForCausalLM with Bart->MBart, facebook/bart-base->facebook/mbart-large-cc25
2135
+ class UnimerMBartForCausalLM(UnimerMBartPreTrainedModel, GenerationMixin):
2136
+ _tied_weights_keys = ["lm_head.weight"]
2137
+
2138
+ def __init__(self, config):
2139
+ config = copy.deepcopy(config)
2140
+ config.is_decoder = True
2141
+ config.is_encoder_decoder = False
2142
+ super().__init__(config)
2143
+ self.model = UnimerMBartDecoderWrapper(config)
2144
+
2145
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
2146
+
2147
+ # Initialize weights and apply final processing
2148
+ self.post_init()
2149
+
2150
+ def get_input_embeddings(self):
2151
+ return self.model.decoder.embed_tokens
2152
+
2153
+ def set_input_embeddings(self, value):
2154
+ self.model.decoder.embed_tokens = value
2155
+
2156
+ def get_output_embeddings(self):
2157
+ return self.lm_head
2158
+
2159
+ def set_output_embeddings(self, new_embeddings):
2160
+ self.lm_head = new_embeddings
2161
+
2162
+ def set_decoder(self, decoder):
2163
+ self.model.decoder = decoder
2164
+
2165
+ def get_decoder(self):
2166
+ return self.model.decoder
2167
+
2168
+ @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentionsAndCounting, config_class=_CONFIG_FOR_DOC)
2169
+ def forward(
2170
+ self,
2171
+ input_ids: torch.LongTensor = None,
2172
+ attention_mask: Optional[torch.Tensor] = None,
2173
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
2174
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
2175
+ head_mask: Optional[torch.Tensor] = None,
2176
+ cross_attn_head_mask: Optional[torch.Tensor] = None,
2177
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
2178
+ inputs_embeds: Optional[torch.FloatTensor] = None,
2179
+ labels: Optional[torch.LongTensor] = None,
2180
+ use_cache: Optional[bool] = None,
2181
+ output_attentions: Optional[bool] = None,
2182
+ output_hidden_states: Optional[bool] = None,
2183
+ return_dict: Optional[bool] = None,
2184
+ count_gt: Optional[torch.LongTensor] = None,
2185
+ ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
2186
+ r"""
2187
+ Args:
2188
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
2189
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
2190
+ provide it.
2191
+
2192
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
2193
+ [`PreTrainedTokenizer.__call__`] for details.
2194
+
2195
+ [What are input IDs?](../glossary#input-ids)
2196
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
2197
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
2198
+
2199
+ - 1 for tokens that are **not masked**,
2200
+ - 0 for tokens that are **masked**.
2201
+
2202
+ [What are attention masks?](../glossary#attention-mask)
2203
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
2204
+ Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
2205
+ if the model is configured as a decoder.
2206
+ encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
2207
+ Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used
2208
+ in the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
2209
+ head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
2210
+ Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
2211
+
2212
+ - 1 indicates the head is **not masked**,
2213
+ - 0 indicates the head is **masked**.
2214
+
2215
+ cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
2216
+ Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
2217
+
2218
+ - 1 indicates the head is **not masked**,
2219
+ - 0 indicates the head is **masked**.
2220
+
2221
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
2222
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
2223
+ shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
2224
+ shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two additional
2225
+ tensors are only required when the model is used as a decoder in a Sequence to Sequence model.
2226
+
2227
+ Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
2228
+ cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
2229
+
2230
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
2231
+ that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
2232
+ all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
2233
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
2234
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
2235
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
2236
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
2237
+ use_cache (`bool`, *optional*):
2238
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
2239
+ (see `past_key_values`).
2240
+
2241
+ - 1 for tokens that are **not masked**,
2242
+ - 0 for tokens that are **masked**.
2243
+ output_attentions (`bool`, *optional*):
2244
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
2245
+ returned tensors for more detail.
2246
+ output_hidden_states (`bool`, *optional*):
2247
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
2248
+ for more detail.
2249
+ return_dict (`bool`, *optional*):
2250
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
2251
+
2252
+ Returns:
2253
+
2254
+ Example:
2255
+
2256
+ ```python
2257
+ >>> from transformers import AutoTokenizer, MBartForCausalLM
2258
+
2259
+ >>> tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-cc25")
2260
+ >>> model = MBartForCausalLM.from_pretrained("facebook/mbart-large-cc25", add_cross_attention=False)
2261
+ >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
2262
+ >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
2263
+ >>> outputs = model(**inputs)
2264
+
2265
+ >>> logits = outputs.logits
2266
+ >>> expected_shape = [1, inputs.input_ids.shape[-1], model.config.vocab_size]
2267
+ >>> list(logits.shape) == expected_shape
2268
+ True
2269
+ ```"""
2270
+
2271
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
2272
+ output_hidden_states = (
2273
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
2274
+ )
2275
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
2276
+
2277
+ count_pred = None
2278
+
2279
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
2280
+ outputs = self.model.decoder(
2281
+ input_ids=input_ids,
2282
+ attention_mask=attention_mask,
2283
+ count_pred=count_pred,
2284
+ encoder_hidden_states=encoder_hidden_states,
2285
+ encoder_attention_mask=encoder_attention_mask,
2286
+ head_mask=head_mask,
2287
+ cross_attn_head_mask=cross_attn_head_mask,
2288
+ past_key_values=past_key_values,
2289
+ inputs_embeds=inputs_embeds,
2290
+ use_cache=use_cache,
2291
+ output_attentions=output_attentions,
2292
+ output_hidden_states=output_hidden_states,
2293
+ return_dict=return_dict,
2294
+ )
2295
+
2296
+ logits = self.lm_head(outputs[0])
2297
+
2298
+ loss = None
2299
+ if labels is not None:
2300
+ labels = labels.to(logits.device)
2301
+ loss_fct = CrossEntropyLoss()
2302
+ loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1))
2303
+
2304
+ if not return_dict:
2305
+ output = (logits,) + outputs[1:]
2306
+ return (loss,) + output if loss is not None else output
2307
+
2308
+ return CausalLMOutputWithCrossAttentionsAndCounting(
2309
+ loss=loss,
2310
+ logits=logits,
2311
+ past_key_values=outputs.past_key_values,
2312
+ hidden_states=outputs.hidden_states,
2313
+ attentions=outputs.attentions,
2314
+ cross_attentions=outputs.cross_attentions,
2315
+ counting=count_pred,
2316
+ )
2317
+
2318
+ def prepare_inputs_for_generation(
2319
+ self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, **kwargs
2320
+ ):
2321
+ # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
2322
+ if attention_mask is None:
2323
+ attention_mask = input_ids.new_ones(input_ids.shape)
2324
+
2325
+ if past_key_values:
2326
+ past_length = past_key_values[0][0].shape[2]
2327
+
2328
+ # Some generation methods already pass only the last input ID
2329
+ if input_ids.shape[1] > past_length:
2330
+ remove_prefix_length = past_length
2331
+ else:
2332
+ # Default to old behavior: keep only final ID
2333
+ remove_prefix_length = input_ids.shape[1] - 1
2334
+
2335
+ input_ids = input_ids[:, remove_prefix_length:]
2336
+ # first step, decoder_cached_states are empty
2337
+ return {
2338
+ "input_ids": input_ids, # encoder_outputs is defined. input_ids not needed
2339
+ "attention_mask": attention_mask,
2340
+ "past_key_values": past_key_values,
2341
+ "use_cache": use_cache,
2342
+ }
2343
+
2344
+ @staticmethod
2345
+ def _reorder_cache(past_key_values, beam_idx):
2346
+ reordered_past = ()
2347
+ for layer_past in past_key_values:
2348
+ reordered_past += (
2349
+ tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
2350
+ )
2351
+ return reordered_past