liger-kernel 0.5.5__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. liger_kernel/chunked_loss/functional.py +2 -0
  2. liger_kernel/chunked_loss/fused_linear_distillation.py +17 -2
  3. liger_kernel/chunked_loss/fused_linear_ppo.py +346 -0
  4. liger_kernel/chunked_loss/grpo_loss.py +134 -60
  5. liger_kernel/chunked_loss/jsd_loss.py +12 -7
  6. liger_kernel/ops/cross_entropy.py +3 -2
  7. liger_kernel/ops/dyt.py +225 -0
  8. liger_kernel/ops/fused_linear_jsd.py +2 -1
  9. liger_kernel/ops/jsd.py +32 -12
  10. liger_kernel/ops/kl_div.py +15 -8
  11. liger_kernel/ops/layer_norm.py +14 -1
  12. liger_kernel/ops/rms_norm.py +12 -1
  13. liger_kernel/transformers/__init__.py +133 -15
  14. liger_kernel/transformers/dyt.py +20 -0
  15. liger_kernel/transformers/functional.py +5 -0
  16. liger_kernel/transformers/gema3_rms.py +8 -0
  17. liger_kernel/transformers/model/gemma.py +17 -20
  18. liger_kernel/transformers/model/gemma2.py +17 -21
  19. liger_kernel/transformers/model/gemma3.py +335 -0
  20. liger_kernel/transformers/model/llama.py +17 -19
  21. liger_kernel/transformers/model/llava.py +369 -0
  22. liger_kernel/transformers/model/loss_utils.py +64 -0
  23. liger_kernel/transformers/model/mistral.py +28 -25
  24. liger_kernel/transformers/model/mixtral.py +20 -26
  25. liger_kernel/transformers/model/mllama.py +17 -19
  26. liger_kernel/transformers/model/olmo2.py +17 -20
  27. liger_kernel/transformers/model/paligemma.py +397 -0
  28. liger_kernel/transformers/model/phi3.py +17 -19
  29. liger_kernel/transformers/model/qwen2.py +17 -19
  30. liger_kernel/transformers/model/qwen2_5_vl.py +9 -10
  31. liger_kernel/transformers/model/qwen2_vl.py +9 -10
  32. liger_kernel/transformers/monkey_patch.py +392 -13
  33. {liger_kernel-0.5.5.dist-info → liger_kernel-0.5.7.dist-info}/METADATA +11 -6
  34. {liger_kernel-0.5.5.dist-info → liger_kernel-0.5.7.dist-info}/RECORD +38 -31
  35. {liger_kernel-0.5.5.dist-info → liger_kernel-0.5.7.dist-info}/WHEEL +1 -1
  36. liger_kernel/chunked_loss/fused_linear_rlhf.py +0 -240
  37. {liger_kernel-0.5.5.dist-info → liger_kernel-0.5.7.dist-info/licenses}/LICENSE +0 -0
  38. {liger_kernel-0.5.5.dist-info → liger_kernel-0.5.7.dist-info/licenses}/NOTICE +0 -0
  39. {liger_kernel-0.5.5.dist-info → liger_kernel-0.5.7.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,335 @@
1
+ from typing import List
2
+ from typing import Optional
3
+ from typing import Tuple
4
+ from typing import Union
5
+
6
+ import torch
7
+ import torch.nn as nn
8
+
9
+ from transformers.cache_utils import Cache
10
+ from transformers.cache_utils import HybridCache
11
+ from transformers.modeling_outputs import CausalLMOutputWithPast
12
+ from transformers.models.gemma3.modeling_gemma3 import _CONFIG_FOR_DOC
13
+ from transformers.models.gemma3.modeling_gemma3 import GEMMA3_INPUTS_DOCSTRING
14
+ from transformers.models.gemma3.modeling_gemma3 import Gemma3CausalLMOutputWithPast
15
+ from transformers.utils import add_start_docstrings_to_model_forward
16
+ from transformers.utils import is_torchdynamo_compiling
17
+ from transformers.utils import logging
18
+ from transformers.utils import replace_return_docstrings
19
+ from transformers.utils.deprecation import deprecate_kwarg
20
+
21
+ from liger_kernel.transformers.fused_linear_cross_entropy import LigerFusedLinearCrossEntropyLoss
22
+ from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
23
+
24
+ logger = logging.get_logger(__name__)
25
+
26
+
27
+ @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
28
+ @add_start_docstrings_to_model_forward(GEMMA3_INPUTS_DOCSTRING)
29
+ @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
30
+ def causal_forward(
31
+ self,
32
+ input_ids: torch.LongTensor = None,
33
+ attention_mask: Optional[torch.Tensor] = None,
34
+ position_ids: Optional[torch.LongTensor] = None,
35
+ past_key_values: Optional[HybridCache] = None,
36
+ inputs_embeds: Optional[torch.FloatTensor] = None,
37
+ labels: Optional[torch.LongTensor] = None,
38
+ use_cache: Optional[bool] = None,
39
+ output_attentions: Optional[bool] = None,
40
+ output_hidden_states: Optional[bool] = None,
41
+ return_dict: Optional[bool] = None,
42
+ cache_position: Optional[torch.LongTensor] = None,
43
+ logits_to_keep: Union[int, torch.Tensor] = 0,
44
+ **loss_kwargs,
45
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
46
+ r"""
47
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
48
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
49
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
50
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
51
+
52
+ logits_to_keep (`int` or `torch.Tensor`, *optional*):
53
+ If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
54
+ `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
55
+ token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
56
+ If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
57
+ This is useful when using packed tensor format (single dimension for batch and sequence length).
58
+
59
+ Returns:
60
+
61
+ Example:
62
+
63
+ ```python
64
+ >>> from transformers import AutoTokenizer, Gemma3ForCausalLM
65
+
66
+ >>> model = Gemma3ForCausalLM.from_pretrained("google/gemma-2-9b")
67
+ >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")
68
+
69
+ >>> prompt = "What is your favorite condiment?"
70
+ >>> inputs = tokenizer(prompt, return_tensors="pt")
71
+
72
+ >>> # Generate
73
+ >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
74
+ >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
75
+ "What is your favorite condiment?"
76
+ ```"""
77
+
78
+ if self.training and self.config._attn_implementation != "eager":
79
+ logger.warning_once(
80
+ "It is strongly recommended to train Gemma3 models with the `eager` attention implementation "
81
+ f"instead of `{self.config._attn_implementation}`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`."
82
+ )
83
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
84
+ output_hidden_states = (
85
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
86
+ )
87
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
88
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
89
+ outputs = self.model(
90
+ input_ids=input_ids,
91
+ attention_mask=attention_mask,
92
+ position_ids=position_ids,
93
+ past_key_values=past_key_values,
94
+ inputs_embeds=inputs_embeds,
95
+ use_cache=use_cache,
96
+ output_attentions=output_attentions,
97
+ output_hidden_states=output_hidden_states,
98
+ return_dict=return_dict,
99
+ cache_position=cache_position,
100
+ **loss_kwargs,
101
+ )
102
+
103
+ hidden_states = outputs[0]
104
+ # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
105
+ slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
106
+ kept_hidden_states = hidden_states[:, slice_indices, :]
107
+ loss = None
108
+ logits = None
109
+ if self.training and (labels is not None):
110
+ loss = LigerForCausalLMLoss(
111
+ hidden_states=kept_hidden_states,
112
+ lm_head_weight=self.lm_head.weight,
113
+ labels=labels,
114
+ hidden_size=self.config.hidden_size,
115
+ final_logit_softcapping=self.config.final_logit_softcapping,
116
+ **loss_kwargs,
117
+ )
118
+
119
+ else:
120
+ logits = self.lm_head(kept_hidden_states)
121
+ if self.config.final_logit_softcapping is not None:
122
+ logits = logits / self.config.final_logit_softcapping
123
+ logits = torch.tanh(logits)
124
+ logits = logits * self.config.final_logit_softcapping
125
+ if labels is not None:
126
+ loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs)
127
+
128
+ if not return_dict:
129
+ output = (logits,) + outputs[1:]
130
+ return (loss,) + output if loss is not None else output
131
+
132
+ return CausalLMOutputWithPast(
133
+ loss=loss,
134
+ logits=logits,
135
+ past_key_values=outputs.past_key_values,
136
+ hidden_states=outputs.hidden_states,
137
+ attentions=outputs.attentions,
138
+ )
139
+
140
+
141
+ @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
142
+ @add_start_docstrings_to_model_forward(GEMMA3_INPUTS_DOCSTRING)
143
+ @replace_return_docstrings(output_type=Gemma3CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
144
+ def multimodal_forward(
145
+ self,
146
+ input_ids: torch.LongTensor = None,
147
+ pixel_values: torch.FloatTensor = None,
148
+ attention_mask: Optional[torch.Tensor] = None,
149
+ position_ids: Optional[torch.LongTensor] = None,
150
+ past_key_values: Optional[Union[List[torch.FloatTensor], Cache]] = None,
151
+ token_type_ids: Optional[torch.LongTensor] = None,
152
+ cache_position: Optional[torch.LongTensor] = None,
153
+ inputs_embeds: Optional[torch.FloatTensor] = None,
154
+ labels: Optional[torch.LongTensor] = None,
155
+ use_cache: Optional[bool] = None,
156
+ output_attentions: Optional[bool] = None,
157
+ output_hidden_states: Optional[bool] = None,
158
+ return_dict: Optional[bool] = None,
159
+ logits_to_keep: Union[int, torch.Tensor] = 0,
160
+ **lm_kwargs,
161
+ ) -> Union[Tuple, Gemma3CausalLMOutputWithPast]:
162
+ r"""
163
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
164
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
165
+ config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
166
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.
167
+
168
+ logits_to_keep (`int` or `torch.Tensor`, *optional*):
169
+ If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
170
+ `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
171
+ token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
172
+ If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
173
+ This is useful when using packed tensor format (single dimension for batch and sequence length).
174
+
175
+ Returns:
176
+
177
+ Example:
178
+
179
+ ```python
180
+ >>> from PIL import Image
181
+ >>> import requests
182
+ >>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration
183
+
184
+ >>> model = Gemma3ForConditionalGeneration.from_pretrained("google/Gemma3-test-224px-hf")
185
+ >>> processor = AutoProcessor.from_pretrained("google/Gemma3-test-224px-hf")
186
+
187
+ >>> prompt = "answer en Where is the cow standing?"
188
+ >>> url = "https://huggingface.co/gv-hf/Gemma3-test-224px-hf/resolve/main/cow_beach_1.png"
189
+ >>> image = Image.open(requests.get(url, stream=True).raw)
190
+
191
+ >>> inputs = processor(images=image, text=prompt, return_tensors="pt")
192
+
193
+ >>> # Generate
194
+ >>> generate_ids = model.generate(**inputs, max_length=30)
195
+ >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
196
+ "answer en Where is the cow standing?\nbeach"
197
+ ```"""
198
+
199
+ if (input_ids is None) ^ (inputs_embeds is not None):
200
+ raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
201
+
202
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
203
+ output_hidden_states = (
204
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
205
+ )
206
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
207
+
208
+ is_training = token_type_ids is not None and labels is not None
209
+
210
+ # Replace image id woth PAD if the image token if OOV, to avoid index-errors
211
+ if input_ids is not None and self.config.image_token_index >= self.vocab_size:
212
+ special_image_mask = input_ids == self.config.image_token_index
213
+ llm_input_ids = input_ids.clone()
214
+ llm_input_ids[special_image_mask] = 0
215
+ else:
216
+ llm_input_ids = input_ids
217
+
218
+ if inputs_embeds is None:
219
+ inputs_embeds = self.get_input_embeddings()(llm_input_ids)
220
+
221
+ if cache_position is None:
222
+ past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
223
+ cache_position = torch.arange(
224
+ past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
225
+ )
226
+
227
+ if position_ids is None:
228
+ position_ids = cache_position.unsqueeze(0) + 1 # Gemma3 positions are 1-indexed
229
+
230
+ # Merge text and images
231
+ if pixel_values is not None:
232
+ image_features = self.get_image_features(pixel_values)
233
+
234
+ if input_ids is None:
235
+ special_image_mask = inputs_embeds == self.get_input_embeddings()(
236
+ torch.tensor(self.config.image_token_index, dtype=torch.long, device=inputs_embeds.device)
237
+ )
238
+ else:
239
+ special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1)
240
+ special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
241
+
242
+ if not is_torchdynamo_compiling() and inputs_embeds[special_image_mask].numel() != image_features.numel():
243
+ image_tokens_in_text = (special_image_mask).sum(dim=1).sum(dim=0)[0]
244
+ raise ValueError(
245
+ f"Number of images does not match number of special image tokens in the input text. "
246
+ f"Got {image_tokens_in_text} image tokens in the text but {image_features.shape[0] * image_features.shape[1]} "
247
+ "tokens from image embeddings."
248
+ )
249
+ image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
250
+ inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
251
+
252
+ # mask out pad-token-ids in labels for BC
253
+ if labels is not None and self.pad_token_id in labels:
254
+ logger.warning_once(
255
+ "`labels` contains `pad_token_id` which will be masked with `config.ignore_index`. "
256
+ "You have to mask out `pad_token_id` when preparing `labels`, this behavior will be removed in v.4.46.",
257
+ )
258
+ labels = torch.where(input_ids == self.pad_token_id, self.config.ignore_index, labels)
259
+
260
+ causal_mask = self._update_causal_mask(
261
+ attention_mask, token_type_ids, past_key_values, cache_position, inputs_embeds, is_training
262
+ )
263
+ outputs = self.language_model.model(
264
+ attention_mask=causal_mask,
265
+ position_ids=position_ids,
266
+ past_key_values=past_key_values,
267
+ inputs_embeds=inputs_embeds,
268
+ use_cache=use_cache,
269
+ output_attentions=output_attentions,
270
+ output_hidden_states=output_hidden_states,
271
+ return_dict=return_dict,
272
+ cache_position=cache_position,
273
+ logits_to_keep=logits_to_keep,
274
+ **lm_kwargs,
275
+ )
276
+
277
+ hidden_states = outputs[0]
278
+ loss = None
279
+ logits = None
280
+
281
+ if self.training and (labels is not None):
282
+ shift_hidden_states = hidden_states[..., :-1, :]
283
+ shift_labels = labels[..., 1:]
284
+
285
+ hidden_device = shift_hidden_states.device
286
+ if attention_mask is not None:
287
+ # we use the input attention mask to shift the hidden_states and labels, because it is 2D.
288
+ # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft
289
+ shift_attention_mask = attention_mask[:, -shift_hidden_states.shape[1] :].to(hidden_device)
290
+ shift_hidden_states = shift_hidden_states[shift_attention_mask.to(hidden_device) != 0].contiguous()
291
+ shift_labels = shift_labels[shift_attention_mask.to(shift_labels.device) != 0].contiguous()
292
+ else:
293
+ shift_hidden_states = shift_hidden_states.contiguous()
294
+ shift_labels = shift_labels.contiguous()
295
+
296
+ # Flatten hidden state
297
+ shift_hidden_states = shift_hidden_states.view(-1, self.config.text_config.hidden_size)
298
+ shift_labels = shift_labels.view(-1).to(hidden_device)
299
+
300
+ lce = LigerFusedLinearCrossEntropyLoss()
301
+ loss = lce(self.language_model.lm_head.weight, shift_hidden_states, shift_labels)
302
+ else:
303
+ logits = self.language_model.lm_head(hidden_states)
304
+ if labels is not None:
305
+ # Upcast to float if we need to compute the loss to avoid potential precision issues
306
+ logits = logits.float()
307
+ shift_logits = logits[..., :-1, :]
308
+ shift_labels = labels[..., 1:]
309
+ if attention_mask is not None:
310
+ # we use the input attention mask to shift the logits and labels, because it is 2D.
311
+ # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft
312
+ shift_attention_mask = attention_mask[:, -shift_logits.shape[1] :].to(logits.device)
313
+ shift_logits = shift_logits[shift_attention_mask.to(logits.device) != 0].contiguous()
314
+ shift_labels = shift_labels[shift_attention_mask.to(shift_labels.device) != 0].contiguous()
315
+ else:
316
+ shift_logits = shift_logits.contiguous()
317
+ shift_labels = shift_labels.contiguous()
318
+ # Flatten the tokens
319
+ loss_fct = nn.CrossEntropyLoss()
320
+
321
+ flat_logits = shift_logits.view(-1, self.config.text_config.vocab_size)
322
+ flat_labels = shift_labels.view(-1).to(shift_logits.device)
323
+ loss = loss_fct(flat_logits, flat_labels)
324
+ if not return_dict:
325
+ output = (logits,) + outputs[1:]
326
+ return (loss,) + output if loss is not None else output
327
+
328
+ return Gemma3CausalLMOutputWithPast(
329
+ loss=loss,
330
+ logits=logits,
331
+ past_key_values=outputs.past_key_values,
332
+ hidden_states=outputs.hidden_states,
333
+ attentions=outputs.attentions,
334
+ image_hidden_states=image_features if pixel_values is not None else None,
335
+ )
@@ -13,8 +13,10 @@ from transformers.models.llama.modeling_llama import _CONFIG_FOR_DOC
13
13
  from transformers.models.llama.modeling_llama import LLAMA_INPUTS_DOCSTRING
14
14
  from transformers.utils import add_start_docstrings_to_model_forward
15
15
  from transformers.utils import replace_return_docstrings
16
+ from transformers.utils.deprecation import deprecate_kwarg
16
17
 
17
18
  from liger_kernel.transformers.fused_linear_cross_entropy import LigerFusedLinearCrossEntropyLoss
19
+ from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
18
20
 
19
21
  if TYPE_CHECKING:
20
22
  from transformers.cache_utils import Cache
@@ -134,6 +136,7 @@ def lce_forward_deprecated(
134
136
  )
135
137
 
136
138
 
139
+ @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
137
140
  @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
138
141
  @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
139
142
  def lce_forward(
@@ -149,7 +152,7 @@ def lce_forward(
149
152
  output_hidden_states: Optional[bool] = None,
150
153
  return_dict: Optional[bool] = None,
151
154
  cache_position: Optional[torch.LongTensor] = None,
152
- num_logits_to_keep: int = 0,
155
+ logits_to_keep: Union[int, torch.Tensor] = 0,
153
156
  **loss_kwargs,
154
157
  ) -> Union[Tuple, CausalLMOutputWithPast]:
155
158
  r"""
@@ -159,10 +162,12 @@ def lce_forward(
159
162
  config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
160
163
  (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
161
164
 
162
- num_logits_to_keep (`int`, *optional*):
163
- Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
165
+ logits_to_keep (`int` or `torch.Tensor`, *optional*):
166
+ If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
164
167
  `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
165
168
  token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
169
+ If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
170
+ This is useful when using packed tensor format (single dimension for batch and sequence length).
166
171
 
167
172
  Returns:
168
173
 
@@ -212,24 +217,17 @@ def lce_forward(
212
217
  loss = None
213
218
  # if in training mode, don't materialize logits
214
219
  if self.training and (labels is not None):
215
- # We do the same thing as ForCausalLMLoss but using Liger FLCE
216
-
217
- shift_hidden_states = hidden_states[..., :-1, :].contiguous()
218
- shift_labels = labels[..., 1:].contiguous()
219
-
220
- # flatten tokens
221
- shift_hidden_states = shift_hidden_states.view(-1, self.config.hidden_size)
222
- shift_labels = shift_labels.view(-1)
223
-
224
- reduction = "sum" if "num_items_in_batch" in loss_kwargs else "mean"
225
- lce = LigerFusedLinearCrossEntropyLoss(reduction=reduction)
226
-
227
- loss = lce(self.lm_head.weight, shift_hidden_states, shift_labels)
228
- if reduction == "sum":
229
- loss /= loss_kwargs["num_items_in_batch"]
220
+ loss = LigerForCausalLMLoss(
221
+ hidden_states=hidden_states,
222
+ lm_head_weight=self.lm_head.weight,
223
+ labels=labels,
224
+ hidden_size=self.config.hidden_size,
225
+ **loss_kwargs,
226
+ )
230
227
 
231
228
  else: # if in inference mode materialize logits
232
- logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
229
+ slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
230
+ logits = self.lm_head(hidden_states[:, slice_indices, :])
233
231
  if labels is not None:
234
232
  loss = self.loss_function(
235
233
  logits=logits,