PyPI - liger-kernel-nightly - Versions diffs - 0.5.2.dev20241218221959__tar.gz → 0.5.2.dev20241220004933__tar.gz - Mend

liger-kernel-nightly 0.5.2.dev20241218221959tar.gz → 0.5.2.dev20241220004933tar.gz

Files changed (199) hide show

{liger_kernel_nightly-0.5.2.dev20241218221959 → liger_kernel_nightly-0.5.2.dev20241220004933}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: liger_kernel_nightly
-Version: 0.5.2.dev20241218221959
+Version: 0.5.2.dev20241220004933
 Summary: Efficient Triton kernels for LLM Training
 License: BSD 2-CLAUSE LICENSE
         Copyright 2024 LinkedIn Corporation

{liger_kernel_nightly-0.5.2.dev20241218221959 → liger_kernel_nightly-0.5.2.dev20241220004933}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "liger_kernel_nightly"
-version = "0.5.2.dev20241218221959"
+version = "0.5.2.dev20241220004933"
 description = "Efficient Triton kernels for LLM Training"
 urls = { "Homepage" = "https://github.com/linkedin/Liger-Kernel" }
 readme = { file = "README.md", content-type = "text/markdown" }

{liger_kernel_nightly-0.5.2.dev20241218221959 → liger_kernel_nightly-0.5.2.dev20241220004933}/src/liger_kernel/chunked_loss/cpo_loss.py RENAMED Viewed

@@ -47,7 +47,6 @@ class LigerFusedLinearCPOFunction(LigerFusedLinearPreferenceBase):
         alpha=1.0,
         compute_nll_loss=True,
         compiled=True,
-        is_encoder_decoder=False,
     ):
         return LigerFusedLinearPreferenceBase.forward(
             ctx,
@@ -61,13 +60,12 @@ class LigerFusedLinearCPOFunction(LigerFusedLinearPreferenceBase):
             beta=beta,
             compute_nll_loss=compute_nll_loss,
             compiled=compiled,
-            is_encoder_decoder=is_encoder_decoder,
         )
     @staticmethod
     def backward(ctx, *grad_output):
         grads = LigerFusedLinearPreferenceBase.backward(ctx, grad_output)[:4]
-        return *grads, None, None, None, None, None, None
+        return *grads, None, None, None, None, None
 class LigerFusedLinearCPOLoss(torch.nn.Module):
@@ -82,16 +80,11 @@ class LigerFusedLinearCPOLoss(torch.nn.Module):
         alpha: float = 1.0,
         compute_nll_loss: bool = True,
         compiled: bool = True,
-        is_encoder_decoder: bool = False,
     ):
         """
         Args:
             ignore_index (int): Index to ignore in the loss.
             beta (float): Weight for the odds ratio loss.
-            alpha (float): Weight for the NLL loss.
-            compute_nll_loss (bool): Whether to compute NLL loss.
-            compiled (bool): Whether to compile the loss function.
-            is_encoder_decoder (bool): Whether the model is an encoder-decoder model.
         """
         super().__init__()
         self.ignore_index = ignore_index
@@ -99,7 +92,6 @@ class LigerFusedLinearCPOLoss(torch.nn.Module):
         self.alpha = alpha
         self.compute_nll_loss = compute_nll_loss
         self.compiled = compiled
-        self.is_encoder_decoder = is_encoder_decoder
     def forward(self, lin_weight, _input, target, bias=None):
         return LigerFusedLinearCPOFunction.apply(
@@ -112,5 +104,4 @@ class LigerFusedLinearCPOLoss(torch.nn.Module):
             self.alpha,
             self.compute_nll_loss,
             self.compiled,
-            self.is_encoder_decoder,
         )

{liger_kernel_nightly-0.5.2.dev20241218221959 → liger_kernel_nightly-0.5.2.dev20241220004933}/src/liger_kernel/chunked_loss/dpo_loss.py RENAMED Viewed

@@ -64,10 +64,9 @@ class LigerFusedLinearDPOFunction(LigerFusedLinearPreferenceBase):
         ref_bias=None,
         ignore_index=-100,
         beta=0.1,
-        compute_nll_loss=True,
+        compute_nll_loss=False,
         compiled=True,
         use_ref_model=True,
-        is_encoder_decoder=False,
     ):
         return LigerFusedLinearPreferenceBase.forward(
             ctx=ctx,
@@ -84,13 +83,12 @@ class LigerFusedLinearDPOFunction(LigerFusedLinearPreferenceBase):
             ref_input=ref_input,
             ref_weight=ref_weight,
             ref_bias=ref_bias,
-            is_encoder_decoder=is_encoder_decoder,
         )
     @staticmethod
     def backward(ctx, *grad_output):
         grads = LigerFusedLinearPreferenceBase.backward(ctx, grad_output)[:4]
-        return *grads, None, None, None, None, None, None, None, None, None
+        return *grads, None, None, None, None, None, None, None, None
 class LigerFusedLinearDPOLoss(torch.nn.Module):
@@ -102,10 +100,9 @@ class LigerFusedLinearDPOLoss(torch.nn.Module):
         self,
         ignore_index: int = -100,
         beta: float = 0.1,
-        compute_nll_loss: bool = True,
+        compute_nll_loss: bool = False,
         compiled: bool = True,
         use_ref_model: bool = False,
-        is_encoder_decoder: bool = False,
     ):
         """
         Args:
@@ -114,7 +111,6 @@ class LigerFusedLinearDPOLoss(torch.nn.Module):
             compute_nll_loss (bool): Whether to compute the NLL loss.
             compiled (bool): Whether to use the torch compiled kernel.
             use_ref_model (bool): Whether to use a reference model for the DPO loss.
-            is_encoder_decoder (bool): Whether the model is an encoder-decoder model.
         """
         super().__init__()
         self.ignore_index = ignore_index
@@ -122,7 +118,6 @@ class LigerFusedLinearDPOLoss(torch.nn.Module):
         self.compute_nll_loss = compute_nll_loss
         self.compiled = compiled
         self.use_ref_model = use_ref_model
-        self.is_encoder_decoder = is_encoder_decoder
     def forward(
         self,
@@ -147,5 +142,4 @@ class LigerFusedLinearDPOLoss(torch.nn.Module):
             self.compute_nll_loss,
             self.compiled,
             self.use_ref_model,
-            self.is_encoder_decoder,
         )

{liger_kernel_nightly-0.5.2.dev20241218221959 → liger_kernel_nightly-0.5.2.dev20241220004933}/src/liger_kernel/chunked_loss/fused_linear_preference.py RENAMED Viewed

@@ -26,7 +26,6 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
         ignore_index=-100,
         alpha=1.0,
         beta=0.1,
-        is_encoder_decoder=False,
         compute_nll_loss=True,
         compiled=True,
         use_ref_model=False,
@@ -57,7 +56,6 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
             ignore_index (int): Index to ignore for loss computation.
             alpha (float): Weight for the NLL loss.
             beta (float): Weight for the preference loss.
-            is_encoder_decoder (bool): Whether the model is an encoder-decoder model.
             compute_nll_loss (bool): Whether to compute NLL loss.
             compiled (bool): Whether to use torch compile for chunk accumulation.
             use_ref_model (bool): Whether to use a reference model for the alignment loss.
@@ -96,7 +94,6 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
             use_ref_model=use_ref_model,
             ref_weight=ref_weight,
             ref_bias=ref_bias,
-            is_encoder_decoder=is_encoder_decoder,
             **loss_kwargs,
         )
@@ -285,48 +282,33 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
         bias=None,
         ignore_index=-100,
         compute_nll_loss=True,
-        is_encoder_decoder=False,
     ):
-        # Calculate logits and log probabilities
+        len_chosen_chunk = target_chunk.shape[0] // 2
         logits_chunk = input_chunk @ weight.t()
         if bias is not None:
-            logits_chunk += bias
+            logits_chunk = logits_chunk + bias
         log_probs_chunk = F.log_softmax(logits_chunk.float(), dim=-1)
-        # Split chunk into chosen and rejected portions
-        len_chosen_chunk = target_chunk.shape[0] // 2
-        # Handle sequence shifting for non-encoder-decoder models
-        if not is_encoder_decoder:
-            logits_chunk = logits_chunk[:, :-1]
-            log_probs_chunk = log_probs_chunk[:, :-1]
-            target_chunk = target_chunk[:, 1:]
-        # Calculate NLL loss for chosen sequences
         chosen_nll_loss = 0.0
         if compute_nll_loss:
-            chosen_probs = log_probs_chunk[:len_chosen_chunk]
-            chosen_targets = target_chunk[:len_chosen_chunk]
             chosen_nll_loss = F.nll_loss(
-                chosen_probs.reshape(-1, chosen_probs.shape[-1]),
-                chosen_targets.reshape(-1),
+                log_probs_chunk[:len_chosen_chunk].view(-1, log_probs_chunk.shape[-1]),
+                target_chunk[:len_chosen_chunk].view(-1),
                 reduction="sum",
                 ignore_index=ignore_index,
             )
-        # Calculate per-token log probabilities
         loss_mask = target_chunk != ignore_index
         label_chunk = torch.where(loss_mask, target_chunk, 0)
         per_token_logps = log_probs_chunk.gather(-1, label_chunk.unsqueeze(-1)).squeeze(
             -1
         )
         average_log_prob = (per_token_logps * loss_mask).sum(-1) / loss_mask.sum(-1)
-        # Split results for chosen and rejected
-        chosen_logps, rejected_logps = (
-            average_log_prob[:len_chosen_chunk],
-            average_log_prob[len_chosen_chunk:],
-        )
+        chosen_logps = average_log_prob[:len_chosen_chunk]
+        rejected_logps = average_log_prob[len_chosen_chunk:]
         chosen_logits = logits_chunk[:len_chosen_chunk]
         rejected_logits = logits_chunk[len_chosen_chunk:]
@@ -349,7 +331,6 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
         ignore_index=-100,
         alpha=1.0,
         beta=0.1,
-        is_encoder_decoder=False,
         compute_nll_loss=True,
         use_ref_model=False,
         ref_input_chunk=None,
@@ -369,7 +350,6 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
             ignore_index (int): Index to ignore for loss computation.
             alpha (float): Weight for the NLL loss.
             beta (float): Weight for the preference loss.
-            is_encoder_decoder (bool): Whether the model is an encoder-decoder model.
             compute_nll_loss (bool): Whether to compute NLL loss.
             use_ref_model (bool): Whether to use a reference model for the alignment loss.
             ref_weight (torch.Tensor): Reference weight tensor. Shape: (vocab_size, hidden_size).
@@ -389,43 +369,33 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
             bias=bias,
             ignore_index=ignore_index,
             compute_nll_loss=compute_nll_loss,
-            is_encoder_decoder=is_encoder_decoder,
         )
-        if not is_encoder_decoder:
-            chosen_nll_loss = (
-                chosen_nll_loss
-                / (full_target[: full_target.shape[0] // 2, 1:] != ignore_index).sum()
-            )
-            chosen_logits_mean = chosen_logits.sum() / (
-                full_target.shape[0] // 2 * (input_chunk.shape[1] - 1) * weight.shape[0]
-            )
-            rejected_logits_mean = rejected_logits.sum() / (
-                full_target.shape[0] // 2 * (input_chunk.shape[1] - 1) * weight.shape[0]
-            )
-        else:
-            chosen_nll_loss = (
-                chosen_nll_loss
-                / (full_target[: full_target.shape[0] // 2] != ignore_index).sum()
-            )
-            chosen_logits_mean = chosen_logits.sum() / (
-                full_target.shape[0] // 2 * input_chunk.shape[1] * weight.shape[0]
-            )
-            rejected_logits_mean = rejected_logits.sum() / (
-                full_target.shape[0] // 2 * input_chunk.shape[1] * weight.shape[0]
-            )
+        chosen_nll_loss = (
+            chosen_nll_loss
+            / (full_target[: full_target.shape[0] // 2] != ignore_index).sum()
+        )
+        chosen_logits_mean = chosen_logits.sum() / (
+            full_target.shape[0] // 2 * input_chunk.shape[1] * weight.shape[0]
+        )
+        rejected_logits_mean = rejected_logits.sum() / (
+            full_target.shape[0] // 2 * input_chunk.shape[1] * weight.shape[0]
+        )
         if use_ref_model:
             with torch.no_grad():
-                (ref_chosen_logps, ref_rejected_logps, _, _, _) = (
-                    LigerFusedLinearPreferenceBase.chunk_forward(
-                        ref_input_chunk,
-                        ref_weight,
-                        target_chunk,
-                        ref_bias,
-                        ignore_index=ignore_index,
-                        compute_nll_loss=False,  # We don't need NLL loss for the reference model
-                        is_encoder_decoder=is_encoder_decoder,  # assume the ref model is the same family
-                    )
+                (
+                    ref_chosen_logps,
+                    ref_rejected_logps,
+                    ref_chosen_logits,
+                    ref_rejected_logits,
+                    ref_chosen_nll_loss,
+                ) = LigerFusedLinearPreferenceBase.chunk_forward(
+                    ref_input_chunk,
+                    ref_weight,
+                    target_chunk,
+                    ref_bias,
+                    ignore_index=ignore_index,
+                    compute_nll_loss=False,  # We don't need NLL loss for the reference model
                 )
             loss_kwargs["ref_chosen_logps"] = ref_chosen_logps
             loss_kwargs["ref_rejected_logps"] = ref_rejected_logps

{liger_kernel_nightly-0.5.2.dev20241218221959 → liger_kernel_nightly-0.5.2.dev20241220004933}/src/liger_kernel/chunked_loss/orpo_loss.py RENAMED Viewed

@@ -57,7 +57,6 @@ class LigerFusedLinearORPOFunction(LigerFusedLinearPreferenceBase):
         beta=0.1,
         compute_nll_loss=True,
         compiled=True,
-        is_encoder_decoder=False,
     ):
         return LigerFusedLinearPreferenceBase.forward(
             ctx=ctx,
@@ -70,13 +69,12 @@ class LigerFusedLinearORPOFunction(LigerFusedLinearPreferenceBase):
             beta=beta,
             compute_nll_loss=compute_nll_loss,
             compiled=compiled,
-            is_encoder_decoder=is_encoder_decoder,
         )
     @staticmethod
     def backward(ctx, *grad_output):
         grads = LigerFusedLinearPreferenceBase.backward(ctx, grad_output)[:4]
-        return *grads, None, None, None, None, None
+        return *grads, None, None, None, None
 class LigerFusedLinearORPOLoss(torch.nn.Module):
@@ -90,22 +88,17 @@ class LigerFusedLinearORPOLoss(torch.nn.Module):
         beta: float = 0.1,
         compute_nll_loss: bool = True,
         compiled: bool = True,
-        is_encoder_decoder: bool = False,
     ):
         """
         Args:
             ignore_index (int): Index to ignore in the loss.
             beta (float): Weight for the odds ratio loss.
-            compute_nll_loss (bool): Whether to compute NLL loss.
-            compiled (bool): Whether to compile the loss function.
-            is_encoder_decoder (bool): Whether the model is an encoder-decoder model.
         """
         super().__init__()
         self.ignore_index = ignore_index
         self.beta = beta
         self.compute_nll_loss = compute_nll_loss
         self.compiled = compiled
-        self.is_encoder_decoder = is_encoder_decoder
     def forward(self, lin_weight, _input, target, bias=None):
         return LigerFusedLinearORPOFunction.apply(
@@ -117,5 +110,4 @@ class LigerFusedLinearORPOLoss(torch.nn.Module):
             self.beta,
             self.compute_nll_loss,
             self.compiled,
-            self.is_encoder_decoder,
         )

{liger_kernel_nightly-0.5.2.dev20241218221959 → liger_kernel_nightly-0.5.2.dev20241220004933}/src/liger_kernel_nightly.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: liger_kernel_nightly
-Version: 0.5.2.dev20241218221959
+Version: 0.5.2.dev20241220004933
 Summary: Efficient Triton kernels for LLM Training
 License: BSD 2-CLAUSE LICENSE
         Copyright 2024 LinkedIn Corporation

{liger_kernel_nightly-0.5.2.dev20241218221959 → liger_kernel_nightly-0.5.2.dev20241220004933}/test/chunked_loss/test_dpo_loss.py RENAMED Viewed

@@ -23,10 +23,17 @@ class HFDPOLoss(HFAlignmentLoss):
     """
     def __init__(
-        self, ignore_index: int = -100, beta: float = 0.1, use_ref_model: bool = True
+        self,
+        ignore_index: int = -100,
+        beta: float = 0.1,
+        use_ref_model: bool = True,
+        compute_nll_loss: bool = False,
     ):
         super().__init__(
-            beta=beta, ignore_index=ignore_index, use_ref_model=use_ref_model
+            beta=beta,
+            ignore_index=ignore_index,
+            use_ref_model=use_ref_model,
+            compute_nll_loss=compute_nll_loss,
         )
     def alignment_loss(
@@ -61,6 +68,7 @@ class TorchLMHeadDPO(torch.nn.Module):
         dtype: torch.dtype,
         bias: bool = False,
         ref_bias: bool = False,
+        compute_nll_loss: bool = False,
         ignore_index: int = -100,
         beta: float = 0.1,
     ):
@@ -72,7 +80,10 @@ class TorchLMHeadDPO(torch.nn.Module):
             in_features=H, out_features=V, bias=ref_bias, dtype=dtype
         )
         self.dpo_loss = HFDPOLoss(
-            ignore_index=ignore_index, beta=beta, use_ref_model=True
+            ignore_index=ignore_index,
+            beta=beta,
+            use_ref_model=True,
+            compute_nll_loss=compute_nll_loss,
         ).get_batch_loss_metrics
     def forward(self, x, ref_x, y):
@@ -95,6 +106,7 @@ class LigerLMHeadDPO(torch.nn.Module):
         dtype: torch.dtype,
         bias: bool = False,
         ref_bias: bool = False,
+        compute_nll_loss: bool = False,
         ignore_index: int = -100,
         beta: float = 0.1,
     ):
@@ -106,7 +118,10 @@ class LigerLMHeadDPO(torch.nn.Module):
             in_features=H, out_features=V, bias=ref_bias, dtype=dtype
         )
         self.dpo_loss = LigerFusedLinearDPOLoss(
-            ignore_index=ignore_index, beta=beta, use_ref_model=True
+            ignore_index=ignore_index,
+            beta=beta,
+            use_ref_model=True,
+            compute_nll_loss=compute_nll_loss,
         )
     def forward(self, x, ref_x, y):
@@ -132,14 +147,27 @@ class LigerLMHeadDPO(torch.nn.Module):
     "scalar, dtype, atol, rtol",
     [
         (1.0, torch.bfloat16, 5e-2, 5e-1),
-        (1.0, torch.float32, 2e-2, 5e-1),
+        (1.0, torch.float32, 1e-5, 5e-4),
     ],
 )
 @pytest.mark.parametrize("bias", [True, False])
 @pytest.mark.parametrize("ref_bias", [True, False])
+@pytest.mark.parametrize("compute_nll_loss", [True, False])
 @pytest.mark.parametrize("ignore_index, beta", [(-100, 0.1), (42, 0.2)])
 def test_correctness(
-    B, T, H, V, scalar, dtype, atol, rtol, bias, ref_bias, ignore_index, beta
+    B,
+    T,
+    H,
+    V,
+    scalar,
+    dtype,
+    atol,
+    rtol,
+    bias,
+    ref_bias,
+    compute_nll_loss,
+    ignore_index,
+    beta,
 ):
     B = 2 * B  # dpo loss requires B to be even
@@ -149,6 +177,7 @@ def test_correctness(
         dtype=dtype,
         bias=bias,
         ref_bias=ref_bias,
+        compute_nll_loss=compute_nll_loss,
         ignore_index=ignore_index,
         beta=beta,
     )
@@ -158,6 +187,7 @@ def test_correctness(
         dtype=dtype,
         bias=bias,
         ref_bias=ref_bias,
+        compute_nll_loss=compute_nll_loss,
         ignore_index=ignore_index,
         beta=beta,
     )
@@ -251,7 +281,10 @@ def test_correctness(
 )
 @pytest.mark.parametrize("bias", [True, False])
 @pytest.mark.parametrize("ref_bias", [True, False])
-def test_correctness_functional(B, T, H, V, scalar, dtype, atol, rtol, bias, ref_bias):
+@pytest.mark.parametrize("compute_nll_loss", [True, False])
+def test_correctness_functional(
+    B, T, H, V, scalar, dtype, atol, rtol, bias, ref_bias, compute_nll_loss
+):
     B = 2 * B
     _input = torch.randn(B, T, H, device=device, dtype=dtype) * scalar
@@ -290,10 +323,28 @@ def test_correctness_functional(B, T, H, V, scalar, dtype, atol, rtol, bias, ref
     ref_bias2 = _ref_bias.detach().clone().requires_grad_(True) if ref_bias else None
     loss1, aggregated_aux_outputs1 = LigerFusedLinearDPOFunction.apply(
-        input1, weight1, target, bias1, ref_input, ref_weight1, ref_bias1
+        input1,
+        weight1,
+        target,
+        bias1,
+        ref_input,
+        ref_weight1,
+        ref_bias1,
+        -100,
+        0.1,
+        compute_nll_loss,
     )
     loss2, aggregated_aux_outputs2 = liger_fused_linear_dpo(
-        input2, weight2, target, bias2, ref_input, ref_weight2, ref_bias2
+        input2,
+        weight2,
+        target,
+        bias2,
+        ref_input,
+        ref_weight2,
+        ref_bias2,
+        -100,
+        0.1,
+        compute_nll_loss,
     )
     assert_verbose_allclose(loss1, loss2, atol=atol, rtol=rtol)

{liger_kernel_nightly-0.5.2.dev20241218221959 → liger_kernel_nightly-0.5.2.dev20241220004933}/test/utils.py RENAMED Viewed

@@ -350,13 +350,13 @@ class HFAlignmentLoss:
         beta: float = 0.1,
         ignore_index: int = -100,
         use_ref_model: bool = False,
-        is_encoder_decoder: bool = False,
+        compute_nll_loss: bool = True,
     ):
         self.alpha = alpha
         self.beta = beta
         self.ignore_index = ignore_index
         self.use_ref_model = use_ref_model
-        self.is_encoder_decoder = is_encoder_decoder
+        self.compute_nll_loss = compute_nll_loss
     @abstractmethod
     def alignment_loss(self):
@@ -374,6 +374,7 @@ class HFAlignmentLoss:
             logits: Logits of the model (unnormalized). Shape: (batch_size, sequence_length, vocab_size)
             labels: Labels for which to compute the log probabilities. Label tokens with a value of ignore_index are ignored. Shape: (batch_size, sequence_length)
             average_log_prob: If True, return the average log probability per (non-masked) token. Otherwise, return the sum of the log probabilities of the (non-masked) tokens.
+            is_encoder_decoder: Whether the model is an encoder-decoder model.
         Returns:
             A tensor of shape (batch_size,) containing the average/sum log probabilities of the given labels under the given logits.
         """
@@ -382,9 +383,6 @@ class HFAlignmentLoss:
                 "Logits (batch and sequence length dim) and labels must have the same shape."
             )
-        if not self.is_encoder_decoder:
-            logits = logits[..., :-1, :].contiguous()
-            labels = labels[..., 1:].contiguous()
         loss_mask = labels != self.ignore_index
         # dummy token; we'll ignore the losses on these tokens later
@@ -444,9 +442,6 @@ class HFAlignmentLoss:
         def cross_entropy_loss(logits, labels):
             # Flatten the tokens
             loss_fct = nn.CrossEntropyLoss(ignore_index=self.ignore_index)
-            if not self.is_encoder_decoder:
-                logits = logits[..., :-1, :].contiguous()
-                labels = labels[..., 1:].contiguous()
             logits = logits.view(-1, logits.shape[-1])
             labels = labels.view(-1)
             # Enable model parallelism
@@ -455,9 +450,11 @@ class HFAlignmentLoss:
             return loss
         labels = target
-        chosen_nll_loss = cross_entropy_loss(
-            all_logits[:len_chosen], labels[:len_chosen]
-        )
+        chosen_nll_loss = torch.tensor(0.0, device=all_logits.device)
+        if self.compute_nll_loss:
+            chosen_nll_loss = cross_entropy_loss(
+                all_logits[:len_chosen], labels[:len_chosen]
+            )
         all_logps = self.get_batch_logps(
             all_logits,
@@ -468,12 +465,8 @@ class HFAlignmentLoss:
         chosen_logps = all_logps[:len_chosen]
         rejected_logps = all_logps[len_chosen:]
-        if not self.is_encoder_decoder:
-            chosen_logits = all_logits[:len_chosen, :-1]
-            rejected_logits = all_logits[len_chosen:, :-1]
-        else:
-            chosen_logits = all_logits[:len_chosen]
-            rejected_logits = all_logits[len_chosen:]
+        chosen_logits = all_logits[:len_chosen]
+        rejected_logits = all_logits[len_chosen:]
         return (
             chosen_logps,