PyPI - liger-kernel-nightly - Versions diffs - 0.5.2.dev20241217060137__py3-none-any.whl → 0.5.2.dev20241218221959__py3-none-any.whl - Mend

liger-kernel-nightly 0.5.2.dev20241217060137py3-none-any.whl → 0.5.2.dev20241218221959py3-none-any.whl

Files changed (10) hide show

liger_kernel/chunked_loss/cpo_loss.py CHANGED Viewed

@@ -47,6 +47,7 @@ class LigerFusedLinearCPOFunction(LigerFusedLinearPreferenceBase):
         alpha=1.0,
         compute_nll_loss=True,
         compiled=True,
+        is_encoder_decoder=False,
     ):
         return LigerFusedLinearPreferenceBase.forward(
             ctx,
@@ -60,12 +61,13 @@ class LigerFusedLinearCPOFunction(LigerFusedLinearPreferenceBase):
             beta=beta,
             compute_nll_loss=compute_nll_loss,
             compiled=compiled,
+            is_encoder_decoder=is_encoder_decoder,
         )
     @staticmethod
     def backward(ctx, *grad_output):
         grads = LigerFusedLinearPreferenceBase.backward(ctx, grad_output)[:4]
-        return *grads, None, None, None, None, None
+        return *grads, None, None, None, None, None, None
 class LigerFusedLinearCPOLoss(torch.nn.Module):
@@ -80,11 +82,16 @@ class LigerFusedLinearCPOLoss(torch.nn.Module):
         alpha: float = 1.0,
         compute_nll_loss: bool = True,
         compiled: bool = True,
+        is_encoder_decoder: bool = False,
     ):
         """
         Args:
             ignore_index (int): Index to ignore in the loss.
             beta (float): Weight for the odds ratio loss.
+            alpha (float): Weight for the NLL loss.
+            compute_nll_loss (bool): Whether to compute NLL loss.
+            compiled (bool): Whether to compile the loss function.
+            is_encoder_decoder (bool): Whether the model is an encoder-decoder model.
         """
         super().__init__()
         self.ignore_index = ignore_index
@@ -92,6 +99,7 @@ class LigerFusedLinearCPOLoss(torch.nn.Module):
         self.alpha = alpha
         self.compute_nll_loss = compute_nll_loss
         self.compiled = compiled
+        self.is_encoder_decoder = is_encoder_decoder
     def forward(self, lin_weight, _input, target, bias=None):
         return LigerFusedLinearCPOFunction.apply(
@@ -104,4 +112,5 @@ class LigerFusedLinearCPOLoss(torch.nn.Module):
             self.alpha,
             self.compute_nll_loss,
             self.compiled,
+            self.is_encoder_decoder,
         )

liger_kernel/chunked_loss/dpo_loss.py CHANGED Viewed

@@ -67,6 +67,7 @@ class LigerFusedLinearDPOFunction(LigerFusedLinearPreferenceBase):
         compute_nll_loss=True,
         compiled=True,
         use_ref_model=True,
+        is_encoder_decoder=False,
     ):
         return LigerFusedLinearPreferenceBase.forward(
             ctx=ctx,
@@ -83,12 +84,13 @@ class LigerFusedLinearDPOFunction(LigerFusedLinearPreferenceBase):
             ref_input=ref_input,
             ref_weight=ref_weight,
             ref_bias=ref_bias,
+            is_encoder_decoder=is_encoder_decoder,
         )
     @staticmethod
     def backward(ctx, *grad_output):
         grads = LigerFusedLinearPreferenceBase.backward(ctx, grad_output)[:4]
-        return *grads, None, None, None, None, None, None, None, None
+        return *grads, None, None, None, None, None, None, None, None, None
 class LigerFusedLinearDPOLoss(torch.nn.Module):
@@ -103,6 +105,7 @@ class LigerFusedLinearDPOLoss(torch.nn.Module):
         compute_nll_loss: bool = True,
         compiled: bool = True,
         use_ref_model: bool = False,
+        is_encoder_decoder: bool = False,
     ):
         """
         Args:
@@ -111,6 +114,7 @@ class LigerFusedLinearDPOLoss(torch.nn.Module):
             compute_nll_loss (bool): Whether to compute the NLL loss.
             compiled (bool): Whether to use the torch compiled kernel.
             use_ref_model (bool): Whether to use a reference model for the DPO loss.
+            is_encoder_decoder (bool): Whether the model is an encoder-decoder model.
         """
         super().__init__()
         self.ignore_index = ignore_index
@@ -118,6 +122,7 @@ class LigerFusedLinearDPOLoss(torch.nn.Module):
         self.compute_nll_loss = compute_nll_loss
         self.compiled = compiled
         self.use_ref_model = use_ref_model
+        self.is_encoder_decoder = is_encoder_decoder
     def forward(
         self,
@@ -142,4 +147,5 @@ class LigerFusedLinearDPOLoss(torch.nn.Module):
             self.compute_nll_loss,
             self.compiled,
             self.use_ref_model,
+            self.is_encoder_decoder,
         )

liger_kernel/chunked_loss/fused_linear_preference.py CHANGED Viewed

@@ -26,6 +26,7 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
         ignore_index=-100,
         alpha=1.0,
         beta=0.1,
+        is_encoder_decoder=False,
         compute_nll_loss=True,
         compiled=True,
         use_ref_model=False,
@@ -56,6 +57,7 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
             ignore_index (int): Index to ignore for loss computation.
             alpha (float): Weight for the NLL loss.
             beta (float): Weight for the preference loss.
+            is_encoder_decoder (bool): Whether the model is an encoder-decoder model.
             compute_nll_loss (bool): Whether to compute NLL loss.
             compiled (bool): Whether to use torch compile for chunk accumulation.
             use_ref_model (bool): Whether to use a reference model for the alignment loss.
@@ -94,6 +96,7 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
             use_ref_model=use_ref_model,
             ref_weight=ref_weight,
             ref_bias=ref_bias,
+            is_encoder_decoder=is_encoder_decoder,
             **loss_kwargs,
         )
@@ -282,33 +285,48 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
         bias=None,
         ignore_index=-100,
         compute_nll_loss=True,
+        is_encoder_decoder=False,
     ):
-        len_chosen_chunk = target_chunk.shape[0] // 2
+        # Calculate logits and log probabilities
         logits_chunk = input_chunk @ weight.t()
         if bias is not None:
-            logits_chunk = logits_chunk + bias
+            logits_chunk += bias
         log_probs_chunk = F.log_softmax(logits_chunk.float(), dim=-1)
+        # Split chunk into chosen and rejected portions
+        len_chosen_chunk = target_chunk.shape[0] // 2
+        # Handle sequence shifting for non-encoder-decoder models
+        if not is_encoder_decoder:
+            logits_chunk = logits_chunk[:, :-1]
+            log_probs_chunk = log_probs_chunk[:, :-1]
+            target_chunk = target_chunk[:, 1:]
+        # Calculate NLL loss for chosen sequences
         chosen_nll_loss = 0.0
         if compute_nll_loss:
+            chosen_probs = log_probs_chunk[:len_chosen_chunk]
+            chosen_targets = target_chunk[:len_chosen_chunk]
             chosen_nll_loss = F.nll_loss(
-                log_probs_chunk[:len_chosen_chunk].view(-1, log_probs_chunk.shape[-1]),
-                target_chunk[:len_chosen_chunk].view(-1),
+                chosen_probs.reshape(-1, chosen_probs.shape[-1]),
+                chosen_targets.reshape(-1),
                 reduction="sum",
                 ignore_index=ignore_index,
             )
+        # Calculate per-token log probabilities
         loss_mask = target_chunk != ignore_index
         label_chunk = torch.where(loss_mask, target_chunk, 0)
         per_token_logps = log_probs_chunk.gather(-1, label_chunk.unsqueeze(-1)).squeeze(
             -1
         )
         average_log_prob = (per_token_logps * loss_mask).sum(-1) / loss_mask.sum(-1)
-        chosen_logps = average_log_prob[:len_chosen_chunk]
-        rejected_logps = average_log_prob[len_chosen_chunk:]
+        # Split results for chosen and rejected
+        chosen_logps, rejected_logps = (
+            average_log_prob[:len_chosen_chunk],
+            average_log_prob[len_chosen_chunk:],
+        )
         chosen_logits = logits_chunk[:len_chosen_chunk]
         rejected_logits = logits_chunk[len_chosen_chunk:]
@@ -331,6 +349,7 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
         ignore_index=-100,
         alpha=1.0,
         beta=0.1,
+        is_encoder_decoder=False,
         compute_nll_loss=True,
         use_ref_model=False,
         ref_input_chunk=None,
@@ -350,6 +369,7 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
             ignore_index (int): Index to ignore for loss computation.
             alpha (float): Weight for the NLL loss.
             beta (float): Weight for the preference loss.
+            is_encoder_decoder (bool): Whether the model is an encoder-decoder model.
             compute_nll_loss (bool): Whether to compute NLL loss.
             use_ref_model (bool): Whether to use a reference model for the alignment loss.
             ref_weight (torch.Tensor): Reference weight tensor. Shape: (vocab_size, hidden_size).
@@ -369,33 +389,43 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
             bias=bias,
             ignore_index=ignore_index,
             compute_nll_loss=compute_nll_loss,
+            is_encoder_decoder=is_encoder_decoder,
         )
-        chosen_nll_loss = (
-            chosen_nll_loss
-            / (full_target[: full_target.shape[0] // 2] != ignore_index).sum()
-        )
-        chosen_logits_mean = chosen_logits.sum() / (
-            full_target.shape[0] // 2 * input_chunk.shape[1] * weight.shape[0]
-        )
-        rejected_logits_mean = rejected_logits.sum() / (
-            full_target.shape[0] // 2 * input_chunk.shape[1] * weight.shape[0]
-        )
+        if not is_encoder_decoder:
+            chosen_nll_loss = (
+                chosen_nll_loss
+                / (full_target[: full_target.shape[0] // 2, 1:] != ignore_index).sum()
+            )
+            chosen_logits_mean = chosen_logits.sum() / (
+                full_target.shape[0] // 2 * (input_chunk.shape[1] - 1) * weight.shape[0]
+            )
+            rejected_logits_mean = rejected_logits.sum() / (
+                full_target.shape[0] // 2 * (input_chunk.shape[1] - 1) * weight.shape[0]
+            )
+        else:
+            chosen_nll_loss = (
+                chosen_nll_loss
+                / (full_target[: full_target.shape[0] // 2] != ignore_index).sum()
+            )
+            chosen_logits_mean = chosen_logits.sum() / (
+                full_target.shape[0] // 2 * input_chunk.shape[1] * weight.shape[0]
+            )
+            rejected_logits_mean = rejected_logits.sum() / (
+                full_target.shape[0] // 2 * input_chunk.shape[1] * weight.shape[0]
+            )
         if use_ref_model:
             with torch.no_grad():
-                (
-                    ref_chosen_logps,
-                    ref_rejected_logps,
-                    ref_chosen_logits,
-                    ref_rejected_logits,
-                    ref_chosen_nll_loss,
-                ) = LigerFusedLinearPreferenceBase.chunk_forward(
-                    ref_input_chunk,
-                    ref_weight,
-                    target_chunk,
-                    ref_bias,
-                    ignore_index=ignore_index,
-                    compute_nll_loss=False,  # We don't need NLL loss for the reference model
+                (ref_chosen_logps, ref_rejected_logps, _, _, _) = (
+                    LigerFusedLinearPreferenceBase.chunk_forward(
+                        ref_input_chunk,
+                        ref_weight,
+                        target_chunk,
+                        ref_bias,
+                        ignore_index=ignore_index,
+                        compute_nll_loss=False,  # We don't need NLL loss for the reference model
+                        is_encoder_decoder=is_encoder_decoder,  # assume the ref model is the same family
+                    )
                 )
             loss_kwargs["ref_chosen_logps"] = ref_chosen_logps
             loss_kwargs["ref_rejected_logps"] = ref_rejected_logps

liger_kernel/chunked_loss/orpo_loss.py CHANGED Viewed

@@ -57,6 +57,7 @@ class LigerFusedLinearORPOFunction(LigerFusedLinearPreferenceBase):
         beta=0.1,
         compute_nll_loss=True,
         compiled=True,
+        is_encoder_decoder=False,
     ):
         return LigerFusedLinearPreferenceBase.forward(
             ctx=ctx,
@@ -69,12 +70,13 @@ class LigerFusedLinearORPOFunction(LigerFusedLinearPreferenceBase):
             beta=beta,
             compute_nll_loss=compute_nll_loss,
             compiled=compiled,
+            is_encoder_decoder=is_encoder_decoder,
         )
     @staticmethod
     def backward(ctx, *grad_output):
         grads = LigerFusedLinearPreferenceBase.backward(ctx, grad_output)[:4]
-        return *grads, None, None, None, None
+        return *grads, None, None, None, None, None
 class LigerFusedLinearORPOLoss(torch.nn.Module):
@@ -88,17 +90,22 @@ class LigerFusedLinearORPOLoss(torch.nn.Module):
         beta: float = 0.1,
         compute_nll_loss: bool = True,
         compiled: bool = True,
+        is_encoder_decoder: bool = False,
     ):
         """
         Args:
             ignore_index (int): Index to ignore in the loss.
             beta (float): Weight for the odds ratio loss.
+            compute_nll_loss (bool): Whether to compute NLL loss.
+            compiled (bool): Whether to compile the loss function.
+            is_encoder_decoder (bool): Whether the model is an encoder-decoder model.
         """
         super().__init__()
         self.ignore_index = ignore_index
         self.beta = beta
         self.compute_nll_loss = compute_nll_loss
         self.compiled = compiled
+        self.is_encoder_decoder = is_encoder_decoder
     def forward(self, lin_weight, _input, target, bias=None):
         return LigerFusedLinearORPOFunction.apply(
@@ -110,4 +117,5 @@ class LigerFusedLinearORPOLoss(torch.nn.Module):
             self.beta,
             self.compute_nll_loss,
             self.compiled,
+            self.is_encoder_decoder,
         )

{liger_kernel_nightly-0.5.2.dev20241217060137.dist-info → liger_kernel_nightly-0.5.2.dev20241218221959.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: liger_kernel_nightly
-Version: 0.5.2.dev20241217060137
+Version: 0.5.2.dev20241218221959
 Summary: Efficient Triton kernels for LLM Training
 License: BSD 2-CLAUSE LICENSE
         Copyright 2024 LinkedIn Corporation

{liger_kernel_nightly-0.5.2.dev20241217060137.dist-info → liger_kernel_nightly-0.5.2.dev20241218221959.dist-info}/RECORD RENAMED Viewed

@@ -3,12 +3,12 @@ liger_kernel/env_report.py,sha256=ok9PMXtO-8uLj_feCJI4h9hz2NtolZ2AG_OJTW5qmo4,18
 liger_kernel/utils.py,sha256=HJa-xVKOohDn6pLVIx-Fv0V9h0QAL3qZGQNRICI-OpI,249
 liger_kernel/chunked_loss/README.md,sha256=K6rucm6nqHpWCmxUOhBYcE3apwQxAy0TfRUippR7Icw,2243
 liger_kernel/chunked_loss/__init__.py,sha256=R2wCcz4Y0kTAve926DH3k182XKezpXeACMHj05g9Mm8,346
-liger_kernel/chunked_loss/cpo_loss.py,sha256=Qu1Ul2A12sp6CqIT-atPbHWFb_LLtINEA9mOpIRx_0g,3097
-liger_kernel/chunked_loss/dpo_loss.py,sha256=9S67SzKkLyoBmHGx8bkmthSNHlCT2ikBy9CCdb7wGj0,4381
+liger_kernel/chunked_loss/cpo_loss.py,sha256=jtA7jA92Gv2raLzJ2QScPqgyi-S04a6aKUMRROdR3-w,3591
+liger_kernel/chunked_loss/dpo_loss.py,sha256=tpBw6fAVq2mujo0_NS98L1NP--m1hYqi1qHGAyfg52g,4690
 liger_kernel/chunked_loss/functional.py,sha256=9Gr-YXIuEzEJkBUhDx3G2fuQayckLor7cC7svhmPML4,549
 liger_kernel/chunked_loss/fused_linear_distillation.py,sha256=2BH6DCPjsR2zS6zcwFPcIIZRhLF8SohjGdKsAJ_301o,10222
-liger_kernel/chunked_loss/fused_linear_preference.py,sha256=AsovMdfsOjgWVxtDhZ_rXqpahMsKTg8YueXnZcHt1XQ,16376
-liger_kernel/chunked_loss/orpo_loss.py,sha256=ZuKGjbkIYzV4UzvupNdq6vyxCp7-BztQkUt8ZnFvKos,3531
+liger_kernel/chunked_loss/fused_linear_preference.py,sha256=iHegoQ18amhXzMNLNyzntxmtz_6JSOgougHTN_rbwfY,17936
+liger_kernel/chunked_loss/orpo_loss.py,sha256=XkVnsJ6Qmn3lxvprXRiySl9Hbx6-UNzWDCFXu_pY6Uc,3973
 liger_kernel/chunked_loss/simpo_loss.py,sha256=Wa4LOlDG9PbJkOOkKg8hbKvnKgg7OTBz6-qIkwPK1yw,3275
 liger_kernel/ops/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 liger_kernel/ops/cross_entropy.py,sha256=oG5hfrlmnlF5lOoZRhHRglObxgH4B0KadjWMJj9EWPM,15860
@@ -58,9 +58,9 @@ liger_kernel/transformers/trainer/__init__.py,sha256=c4OQVJmhNOloj0JYSEc0j_cQuBb
 liger_kernel/transformers/trainer/orpo_trainer.py,sha256=O2k2vdHl-O1S-U61aEmyUFu3QrEuNAipQa2oUBb3HAA,7679
 liger_kernel/triton/__init__.py,sha256=yfRe0zMb47QnqjecZWG7LnanfCTzeku7SgWRAwNVmzU,101
 liger_kernel/triton/monkey_patch.py,sha256=5BcGKTtdqeYchypBIBopGIWPx1-cFALz7sOKoEsqXJ0,1584
-liger_kernel_nightly-0.5.2.dev20241217060137.dist-info/LICENSE,sha256=OhzLDHJ0to4a8sodVLELZiCFylZ1NAAYLs-HrjPy0ag,1312
-liger_kernel_nightly-0.5.2.dev20241217060137.dist-info/METADATA,sha256=s4F2CNLYmapm4S_h0kRqQVPItXe5hHkR81gBQL6P1L8,21055
-liger_kernel_nightly-0.5.2.dev20241217060137.dist-info/NOTICE,sha256=njwnoPZLh9AN8SJQzxvCGLHi-8X__AvWRze6joNXIY8,2066
-liger_kernel_nightly-0.5.2.dev20241217060137.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
-liger_kernel_nightly-0.5.2.dev20241217060137.dist-info/top_level.txt,sha256=2eghu4hA3LnkM7ElW92tQ8zegWKgSbeo-k-aGe1YnvY,13
-liger_kernel_nightly-0.5.2.dev20241217060137.dist-info/RECORD,,
+liger_kernel_nightly-0.5.2.dev20241218221959.dist-info/LICENSE,sha256=OhzLDHJ0to4a8sodVLELZiCFylZ1NAAYLs-HrjPy0ag,1312
+liger_kernel_nightly-0.5.2.dev20241218221959.dist-info/METADATA,sha256=3Af4_e7ToJ34MQGPqIg94fXvRKApkHFb6dV7evsm494,21055
+liger_kernel_nightly-0.5.2.dev20241218221959.dist-info/NOTICE,sha256=njwnoPZLh9AN8SJQzxvCGLHi-8X__AvWRze6joNXIY8,2066
+liger_kernel_nightly-0.5.2.dev20241218221959.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
+liger_kernel_nightly-0.5.2.dev20241218221959.dist-info/top_level.txt,sha256=2eghu4hA3LnkM7ElW92tQ8zegWKgSbeo-k-aGe1YnvY,13
+liger_kernel_nightly-0.5.2.dev20241218221959.dist-info/RECORD,,

{liger_kernel_nightly-0.5.2.dev20241217060137.dist-info → liger_kernel_nightly-0.5.2.dev20241218221959.dist-info}/LICENSE RENAMED Viewed

File without changes

{liger_kernel_nightly-0.5.2.dev20241217060137.dist-info → liger_kernel_nightly-0.5.2.dev20241218221959.dist-info}/NOTICE RENAMED Viewed

File without changes

{liger_kernel_nightly-0.5.2.dev20241217060137.dist-info → liger_kernel_nightly-0.5.2.dev20241218221959.dist-info}/WHEEL RENAMED Viewed

File without changes

{liger_kernel_nightly-0.5.2.dev20241217060137.dist-info → liger_kernel_nightly-0.5.2.dev20241218221959.dist-info}/top_level.txt RENAMED Viewed

File without changes

liger-kernel-nightly 0.5.2.dev20241217060137__py3-none-any.whl → 0.5.2.dev20241218221959__py3-none-any.whl

liger-kernel-nightly 0.5.2.dev20241217060137py3-none-any.whl → 0.5.2.dev20241218221959py3-none-any.whl