PyPI - liger-kernel-nightly - Versions diffs - 0.5.2.dev20241212030605__py3-none-any.whl → 0.5.2.dev20241212033924__py3-none-any.whl - Mend

liger-kernel-nightly 0.5.2.dev20241212030605py3-none-any.whl → 0.5.2.dev20241212033924py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

liger_kernel/chunked_loss/dpo_loss.py CHANGED Viewed

@@ -59,6 +59,7 @@ class LigerFusedLinearDPOFunction(LigerFusedLinearPreferenceBase):
         weight,
         target,
         bias=None,
+        ref_input=None,
         ref_weight=None,
         ref_bias=None,
         ignore_index=-100,
@@ -79,6 +80,7 @@ class LigerFusedLinearDPOFunction(LigerFusedLinearPreferenceBase):
             compute_nll_loss=compute_nll_loss,
             compiled=compiled,
             use_ref_model=use_ref_model,
+            ref_input=ref_input,
             ref_weight=ref_weight,
             ref_bias=ref_bias,
         )
@@ -86,7 +88,7 @@ class LigerFusedLinearDPOFunction(LigerFusedLinearPreferenceBase):
     @staticmethod
     def backward(ctx, *grad_output):
         grads = LigerFusedLinearPreferenceBase.backward(ctx, grad_output)[:4]
-        return *grads, None, None, None, None, None, None, None
+        return *grads, None, None, None, None, None, None, None, None
 class LigerFusedLinearDPOLoss(torch.nn.Module):
@@ -118,13 +120,21 @@ class LigerFusedLinearDPOLoss(torch.nn.Module):
         self.use_ref_model = use_ref_model
     def forward(
-        self, lin_weight, _input, target, bias=None, ref_weight=None, ref_bias=None
+        self,
+        lin_weight,
+        _input,
+        target,
+        bias=None,
+        ref_input=None,
+        ref_weight=None,
+        ref_bias=None,
     ):
         return LigerFusedLinearDPOFunction.apply(
             _input,
             lin_weight,
             target,
             bias,
+            ref_input,
             ref_weight,
             ref_bias,
             self.ignore_index,

liger_kernel/chunked_loss/fused_linear_preference.py CHANGED Viewed

@@ -29,7 +29,7 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
         compute_nll_loss=True,
         compiled=True,
         use_ref_model=False,
-        # TODO: ref input
+        ref_input=None,
         ref_weight=None,
         ref_bias=None,
         **loss_kwargs,
@@ -97,20 +97,26 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
             **loss_kwargs,
         )
-        def fused_fwd_bwd(input_chunk, target_chunk):
+        def fused_fwd_bwd(input_chunk, target_chunk, ref_input_chunk):
             """
             Fused forward and backward pass for a chunk of input and target.
             """
             if bias is not None:
                 return torch.func.grad_and_value(
                     compute_loss, argnums=(0, 1, 3), has_aux=True
-                )(input_chunk, weight, target_chunk, bias)
+                )(
+                    input_chunk,
+                    weight,
+                    target_chunk,
+                    bias,
+                    ref_input_chunk=ref_input_chunk,
+                )
             else:
                 return torch.func.grad_and_value(
                     compute_loss, argnums=(0, 1), has_aux=True
-                )(input_chunk, weight, target_chunk)
+                )(input_chunk, weight, target_chunk, ref_input_chunk=ref_input_chunk)
-        def accumulate_chunk(input_chunk, target_chunk):
+        def accumulate_chunk(input_chunk, target_chunk, ref_input_chunk=None):
             if bias is not None:
                 (chunk_grad_input, chunk_grad_weight, chunk_grad_bias), (
                     chunk_loss,
@@ -122,7 +128,7 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
                         chunk_nll_loss,
                         *aux_outputs,
                     ),
-                ) = fused_fwd_bwd(input_chunk, target_chunk)
+                ) = fused_fwd_bwd(input_chunk, target_chunk, ref_input_chunk)
                 grad_bias.add_(chunk_grad_bias)  # accumulate bias gradient
             else:
                 (chunk_grad_input, chunk_grad_weight), (
@@ -135,7 +141,7 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
                         chunk_nll_loss,
                         *aux_outputs,
                     ),
-                ) = fused_fwd_bwd(input_chunk, target_chunk)
+                ) = fused_fwd_bwd(input_chunk, target_chunk, ref_input_chunk)
             # Accumulate gradients
             grad_weight.add_(chunk_grad_weight)
@@ -182,18 +188,43 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
         _rejected_input_chunks = torch.chunk(_input[len_chosen:], chunks=chunks, dim=0)
         _rejected_target_chunks = torch.chunk(target[len_chosen:], chunks=chunks, dim=0)
+        if use_ref_model:
+            _ref_chosen_input_chunks = torch.chunk(
+                ref_input[:len_chosen], chunks=chunks, dim=0
+            )
+            _ref_rejected_input_chunks = torch.chunk(
+                ref_input[len_chosen:], chunks=chunks, dim=0
+            )
         for (
             chosen_input_chunk,
             rejected_input_chunk,
             chosen_target_chunk,
             rejected_target_chunk,
+            ref_chosen_input_chunk,
+            ref_rejected_input_chunk,
         ) in zip(
             _chosen_input_chunks,
             _rejected_input_chunks,
             _chosen_target_chunks,
             _rejected_target_chunks,
+            (
+                _ref_chosen_input_chunks
+                if use_ref_model
+                else [None] * len(_chosen_input_chunks)
+            ),
+            (
+                _ref_rejected_input_chunks
+                if use_ref_model
+                else [None] * len(_rejected_input_chunks)
+            ),
         ):
             input_chunk = torch.cat([chosen_input_chunk, rejected_input_chunk], dim=0)
+            ref_input_chunk = (
+                torch.cat([ref_chosen_input_chunk, ref_rejected_input_chunk], dim=0)
+                if use_ref_model
+                else None
+            )
             target_chunk = torch.cat(
                 [chosen_target_chunk, rejected_target_chunk], dim=0
             )
@@ -202,9 +233,10 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
             torch._dynamo.mark_dynamic(input_chunk, 1)
             torch._dynamo.mark_dynamic(target_chunk, 1)
             torch._dynamo.mark_dynamic(target, 1)
+            torch._dynamo.mark_dynamic(ref_input_chunk, 1) if use_ref_model else None
             # accumulate loss, gradients, and metrics
-            accumulate_chunk(input_chunk, target_chunk)
+            accumulate_chunk(input_chunk, target_chunk, ref_input_chunk)
         # combine grad_chosen_inputs and grad_rejected_inputs
         grad_inputs = grad_chosen_inputs + grad_rejected_inputs
@@ -301,6 +333,7 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
         beta=0.1,
         compute_nll_loss=True,
         use_ref_model=False,
+        ref_input_chunk=None,
         ref_weight=None,
         ref_bias=None,
         **loss_kwargs,
@@ -357,7 +390,7 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
                     ref_rejected_logits,
                     ref_chosen_nll_loss,
                 ) = LigerFusedLinearPreferenceBase.chunk_forward(
-                    input_chunk,
+                    ref_input_chunk,
                     ref_weight,
                     target_chunk,
                     ref_bias,

{liger_kernel_nightly-0.5.2.dev20241212030605.dist-info → liger_kernel_nightly-0.5.2.dev20241212033924.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: liger_kernel_nightly
-Version: 0.5.2.dev20241212030605
+Version: 0.5.2.dev20241212033924
 Summary: Efficient Triton kernels for LLM Training
 License: BSD 2-CLAUSE LICENSE
         Copyright 2024 LinkedIn Corporation

{liger_kernel_nightly-0.5.2.dev20241212030605.dist-info → liger_kernel_nightly-0.5.2.dev20241212033924.dist-info}/RECORD RENAMED Viewed

@@ -4,10 +4,10 @@ liger_kernel/utils.py,sha256=HJa-xVKOohDn6pLVIx-Fv0V9h0QAL3qZGQNRICI-OpI,249
 liger_kernel/chunked_loss/README.md,sha256=K6rucm6nqHpWCmxUOhBYcE3apwQxAy0TfRUippR7Icw,2243
 liger_kernel/chunked_loss/__init__.py,sha256=R2wCcz4Y0kTAve926DH3k182XKezpXeACMHj05g9Mm8,346
 liger_kernel/chunked_loss/cpo_loss.py,sha256=Qu1Ul2A12sp6CqIT-atPbHWFb_LLtINEA9mOpIRx_0g,3097
-liger_kernel/chunked_loss/dpo_loss.py,sha256=H9_RRhclckHYM2sd75tgbnf8IxC_PU2JCALbgtPQvwc,4222
+liger_kernel/chunked_loss/dpo_loss.py,sha256=9S67SzKkLyoBmHGx8bkmthSNHlCT2ikBy9CCdb7wGj0,4381
 liger_kernel/chunked_loss/functional.py,sha256=9Gr-YXIuEzEJkBUhDx3G2fuQayckLor7cC7svhmPML4,549
 liger_kernel/chunked_loss/fused_linear_distillation.py,sha256=2BH6DCPjsR2zS6zcwFPcIIZRhLF8SohjGdKsAJ_301o,10222
-liger_kernel/chunked_loss/fused_linear_preference.py,sha256=vlWfaaIECWvCQhY9PM7zRI0vKThIrydMf6P44bXn1EE,15114
+liger_kernel/chunked_loss/fused_linear_preference.py,sha256=AsovMdfsOjgWVxtDhZ_rXqpahMsKTg8YueXnZcHt1XQ,16376
 liger_kernel/chunked_loss/orpo_loss.py,sha256=ZuKGjbkIYzV4UzvupNdq6vyxCp7-BztQkUt8ZnFvKos,3531
 liger_kernel/chunked_loss/simpo_loss.py,sha256=Wa4LOlDG9PbJkOOkKg8hbKvnKgg7OTBz6-qIkwPK1yw,3275
 liger_kernel/ops/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -58,9 +58,9 @@ liger_kernel/transformers/trainer/__init__.py,sha256=c4OQVJmhNOloj0JYSEc0j_cQuBb
 liger_kernel/transformers/trainer/orpo_trainer.py,sha256=jko6oq_XQdBSmXubp05E-_YXOyhtB5Bj75dg5YNwOsE,7517
 liger_kernel/triton/__init__.py,sha256=yfRe0zMb47QnqjecZWG7LnanfCTzeku7SgWRAwNVmzU,101
 liger_kernel/triton/monkey_patch.py,sha256=5BcGKTtdqeYchypBIBopGIWPx1-cFALz7sOKoEsqXJ0,1584
-liger_kernel_nightly-0.5.2.dev20241212030605.dist-info/LICENSE,sha256=OhzLDHJ0to4a8sodVLELZiCFylZ1NAAYLs-HrjPy0ag,1312
-liger_kernel_nightly-0.5.2.dev20241212030605.dist-info/METADATA,sha256=PeHGuXRXme-T4S249Fh6IWDCNH2-DMWzhyrs2i9MiyE,20260
-liger_kernel_nightly-0.5.2.dev20241212030605.dist-info/NOTICE,sha256=njwnoPZLh9AN8SJQzxvCGLHi-8X__AvWRze6joNXIY8,2066
-liger_kernel_nightly-0.5.2.dev20241212030605.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
-liger_kernel_nightly-0.5.2.dev20241212030605.dist-info/top_level.txt,sha256=2eghu4hA3LnkM7ElW92tQ8zegWKgSbeo-k-aGe1YnvY,13
-liger_kernel_nightly-0.5.2.dev20241212030605.dist-info/RECORD,,
+liger_kernel_nightly-0.5.2.dev20241212033924.dist-info/LICENSE,sha256=OhzLDHJ0to4a8sodVLELZiCFylZ1NAAYLs-HrjPy0ag,1312
+liger_kernel_nightly-0.5.2.dev20241212033924.dist-info/METADATA,sha256=ayx2_ON0TY-xC2ba0fpG3x5Vgx5b_SQCIRx-qw455u8,20260
+liger_kernel_nightly-0.5.2.dev20241212033924.dist-info/NOTICE,sha256=njwnoPZLh9AN8SJQzxvCGLHi-8X__AvWRze6joNXIY8,2066
+liger_kernel_nightly-0.5.2.dev20241212033924.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
+liger_kernel_nightly-0.5.2.dev20241212033924.dist-info/top_level.txt,sha256=2eghu4hA3LnkM7ElW92tQ8zegWKgSbeo-k-aGe1YnvY,13
+liger_kernel_nightly-0.5.2.dev20241212033924.dist-info/RECORD,,

{liger_kernel_nightly-0.5.2.dev20241212030605.dist-info → liger_kernel_nightly-0.5.2.dev20241212033924.dist-info}/LICENSE RENAMED Viewed

File without changes

{liger_kernel_nightly-0.5.2.dev20241212030605.dist-info → liger_kernel_nightly-0.5.2.dev20241212033924.dist-info}/NOTICE RENAMED Viewed

File without changes

{liger_kernel_nightly-0.5.2.dev20241212030605.dist-info → liger_kernel_nightly-0.5.2.dev20241212033924.dist-info}/WHEEL RENAMED Viewed

File without changes

{liger_kernel_nightly-0.5.2.dev20241212030605.dist-info → liger_kernel_nightly-0.5.2.dev20241212033924.dist-info}/top_level.txt RENAMED Viewed

File without changes

liger-kernel-nightly 0.5.2.dev20241212030605__py3-none-any.whl → 0.5.2.dev20241212033924__py3-none-any.whl

liger-kernel-nightly 0.5.2.dev20241212030605py3-none-any.whl → 0.5.2.dev20241212033924py3-none-any.whl