PyPI - liger-kernel-nightly - Versions diffs - 0.5.2.dev20241229131950__py3-none-any.whl → 0.5.2.dev20250101082227__py3-none-any.whl - Mend

liger-kernel-nightly 0.5.2.dev20241229131950py3-none-any.whl → 0.5.2.dev20250101082227py3-none-any.whl

Files changed (7) hide show

liger_kernel/chunked_loss/fused_linear_distillation.py CHANGED Viewed

@@ -8,12 +8,15 @@ from torch.nn import functional as F
 class LigerFusedLinearDistillationBase(torch.autograd.Function):
     @abstractmethod
-    def distillation_loss_fn(student_logits, teacher_logits, temperature):
+    def distillation_loss_fn(
+        student_logits,
+        teacher_logits,
+    ):
         """
         Compute distillation loss.
         Args:
-            student_logits (torch.Tensor): Raw logits of student tokens. Shape: (batch_size * seq_len, vocab_size).
-            teacher_logits (torch.Tensor): Raw logits of teacher tokens. Shape: (batch_size * seq_len, vocab_size).
+            student_logits (torch.Tensor): Raw (temperature-scaled) logits of student tokens. Shape: (batch_size * seq_len, vocab_size).
+            teacher_logits (torch.Tensor): Raw (temperature-scaled) logits of teacher tokens. Shape: (batch_size * seq_len, vocab_size).
         """
         raise NotImplementedError("Distillation loss function must be implemented.")
@@ -65,7 +68,6 @@ class LigerFusedLinearDistillationBase(torch.autograd.Function):
         distillation_loss_fn=None,
         full_target=None,
         ignore_index=-100,
-        temperature=1.0,
         weight_hard_loss=0.5,
         weight_soft_loss=0.5,
         compute_ce_loss=True,
@@ -107,7 +109,7 @@ class LigerFusedLinearDistillationBase(torch.autograd.Function):
         hard_loss /= full_target.shape[0]
-        soft_loss = distillation_loss_fn(student_logits_chunk, teacher_logits_chunk, temperature)
+        soft_loss = distillation_loss_fn(student_logits_chunk, teacher_logits_chunk)
         soft_loss /= full_target.shape[0]
         loss = weight_hard_loss * hard_loss + weight_soft_loss * soft_loss
@@ -147,10 +149,11 @@ class LigerFusedLinearDistillationBase(torch.autograd.Function):
             teacher_bias (torch.Tensor, optional): Teacher bias tensor. Shape: (vocab_size,).
             loss_fn (callable): Loss function to compute the loss on a chunk of input/target.
             chunk_size (int): Size of a chunk.
-            compute_ce_loss (bool): Whether to compute CE loss.
             ignore_index (int): Index to ignore for loss computation.
             weight_hard_loss (float): Weight for hard/task loss.
             weight_soft_loss (float): Weight for soft/distillation loss.
+            compute_ce_loss (bool): Whether to compute CE loss.
+            temperature (float): Temperature to control the input probability distribution. Default: `1.0` (i.e. no scale)
             compiled (bool): Whether to use torch compile for chunk accumulation.
             loss_kwargs (dict): Other possible arguments that a loss function might need
         """
@@ -168,7 +171,6 @@ class LigerFusedLinearDistillationBase(torch.autograd.Function):
             weight_hard_loss=weight_hard_loss,
             weight_soft_loss=weight_soft_loss,
             compute_ce_loss=compute_ce_loss,
-            temperature=temperature,
             **loss_kwargs,
         )
@@ -223,6 +225,9 @@ class LigerFusedLinearDistillationBase(torch.autograd.Function):
         if compiled:
             accumulate_chunk = torch.compile(accumulate_chunk)
+        student_input /= temperature
+        teacher_input /= temperature
         num_chunks = max(1, student_input.shape[0] // CHUNK_SIZE)
         _student_input_chunks = torch.chunk(student_input, chunks=num_chunks, dim=0)
         _teacher_input_chunks = torch.chunk(teacher_input, chunks=num_chunks, dim=0)

{liger_kernel_nightly-0.5.2.dev20241229131950.dist-info → liger_kernel_nightly-0.5.2.dev20250101082227.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: liger_kernel_nightly
-Version: 0.5.2.dev20241229131950
+Version: 0.5.2.dev20250101082227
 Summary: Efficient Triton kernels for LLM Training
 License: BSD 2-CLAUSE LICENSE
         Copyright 2024 LinkedIn Corporation

{liger_kernel_nightly-0.5.2.dev20241229131950.dist-info → liger_kernel_nightly-0.5.2.dev20250101082227.dist-info}/RECORD RENAMED Viewed

@@ -6,7 +6,7 @@ liger_kernel/chunked_loss/__init__.py,sha256=R2wCcz4Y0kTAve926DH3k182XKezpXeACMH
 liger_kernel/chunked_loss/cpo_loss.py,sha256=L4Nk38Xh5Yfhah3Vsc_sN_Q75FWt1LA-xNNXzsK8iPM,3516
 liger_kernel/chunked_loss/dpo_loss.py,sha256=VYZMOafdvE8xlhvTtwjrz81tIzxR1mHF4lXdsADnIQg,4373
 liger_kernel/chunked_loss/functional.py,sha256=9Gr-YXIuEzEJkBUhDx3G2fuQayckLor7cC7svhmPML4,549
-liger_kernel/chunked_loss/fused_linear_distillation.py,sha256=M-QWvGPnWefYDn6Hr9bPn7diMNP5qrUaeWTb_zdMO4E,10265
+liger_kernel/chunked_loss/fused_linear_distillation.py,sha256=uQtwtu-kaUZJTjNhAnIr3O794oUlUZ98XR5shYtwP5k,10440
 liger_kernel/chunked_loss/fused_linear_preference.py,sha256=25sTgvphLKAR0jyJcrsJPKK1abFpTKrajSyAx8nJ3bc,16134
 liger_kernel/chunked_loss/orpo_loss.py,sha256=jbZxx-EjPK71A6CSyNzTOAIEQgAUjfvwSViw6R_pPXQ,3510
 liger_kernel/chunked_loss/simpo_loss.py,sha256=ZvDIjT9EQrbwzH2LNZMhv84SPsOHGi_Ywk95vgA0b_o,3736
@@ -58,9 +58,9 @@ liger_kernel/transformers/trainer/__init__.py,sha256=p7yQfklV8-467qSz_ZMimkbDF7H
 liger_kernel/transformers/trainer/orpo_trainer.py,sha256=MId1S_MfA3pPVQA1rkiKxp-jZDNz8VmvZzXC-Kugol4,7662
 liger_kernel/triton/__init__.py,sha256=qCiCamzCRv6lpV8IqpAc9YMdNKC7GKurClWceQPnlis,92
 liger_kernel/triton/monkey_patch.py,sha256=Rd0hUHAzDkFfHvnX7-PBaNK5EKnZhtfM_h-fgQH9HPY,1568
-liger_kernel_nightly-0.5.2.dev20241229131950.dist-info/LICENSE,sha256=OhzLDHJ0to4a8sodVLELZiCFylZ1NAAYLs-HrjPy0ag,1312
-liger_kernel_nightly-0.5.2.dev20241229131950.dist-info/METADATA,sha256=iOyPsdNf1GL3Z3Ng0CS3xoOq6iiTb8eFXAMwqDT1UZM,21055
-liger_kernel_nightly-0.5.2.dev20241229131950.dist-info/NOTICE,sha256=njwnoPZLh9AN8SJQzxvCGLHi-8X__AvWRze6joNXIY8,2066
-liger_kernel_nightly-0.5.2.dev20241229131950.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
-liger_kernel_nightly-0.5.2.dev20241229131950.dist-info/top_level.txt,sha256=2eghu4hA3LnkM7ElW92tQ8zegWKgSbeo-k-aGe1YnvY,13
-liger_kernel_nightly-0.5.2.dev20241229131950.dist-info/RECORD,,
+liger_kernel_nightly-0.5.2.dev20250101082227.dist-info/LICENSE,sha256=OhzLDHJ0to4a8sodVLELZiCFylZ1NAAYLs-HrjPy0ag,1312
+liger_kernel_nightly-0.5.2.dev20250101082227.dist-info/METADATA,sha256=gNuR5mtVV7fQsT0qPLr3_Ok2WLKHgbC2FidkcY1q6OA,21055
+liger_kernel_nightly-0.5.2.dev20250101082227.dist-info/NOTICE,sha256=njwnoPZLh9AN8SJQzxvCGLHi-8X__AvWRze6joNXIY8,2066
+liger_kernel_nightly-0.5.2.dev20250101082227.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
+liger_kernel_nightly-0.5.2.dev20250101082227.dist-info/top_level.txt,sha256=2eghu4hA3LnkM7ElW92tQ8zegWKgSbeo-k-aGe1YnvY,13
+liger_kernel_nightly-0.5.2.dev20250101082227.dist-info/RECORD,,

{liger_kernel_nightly-0.5.2.dev20241229131950.dist-info → liger_kernel_nightly-0.5.2.dev20250101082227.dist-info}/LICENSE RENAMED Viewed

File without changes

{liger_kernel_nightly-0.5.2.dev20241229131950.dist-info → liger_kernel_nightly-0.5.2.dev20250101082227.dist-info}/NOTICE RENAMED Viewed

File without changes

{liger_kernel_nightly-0.5.2.dev20241229131950.dist-info → liger_kernel_nightly-0.5.2.dev20250101082227.dist-info}/WHEEL RENAMED Viewed

File without changes

{liger_kernel_nightly-0.5.2.dev20241229131950.dist-info → liger_kernel_nightly-0.5.2.dev20250101082227.dist-info}/top_level.txt RENAMED Viewed

File without changes

liger-kernel-nightly 0.5.2.dev20241229131950__py3-none-any.whl → 0.5.2.dev20250101082227__py3-none-any.whl

liger-kernel-nightly 0.5.2.dev20241229131950py3-none-any.whl → 0.5.2.dev20250101082227py3-none-any.whl