PyPI - liger-kernel-nightly - Versions diffs - 0.6.1.dev20250728225847__tar.gz → 0.6.1.dev20250730201750__tar.gz - Mend

liger-kernel-nightly 0.6.1.dev20250728225847tar.gz → 0.6.1.dev20250730201750tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (279) hide show

{liger_kernel_nightly-0.6.1.dev20250728225847 → liger_kernel_nightly-0.6.1.dev20250730201750}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: liger_kernel_nightly
-Version: 0.6.1.dev20250728225847
+Version: 0.6.1.dev20250730201750
 Summary: Efficient Triton kernels for LLM Training
 License: BSD 2-CLAUSE LICENSE
         Copyright 2024 LinkedIn Corporation

{liger_kernel_nightly-0.6.1.dev20250728225847 → liger_kernel_nightly-0.6.1.dev20250730201750}/benchmark/scripts/benchmark_fused_linear_cross_entropy.py RENAMED Viewed

@@ -34,10 +34,12 @@ class TorchLMHeadCE(torch.nn.Module):
 class LigerLMHeadCE(torch.nn.Module):
-    def __init__(self, H: int, V: int, dtype: torch.dtype, ignore_index: int = -100):
+    def __init__(self, H: int, V: int, dtype: torch.dtype, ignore_index: int = -100, accum_dtype=None):
         super().__init__()
         self.lin = torch.nn.Linear(in_features=H, out_features=V, bias=False, dtype=dtype)
-        self.ce_loss = LigerFusedLinearCrossEntropyLoss(ignore_index=ignore_index, reduction="mean")
+        self.ce_loss = LigerFusedLinearCrossEntropyLoss(
+            ignore_index=ignore_index, reduction="mean", accum_dtype=accum_dtype
+        )
     def forward(self, x, y):
         return self.ce_loss(self.lin.weight, x, y)
@@ -59,6 +61,7 @@ def bench_memory_fused_linear_cross_entropy(
     torch_lm_head_ce = TorchLMHeadCE(H=H, V=V, dtype=dtype).to(device)
     liger_lm_head_ce = LigerLMHeadCE(H=H, V=V, dtype=dtype).to(device)
+    liger_lm_head_ce_fp32_accum = LigerLMHeadCE(H=H, V=V, dtype=dtype, accum_dtype=torch.float32).to(device)
     _input = torch.randn(BT, H, requires_grad=True, dtype=dtype, device=device)
     target = torch.randint(V, (BT, 1), dtype=torch.long, device=device).squeeze(1)
@@ -66,6 +69,8 @@ def bench_memory_fused_linear_cross_entropy(
     def fwd():
         if provider == "liger":
             return liger_lm_head_ce(_input, target)
+        elif provider == "liger-fp32-accum":
+            return liger_lm_head_ce_fp32_accum(_input, target)
         elif provider == "huggingface":
             return torch_lm_head_ce(_input, target)
@@ -98,6 +103,7 @@ def bench_speed_fused_linear_cross_entropy(
     torch_lm_head_ce = TorchLMHeadCE(H=H, V=V, dtype=dtype).to(device)
     liger_lm_head_ce = LigerLMHeadCE(H=H, V=V, dtype=dtype).to(device)
+    liger_lm_head_ce_fp32_accum = LigerLMHeadCE(H=H, V=V, dtype=dtype, accum_dtype=torch.float32).to(device)
     _input = torch.randn(BT, H, requires_grad=True, dtype=dtype, device=device)
     target = torch.randint(V, (BT, 1), dtype=torch.long, device=device).squeeze(1)
@@ -105,6 +111,8 @@ def bench_speed_fused_linear_cross_entropy(
     def fwd():
         if provider == "liger":
             return liger_lm_head_ce(_input, target)
+        elif provider == "liger-fp32-accum":
+            return liger_lm_head_ce_fp32_accum(_input, target)
         elif provider == "huggingface":
             return torch_lm_head_ce(_input, target)
@@ -149,7 +157,7 @@ if __name__ == "__main__":
         "x_name": "BT",
         "x_label": "B x T",
         "x_values": [2**i for i in range(12, 16)],
-        "kernel_providers": ["liger", "huggingface"],
+        "kernel_providers": ["liger", "liger-fp32-accum", "huggingface"],
         "extra_benchmark_configs": [{"H": 4096, "V": 128256, "mode": "forward", "dtype": torch.bfloat16}],
         "overwrite": args.overwrite,
     }

{liger_kernel_nightly-0.6.1.dev20250728225847 → liger_kernel_nightly-0.6.1.dev20250730201750}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "liger_kernel_nightly"
-version = "0.6.1.dev20250728225847"
+version = "0.6.1.dev20250730201750"
 description = "Efficient Triton kernels for LLM Training"
 urls = { "Homepage" = "https://github.com/linkedin/Liger-Kernel" }
 readme = { file = "README.md", content-type = "text/markdown" }

{liger_kernel_nightly-0.6.1.dev20250728225847 → liger_kernel_nightly-0.6.1.dev20250730201750}/src/liger_kernel/chunked_loss/dpo_loss.py RENAMED Viewed

@@ -13,6 +13,7 @@ class LigerFusedLinearDPOFunction(LigerFusedLinearPreferenceBase):
         ref_chosen_logps=None,
         ref_rejected_logps=None,
         beta=0.1,
+        loss_type="sigmoid",
     ):
         """
         Paper: https://arxiv.org/pdf/2305.18290
@@ -48,8 +49,50 @@ class LigerFusedLinearDPOFunction(LigerFusedLinearPreferenceBase):
         chosen_rewards = beta * chosen_logratios
         rejected_rewards = beta * rejected_logratios
-        logits_diff = beta * (chosen_logratios - rejected_logratios)
-        loss = -F.logsigmoid(logits_diff).sum() / (full_target.shape[0] // 2)
+        if loss_type == "sigmoid":
+            logits_diff = beta * (chosen_logratios - rejected_logratios)
+            loss = -F.logsigmoid(logits_diff).sum() / (full_target.shape[0] // 2)
+        elif loss_type == "apo_zero":
+            # Eqn (7) of the APO paper (https://huggingface.co/papers/2408.06266)
+            # Use this loss when you believe the chosen outputs are better than your model's default output
+            losses_chosen = 1 - F.sigmoid(beta * chosen_logratios)  # Increase chosen likelihood
+            losses_rejected = F.sigmoid(beta * rejected_logratios)
+            losses = losses_chosen + losses_rejected
+            loss = losses.sum() / (full_target.shape[0] // 2)
+        elif loss_type == "apo_down":
+            # Eqn (8) of the APO paper (https://huggingface.co/papers/2408.06266)
+            # Use this loss when you believe the chosen outputs are worse than your model's default output.
+            # Decrease chosen likelihood and decrease rejected likelihood more
+            losses_chosen = F.sigmoid(beta * chosen_logratios)
+            losses_rejected = 1 - F.sigmoid(beta * (chosen_logratios - rejected_logratios))
+            losses = losses_chosen + losses_rejected
+            loss = losses.sum() / (full_target.shape[0] // 2)
+        elif loss_type == "sppo_hard":
+            # In the paper (https://huggingface.co/papers/2405.00675), SPPO employs a soft probability approach,
+            # estimated using the PairRM score. The probability calculation is conducted outside of the trainer class.
+            # The version described here is the hard probability version, where P in Equation (4.7) of Algorithm 1 is
+            # set to 1 for the winner and 0 for the loser.
+            a = chosen_logps - ref_chosen_logps
+            b = rejected_logps - ref_rejected_logps
+            losses = (a - 0.5 / beta) ** 2 + (b + 0.5 / beta) ** 2
+            loss = losses.sum() / (full_target.shape[0] // 2)
+        elif loss_type == "nca_pair":
+            losses = (
+                -F.logsigmoid(chosen_rewards)
+                - 0.5 * F.logsigmoid(-chosen_rewards)
+                - 0.5 * F.logsigmoid(-rejected_rewards)
+            )
+            loss = losses.sum() / (full_target.shape[0] // 2)
+        else:
+            raise ValueError(
+                f"Unsupported loss_type: {loss_type}. Supported types are: sigmoid, apo_zero, apo_down, sppo_hard, nca_pair"
+            )
         return loss, chosen_rewards, rejected_rewards
     @classmethod
@@ -70,6 +113,7 @@ class LigerFusedLinearDPOFunction(LigerFusedLinearPreferenceBase):
         use_ref_model=True,
         average_log_prob=False,
         chunk_size=1,
+        loss_type="sigmoid",
     ):
         """
         Fused linear layer with DPO loss.
@@ -108,12 +152,13 @@ class LigerFusedLinearDPOFunction(LigerFusedLinearPreferenceBase):
             ref_bias=ref_bias,
             average_log_prob=average_log_prob,
             chunk_size=chunk_size,
+            loss_type=loss_type,
         )
     @staticmethod
     def backward(ctx, *grad_output):
         grads = LigerFusedLinearPreferenceBase.backward(ctx, grad_output)[:4]
-        return *grads, None, None, None, None, None, None, None, None, None, None
+        return *grads, None, None, None, None, None, None, None, None, None, None, None
 class LigerFusedLinearDPOLoss(torch.nn.Module):
@@ -130,6 +175,7 @@ class LigerFusedLinearDPOLoss(torch.nn.Module):
         use_ref_model: bool = True,
         average_log_prob: bool = False,
         chunk_size: int = 1,
+        loss_type: str = "sigmoid",
     ):
         """
         Args:
@@ -149,6 +195,10 @@ class LigerFusedLinearDPOLoss(torch.nn.Module):
         self.use_ref_model = use_ref_model
         self.average_log_prob = average_log_prob
         self.chunk_size = chunk_size
+        self.loss_type = loss_type
+        supported_loss_types = {"sigmoid", "apo_zero", "apo_down", "sppo_hard", "nca_pair"}
+        if self.loss_type not in supported_loss_types:
+            raise ValueError(f"Unsupported loss_type: {self.loss_type}. Supported types are: {supported_loss_types}")
     def forward(
         self,
@@ -175,4 +225,5 @@ class LigerFusedLinearDPOLoss(torch.nn.Module):
             self.use_ref_model,
             self.average_log_prob,
             self.chunk_size,
+            self.loss_type,
         )

{liger_kernel_nightly-0.6.1.dev20250728225847 → liger_kernel_nightly-0.6.1.dev20250730201750}/src/liger_kernel/ops/fused_linear_cross_entropy.py RENAMED Viewed

@@ -25,6 +25,7 @@ def fused_linear_cross_entropy_forward(
     reduction="mean",
     softcap=None,
     return_z_loss=False,
+    accum_dtype=None,
 ):
     assert isinstance(return_z_loss, bool), f"return_z_loss must be True or False. Got: {return_z_loss}"
     device = _input.device
@@ -44,10 +45,16 @@ def fused_linear_cross_entropy_forward(
     chunk_size = triton.next_power_of_2(triton.cdiv(BT, inc_factor))  # (BT + inc_factor - 1) // inc_factor
     num_chunks = triton.cdiv(BT, chunk_size)  # (BT + chunk_size - 1) // chunk_size
-    grad_weight = torch.zeros_like(weight, device=device) if weight.requires_grad else None
     grad_input = torch.zeros_like(_input, device=device)
-    grad_bias = torch.zeros_like(bias, device=device) if bias is not None else None
-    # we use fp32 for loss accumulator
+    # we use fp32 for loss and gradients accumulator
+    if accum_dtype is None:
+        grad_weight = torch.zeros_like(weight, device=device) if weight.requires_grad else None
+        grad_bias = torch.zeros_like(bias, device=device) if bias is not None else None
+    else:
+        grad_weight = torch.zeros_like(weight, dtype=accum_dtype, device=device) if weight.requires_grad else None
+        grad_bias = torch.zeros_like(bias, dtype=accum_dtype, device=device) if bias is not None else None
     loss_1d = torch.zeros(BT, dtype=torch.float32, device=device)
     z_loss_1d = torch.zeros(BT, dtype=_input.dtype, device=_input.device) if return_z_loss else None
@@ -124,16 +131,7 @@ def fused_linear_cross_entropy_forward(
         grad_input[start_idx:end_idx] = grad_logits_chunk @ weight
         if grad_weight is not None:
-            torch.addmm(
-                input=grad_weight,
-                mat1=logits_chunk.t().to(
-                    _input_chunk.dtype
-                ),  # In an autocast scenario without bias, differing logits_chunk data types will cause an addmm operation error.
-                mat2=_input_chunk,
-                out=grad_weight,
-                alpha=1.0,
-                beta=1.0,
-            )
+            grad_weight += torch.mm(grad_logits_chunk.t(), _input_chunk).float()
         if bias is not None:
             torch.add(
@@ -151,6 +149,11 @@ def fused_linear_cross_entropy_forward(
     else:
         loss = torch.sum(loss_1d)
         z_loss = torch.sum(z_loss_1d) if return_z_loss else None
+    # Cast back to original dtype
+    grad_weight = grad_weight.to(weight.dtype) if grad_weight is not None else None
+    grad_bias = grad_bias.to(bias.dtype) if grad_bias is not None else None
     return loss, z_loss, grad_input, grad_weight, grad_bias
@@ -217,6 +220,7 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
         reduction="mean",
         softcap=None,
         return_z_loss: bool = False,
+        accum_dtype=None,
     ):
         """
         Fusing the last linear layer with cross-entropy loss
@@ -235,6 +239,8 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
         ignore_index: the index to ignore in the target
         label_smoothing (float): The amount of smoothing when computing the loss, where 0.0 means no smoothing.
         reduction: reduction to apply
+        accum_dtype (torch.dtype): the dtype of intermediate result buffers for weight and bias gradient accumulations.
+            Recommended to set `accum_dtype` to higher precision, e.g. `torch.float32`, if the training is unstable with original dtype. Default: `None`, performing accumulations in original dtype
         """
         loss, z_loss, grad_input, grad_weight, grad_bias = fused_linear_cross_entropy_forward(
@@ -249,6 +255,7 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
             reduction=reduction,
             softcap=softcap,
             return_z_loss=return_z_loss,
+            accum_dtype=accum_dtype,
         )
         # downcast to dtype and store for backward
         ctx.save_for_backward(
@@ -280,4 +287,5 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
             None,
             None,
             None,
+            None,
         )

{liger_kernel_nightly-0.6.1.dev20250728225847 → liger_kernel_nightly-0.6.1.dev20250730201750}/src/liger_kernel/transformers/functional.py RENAMED Viewed

@@ -64,6 +64,7 @@ def liger_fused_linear_cross_entropy(
     reduction: str = "mean",
     softcap: Optional[float] = None,
     return_z_loss: bool = False,
+    accum_dtype=None,
 ):
     loss, z_loss = LigerFusedLinearCrossEntropyFunction.apply(
         input,
@@ -77,6 +78,7 @@ def liger_fused_linear_cross_entropy(
         reduction,
         softcap,
         return_z_loss,
+        accum_dtype,
     )
     if not return_z_loss:
         return loss

{liger_kernel_nightly-0.6.1.dev20250728225847 → liger_kernel_nightly-0.6.1.dev20250730201750}/src/liger_kernel/transformers/fused_linear_cross_entropy.py RENAMED Viewed

@@ -15,6 +15,7 @@ class LigerFusedLinearCrossEntropyLoss(torch.nn.Module):
         reduction: str = "mean",
         softcap: Optional[float] = None,
         return_z_loss: bool = False,
+        accum_dtype: Optional[torch.dtype] = None,
     ):
         super().__init__()
         assert (label_smoothing >= 0) and (label_smoothing <= 1), (
@@ -32,6 +33,7 @@ class LigerFusedLinearCrossEntropyLoss(torch.nn.Module):
         self.reduction = reduction
         self.softcap = softcap
         self.return_z_loss = return_z_loss
+        self.accum_dtype = accum_dtype
     def forward(self, lin_weight, _input, target, bias=None):
         loss, z_loss = LigerFusedLinearCrossEntropyFunction.apply(
@@ -46,6 +48,7 @@ class LigerFusedLinearCrossEntropyLoss(torch.nn.Module):
             self.reduction,
             self.softcap,
             self.return_z_loss,
+            self.accum_dtype,
         )
         if not self.return_z_loss:
             return loss

{liger_kernel_nightly-0.6.1.dev20250728225847 → liger_kernel_nightly-0.6.1.dev20250730201750}/src/liger_kernel_nightly.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: liger_kernel_nightly
-Version: 0.6.1.dev20250728225847
+Version: 0.6.1.dev20250730201750
 Summary: Efficient Triton kernels for LLM Training
 License: BSD 2-CLAUSE LICENSE
         Copyright 2024 LinkedIn Corporation

liger-kernel-nightly 0.6.1.dev20250728225847__tar.gz → 0.6.1.dev20250730201750__tar.gz

liger-kernel-nightly 0.6.1.dev20250728225847tar.gz → 0.6.1.dev20250730201750tar.gz