PyPI - liger-kernel-nightly - Versions diffs - 0.6.2.dev20250822000312__py3-none-any.whl → 0.6.2.dev20250822031344__py3-none-any.whl - Mend

liger-kernel-nightly 0.6.2.dev20250822000312py3-none-any.whl → 0.6.2.dev20250822031344py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

liger_kernel/ops/fused_linear_cross_entropy.py CHANGED Viewed

@@ -26,6 +26,7 @@ def fused_linear_cross_entropy_forward(
     softcap=None,
     return_z_loss=False,
     accum_dtype=None,
+    use_token_scaling=False,
 ):
     assert isinstance(return_z_loss, bool), f"return_z_loss must be True or False. Got: {return_z_loss}"
     device = _input.device
@@ -89,6 +90,23 @@ def fused_linear_cross_entropy_forward(
         n_rows = logits_chunk.shape[0]
+        # Compute predicted probabilities for token scaling if needed
+        if use_token_scaling:
+            # Compute softmax probabilities for scaling
+            # We need to compute this before the cross entropy kernel modifies logits_chunk
+            logits_for_softmax = logits_chunk.detach().clone()  # Detach to avoid gradient flow
+            if softcap is not None:
+                logits_for_softmax = softcap * torch.tanh(logits_for_softmax / softcap)
+            # Compute softmax to get predicted probabilities
+            probs = torch.softmax(logits_for_softmax, dim=-1)
+            # Get the predicted probability for each target token
+            pred_probs = torch.gather(probs, -1, target_chunk.unsqueeze(-1)).squeeze(-1)
+            # Store the scaling factors
+            scaling_factors = pred_probs.detach()  # Detach to ensure no gradient flow
         # unreduced loss
         loss_1d_slice = loss_1d[start_idx:end_idx]  # chunk_size,
         z_loss_1d_slice = z_loss_1d[start_idx:end_idx] if return_z_loss else None
@@ -123,11 +141,23 @@ def fused_linear_cross_entropy_forward(
             num_warps=32 if not is_hip() else 16,
         )
+        # Apply token scaling if requested
+        if use_token_scaling:
+            loss_1d_slice = loss_1d_slice * scaling_factors
+            if return_z_loss:
+                z_loss_1d_slice = z_loss_1d_slice * scaling_factors
         loss_1d[start_idx:end_idx] = loss_1d_slice
         if return_z_loss:
             z_loss_1d[start_idx:end_idx] = z_loss_1d_slice
         grad_logits_chunk = logits_chunk  # chunk_size x V
+        # Apply token scaling to gradients if requested
+        if use_token_scaling:
+            # Expand scaling factors to match gradient dimensions
+            scaling_factors_expanded = scaling_factors.unsqueeze(-1)  # chunk_size x 1
+            grad_logits_chunk = grad_logits_chunk * scaling_factors_expanded
         grad_input[start_idx:end_idx] = grad_logits_chunk @ weight
         if grad_weight is not None:
@@ -136,7 +166,7 @@ def fused_linear_cross_entropy_forward(
         if bias is not None:
             torch.add(
                 input=grad_bias,
-                other=logits_chunk.sum(dim=0),
+                other=grad_logits_chunk.sum(dim=0),
                 out=grad_bias,
                 alpha=1.0,
             )
@@ -146,6 +176,10 @@ def fused_linear_cross_entropy_forward(
     #     loss = loss_1d
     #     z_loss = z_loss_1d if return_z_loss else None
+    if reduction == "none":
+        # Return per-token losses
+        loss = loss_1d
+        z_loss = z_loss_1d if return_z_loss else None
     else:
         loss = torch.sum(loss_1d)
         z_loss = torch.sum(z_loss_1d) if return_z_loss else None
@@ -221,6 +255,7 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
         softcap=None,
         return_z_loss: bool = False,
         accum_dtype=None,
+        use_token_scaling: bool = False,
     ):
         """
         Fusing the last linear layer with cross-entropy loss
@@ -241,6 +276,9 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
         reduction: reduction to apply
         accum_dtype (torch.dtype): the dtype of intermediate result buffers for weight and bias gradient accumulations.
             Recommended to set `accum_dtype` to higher precision, e.g. `torch.float32`, if the training is unstable with original dtype. Default: `None`, performing accumulations in original dtype
+        use_token_scaling (bool): whether to scale each token's loss by its predicted probability (detached).
+            When True, each token's loss is multiplied by the model's predicted probability for that token's true class.
+            Default: False.
         """
         loss, z_loss, grad_input, grad_weight, grad_bias = fused_linear_cross_entropy_forward(
@@ -256,6 +294,7 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
             softcap=softcap,
             return_z_loss=return_z_loss,
             accum_dtype=accum_dtype,
+            use_token_scaling=use_token_scaling,
         )
         # downcast to dtype and store for backward
         ctx.save_for_backward(
@@ -288,4 +327,5 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
             None,
             None,
             None,
+            None,  # use_token_scaling
         )

liger_kernel/transformers/functional.py CHANGED Viewed

@@ -65,6 +65,7 @@ def liger_fused_linear_cross_entropy(
     softcap: Optional[float] = None,
     return_z_loss: bool = False,
     accum_dtype=None,
+    use_token_scaling: bool = False,
 ):
     loss, z_loss = LigerFusedLinearCrossEntropyFunction.apply(
         input,
@@ -79,6 +80,7 @@ def liger_fused_linear_cross_entropy(
         softcap,
         return_z_loss,
         accum_dtype,
+        use_token_scaling,
     )
     if not return_z_loss:
         return loss

liger_kernel/transformers/fused_linear_cross_entropy.py CHANGED Viewed

@@ -16,6 +16,7 @@ class LigerFusedLinearCrossEntropyLoss(torch.nn.Module):
         softcap: Optional[float] = None,
         return_z_loss: bool = False,
         accum_dtype: Optional[torch.dtype] = None,
+        use_token_scaling: bool = False,
     ):
         super().__init__()
         assert (label_smoothing >= 0) and (label_smoothing <= 1), (
@@ -34,6 +35,7 @@ class LigerFusedLinearCrossEntropyLoss(torch.nn.Module):
         self.softcap = softcap
         self.return_z_loss = return_z_loss
         self.accum_dtype = accum_dtype
+        self.use_token_scaling = use_token_scaling
     def forward(self, lin_weight, _input, target, bias=None):
         loss, z_loss = LigerFusedLinearCrossEntropyFunction.apply(
@@ -49,6 +51,7 @@ class LigerFusedLinearCrossEntropyLoss(torch.nn.Module):
             self.softcap,
             self.return_z_loss,
             self.accum_dtype,
+            self.use_token_scaling,
         )
         if not self.return_z_loss:
             return loss

{liger_kernel_nightly-0.6.2.dev20250822000312.dist-info → liger_kernel_nightly-0.6.2.dev20250822031344.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: liger_kernel_nightly
-Version: 0.6.2.dev20250822000312
+Version: 0.6.2.dev20250822031344
 Summary: Efficient Triton kernels for LLM Training
 License: BSD 2-CLAUSE LICENSE
         Copyright 2024 LinkedIn Corporation

{liger_kernel_nightly-0.6.2.dev20250822000312.dist-info → liger_kernel_nightly-0.6.2.dev20250822031344.dist-info}/RECORD RENAMED Viewed

@@ -20,7 +20,7 @@ liger_kernel/ops/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,
 liger_kernel/ops/cross_entropy.py,sha256=e8THGnhOcy_0SbOLABx67HEM7-B8a8pG7nDKbCRpQKM,19123
 liger_kernel/ops/dyt.py,sha256=gCLz4S8aul8SY9nvIGaoK67aGb7U9MJRQdo3ONqmQYs,5417
 liger_kernel/ops/fused_add_rms_norm.py,sha256=UBqmlqFCmhSAIpkNKd8rrfXatX7Z4J9bp2dX9A0lrJQ,14017
-liger_kernel/ops/fused_linear_cross_entropy.py,sha256=YFPXUOIZpM_4r7AlfjkwOgDhAE_0H2mFjdKtx8cv-T4,11594
+liger_kernel/ops/fused_linear_cross_entropy.py,sha256=AIlKMOnM3J7ZeAgPP1uvA3T4OIeRkz6TTr_Lg9XgZGY,13581
 liger_kernel/ops/fused_linear_jsd.py,sha256=CSoprxb-YcJy-YUKiTcYkxN8sb9h2kdk_iHuncvSV5c,9683
 liger_kernel/ops/fused_neighborhood_attention.py,sha256=vPi5xbnh6wxyZehaqo6Tuilqo2fN5SGDiONjnNmIKqs,35556
 liger_kernel/ops/geglu.py,sha256=r0WSq9E93zzynL44Wh8femzOWK07_SseBM_pJUyxT3s,4144
@@ -46,9 +46,9 @@ liger_kernel/transformers/auto_model.py,sha256=0qCTRZt280Bj_LcFdzo9hlaR-BWNazawX
 liger_kernel/transformers/cross_entropy.py,sha256=z3KTWQnFxr_IZaVjtYt0ZNEWQdDdYThN35xWkHlDGH0,1683
 liger_kernel/transformers/dyt.py,sha256=i-4GPaMrl-jab9TVI5qN0-H9qycn_mCbV82ozU4nbmU,723
 liger_kernel/transformers/fsdp.py,sha256=CUiyjTmjkjY7pLXQv8ly9rnzgXw6529csd9pvtJNMYc,3096
-liger_kernel/transformers/functional.py,sha256=XkYk_zb8xsRMtZtouYmlX_Tyyr-QA3WigSPF36DECYk,7777
+liger_kernel/transformers/functional.py,sha256=-vpz95wbv5wLpInjSG06KNHETsEgKnRIiV-lMYHVs68,7841
 liger_kernel/transformers/fused_add_rms_norm.py,sha256=7_Bzg-x6lLe6W1qG2DtjDALhEpNZlC6N5GppEs9cTYY,1199
-liger_kernel/transformers/fused_linear_cross_entropy.py,sha256=_5AaQT2mcUEO2T7JGJYQafz6A1Efn9d3-Z3xFO_Xe0o,1862
+liger_kernel/transformers/fused_linear_cross_entropy.py,sha256=ZMxkiJzGz1KtqgAdsqPODq3bugHBx_80kPYcd5z-xmM,1990
 liger_kernel/transformers/fused_linear_jsd.py,sha256=bZ4otCvWBuOnA5XdQL-FzZVItJlDt-ht9e_pG7PG93E,3999
 liger_kernel/transformers/fused_neighborhood_attention.py,sha256=TxYDUAt9B6WSP14aJP66C_2Mbds2sSIPGnamhUSTrC8,7957
 liger_kernel/transformers/geglu.py,sha256=mrgqzIUVd6lN7fkDKLkw5YaESDxDtFgbot430WwPVOQ,1107
@@ -96,9 +96,9 @@ liger_kernel/transformers/trainer/__init__.py,sha256=p7yQfklV8-467qSz_ZMimkbDF7H
 liger_kernel/transformers/trainer/orpo_trainer.py,sha256=tX0h63aOFe3rNqTmk6JpMf75UPo981yzEa6TghnjS0Q,5370
 liger_kernel/triton/__init__.py,sha256=qCiCamzCRv6lpV8IqpAc9YMdNKC7GKurClWceQPnlis,92
 liger_kernel/triton/monkey_patch.py,sha256=Rd0hUHAzDkFfHvnX7-PBaNK5EKnZhtfM_h-fgQH9HPY,1568
-liger_kernel_nightly-0.6.2.dev20250822000312.dist-info/LICENSE,sha256=OhzLDHJ0to4a8sodVLELZiCFylZ1NAAYLs-HrjPy0ag,1312
-liger_kernel_nightly-0.6.2.dev20250822000312.dist-info/METADATA,sha256=5Jn-j-NberC6fAx9fKWvuHMW5RL1cI67YNRXbq_S6GU,24504
-liger_kernel_nightly-0.6.2.dev20250822000312.dist-info/NOTICE,sha256=njwnoPZLh9AN8SJQzxvCGLHi-8X__AvWRze6joNXIY8,2066
-liger_kernel_nightly-0.6.2.dev20250822000312.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
-liger_kernel_nightly-0.6.2.dev20250822000312.dist-info/top_level.txt,sha256=2eghu4hA3LnkM7ElW92tQ8zegWKgSbeo-k-aGe1YnvY,13
-liger_kernel_nightly-0.6.2.dev20250822000312.dist-info/RECORD,,
+liger_kernel_nightly-0.6.2.dev20250822031344.dist-info/LICENSE,sha256=OhzLDHJ0to4a8sodVLELZiCFylZ1NAAYLs-HrjPy0ag,1312
+liger_kernel_nightly-0.6.2.dev20250822031344.dist-info/METADATA,sha256=XSw3SXL9PGPj5eGacLKkUfGpT7I7_QcYmrFdC75Wuck,24504
+liger_kernel_nightly-0.6.2.dev20250822031344.dist-info/NOTICE,sha256=njwnoPZLh9AN8SJQzxvCGLHi-8X__AvWRze6joNXIY8,2066
+liger_kernel_nightly-0.6.2.dev20250822031344.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
+liger_kernel_nightly-0.6.2.dev20250822031344.dist-info/top_level.txt,sha256=2eghu4hA3LnkM7ElW92tQ8zegWKgSbeo-k-aGe1YnvY,13
+liger_kernel_nightly-0.6.2.dev20250822031344.dist-info/RECORD,,

{liger_kernel_nightly-0.6.2.dev20250822000312.dist-info → liger_kernel_nightly-0.6.2.dev20250822031344.dist-info}/LICENSE RENAMED Viewed

File without changes

{liger_kernel_nightly-0.6.2.dev20250822000312.dist-info → liger_kernel_nightly-0.6.2.dev20250822031344.dist-info}/NOTICE RENAMED Viewed

File without changes

{liger_kernel_nightly-0.6.2.dev20250822000312.dist-info → liger_kernel_nightly-0.6.2.dev20250822031344.dist-info}/WHEEL RENAMED Viewed

File without changes

{liger_kernel_nightly-0.6.2.dev20250822000312.dist-info → liger_kernel_nightly-0.6.2.dev20250822031344.dist-info}/top_level.txt RENAMED Viewed

File without changes

liger-kernel-nightly 0.6.2.dev20250822000312__py3-none-any.whl → 0.6.2.dev20250822031344__py3-none-any.whl

liger-kernel-nightly 0.6.2.dev20250822000312py3-none-any.whl → 0.6.2.dev20250822031344py3-none-any.whl