PyPI - liger-kernel-nightly - Versions diffs - 0.6.2.dev20251014205028__py3-none-any.whl → 0.6.2.dev20251016055812__py3-none-any.whl - Mend

liger-kernel-nightly 0.6.2.dev20251014205028py3-none-any.whl → 0.6.2.dev20251016055812py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

liger_kernel/ops/cross_entropy.py CHANGED Viewed

@@ -414,6 +414,8 @@ class LigerCrossEntropyFunction(torch.autograd.Function):
         Returns:
         tuple: A tuple with the compouted losses with respect to loss and z loss. The elements are tensors or None.
         """
+        input_requires_grad = _input.requires_grad
         loss, z_loss, _input = cross_entropy_forward(
             _input,
             target,
@@ -428,7 +430,8 @@ class LigerCrossEntropyFunction(torch.autograd.Function):
         # TODO: investigation
         # If we don't detach the _input tensor, the memory will double
         # Not sure why but seems that there will be a time both grad and value exist but in different location
-        ctx.save_for_backward(_input.detach())
+        if input_requires_grad:
+            ctx.save_for_backward(_input.detach())
         ctx.return_z_loss = return_z_loss
         return loss, z_loss

liger_kernel/ops/fused_linear_cross_entropy.py CHANGED Viewed

@@ -31,6 +31,8 @@ def fused_linear_cross_entropy_forward(
     assert isinstance(return_z_loss, bool), f"return_z_loss must be True or False. Got: {return_z_loss}"
     device = _input.device
+    input_requires_grad = _input.requires_grad
     # inputs have shape: BT x H
     # materialized activations will have shape: BT x V
     # the increase in memory = BT x V
@@ -49,12 +51,13 @@ def fused_linear_cross_entropy_forward(
     grad_input = torch.zeros_like(_input, device=device)
     # we use fp32 for loss and gradients accumulator
-    if accum_dtype is None:
-        grad_weight = torch.zeros_like(weight, device=device) if weight.requires_grad else None
-        grad_bias = torch.zeros_like(bias, device=device) if bias is not None else None
-    else:
-        grad_weight = torch.zeros_like(weight, dtype=accum_dtype, device=device) if weight.requires_grad else None
-        grad_bias = torch.zeros_like(bias, dtype=accum_dtype, device=device) if bias is not None else None
+    if input_requires_grad:
+        if accum_dtype is None:
+            grad_weight = torch.zeros_like(weight, device=device) if weight.requires_grad else None
+            grad_bias = torch.zeros_like(bias, device=device) if bias is not None else None
+        else:
+            grad_weight = torch.zeros_like(weight, dtype=accum_dtype, device=device) if weight.requires_grad else None
+            grad_bias = torch.zeros_like(bias, dtype=accum_dtype, device=device) if bias is not None else None
     loss_1d = torch.zeros(BT, dtype=torch.float32, device=device)
     z_loss_1d = torch.zeros(BT, dtype=_input.dtype, device=_input.device) if return_z_loss else None
@@ -150,7 +153,7 @@ def fused_linear_cross_entropy_forward(
             RETURN_Z_LOSS=return_z_loss,
             HAS_WEIGHT=True if ce_weight is not None else False,
             HAS_SOFTCAPPING=True if softcap is not None else False,
-            HAS_GRADIENTS=_input.requires_grad,
+            HAS_GRADIENTS=input_requires_grad,
             BLOCK_SIZE=BLOCK_SIZE,
             num_warps=32 if not is_hip() else 16,
         )
@@ -172,12 +175,13 @@ def fused_linear_cross_entropy_forward(
             scaling_factors_expanded = scaling_factors.unsqueeze(-1)  # chunk_size x 1
             grad_logits_chunk = grad_logits_chunk * scaling_factors_expanded
-        grad_input[start_idx:end_idx] = grad_logits_chunk @ weight
+        if input_requires_grad:
+            grad_input[start_idx:end_idx] = grad_logits_chunk @ weight
-        if grad_weight is not None and _input.requires_grad:
+        if grad_weight is not None and input_requires_grad:
             grad_weight += torch.mm(grad_logits_chunk.t(), _input_chunk).float()
-        if bias is not None and _input.requires_grad:
+        if bias is not None and input_requires_grad:
             torch.add(
                 input=grad_bias,
                 other=grad_logits_chunk.sum(dim=0),

{liger_kernel_nightly-0.6.2.dev20251014205028.dist-info → liger_kernel_nightly-0.6.2.dev20251016055812.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: liger_kernel_nightly
-Version: 0.6.2.dev20251014205028
+Version: 0.6.2.dev20251016055812
 Summary: Efficient Triton kernels for LLM Training
 License: BSD 2-CLAUSE LICENSE
         Copyright 2024 LinkedIn Corporation

{liger_kernel_nightly-0.6.2.dev20251014205028.dist-info → liger_kernel_nightly-0.6.2.dev20251016055812.dist-info}/RECORD RENAMED Viewed

@@ -17,10 +17,10 @@ liger_kernel/chunked_loss/kto_loss.py,sha256=llVCe6DkcpCo57seGWoMikaQVFApx764jsm
 liger_kernel/chunked_loss/orpo_loss.py,sha256=nu9UYG16dcMw93lvHi4_hYs3Q0FK1KnlmMRj7OpYU8s,4872
 liger_kernel/chunked_loss/simpo_loss.py,sha256=fy2w8KbhMrBv7b1jdIeH3bBFxY52bPQPZb3KwBvmurM,5385
 liger_kernel/ops/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-liger_kernel/ops/cross_entropy.py,sha256=OVkani9JEmCJ8IHN3UgJKzGW7zxJWDwy1EaWVcbShgQ,19517
+liger_kernel/ops/cross_entropy.py,sha256=CEgAeX97ezIBRhK3dPQRKsEQiwgnBDOewtDoqKXzw_Q,19605
 liger_kernel/ops/dyt.py,sha256=gCLz4S8aul8SY9nvIGaoK67aGb7U9MJRQdo3ONqmQYs,5417
 liger_kernel/ops/fused_add_rms_norm.py,sha256=UBqmlqFCmhSAIpkNKd8rrfXatX7Z4J9bp2dX9A0lrJQ,14017
-liger_kernel/ops/fused_linear_cross_entropy.py,sha256=PqIPHU8EjkHRJF6cNZViDucFVOgqo7eanJxB53Npke8,14388
+liger_kernel/ops/fused_linear_cross_entropy.py,sha256=rL6PyM4_9CLj7OL6qHa_ssFdJn0JEZlE12znF7T5cvM,14521
 liger_kernel/ops/fused_linear_jsd.py,sha256=CSoprxb-YcJy-YUKiTcYkxN8sb9h2kdk_iHuncvSV5c,9683
 liger_kernel/ops/fused_neighborhood_attention.py,sha256=vPi5xbnh6wxyZehaqo6Tuilqo2fN5SGDiONjnNmIKqs,35556
 liger_kernel/ops/geglu.py,sha256=r0WSq9E93zzynL44Wh8femzOWK07_SseBM_pJUyxT3s,4144
@@ -101,9 +101,9 @@ liger_kernel/transformers/trainer/__init__.py,sha256=p7yQfklV8-467qSz_ZMimkbDF7H
 liger_kernel/transformers/trainer/orpo_trainer.py,sha256=tX0h63aOFe3rNqTmk6JpMf75UPo981yzEa6TghnjS0Q,5370
 liger_kernel/triton/__init__.py,sha256=qCiCamzCRv6lpV8IqpAc9YMdNKC7GKurClWceQPnlis,92
 liger_kernel/triton/monkey_patch.py,sha256=Rd0hUHAzDkFfHvnX7-PBaNK5EKnZhtfM_h-fgQH9HPY,1568
-liger_kernel_nightly-0.6.2.dev20251014205028.dist-info/LICENSE,sha256=OhzLDHJ0to4a8sodVLELZiCFylZ1NAAYLs-HrjPy0ag,1312
-liger_kernel_nightly-0.6.2.dev20251014205028.dist-info/METADATA,sha256=6VDasn5yo1wPa73CAIS4iRzr6TJ_cWpSjF_QbD5r1sM,24777
-liger_kernel_nightly-0.6.2.dev20251014205028.dist-info/NOTICE,sha256=njwnoPZLh9AN8SJQzxvCGLHi-8X__AvWRze6joNXIY8,2066
-liger_kernel_nightly-0.6.2.dev20251014205028.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
-liger_kernel_nightly-0.6.2.dev20251014205028.dist-info/top_level.txt,sha256=2eghu4hA3LnkM7ElW92tQ8zegWKgSbeo-k-aGe1YnvY,13
-liger_kernel_nightly-0.6.2.dev20251014205028.dist-info/RECORD,,
+liger_kernel_nightly-0.6.2.dev20251016055812.dist-info/LICENSE,sha256=OhzLDHJ0to4a8sodVLELZiCFylZ1NAAYLs-HrjPy0ag,1312
+liger_kernel_nightly-0.6.2.dev20251016055812.dist-info/METADATA,sha256=0T7yuosaQopminlzrQ4Z2ZyY7Lm_Dst67jQScbOIlHU,24777
+liger_kernel_nightly-0.6.2.dev20251016055812.dist-info/NOTICE,sha256=njwnoPZLh9AN8SJQzxvCGLHi-8X__AvWRze6joNXIY8,2066
+liger_kernel_nightly-0.6.2.dev20251016055812.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
+liger_kernel_nightly-0.6.2.dev20251016055812.dist-info/top_level.txt,sha256=2eghu4hA3LnkM7ElW92tQ8zegWKgSbeo-k-aGe1YnvY,13
+liger_kernel_nightly-0.6.2.dev20251016055812.dist-info/RECORD,,

{liger_kernel_nightly-0.6.2.dev20251014205028.dist-info → liger_kernel_nightly-0.6.2.dev20251016055812.dist-info}/LICENSE RENAMED Viewed

File without changes

{liger_kernel_nightly-0.6.2.dev20251014205028.dist-info → liger_kernel_nightly-0.6.2.dev20251016055812.dist-info}/NOTICE RENAMED Viewed

File without changes

{liger_kernel_nightly-0.6.2.dev20251014205028.dist-info → liger_kernel_nightly-0.6.2.dev20251016055812.dist-info}/WHEEL RENAMED Viewed

File without changes

{liger_kernel_nightly-0.6.2.dev20251014205028.dist-info → liger_kernel_nightly-0.6.2.dev20251016055812.dist-info}/top_level.txt RENAMED Viewed

File without changes

liger-kernel-nightly 0.6.2.dev20251014205028__py3-none-any.whl → 0.6.2.dev20251016055812__py3-none-any.whl

liger-kernel-nightly 0.6.2.dev20251014205028py3-none-any.whl → 0.6.2.dev20251016055812py3-none-any.whl