PyPI - liger-kernel-nightly - Versions diffs - 0.4.2.dev20241122052539__py3-none-any.whl → 0.4.2.dev20241123040418__py3-none-any.whl - Mend

liger-kernel-nightly 0.4.2.dev20241122052539py3-none-any.whl → 0.4.2.dev20241123040418py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

liger_kernel/__init__.py ADDED Viewed

File without changes

liger_kernel/ops/cross_entropy.py CHANGED Viewed

@@ -92,8 +92,8 @@ def liger_cross_entropy_kernel(
     # 3. [Online softmax] first pass: find max + sum
     m = float("-inf")  # m is the max value. use the notation from the paper
     d = 0.0  # d is the sum. use the notation from the paper
-    ori_X_y = tl.load(
-        X_ptr + y
+    ori_X_y = tl.load(X_ptr + y).cast(
+        tl.float32
     )  # we need to store the original value of X_y for the loss calculation
     if HAS_SOFTCAPPING:
         ori_X_y = softcap * tanh(ori_X_y / softcap)
@@ -106,8 +106,11 @@ def liger_cross_entropy_kernel(
     for i in range(0, n_cols, BLOCK_SIZE):
         X_offsets = i + tl.arange(0, BLOCK_SIZE)
         X_block = tl.load(
-            X_ptr + X_offsets, mask=X_offsets < n_cols, other=float("-inf")
-        )
+            X_ptr + X_offsets,
+            mask=X_offsets < n_cols,
+            other=float("-inf"),
+            # Ensure float32 precision for softmax calculation
+        ).cast(tl.float32)
         if HAS_SOFTCAPPING:
             X_block = softcap * tanh(X_block / softcap)
         block_max = tl.max(X_block)
@@ -141,8 +144,11 @@ def liger_cross_entropy_kernel(
     for i in range(0, n_cols, BLOCK_SIZE):
         X_offsets = i + tl.arange(0, BLOCK_SIZE)
         X_block = tl.load(
-            X_ptr + X_offsets, mask=X_offsets < n_cols, other=float("-inf")
-        )
+            X_ptr + X_offsets,
+            mask=X_offsets < n_cols,
+            other=float("-inf"),
+            # Ensure float32 precision for softmax calculation
+        ).cast(tl.float32)
         if HAS_SOFTCAPPING:
             intermediate = tanh(X_block / softcap)
             X_block = softcap * intermediate

liger_kernel/ops/fused_linear_cross_entropy.py CHANGED Viewed

@@ -26,7 +26,6 @@ def fused_linear_cross_entropy_forward(
     reduction="mean",
     softcap=None,
 ):
-    dtype = _input.dtype
     device = _input.device
     # inputs have shape: BT x H
@@ -74,9 +73,6 @@ def fused_linear_cross_entropy_forward(
         loss_1d_slice = loss_1d[start_idx:end_idx]  # chunk_size,
         n_non_ignore = (target_chunk != ignore_index).sum().item()
-        # when doing CE, use the upcasted precision
-        logits_chunk = logits_chunk.float()
         # ensure _input and target are contiguous
         logits_chunk = logits_chunk.contiguous()
         target_chunk = target_chunk.contiguous()
@@ -103,13 +99,6 @@ def fused_linear_cross_entropy_forward(
             num_warps=32 if not is_hip() else 16,
         )
-        # gradient of logits_chunk is computed in-place by the above triton kernel.
-        # Following HuggingFace model source code, we do the forward and backward
-        # w.r.t. logits in fp32 for numerical stability especially as the num classes (vocab size) is huge.
-        # (reference: https://github.com/huggingface/transformers/blob/v4.42.4/src/transformers/models/llama/modeling_llama.py#L1194)
-        # Propagating to lm_head's backward, we'll switch back to the original dtype.
-        logits_chunk = logits_chunk.to(dtype)
         # gradient of logits_chunk is computed in-place by the above triton kernel and is of shape: chunk_size x V
         # thus grad_input[start_idx: end_idx] should be of shape: chunk_size x H
         # additionally, since we are chunking the inputs, observe that the loss and gradients are calculated only

liger_kernel/ops/layer_norm.py CHANGED Viewed

@@ -180,8 +180,13 @@ def layer_norm_backward(dY, X, W, B, Mean, RSTD):
     dY = dY.view(-1, dim)
     n_rows, n_cols = dY.shape
+    sm_count = 1
+    if X.device.type == "cuda":
+        sm_count = torch.cuda.get_device_properties(X.device).multi_processor_count
+    elif X.device.type == "xpu":
+        sm_count = torch.xpu.get_device_properties(X.device).gpu_subslice_count
     DX = torch.empty((n_rows, n_cols), dtype=X.dtype, device=X.device)
-    sm_count = torch.cuda.get_device_properties(X.device).multi_processor_count
     _DW = torch.empty((sm_count, n_cols), dtype=W.dtype, device=W.device)
     _DB = torch.empty((sm_count, n_cols), dtype=W.dtype, device=W.device)

liger_kernel/ops/rms_norm.py CHANGED Viewed

@@ -264,6 +264,7 @@ def rms_norm_backward(
     dY = dY.view(-1, dim)
     n_rows, n_cols = dY.shape
+    sm_count = 1
     if X.device.type == "cuda":
         sm_count = torch.cuda.get_device_properties(X.device).multi_processor_count
     elif X.device.type == "xpu":

liger_kernel/ops/utils.py CHANGED Viewed

@@ -20,6 +20,8 @@ import triton
 import triton.language as tl
 from packaging.version import Version
+from liger_kernel.utils import infer_device
 def is_hip() -> bool:
     return torch.version.hip is not None
@@ -69,10 +71,11 @@ def compare_version(package: str, operator: Callable, target: str):
 def get_amp_custom_fwd_bwd() -> Callable:
+    device = infer_device()
     if compare_version("torch", operator.ge, "2.4.0"):
         return (
-            functools.partial(torch.amp.custom_fwd, device_type="cuda"),
-            functools.partial(torch.amp.custom_bwd, device_type="cuda"),
+            functools.partial(torch.amp.custom_fwd, device_type=device),
+            functools.partial(torch.amp.custom_bwd, device_type=device),
         )
     return torch.cuda.amp.custom_fwd, torch.cuda.amp.custom_bwd

liger_kernel/utils.py ADDED Viewed

@@ -0,0 +1,13 @@
+import torch
+def infer_device():
+    """
+    Get current device name based on available devices
+    """
+    if torch.cuda.is_available():
+        return "cuda"
+    elif torch.xpu.is_available():
+        return "xpu"
+    else:
+        return "cpu"

{liger_kernel_nightly-0.4.2.dev20241122052539.dist-info → liger_kernel_nightly-0.4.2.dev20241123040418.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: liger_kernel_nightly
-Version: 0.4.2.dev20241122052539
+Version: 0.4.2.dev20241123040418
 Summary: Efficient Triton kernels for LLM Training
 License: BSD 2-CLAUSE LICENSE
         Copyright 2024 LinkedIn Corporation

{liger_kernel_nightly-0.4.2.dev20241122052539.dist-info → liger_kernel_nightly-0.4.2.dev20241123040418.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,6 @@
+liger_kernel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 liger_kernel/env_report.py,sha256=jye8RvUkmhqaIshdeIpoUABoAu7FPKJUib4FnAfvkpw,1132
+liger_kernel/utils.py,sha256=HJa-xVKOohDn6pLVIx-Fv0V9h0QAL3qZGQNRICI-OpI,249
 liger_kernel/chunked_loss/__init__.py,sha256=R2wCcz4Y0kTAve926DH3k182XKezpXeACMHj05g9Mm8,346
 liger_kernel/chunked_loss/cpo_loss.py,sha256=H2L6mNtU8RMJ17u4aMZ9FHEfBvg1Z_hliY5-jZxiDBM,3079
 liger_kernel/chunked_loss/dpo_loss.py,sha256=XcCGLVmTVdEX30q41XRXXK_c-MSumVJ-l4tQwobUv2w,4228
@@ -7,19 +9,19 @@ liger_kernel/chunked_loss/fused_linear_preference.py,sha256=nkEpNWTHh5GmlnHOnGx5
 liger_kernel/chunked_loss/orpo_loss.py,sha256=DZ-_hm1twllBWujEV4M4-VDBkxMDBvoGqMGe-aGP1hA,3147
 liger_kernel/chunked_loss/simpo_loss.py,sha256=Jpl_U6DfxlzyHnlKN2i05K0vwz-ouiTmxlLGb439FwY,3328
 liger_kernel/ops/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-liger_kernel/ops/cross_entropy.py,sha256=sfUb7-jIZp0EKXjg1DYy2Wdzw_Mg-mHmGoR5bpdm4tw,15526
-liger_kernel/ops/fused_linear_cross_entropy.py,sha256=ib7M3AjJE164yMfuS9R39k-5qnDgYOXptIT146lqYbg,9964
+liger_kernel/ops/cross_entropy.py,sha256=VqaYB9Zirc51eZ28OmjEZRrrV9UysRjS_vhIftB9sKo,15753
+liger_kernel/ops/fused_linear_cross_entropy.py,sha256=Tnw4gyAYVVdnCOqhOuLEzbUQ3goOTnoAfk3pqSIM5ac,9301
 liger_kernel/ops/fused_linear_jsd.py,sha256=nOv4zwfxHqqepKEmMsQuz-B3H-gRjyo8uClpmqSGLYA,9693
 liger_kernel/ops/geglu.py,sha256=MQL4zyzneZqZYUGPvb1QjI_EYT9_pKfSDgR25WD9jrI,4127
 liger_kernel/ops/group_norm.py,sha256=VaRErVJGR4JqgXXvuIjNGTn3E2egjLtU1y3ymwIf4d8,10961
 liger_kernel/ops/jsd.py,sha256=Ap2b0_geCl6fqBXLI1IS6Yn6GlO-8LgPmnOW3y47dus,6151
 liger_kernel/ops/kl_div.py,sha256=03FNXfvCb6M-56hhFepAFV9p6brArPR6KOKkdGD34mw,8374
-liger_kernel/ops/layer_norm.py,sha256=unGMYMOPqtkM9aTrokhcqgPmsV2AUN7Yzv86isVB9OI,7422
+liger_kernel/ops/layer_norm.py,sha256=_CZggw3GNEIUx5weDzadFit5I-Lzosoo8prgeJzcViY,7589
 liger_kernel/ops/qwen2vl_mrope.py,sha256=xZvQnhkSTjU-k6KiiRn9e0SYO1ESs1jmuZFMICduLpc,8552
-liger_kernel/ops/rms_norm.py,sha256=GKs49wXmUngY7MJ5QDQxTp4P2HDVqzZBaCr0pGtyZyM,11733
+liger_kernel/ops/rms_norm.py,sha256=g7OXwuYI8-LXudDwvXuiupVjjOsbu8c4wwv83VaHa54,11750
 liger_kernel/ops/rope.py,sha256=jrzaA9-6Orn44y_IIam9_YNPQxOFK2FrIRNfFea4EtU,8513
 liger_kernel/ops/swiglu.py,sha256=Fwxtd76rhHKT9ShQAGca9RsnASplAVxtYKHmiT73_yA,2994
-liger_kernel/ops/utils.py,sha256=3JSF--O7KT5Wa5BuO70M4h0XetxoZ_e9IoW9GRlxlBg,3777
+liger_kernel/ops/utils.py,sha256=_VQvd1PX5JXm5xaiBrk2gANp3qr4kM7qYG3ypkBwkMs,3850
 liger_kernel/ops/experimental/embedding.py,sha256=LYR66dB-jhvhtUjeV4PnNro-n77J1mdlmpSLSxB3Y6U,4186
 liger_kernel/ops/experimental/mm_int8int2.py,sha256=JpGVZCgRC6T8XMUJ_QbZRS2XU1bh0urIZphs5DTc1mY,13358
 liger_kernel/transformers/__init__.py,sha256=gia-eBxr7TLxU0GdDf8AfCY4WgDlFLqIGSt7EoQGsBA,1336
@@ -52,9 +54,9 @@ liger_kernel/transformers/model/qwen2.py,sha256=EyhSSzQOskGjSnCsKMZpd1s5IAIlHd5P
 liger_kernel/transformers/model/qwen2_vl.py,sha256=bIQe2bWiY--G84FhCD29Gdi64_qHP6vbcGsK6vKysQE,8547
 liger_kernel/triton/__init__.py,sha256=yfRe0zMb47QnqjecZWG7LnanfCTzeku7SgWRAwNVmzU,101
 liger_kernel/triton/monkey_patch.py,sha256=5BcGKTtdqeYchypBIBopGIWPx1-cFALz7sOKoEsqXJ0,1584
-liger_kernel_nightly-0.4.2.dev20241122052539.dist-info/LICENSE,sha256=OhzLDHJ0to4a8sodVLELZiCFylZ1NAAYLs-HrjPy0ag,1312
-liger_kernel_nightly-0.4.2.dev20241122052539.dist-info/METADATA,sha256=4fPpoJL_3aqEC-teWIauvl_VbNer1TYZ-7s8znCLSgs,21891
-liger_kernel_nightly-0.4.2.dev20241122052539.dist-info/NOTICE,sha256=njwnoPZLh9AN8SJQzxvCGLHi-8X__AvWRze6joNXIY8,2066
-liger_kernel_nightly-0.4.2.dev20241122052539.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
-liger_kernel_nightly-0.4.2.dev20241122052539.dist-info/top_level.txt,sha256=2eghu4hA3LnkM7ElW92tQ8zegWKgSbeo-k-aGe1YnvY,13
-liger_kernel_nightly-0.4.2.dev20241122052539.dist-info/RECORD,,
+liger_kernel_nightly-0.4.2.dev20241123040418.dist-info/LICENSE,sha256=OhzLDHJ0to4a8sodVLELZiCFylZ1NAAYLs-HrjPy0ag,1312
+liger_kernel_nightly-0.4.2.dev20241123040418.dist-info/METADATA,sha256=lXW5-kGkMAutfiUrZflzYeW1bZo8efp65MDohQ3G1T0,21891
+liger_kernel_nightly-0.4.2.dev20241123040418.dist-info/NOTICE,sha256=njwnoPZLh9AN8SJQzxvCGLHi-8X__AvWRze6joNXIY8,2066
+liger_kernel_nightly-0.4.2.dev20241123040418.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
+liger_kernel_nightly-0.4.2.dev20241123040418.dist-info/top_level.txt,sha256=2eghu4hA3LnkM7ElW92tQ8zegWKgSbeo-k-aGe1YnvY,13
+liger_kernel_nightly-0.4.2.dev20241123040418.dist-info/RECORD,,

{liger_kernel_nightly-0.4.2.dev20241122052539.dist-info → liger_kernel_nightly-0.4.2.dev20241123040418.dist-info}/LICENSE RENAMED Viewed

File without changes

{liger_kernel_nightly-0.4.2.dev20241122052539.dist-info → liger_kernel_nightly-0.4.2.dev20241123040418.dist-info}/NOTICE RENAMED Viewed

File without changes

{liger_kernel_nightly-0.4.2.dev20241122052539.dist-info → liger_kernel_nightly-0.4.2.dev20241123040418.dist-info}/WHEEL RENAMED Viewed

File without changes

{liger_kernel_nightly-0.4.2.dev20241122052539.dist-info → liger_kernel_nightly-0.4.2.dev20241123040418.dist-info}/top_level.txt RENAMED Viewed

File without changes

liger-kernel-nightly 0.4.2.dev20241122052539__py3-none-any.whl → 0.4.2.dev20241123040418__py3-none-any.whl

liger-kernel-nightly 0.4.2.dev20241122052539py3-none-any.whl → 0.4.2.dev20241123040418py3-none-any.whl