PyPI - liger-kernel-nightly - Versions diffs - 0.3.1.dev20241102065152__py3-none-any.whl → 0.3.1.dev20241104210835__py3-none-any.whl - Mend

liger-kernel-nightly 0.3.1.dev20241102065152py3-none-any.whl → 0.3.1.dev20241104210835py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of liger-kernel-nightly might be problematic. Click here for more details.

Files changed (17) hide show

liger_kernel/ops/cross_entropy.py CHANGED Viewed

@@ -2,7 +2,7 @@ import torch
 import triton
 import triton.language as tl
-from liger_kernel.ops.utils import element_mul_kernel
+from liger_kernel.ops.utils import element_mul_kernel, is_hip
 @triton.jit
@@ -194,7 +194,7 @@ def cross_entropy_forward(_input, target, ignore_index, label_smoothing, reducti
         BLOCK_SIZE=BLOCK_SIZE,
         # TODO: 32 seems to give the best performance
         # Performance is quite sensitive to num_warps
-        num_warps=32,
+        num_warps=32 if not is_hip() else 16,
     )
     loss = torch.sum(loss_1d)
@@ -219,7 +219,7 @@ def cross_entropy_backward(_input, grad_output):
             grad_output,
             V,
             BLOCK_SIZE=BLOCK_SIZE,
-            num_warps=32,
+            num_warps=32 if not is_hip() else 16,
         )
     return _input

liger_kernel/ops/fused_linear_cross_entropy.py CHANGED Viewed

@@ -2,7 +2,12 @@ import torch
 import triton
 from liger_kernel.ops.cross_entropy import liger_cross_entropy_kernel
-from liger_kernel.ops.utils import amp_custom_bwd, amp_custom_fwd, element_mul_kernel
+from liger_kernel.ops.utils import (
+    amp_custom_bwd,
+    amp_custom_fwd,
+    element_mul_kernel,
+    is_hip,
+)
 # The hard limit of TRITON_MAX_TENSOR_NUMEL is 1048576 https://github.com/triton-lang/triton/blob/ba42a5c68fd0505f8c42f4202d53be0f8d9a5fe0/python/triton/language/core.py#L19
 # However, setting limit as 65536 as in LayerNorm tutorial is faster because of less register spilling
@@ -88,7 +93,7 @@ def fused_linear_cross_entropy_forward(
             label_smoothing=label_smoothing,
             reduction=reduction,
             BLOCK_SIZE=BLOCK_SIZE,
-            num_warps=32,
+            num_warps=32 if not is_hip() else 16,
         )
         # gradient of logits_chunk is computed in-place by the above triton kernel.
@@ -153,7 +158,7 @@ def fused_linear_cross_entropy_backward(
             grad_output,
             H,
             BLOCK_SIZE=BLOCK_SIZE,
-            num_warps=32,
+            num_warps=32 if not is_hip() else 16,
         )
         # handle grad_weight
@@ -167,7 +172,7 @@ def fused_linear_cross_entropy_backward(
                 grad_output,
                 H,
                 BLOCK_SIZE=BLOCK_SIZE,
-                num_warps=32,
+                num_warps=32 if not is_hip() else 16,
             )
         if grad_bias is not None:
@@ -180,7 +185,7 @@ def fused_linear_cross_entropy_backward(
                 grad_output,
                 1,
                 BLOCK_SIZE=BLOCK_SIZE,
-                num_warps=32,
+                num_warps=32 if not is_hip() else 16,
             )
     return grad_input, grad_weight, grad_bias

liger_kernel/ops/fused_linear_jsd.py CHANGED Viewed

@@ -4,7 +4,12 @@ import torch
 import triton
 from liger_kernel.ops.jsd import _jsd_kernel
-from liger_kernel.ops.utils import amp_custom_bwd, amp_custom_fwd, element_mul_kernel
+from liger_kernel.ops.utils import (
+    amp_custom_bwd,
+    amp_custom_fwd,
+    element_mul_kernel,
+    is_hip,
+)
 # The hard limit of TRITON_MAX_TENSOR_NUMEL is 1048576 https://github.com/triton-lang/triton/blob/ba42a5c68fd0505f8c42f4202d53be0f8d9a5fe0/python/triton/language/core.py#L19
 # However, setting limit as 65536 as in LayerNorm tutorial is faster because of less register spilling
@@ -147,7 +152,7 @@ def fused_linear_jsd_backward(grad_output, grad_input, grad_weight):
             grad_output,
             H,
             BLOCK_SIZE=BLOCK_SIZE,
-            num_warps=32,
+            num_warps=32 if not is_hip() else 16,
         )
         # handle grad_weight
@@ -161,7 +166,7 @@ def fused_linear_jsd_backward(grad_output, grad_input, grad_weight):
                 grad_output,
                 H,
                 BLOCK_SIZE=BLOCK_SIZE,
-                num_warps=32,
+                num_warps=32 if not is_hip() else 16,
             )
     return grad_input, grad_weight

liger_kernel/ops/kl_div.py CHANGED Viewed

@@ -4,13 +4,13 @@ import torch
 import triton
 import triton.language as tl
-from liger_kernel.ops.utils import ensure_contiguous
+from liger_kernel.ops.utils import ensure_contiguous, is_hip
 def get_num_warps(BLOCK_SIZE):
     num_warps = 4
     if BLOCK_SIZE >= 32768:
-        num_warps = 32
+        num_warps = 32 if not is_hip() else 16
     elif BLOCK_SIZE >= 8192:
         num_warps = 16
     elif BLOCK_SIZE >= 2048:

liger_kernel/ops/utils.py CHANGED Viewed

@@ -21,6 +21,10 @@ import triton.language as tl
 from packaging.version import Version
+def is_hip() -> bool:
+    return torch.version.hip is not None
 def ensure_contiguous(fn):
     @functools.wraps(fn)
     def wrapper(ctx, *args, **kwargs):
@@ -47,7 +51,7 @@ def calculate_settings(n):
     num_warps = 4
     if BLOCK_SIZE >= 32768:
-        num_warps = 32
+        num_warps = 32 if not is_hip() else 16
     elif BLOCK_SIZE >= 8192:
         num_warps = 16
     elif BLOCK_SIZE >= 2048:

liger_kernel/transformers/model/llama.py CHANGED Viewed

@@ -17,7 +17,6 @@ from liger_kernel.transformers.fused_linear_cross_entropy import (
     LigerFusedLinearCrossEntropyLoss,
 )
 if TYPE_CHECKING:
     from transformers.cache_utils import Cache

{liger_kernel_nightly-0.3.1.dev20241102065152.dist-info → liger_kernel_nightly-0.3.1.dev20241104210835.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: liger_kernel_nightly
-Version: 0.3.1.dev20241102065152
+Version: 0.3.1.dev20241104210835
 Summary: Efficient Triton kernels for LLM Training
 License: BSD 2-CLAUSE LICENSE
         Copyright 2024 LinkedIn Corporation
@@ -163,11 +163,18 @@ With one line of code, Liger Kernel can increase throughput by more than 20% and
 ## Installation
-### Dependencies
+### Dependencies
+#### CUDA
 - `torch >= 2.1.2`
 - `triton >= 2.3.0`
+#### ROCm
+- `torch >= 2.5.0` Install according to the instruction in Pytorch official webpage.
+- `triton >= 3.0.0` Install from pypi. (e.g. `pip install triton==3.0.0`)
 ### Optional Dependencies
 - `transformers >= 4.x`: Required if you plan to use the transformers models patching APIs. The specific model you are working will dictate the minimum version of transformers.
@@ -197,6 +204,7 @@ pip install -e .
 pip install -e .[transformers]
 ```
 ## Getting Started
 There are a couple of ways to apply Liger kernels, depending on the level of customization required.

{liger_kernel_nightly-0.3.1.dev20241102065152.dist-info → liger_kernel_nightly-0.3.1.dev20241104210835.dist-info}/RECORD RENAMED Viewed

@@ -1,16 +1,16 @@
 liger_kernel/env_report.py,sha256=LFUJ6UMkFFGPBYXBlqHFGy4bhsemEpSI-_1edSazlHI,1130
 liger_kernel/ops/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-liger_kernel/ops/cross_entropy.py,sha256=OB3nvIONLB_sj9LO6UQv1qLnf861k-pR58RtwgoiyYA,11192
-liger_kernel/ops/fused_linear_cross_entropy.py,sha256=qg7qBQFLDJClnkUOGhFFHPSW_x7rPvQekbm_4OOYxys,9331
-liger_kernel/ops/fused_linear_jsd.py,sha256=ZQUxNqm3yOokZhUId0sIfob_3e43rGbMTDxeCk9A92o,9549
+liger_kernel/ops/cross_entropy.py,sha256=23Di7l0T20OBj8K3-0PYEA5FCJrrbiKs3xMGyLlzbtg,11248
+liger_kernel/ops/fused_linear_cross_entropy.py,sha256=M-cF4BO-vvso2BIdk7-Q2FleeFPhqSQwZR1EirPC4OE,9456
+liger_kernel/ops/fused_linear_jsd.py,sha256=5D_obamh08lGGTMyh85kBJD_aNjPhOYf4-TmCZ6m4s4,9626
 liger_kernel/ops/geglu.py,sha256=MQL4zyzneZqZYUGPvb1QjI_EYT9_pKfSDgR25WD9jrI,4127
 liger_kernel/ops/jsd.py,sha256=anWfdioucxZy4JQfTvbHBR-IQrZKeH-gBF1MHwwTuTQ,5781
-liger_kernel/ops/kl_div.py,sha256=qnmtFQwuO3FR7Ovup_DDzpkD1A1LpwOaWlcO6K9ysHk,8342
+liger_kernel/ops/kl_div.py,sha256=03FNXfvCb6M-56hhFepAFV9p6brArPR6KOKkdGD34mw,8374
 liger_kernel/ops/layer_norm.py,sha256=unGMYMOPqtkM9aTrokhcqgPmsV2AUN7Yzv86isVB9OI,7422
 liger_kernel/ops/rms_norm.py,sha256=9S9wyZLmzNyJlBxV4vbv4p5es7bGP-m_5wK9JC6JIdA,10911
 liger_kernel/ops/rope.py,sha256=jrzaA9-6Orn44y_IIam9_YNPQxOFK2FrIRNfFea4EtU,8513
 liger_kernel/ops/swiglu.py,sha256=Fwxtd76rhHKT9ShQAGca9RsnASplAVxtYKHmiT73_yA,2994
-liger_kernel/ops/utils.py,sha256=w0QT3ynUK2vYUAgsVvfoENTUu5L-2TuB3IYt8JaXlNA,3688
+liger_kernel/ops/utils.py,sha256=3JSF--O7KT5Wa5BuO70M4h0XetxoZ_e9IoW9GRlxlBg,3777
 liger_kernel/ops/experimental/embedding.py,sha256=LYR66dB-jhvhtUjeV4PnNro-n77J1mdlmpSLSxB3Y6U,4186
 liger_kernel/ops/experimental/mm_int8int2.py,sha256=JpGVZCgRC6T8XMUJ_QbZRS2XU1bh0urIZphs5DTc1mY,13358
 liger_kernel/transformers/__init__.py,sha256=gia-eBxr7TLxU0GdDf8AfCY4WgDlFLqIGSt7EoQGsBA,1336
@@ -31,7 +31,7 @@ liger_kernel/transformers/trainer_integration.py,sha256=W3ON51O5GkyzNJsItz0y5rKx
 liger_kernel/transformers/experimental/embedding.py,sha256=HpckiAMKM8-SRxKDcGTqortVxnjhwpZsfsp9lfjqfeM,895
 liger_kernel/transformers/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 liger_kernel/transformers/model/gemma.py,sha256=EcdkGbSj_qroTDFl0Sc_HLyDyY0xcDhwrgkM_wkXnw8,4987
-liger_kernel/transformers/model/llama.py,sha256=KSCUkUnHrhL0jI4NRtJrPC0tbp-oFBCJEiqgga0HuTU,10427
+liger_kernel/transformers/model/llama.py,sha256=RinsgC_eR-YNvZd2SHPQxZ4eyR3uViaTFCM3SvI5nks,10426
 liger_kernel/transformers/model/mistral.py,sha256=_MQJrDntlxBO5cJwgTjr2rk2nNd5FAXVnzcTg_PEekQ,5079
 liger_kernel/transformers/model/mixtral.py,sha256=51FghRY8aGBWat7KSgTeFDqdStDiXY3dEJepByNhEOE,5847
 liger_kernel/transformers/model/mllama.py,sha256=S00P0pJrGHOWBx170TPYZbQ0djv0__m8Dqv1FvKZUyE,5926
@@ -40,14 +40,14 @@ liger_kernel/transformers/model/qwen2.py,sha256=3inWFXGHYT7wA10OR6bq3mDUBrr10AS5
 liger_kernel/transformers/model/qwen2_vl.py,sha256=ymsm9aQpSUiSU12GY8FO608p9dSHOz4TCnNI1htX5bk,6975
 liger_kernel/triton/__init__.py,sha256=yfRe0zMb47QnqjecZWG7LnanfCTzeku7SgWRAwNVmzU,101
 liger_kernel/triton/monkey_patch.py,sha256=5BcGKTtdqeYchypBIBopGIWPx1-cFALz7sOKoEsqXJ0,1584
-liger_kernel_nightly-0.3.1.dev20241102065152.dist-info/LICENSE,sha256=OhzLDHJ0to4a8sodVLELZiCFylZ1NAAYLs-HrjPy0ag,1312
-liger_kernel_nightly-0.3.1.dev20241102065152.dist-info/LICENSE-Apache-2.0,sha256=NRaCIsL9eblGS35gk4WKTC0usNYnR_mgRHJTKqz2_UE,11348
-liger_kernel_nightly-0.3.1.dev20241102065152.dist-info/LICENSE-MIT-AutoAWQ,sha256=pfiOyInrAPY3xQbvV1i-gOqNZK7QEyIepT1IbqOYYYo,1067
-liger_kernel_nightly-0.3.1.dev20241102065152.dist-info/LICENSE-MIT-Efficient-Cross-Entropy,sha256=PaC9HqyFYTy-ClS0H8Zfa2motJuTppjECXmjHwJcaOk,1063
-liger_kernel_nightly-0.3.1.dev20241102065152.dist-info/LICENSE-MIT-llmc,sha256=kyFLt_XUcXS88CuxQt5-PjOcLjpJP2m-T4gtqZf3GLc,1071
-liger_kernel_nightly-0.3.1.dev20241102065152.dist-info/LICENSE-MIT-triton,sha256=wL6W8IwsKiyHtzXubg8TCXhRZuo8S83EPdqXffYtqWg,1131
-liger_kernel_nightly-0.3.1.dev20241102065152.dist-info/METADATA,sha256=8To-7_aoXsEfzk2d1hYaVSCwdirZDFJmavscRZLumXs,27717
-liger_kernel_nightly-0.3.1.dev20241102065152.dist-info/NOTICE,sha256=njwnoPZLh9AN8SJQzxvCGLHi-8X__AvWRze6joNXIY8,2066
-liger_kernel_nightly-0.3.1.dev20241102065152.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
-liger_kernel_nightly-0.3.1.dev20241102065152.dist-info/top_level.txt,sha256=2eghu4hA3LnkM7ElW92tQ8zegWKgSbeo-k-aGe1YnvY,13
-liger_kernel_nightly-0.3.1.dev20241102065152.dist-info/RECORD,,
+liger_kernel_nightly-0.3.1.dev20241104210835.dist-info/LICENSE,sha256=OhzLDHJ0to4a8sodVLELZiCFylZ1NAAYLs-HrjPy0ag,1312
+liger_kernel_nightly-0.3.1.dev20241104210835.dist-info/LICENSE-Apache-2.0,sha256=NRaCIsL9eblGS35gk4WKTC0usNYnR_mgRHJTKqz2_UE,11348
+liger_kernel_nightly-0.3.1.dev20241104210835.dist-info/LICENSE-MIT-AutoAWQ,sha256=pfiOyInrAPY3xQbvV1i-gOqNZK7QEyIepT1IbqOYYYo,1067
+liger_kernel_nightly-0.3.1.dev20241104210835.dist-info/LICENSE-MIT-Efficient-Cross-Entropy,sha256=PaC9HqyFYTy-ClS0H8Zfa2motJuTppjECXmjHwJcaOk,1063
+liger_kernel_nightly-0.3.1.dev20241104210835.dist-info/LICENSE-MIT-llmc,sha256=kyFLt_XUcXS88CuxQt5-PjOcLjpJP2m-T4gtqZf3GLc,1071
+liger_kernel_nightly-0.3.1.dev20241104210835.dist-info/LICENSE-MIT-triton,sha256=wL6W8IwsKiyHtzXubg8TCXhRZuo8S83EPdqXffYtqWg,1131
+liger_kernel_nightly-0.3.1.dev20241104210835.dist-info/METADATA,sha256=KLe3u0yMc9Dipf9wsCM2DXabjlK1X-cgfqnJe5z-Lmk,27901
+liger_kernel_nightly-0.3.1.dev20241104210835.dist-info/NOTICE,sha256=njwnoPZLh9AN8SJQzxvCGLHi-8X__AvWRze6joNXIY8,2066
+liger_kernel_nightly-0.3.1.dev20241104210835.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
+liger_kernel_nightly-0.3.1.dev20241104210835.dist-info/top_level.txt,sha256=2eghu4hA3LnkM7ElW92tQ8zegWKgSbeo-k-aGe1YnvY,13
+liger_kernel_nightly-0.3.1.dev20241104210835.dist-info/RECORD,,