PyPI - liger-kernel-nightly - Versions diffs - 0.4.2.dev20241121054604__py3-none-any.whl → 0.4.2.dev20241121224158__py3-none-any.whl - Mend

liger-kernel-nightly 0.4.2.dev20241121054604py3-none-any.whl → 0.4.2.dev20241121224158py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

liger_kernel/ops/fused_linear_jsd.py CHANGED Viewed

@@ -202,7 +202,7 @@ class LigerFusedLinearJSDFunction(torch.autograd.Function):
             teacher_input (torch.tensor): input of the last projection layer in teacher model, with shape (B*T, H), where B is batch size, T is sequence length, H is hidden dimension.
             teacher_weight (torch.tensor): the last projection layer in teacher model, with shape (V, H), where V is vocab size
             shift_labels (Optional[torch.LongTensor]): indicator of next predicted vocab with shape (BT) where each value is in [0, V-1].
-            jsd_beta (float): coefficient beta of generalized JSD in the open interval (0, 1). Default: `0.5`
+            jsd_beta (float): coefficient beta of generalized JSD in the interval [0, 1]. It implements forward/reverse KL when beta equals 0 and 1 respectively. Default: `0.5`
             ignore_index (int): the index to ignore. Default: -100
             temperature (float): temperature in softmax function to control the output probability distribution. Default: `1.0`

liger_kernel/ops/jsd.py CHANGED Viewed

@@ -18,7 +18,7 @@ def _jsd_kernel(
     dX_ptr,
     dX_stride,
     label_ptr,
-    beta,
+    beta: tl.constexpr,
     n_non_ignore: int,
     ignore_index: tl.constexpr,
     n_cols,
@@ -50,17 +50,26 @@ def _jsd_kernel(
         X = tl.load(X_ptr + offsets, mask=mask, other=float("-inf")).to(tl.float32)
         Y = tl.load(Y_ptr + offsets, mask=mask, other=float("-inf")).to(tl.float32)
-        Q = tl.exp(X)
-        P = tl.exp(Y)
-        M = beta * P + (1 - beta) * Q
-        log_M = tl.log(M)
+        if beta == 0.0:  # forward KL
+            Y_prob = tl.exp(Y)
+            loss = Y_prob * (Y - X)
+            dX = -Y_prob
+        elif beta == 1.0:
+            X_prob = tl.exp(X)
+            loss = X_prob * (X - Y)
+            dX = loss + X_prob
+        else:
+            Q = tl.exp(X)
+            P = tl.exp(Y)
+            M = beta * P + (1 - beta) * Q
+            log_M = tl.log(M)
+            loss = beta * P * Y + (1 - beta) * Q * X - M * log_M
+            dX = (1 - beta) * Q * (X - log_M)
-        loss = beta * P * Y + (1 - beta) * Q * X - M * log_M
-        # reduction == "batchmean"
         loss = loss / n_non_ignore
+        dX = dX / n_non_ignore
         tl.store(loss_ptr + offsets, loss, mask=mask)
-        dX = (1 - beta) * Q * (X - log_M) / n_non_ignore
         tl.store(dX_ptr + offsets, dX, mask=mask)
@@ -142,7 +151,7 @@ class LigerJSDFunction(torch.autograd.Function):
             _input (torch.Tensor): predict values with shape (BT, V) in logspace
             target (torch.Tensor): ground truth values with shape (BT, V) in logspace
             shift_labels (Optional[torch.LongTensor]): indicator of next predicted vocab with shape (BT) where each value is in [0, V-1].
-            beta (float): coefficient beta of generalized JSD in the open interval (0, 1)
+            beta (float): coefficient beta of generalized JSD in the interval [0, 1]. It implements forward/reverse KL when beta equals 0 and 1 respectively. Default: `0.5`
             ignore_index (int): the index to ignore. Default: -100
         Returns:

liger_kernel/transformers/fused_linear_jsd.py CHANGED Viewed

@@ -12,7 +12,7 @@ class LigerFusedLinearJSD(torch.nn.Module):
     the materialization of the large logits tensor.
     Args:
-        jsd_beta (float): coefficient beta of generalized JSD in the open interval (0, 1). Default: `0.5`
+        jsd_beta (float): coefficient beta of generalized JSD in the interval [0, 1]. It implements forward/reverse KL when beta equals 0 and 1 respectively. Default: `0.5`
         ignore_index (int): The index to ignore in the target. Default: `-100`
         temperature (float): temperature in softmax function to control the output probability distribution. Default: `1.0`
@@ -70,9 +70,6 @@ class LigerFusedLinearJSD(torch.nn.Module):
     def __init__(self, jsd_beta=0.5, ignore_index=-100, temperature=1.0):
         super().__init__()
-        assert (
-            jsd_beta > 0 and jsd_beta < 1
-        ), f"beta must be greater than 0 and less than 1. Got: {jsd_beta}"
         assert temperature != 0, "temperature cannot be 0."
         self.jsd_beta = jsd_beta
         self.temperature = temperature

liger_kernel/transformers/jsd.py CHANGED Viewed

@@ -18,7 +18,7 @@ class LigerJSD(torch.nn.Module):
     :math:`P` denotes the teacher model and :math:`Q` denotes the student model.
     Args:
-        beta (float): coefficient beta of generalized JSD in the open interval (0, 1). Default: `0.5`
+        beta (float): coefficient beta of generalized JSD in the interval [0, 1]. It implements forward/reverse KL when beta equals 0 and 1 respectively. Default: `0.5`
         ignore_index (int): The index to ignore in the target. Default: `-100`
     Shape:
@@ -58,9 +58,6 @@ class LigerJSD(torch.nn.Module):
     def __init__(self, beta: float = 0.5, ignore_index: int = -100):
         super().__init__()
-        assert (
-            beta > 0 and beta < 1
-        ), f"beta must be greater than 0 and less than 1. Got: {beta}"
         self.beta = beta
         self.ignore_index = ignore_index

{liger_kernel_nightly-0.4.2.dev20241121054604.dist-info → liger_kernel_nightly-0.4.2.dev20241121224158.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: liger_kernel_nightly
-Version: 0.4.2.dev20241121054604
+Version: 0.4.2.dev20241121224158
 Summary: Efficient Triton kernels for LLM Training
 License: BSD 2-CLAUSE LICENSE
         Copyright 2024 LinkedIn Corporation
@@ -303,8 +303,8 @@ $$\text{GeGLU}(x)=\text{GELU}(xW+b)\otimes(xV+c)$$
 <!-- TODO: verify vocab sizes are accurate  -->
 - **FusedLinearCrossEntropy**: Peak memory usage of cross entropy loss is further improved by fusing the model head with the CE loss and chunking the input for block-wise loss and gradient calculation, a technique inspired by [Efficient Cross Entropy](https://github.com/mgmalek/efficient_cross_entropy). It achieves >4X memory reduction for 128k vocab size. **This is highly effective for large batch size, large sequence length, and large vocabulary sizes.** Please refer to the [Medusa example](https://github.com/linkedin/Liger-Kernel/tree/main/examples/medusa) for individual kernel usage.
 - **KLDivergence**: [KL Divergence](https://pytorch.org/docs/stable/generated/torch.nn.KLDivLoss.html) is implemented by fusing the forward into a single triton kernel, with reduction done outside the kernel. It achieves ~1.5X speed and ~15% memory reduction for 128K vocab size.
-- **JSD**: [Generalized JSD](https://arxiv.org/pdf/2306.13649) (Jensen-Shannon divergence), is implemented by computing both the loss and gradient in the forward pass. It achieves ~1.5X speed and ~54% memory reduction for 128k vocab size.
-- **FusedLinearJSD**: Peak memory usage of JSD loss is further improved by fusing the model head with the JSD and chunking the input for block-wise loss and gradient calculation. It achieves ~85% memory reduction for 128k vocab size where batch size $\times$ sequence length is 8192.
+- **JSD**: [Generalized JSD](https://arxiv.org/pdf/2306.13649) (Jensen-Shannon divergence), is implemented by computing both the loss and gradient in the forward pass. It achieves ~1.5X speed and ~54% memory reduction for 128k vocab size. **NOTE**: It implements forward/reverse KL when `beta` equals 0 and 1 respectively.
+- **FusedLinearJSD**: Peak memory usage of JSD loss is further improved by fusing the model head with the JSD and chunking the input for block-wise loss and gradient calculation. It achieves ~85% memory reduction for 128k vocab size where batch size $\times$ sequence length is 8192. **NOTE**: It implements forward/reverse KL when `beta` equals 0 and 1 respectively.
 ### Experimental Kernels

{liger_kernel_nightly-0.4.2.dev20241121054604.dist-info → liger_kernel_nightly-0.4.2.dev20241121224158.dist-info}/RECORD RENAMED Viewed

@@ -9,10 +9,10 @@ liger_kernel/chunked_loss/simpo_loss.py,sha256=Jpl_U6DfxlzyHnlKN2i05K0vwz-ouiTmx
 liger_kernel/ops/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 liger_kernel/ops/cross_entropy.py,sha256=sfUb7-jIZp0EKXjg1DYy2Wdzw_Mg-mHmGoR5bpdm4tw,15526
 liger_kernel/ops/fused_linear_cross_entropy.py,sha256=ib7M3AjJE164yMfuS9R39k-5qnDgYOXptIT146lqYbg,9964
-liger_kernel/ops/fused_linear_jsd.py,sha256=5D_obamh08lGGTMyh85kBJD_aNjPhOYf4-TmCZ6m4s4,9626
+liger_kernel/ops/fused_linear_jsd.py,sha256=nOv4zwfxHqqepKEmMsQuz-B3H-gRjyo8uClpmqSGLYA,9693
 liger_kernel/ops/geglu.py,sha256=MQL4zyzneZqZYUGPvb1QjI_EYT9_pKfSDgR25WD9jrI,4127
 liger_kernel/ops/group_norm.py,sha256=VaRErVJGR4JqgXXvuIjNGTn3E2egjLtU1y3ymwIf4d8,10961
-liger_kernel/ops/jsd.py,sha256=anWfdioucxZy4JQfTvbHBR-IQrZKeH-gBF1MHwwTuTQ,5781
+liger_kernel/ops/jsd.py,sha256=Ap2b0_geCl6fqBXLI1IS6Yn6GlO-8LgPmnOW3y47dus,6151
 liger_kernel/ops/kl_div.py,sha256=03FNXfvCb6M-56hhFepAFV9p6brArPR6KOKkdGD34mw,8374
 liger_kernel/ops/layer_norm.py,sha256=unGMYMOPqtkM9aTrokhcqgPmsV2AUN7Yzv86isVB9OI,7422
 liger_kernel/ops/qwen2vl_mrope.py,sha256=xZvQnhkSTjU-k6KiiRn9e0SYO1ESs1jmuZFMICduLpc,8552
@@ -27,10 +27,10 @@ liger_kernel/transformers/auto_model.py,sha256=RMIwQHSiXoksXFTIqFZ4PLBgoqkxJJAT3
 liger_kernel/transformers/cross_entropy.py,sha256=yEm_YQ7oa3_BzT3hdW6KrAslduhSqWcJQVNZZDcWCg4,1758
 liger_kernel/transformers/functional.py,sha256=jwTHmyjOVC1_I-6ztY1EbbRqPIfFHojcHrP2c4P6U4I,2123
 liger_kernel/transformers/fused_linear_cross_entropy.py,sha256=_i0PXSp5iZ9pKXdEeZ4lvHCENJYjV4y74yz3ZRG5XQg,1484
-liger_kernel/transformers/fused_linear_jsd.py,sha256=MJ-KjmLZnakuoVpnbDGkd95DQgvESniyrRWYzollVZM,4066
+liger_kernel/transformers/fused_linear_jsd.py,sha256=bZ4otCvWBuOnA5XdQL-FzZVItJlDt-ht9e_pG7PG93E,3999
 liger_kernel/transformers/geglu.py,sha256=QcrME_8ooIn0xa59LaC0aoOdRrBIFd11Y0bAyF0NfCw,1130
 liger_kernel/transformers/group_norm.py,sha256=FJ9R7mS9G1wO-GRIQ6QKSmIhnZ6nQ6GIkE4NnX_hnn0,2241
-liger_kernel/transformers/jsd.py,sha256=W-5CypO2mx4-bUWOxq1KScfCdoXlLoYbtt5xBnRzMs4,3056
+liger_kernel/transformers/jsd.py,sha256=sbr8DnKSYZJH9pv2rpmboNijYGpZKbhb2-WSGp5_v6g,3001
 liger_kernel/transformers/kl_div.py,sha256=qVhjBg6tjRyue5iZ3NFxo8uySY4JuIFJyv0IM_50F24,431
 liger_kernel/transformers/layer_norm.py,sha256=fd6o4kSHJWolQMWxh-l1qObfgL08ruNbUoBiANKX1ow,972
 liger_kernel/transformers/monkey_patch.py,sha256=Fk2v4GZQDJzfh3Cpc6BHNJbs_tungDyWmqS9nuG9Lc4,38406
@@ -52,9 +52,9 @@ liger_kernel/transformers/model/qwen2.py,sha256=EyhSSzQOskGjSnCsKMZpd1s5IAIlHd5P
 liger_kernel/transformers/model/qwen2_vl.py,sha256=bIQe2bWiY--G84FhCD29Gdi64_qHP6vbcGsK6vKysQE,8547
 liger_kernel/triton/__init__.py,sha256=yfRe0zMb47QnqjecZWG7LnanfCTzeku7SgWRAwNVmzU,101
 liger_kernel/triton/monkey_patch.py,sha256=5BcGKTtdqeYchypBIBopGIWPx1-cFALz7sOKoEsqXJ0,1584
-liger_kernel_nightly-0.4.2.dev20241121054604.dist-info/LICENSE,sha256=OhzLDHJ0to4a8sodVLELZiCFylZ1NAAYLs-HrjPy0ag,1312
-liger_kernel_nightly-0.4.2.dev20241121054604.dist-info/METADATA,sha256=JFMw-oIVSlNwaCh7Fi2x1Q-JdLAOK_s9qxkV4idrGEQ,21723
-liger_kernel_nightly-0.4.2.dev20241121054604.dist-info/NOTICE,sha256=njwnoPZLh9AN8SJQzxvCGLHi-8X__AvWRze6joNXIY8,2066
-liger_kernel_nightly-0.4.2.dev20241121054604.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
-liger_kernel_nightly-0.4.2.dev20241121054604.dist-info/top_level.txt,sha256=2eghu4hA3LnkM7ElW92tQ8zegWKgSbeo-k-aGe1YnvY,13
-liger_kernel_nightly-0.4.2.dev20241121054604.dist-info/RECORD,,
+liger_kernel_nightly-0.4.2.dev20241121224158.dist-info/LICENSE,sha256=OhzLDHJ0to4a8sodVLELZiCFylZ1NAAYLs-HrjPy0ag,1312
+liger_kernel_nightly-0.4.2.dev20241121224158.dist-info/METADATA,sha256=3HyUur6qJmSMTQaxiLaiDaGUrvU3_ILHlvWdobywuso,21891
+liger_kernel_nightly-0.4.2.dev20241121224158.dist-info/NOTICE,sha256=njwnoPZLh9AN8SJQzxvCGLHi-8X__AvWRze6joNXIY8,2066
+liger_kernel_nightly-0.4.2.dev20241121224158.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
+liger_kernel_nightly-0.4.2.dev20241121224158.dist-info/top_level.txt,sha256=2eghu4hA3LnkM7ElW92tQ8zegWKgSbeo-k-aGe1YnvY,13
+liger_kernel_nightly-0.4.2.dev20241121224158.dist-info/RECORD,,

{liger_kernel_nightly-0.4.2.dev20241121054604.dist-info → liger_kernel_nightly-0.4.2.dev20241121224158.dist-info}/LICENSE RENAMED Viewed

File without changes

{liger_kernel_nightly-0.4.2.dev20241121054604.dist-info → liger_kernel_nightly-0.4.2.dev20241121224158.dist-info}/NOTICE RENAMED Viewed

File without changes

{liger_kernel_nightly-0.4.2.dev20241121054604.dist-info → liger_kernel_nightly-0.4.2.dev20241121224158.dist-info}/WHEEL RENAMED Viewed

File without changes

{liger_kernel_nightly-0.4.2.dev20241121054604.dist-info → liger_kernel_nightly-0.4.2.dev20241121224158.dist-info}/top_level.txt RENAMED Viewed

File without changes

liger-kernel-nightly 0.4.2.dev20241121054604__py3-none-any.whl → 0.4.2.dev20241121224158__py3-none-any.whl

liger-kernel-nightly 0.4.2.dev20241121054604py3-none-any.whl → 0.4.2.dev20241121224158py3-none-any.whl