PyPI - liger-kernel-nightly - Versions diffs - 0.5.2.dev20241223032630__py3-none-any.whl → 0.5.2.dev20241223042135__py3-none-any.whl - Mend

liger-kernel-nightly 0.5.2.dev20241223032630py3-none-any.whl → 0.5.2.dev20241223042135py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

liger_kernel/chunked_loss/cpo_loss.py +5 -11
liger_kernel/chunked_loss/dpo_loss.py +1 -4
liger_kernel/chunked_loss/fused_linear_distillation.py +37 -37
liger_kernel/chunked_loss/fused_linear_preference.py +40 -64
liger_kernel/chunked_loss/orpo_loss.py +2 -6
liger_kernel/chunked_loss/simpo_loss.py +4 -8
liger_kernel/env_report.py +4 -11
liger_kernel/ops/cross_entropy.py +7 -10
liger_kernel/ops/experimental/embedding.py +1 -3
liger_kernel/ops/experimental/mm_int8int2.py +3 -9
liger_kernel/ops/fused_linear_cross_entropy.py +7 -15
liger_kernel/ops/fused_linear_jsd.py +11 -29
liger_kernel/ops/geglu.py +6 -17
liger_kernel/ops/group_norm.py +11 -28
liger_kernel/ops/jsd.py +2 -6
liger_kernel/ops/kl_div.py +4 -7
liger_kernel/ops/layer_norm.py +3 -5
liger_kernel/ops/qwen2vl_mrope.py +8 -25
liger_kernel/ops/rms_norm.py +11 -29
liger_kernel/ops/rope.py +8 -24
liger_kernel/ops/swiglu.py +4 -8
liger_kernel/ops/utils.py +2 -0
liger_kernel/transformers/__init__.py +16 -24
liger_kernel/transformers/auto_model.py +6 -13
liger_kernel/transformers/cross_entropy.py +1 -3
liger_kernel/transformers/experimental/embedding.py +1 -3
liger_kernel/transformers/functional.py +2 -6
liger_kernel/transformers/fused_linear_cross_entropy.py +2 -6
liger_kernel/transformers/geglu.py +1 -4
liger_kernel/transformers/group_norm.py +3 -9
liger_kernel/transformers/jsd.py +1 -3
liger_kernel/transformers/kl_div.py +1 -3
liger_kernel/transformers/layer_norm.py +3 -9
liger_kernel/transformers/model/gemma.py +18 -40
liger_kernel/transformers/model/gemma2.py +19 -41
liger_kernel/transformers/model/llama.py +22 -48
liger_kernel/transformers/model/mistral.py +14 -26
liger_kernel/transformers/model/mixtral.py +23 -53
liger_kernel/transformers/model/mllama.py +16 -36
liger_kernel/transformers/model/phi3.py +18 -40
liger_kernel/transformers/model/qwen2.py +18 -40
liger_kernel/transformers/model/qwen2_vl.py +16 -30
liger_kernel/transformers/monkey_patch.py +43 -117
liger_kernel/transformers/rms_norm.py +4 -4
liger_kernel/transformers/swiglu.py +2 -8
liger_kernel/transformers/trainer/__init__.py +1 -3
liger_kernel/transformers/trainer/orpo_trainer.py +13 -16
liger_kernel/triton/__init__.py +1 -3
liger_kernel/triton/monkey_patch.py +1 -3
{liger_kernel_nightly-0.5.2.dev20241223032630.dist-info → liger_kernel_nightly-0.5.2.dev20241223042135.dist-info}/METADATA +1 -1
liger_kernel_nightly-0.5.2.dev20241223042135.dist-info/RECORD +66 -0
liger_kernel_nightly-0.5.2.dev20241223032630.dist-info/RECORD +0 -66
{liger_kernel_nightly-0.5.2.dev20241223032630.dist-info → liger_kernel_nightly-0.5.2.dev20241223042135.dist-info}/LICENSE +0 -0
{liger_kernel_nightly-0.5.2.dev20241223032630.dist-info → liger_kernel_nightly-0.5.2.dev20241223042135.dist-info}/NOTICE +0 -0
{liger_kernel_nightly-0.5.2.dev20241223032630.dist-info → liger_kernel_nightly-0.5.2.dev20241223042135.dist-info}/WHEEL +0 -0
{liger_kernel_nightly-0.5.2.dev20241223032630.dist-info → liger_kernel_nightly-0.5.2.dev20241223042135.dist-info}/top_level.txt +0 -0

liger_kernel/chunked_loss/cpo_loss.py CHANGED Viewed

@@ -1,17 +1,12 @@
 import torch
 import torch.nn.functional as F
-from liger_kernel.chunked_loss.fused_linear_preference import (
-    LigerFusedLinearPreferenceBase,
-)
+from liger_kernel.chunked_loss.fused_linear_preference import LigerFusedLinearPreferenceBase
 class LigerFusedLinearCPOFunction(LigerFusedLinearPreferenceBase):
     @staticmethod
-    def preference_loss_fn(
-        chosen_logps, rejected_logps, full_target, beta=0.1, label_smoothing=0.0
-    ):
+    def preference_loss_fn(chosen_logps, rejected_logps, full_target, beta=0.1, label_smoothing=0.0):
         """
         Paper: https://arxiv.org/pdf/2401.08417
@@ -35,10 +30,9 @@ class LigerFusedLinearCPOFunction(LigerFusedLinearPreferenceBase):
             label_smoothing (float): Label smoothing factor, will reduce to Equation above when label_smoothing -> 0.
         """
         logits = beta * (chosen_logps - rejected_logps)
-        loss = (
-            - F.logsigmoid(logits) * (1 - label_smoothing)
-            - F.logsigmoid(-logits) * label_smoothing
-        ).sum() / (full_target.shape[0] // 2)
+        loss = (-F.logsigmoid(logits) * (1 - label_smoothing) - F.logsigmoid(-logits) * label_smoothing).sum() / (
+            full_target.shape[0] // 2
+        )
         return loss

liger_kernel/chunked_loss/dpo_loss.py CHANGED Viewed

@@ -1,13 +1,10 @@
 import torch
 import torch.nn.functional as F
-from liger_kernel.chunked_loss.fused_linear_preference import (
-    LigerFusedLinearPreferenceBase,
-)
+from liger_kernel.chunked_loss.fused_linear_preference import LigerFusedLinearPreferenceBase
 class LigerFusedLinearDPOFunction(LigerFusedLinearPreferenceBase):
     @staticmethod
     def preference_loss_fn(
         chosen_logps,

liger_kernel/chunked_loss/fused_linear_distillation.py CHANGED Viewed

@@ -2,11 +2,11 @@ from abc import abstractmethod
 from functools import partial
 import torch
 from torch.nn import functional as F
 class LigerFusedLinearDistillationBase(torch.autograd.Function):
     @abstractmethod
     def distillation_loss_fn(student_logits, teacher_logits, temperature):
         """
@@ -89,25 +89,25 @@ class LigerFusedLinearDistillationBase(torch.autograd.Function):
             compute_ce_loss (bool): Whether to compute CE loss.
             loss_kwargs (dict): Additional arguments for the loss function.
         """
-        student_logits_chunk, teacher_logits_chunk, hard_loss = (
-            LigerFusedLinearDistillationBase.chunk_forward(
-                student_input_chunk,
-                student_weight,
-                teacher_input_chunk,
-                teacher_weight,
-                target_chunk,
-                student_bias=student_bias,
-                teacher_bias=teacher_bias,
-                ignore_index=ignore_index,
-                compute_ce_loss=compute_ce_loss,
-            )
+        (
+            student_logits_chunk,
+            teacher_logits_chunk,
+            hard_loss,
+        ) = LigerFusedLinearDistillationBase.chunk_forward(
+            student_input_chunk,
+            student_weight,
+            teacher_input_chunk,
+            teacher_weight,
+            target_chunk,
+            student_bias=student_bias,
+            teacher_bias=teacher_bias,
+            ignore_index=ignore_index,
+            compute_ce_loss=compute_ce_loss,
         )
         hard_loss /= full_target.shape[0]
-        soft_loss = distillation_loss_fn(
-            student_logits_chunk, teacher_logits_chunk, temperature
-        )
+        soft_loss = distillation_loss_fn(student_logits_chunk, teacher_logits_chunk, temperature)
         soft_loss /= full_target.shape[0]
         loss = weight_hard_loss * hard_loss + weight_soft_loss * soft_loss
@@ -174,17 +174,18 @@ class LigerFusedLinearDistillationBase(torch.autograd.Function):
         def accumulate_chunk(student_input_chunk, teacher_input_chunk, target_chunk):
             if student_bias is not None:
-                (chunk_grad_input, chunk_grad_weight, chunk_grad_bias), (
-                    chunk_loss,
+                (
+                    (chunk_grad_input, chunk_grad_weight, chunk_grad_bias),
                     (
-                        chunk_soft_loss,
-                        chunk_hard_loss,
-                        chunk_student_logits,
-                        chunk_teacher_logits,
+                        chunk_loss,
+                        (
+                            chunk_soft_loss,
+                            chunk_hard_loss,
+                            chunk_student_logits,
+                            chunk_teacher_logits,
+                        ),
                     ),
-                ) = torch.func.grad_and_value(
-                    loss_func_to_call, argnums=(0, 1, 5), has_aux=True
-                )(
+                ) = torch.func.grad_and_value(loss_func_to_call, argnums=(0, 1, 5), has_aux=True)(
                     student_input_chunk,
                     student_weight,
                     teacher_input_chunk,
@@ -195,17 +196,18 @@ class LigerFusedLinearDistillationBase(torch.autograd.Function):
                 )
                 grad_bias.add_(chunk_grad_bias)
             else:
-                (chunk_grad_input, chunk_grad_weight), (
-                    chunk_loss,
+                (
+                    (chunk_grad_input, chunk_grad_weight),
                     (
-                        chunk_soft_loss,
-                        chunk_hard_loss,
-                        chunk_student_logits,
-                        chunk_teacher_logits,
+                        chunk_loss,
+                        (
+                            chunk_soft_loss,
+                            chunk_hard_loss,
+                            chunk_student_logits,
+                            chunk_teacher_logits,
+                        ),
                     ),
-                ) = torch.func.grad_and_value(
-                    loss_func_to_call, argnums=(0, 1), has_aux=True
-                )(
+                ) = torch.func.grad_and_value(loss_func_to_call, argnums=(0, 1), has_aux=True)(
                     student_input_chunk,
                     student_weight,
                     teacher_input_chunk,
@@ -229,9 +231,7 @@ class LigerFusedLinearDistillationBase(torch.autograd.Function):
         for student_input_chunk, teacher_input_chunk, target_chunk in zip(
             _student_input_chunks, _teacher_input_chunks, _target_chunks
         ):
-            grad_input = accumulate_chunk(
-                student_input_chunk, teacher_input_chunk, target_chunk
-            )
+            grad_input = accumulate_chunk(student_input_chunk, teacher_input_chunk, target_chunk)
             grad_inputs.append(grad_input)
         ctx.save_for_backward(

liger_kernel/chunked_loss/fused_linear_preference.py CHANGED Viewed

@@ -2,11 +2,11 @@ from abc import abstractmethod
 from functools import partial
 import torch
 from torch.nn import functional as F
 class LigerFusedLinearPreferenceBase(torch.autograd.Function):
     @abstractmethod
     def preference_loss_fn(*args, **kwargs):
         """
@@ -102,9 +102,7 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
             Fused forward and backward pass for a chunk of input and target.
             """
             if bias is not None:
-                return torch.func.grad_and_value(
-                    compute_loss, argnums=(0, 1, 3), has_aux=True
-                )(
+                return torch.func.grad_and_value(compute_loss, argnums=(0, 1, 3), has_aux=True)(
                     input_chunk,
                     weight,
                     target_chunk,
@@ -112,43 +110,47 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
                     ref_input_chunk=ref_input_chunk,
                 )
             else:
-                return torch.func.grad_and_value(
-                    compute_loss, argnums=(0, 1), has_aux=True
-                )(input_chunk, weight, target_chunk, ref_input_chunk=ref_input_chunk)
+                return torch.func.grad_and_value(compute_loss, argnums=(0, 1), has_aux=True)(
+                    input_chunk, weight, target_chunk, ref_input_chunk=ref_input_chunk
+                )
         def accumulate_chunk(input_chunk, target_chunk, ref_input_chunk=None):
             if bias is not None:
-                (chunk_grad_input, chunk_grad_weight, chunk_grad_bias), (
-                    chunk_loss,
+                (
+                    (chunk_grad_input, chunk_grad_weight, chunk_grad_bias),
                     (
-                        chunk_chosen_logps,
-                        chunk_rejected_logps,
-                        chunk_chosen_logits_mean,
-                        chunk_rejected_logits_mean,
-                        chunk_nll_loss,
-                        *aux_outputs,
+                        chunk_loss,
+                        (
+                            chunk_chosen_logps,
+                            chunk_rejected_logps,
+                            chunk_chosen_logits_mean,
+                            chunk_rejected_logits_mean,
+                            chunk_nll_loss,
+                            *aux_outputs,
+                        ),
                     ),
                 ) = fused_fwd_bwd(input_chunk, target_chunk, ref_input_chunk)
                 grad_bias.add_(chunk_grad_bias)  # accumulate bias gradient
             else:
-                (chunk_grad_input, chunk_grad_weight), (
-                    chunk_loss,
+                (
+                    (chunk_grad_input, chunk_grad_weight),
                     (
-                        chunk_chosen_logps,
-                        chunk_rejected_logps,
-                        chunk_chosen_logits_mean,
-                        chunk_rejected_logits_mean,
-                        chunk_nll_loss,
-                        *aux_outputs,
+                        chunk_loss,
+                        (
+                            chunk_chosen_logps,
+                            chunk_rejected_logps,
+                            chunk_chosen_logits_mean,
+                            chunk_rejected_logits_mean,
+                            chunk_nll_loss,
+                            *aux_outputs,
+                        ),
                     ),
                 ) = fused_fwd_bwd(input_chunk, target_chunk, ref_input_chunk)
             # Accumulate gradients
             grad_weight.add_(chunk_grad_weight)
             grad_chosen_inputs.append(chunk_grad_input[: chosen_target_chunk.shape[0]])
-            grad_rejected_inputs.append(
-                chunk_grad_input[chosen_target_chunk.shape[0] :]
-            )
+            grad_rejected_inputs.append(chunk_grad_input[chosen_target_chunk.shape[0] :])
             # Accumulate loss
             loss_acc.add_(chunk_loss)
@@ -165,9 +167,7 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
             if len(aggregated_aux_outputs) == 0:
                 for aux in aux_outputs:
                     if aux.ndim == 0:
-                        aggregated_aux_outputs.append(
-                            torch.zeros((), device=aux.device)
-                        )
+                        aggregated_aux_outputs.append(torch.zeros((), device=aux.device))
                     else:
                         aggregated_aux_outputs.append([])
@@ -189,12 +189,8 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
         _rejected_target_chunks = torch.chunk(target[len_chosen:], chunks=chunks, dim=0)
         if use_ref_model:
-            _ref_chosen_input_chunks = torch.chunk(
-                ref_input[:len_chosen], chunks=chunks, dim=0
-            )
-            _ref_rejected_input_chunks = torch.chunk(
-                ref_input[len_chosen:], chunks=chunks, dim=0
-            )
+            _ref_chosen_input_chunks = torch.chunk(ref_input[:len_chosen], chunks=chunks, dim=0)
+            _ref_rejected_input_chunks = torch.chunk(ref_input[len_chosen:], chunks=chunks, dim=0)
         for (
             chosen_input_chunk,
@@ -208,26 +204,15 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
             _rejected_input_chunks,
             _chosen_target_chunks,
             _rejected_target_chunks,
-            (
-                _ref_chosen_input_chunks
-                if use_ref_model
-                else [None] * len(_chosen_input_chunks)
-            ),
-            (
-                _ref_rejected_input_chunks
-                if use_ref_model
-                else [None] * len(_rejected_input_chunks)
-            ),
+            (_ref_chosen_input_chunks if use_ref_model else [None] * len(_chosen_input_chunks)),
+            (_ref_rejected_input_chunks if use_ref_model else [None] * len(_rejected_input_chunks)),
+            strict=False,
         ):
             input_chunk = torch.cat([chosen_input_chunk, rejected_input_chunk], dim=0)
             ref_input_chunk = (
-                torch.cat([ref_chosen_input_chunk, ref_rejected_input_chunk], dim=0)
-                if use_ref_model
-                else None
-            )
-            target_chunk = torch.cat(
-                [chosen_target_chunk, rejected_target_chunk], dim=0
+                torch.cat([ref_chosen_input_chunk, ref_rejected_input_chunk], dim=0) if use_ref_model else None
             )
+            target_chunk = torch.cat([chosen_target_chunk, rejected_target_chunk], dim=0)
             # mark input_chunk, target_chunk, and target dimension 1 as dynamic to prevent torch.compile recompilation
             torch._dynamo.mark_dynamic(input_chunk, 1)
@@ -265,9 +250,7 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
     @staticmethod
     def backward(ctx, *grad_output):
         grad_input, grad_weight, grad_bias = ctx.saved_tensors
-        if torch.ne(
-            grad_output[0][0], torch.tensor(1.0, device=grad_output[0][0].device)
-        ):
+        if torch.ne(grad_output[0][0], torch.tensor(1.0, device=grad_output[0][0].device)):
             grad_input = grad_input * grad_output[0][0]
             grad_weight = grad_weight * grad_output[0][0]
             grad_bias = grad_bias * grad_output[0][0] if grad_bias is not None else None
@@ -301,9 +284,7 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
         loss_mask = target_chunk != ignore_index
         label_chunk = torch.where(loss_mask, target_chunk, 0)
-        per_token_logps = log_probs_chunk.gather(-1, label_chunk.unsqueeze(-1)).squeeze(
-            -1
-        )
+        per_token_logps = log_probs_chunk.gather(-1, label_chunk.unsqueeze(-1)).squeeze(-1)
         average_log_prob = (per_token_logps * loss_mask).sum(-1) / loss_mask.sum(-1)
         chosen_logps = average_log_prob[:len_chosen_chunk]
@@ -370,13 +351,8 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
             ignore_index=ignore_index,
             compute_nll_loss=compute_nll_loss,
         )
-        chosen_nll_loss = (
-            chosen_nll_loss
-            / (full_target[: full_target.shape[0] // 2] != ignore_index).sum()
-        )
-        chosen_logits_mean = chosen_logits.sum() / (
-            full_target.shape[0] // 2 * input_chunk.shape[1] * weight.shape[0]
-        )
+        chosen_nll_loss = chosen_nll_loss / (full_target[: full_target.shape[0] // 2] != ignore_index).sum()
+        chosen_logits_mean = chosen_logits.sum() / (full_target.shape[0] // 2 * input_chunk.shape[1] * weight.shape[0])
         rejected_logits_mean = rejected_logits.sum() / (
             full_target.shape[0] // 2 * input_chunk.shape[1] * weight.shape[0]
         )

liger_kernel/chunked_loss/orpo_loss.py CHANGED Viewed

@@ -1,13 +1,10 @@
 import torch
 import torch.nn.functional as F
-from liger_kernel.chunked_loss.fused_linear_preference import (
-    LigerFusedLinearPreferenceBase,
-)
+from liger_kernel.chunked_loss.fused_linear_preference import LigerFusedLinearPreferenceBase
 class LigerFusedLinearORPOFunction(LigerFusedLinearPreferenceBase):
     @staticmethod
     def preference_loss_fn(chosen_logps, rejected_logps, full_target, beta=0.1):
         """
@@ -32,8 +29,7 @@ class LigerFusedLinearORPOFunction(LigerFusedLinearPreferenceBase):
             beta (float): Weight for the odds ratio loss.
         """
         log_odds = (chosen_logps - rejected_logps) - (
-            torch.log1p(-torch.exp(chosen_logps))
-            - torch.log1p(-torch.exp(rejected_logps))
+            torch.log1p(-torch.exp(chosen_logps)) - torch.log1p(-torch.exp(rejected_logps))
         )
         ratio = F.logsigmoid(log_odds)
         loss = -beta * ratio.sum() / (full_target.shape[0] // 2)

liger_kernel/chunked_loss/simpo_loss.py CHANGED Viewed

@@ -1,13 +1,10 @@
 import torch
 import torch.nn.functional as F
-from liger_kernel.chunked_loss.fused_linear_preference import (
-    LigerFusedLinearPreferenceBase,
-)
+from liger_kernel.chunked_loss.fused_linear_preference import LigerFusedLinearPreferenceBase
 class LigerFusedLinearSimPOFunction(LigerFusedLinearPreferenceBase):
     @staticmethod
     def preference_loss_fn(
         chosen_logps,
@@ -41,10 +38,9 @@ class LigerFusedLinearSimPOFunction(LigerFusedLinearPreferenceBase):
             label_smoothing (float): Label smoothing factor, will reduce to Equation above when label_smoothing -> 0.
         """
         logits = beta * (chosen_logps - rejected_logps) - gamma
-        loss = (
-            - F.logsigmoid(logits) * (1 - label_smoothing)
-            - F.logsigmoid(-logits) * label_smoothing
-        ).sum() / (full_target.shape[0] // 2)
+        loss = (-F.logsigmoid(logits) * (1 - label_smoothing) - F.logsigmoid(-logits) * label_smoothing).sum() / (
+            full_target.shape[0] // 2
+        )
         return loss

liger_kernel/env_report.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import platform
 import sys
 from importlib.metadata import version
@@ -27,15 +28,9 @@ def print_env_report():
         import torch
         print(f"PyTorch version: {torch.__version__}")
-        cuda_version = (
-            torch.version.cuda if torch.cuda.is_available() else "Not available"
-        )
+        cuda_version = torch.version.cuda if torch.cuda.is_available() else "Not available"
         print(f"CUDA version: {cuda_version}")
-        hip_version = (
-            torch.version.hip
-            if torch.cuda.is_available() and torch.version.hip
-            else "Not available"
-        )
+        hip_version = torch.version.hip if torch.cuda.is_available() and torch.version.hip else "Not available"
         print(f"HIP(ROCm) version: {hip_version}")
     except ImportError:
@@ -58,9 +53,7 @@ def print_env_report():
         print("Transformers: Not installed")
     try:
-        xpu_version = (
-            torch.version.xpu if torch.xpu.is_available() else "XPU Not Available"
-        )
+        xpu_version = torch.version.xpu if torch.xpu.is_available() else "XPU Not Available"
         print(f"XPU version: {xpu_version}")
     except ImportError:
         print("XPU version: Unable to query")

liger_kernel/ops/cross_entropy.py CHANGED Viewed

@@ -1,11 +1,14 @@
 import operator
 from typing import Optional
 import torch
 import triton
 import triton.language as tl
-from liger_kernel.ops.utils import compare_version, element_mul_kernel, is_hip
+from liger_kernel.ops.utils import compare_version
+from liger_kernel.ops.utils import element_mul_kernel
+from liger_kernel.ops.utils import is_hip
 if compare_version("triton", operator.ge, "3.0.0"):
     try:
@@ -92,9 +95,7 @@ def liger_cross_entropy_kernel(
     # 3. [Online softmax] first pass: find max + sum
     m = float("-inf")  # m is the max value. use the notation from the paper
     d = 0.0  # d is the sum. use the notation from the paper
-    ori_X_y = tl.load(X_ptr + y).cast(
-        tl.float32
-    )  # we need to store the original value of X_y for the loss calculation
+    ori_X_y = tl.load(X_ptr + y).cast(tl.float32)  # we need to store the original value of X_y for the loss calculation
     if HAS_SOFTCAPPING:
         ori_X_y = softcap * tanh(ori_X_y / softcap)
@@ -232,14 +233,10 @@ def cross_entropy_forward(
     return_z_loss,
 ):
     if not isinstance(return_z_loss, int):
-        assert (
-            return_z_loss in _bool_to_return_z_loss
-        ), f"return_z_loss must be True or False. Got: {return_z_loss}"
+        assert return_z_loss in _bool_to_return_z_loss, f"return_z_loss must be True or False. Got: {return_z_loss}"
         return_z_loss = _bool_to_return_z_loss[return_z_loss]
     else:
-        assert (
-            return_z_loss in _bool_to_return_z_loss
-        ), f"return_z_loss must be True or False. Got: {return_z_loss}"
+        assert return_z_loss in _bool_to_return_z_loss, f"return_z_loss must be True or False. Got: {return_z_loss}"
     BT, V = _input.shape
     n_rows = BT

liger_kernel/ops/experimental/embedding.py CHANGED Viewed

@@ -34,9 +34,7 @@ def embedding_forward_kernel(
     )
     output_offsets = offsets_m[:, None] * embedding_dim + offsets_n[None, :]
-    tl.store(
-        output_ptr + output_offsets, embeddings, mask=mask_m[:, None] & mask_n[None, :]
-    )
+    tl.store(output_ptr + output_offsets, embeddings, mask=mask_m[:, None] & mask_n[None, :])
 @triton.jit

liger_kernel/ops/experimental/mm_int8int2.py CHANGED Viewed

@@ -37,9 +37,7 @@ def pack_weights(intweights: torch.Tensor, bits: int = 2) -> torch.Tensor:
     else:
         packed_tensor_shape = (row_dim, *original_shape[1:])
-    packed = torch.zeros(
-        packed_tensor_shape, device=intweights.device, dtype=torch.uint8
-    )
+    packed = torch.zeros(packed_tensor_shape, device=intweights.device, dtype=torch.uint8)
     unpacked = intweights.to(torch.uint8)
     def lshift(t: torch.Tensor, bits: int):
@@ -327,17 +325,13 @@ def matmul_kernel(
 def matmul(a, b):
-    assert (
-        a.shape[1] == b.shape[0] * 4
-    ), "Incompatible dimensions, the weight matrix need to be packed"
+    assert a.shape[1] == b.shape[0] * 4, "Incompatible dimensions, the weight matrix need to be packed"
     assert a.is_contiguous(), "Matrix A must be contiguous"
     M, K = a.shape
     _, N = b.shape
     # c is in int32 to avoid any overflows or underflows
     c = torch.empty((M, N), device=a.device, dtype=torch.int32)
-    grid = lambda META: (
-        triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]),
-    )
+    grid = lambda META: (triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]),)
     matmul_kernel[grid](
         a,
         b,

liger_kernel/ops/fused_linear_cross_entropy.py CHANGED Viewed

@@ -2,12 +2,10 @@ import torch
 import triton
 from liger_kernel.ops.cross_entropy import liger_cross_entropy_kernel
-from liger_kernel.ops.utils import (
-    amp_custom_bwd,
-    amp_custom_fwd,
-    element_mul_kernel,
-    is_hip,
-)
+from liger_kernel.ops.utils import amp_custom_bwd
+from liger_kernel.ops.utils import amp_custom_fwd
+from liger_kernel.ops.utils import element_mul_kernel
+from liger_kernel.ops.utils import is_hip
 # The hard limit of TRITON_MAX_TENSOR_NUMEL is 1048576 https://github.com/triton-lang/triton/blob/ba42a5c68fd0505f8c42f4202d53be0f8d9a5fe0/python/triton/language/core.py#L19
 # However, setting limit as 65536 as in LayerNorm tutorial is faster because of less register spilling
@@ -40,14 +38,10 @@ def fused_linear_cross_entropy_forward(
     BLOCK_SIZE = min(MAX_FUSED_SIZE, triton.next_power_of_2(V))
     inc_factor = triton.cdiv(V, H)  # (V + H - 1) // H
-    chunk_size = triton.next_power_of_2(
-        triton.cdiv(BT, inc_factor)
-    )  # (BT + inc_factor - 1) // inc_factor
+    chunk_size = triton.next_power_of_2(triton.cdiv(BT, inc_factor))  # (BT + inc_factor - 1) // inc_factor
     num_chunks = triton.cdiv(BT, chunk_size)  # (BT + chunk_size - 1) // chunk_size
-    grad_weight = (
-        torch.zeros_like(weight, device=device) if weight.requires_grad else None
-    )
+    grad_weight = torch.zeros_like(weight, device=device) if weight.requires_grad else None
     grad_input = torch.zeros_like(_input, device=device)
     grad_bias = torch.zeros_like(bias, device=device) if bias is not None else None
     # we use fp32 for loss accumulator
@@ -137,9 +131,7 @@ def fused_linear_cross_entropy_forward(
     return loss, grad_input, grad_weight, grad_bias
-def fused_linear_cross_entropy_backward(
-    grad_output, grad_input, grad_weight, grad_bias
-):
+def fused_linear_cross_entropy_backward(grad_output, grad_input, grad_weight, grad_bias):
     # If cross entropy is the last layer, grad_output is 1.0. Skip the mul to save time
     if torch.ne(grad_output, torch.tensor(1.0, device=grad_output.device)):
         # We use a Triton kernel instead of a PyTorch operation because modifying inputs in-place

liger_kernel/ops/fused_linear_jsd.py CHANGED Viewed

@@ -4,12 +4,10 @@ import torch
 import triton
 from liger_kernel.ops.jsd import _jsd_kernel
-from liger_kernel.ops.utils import (
-    amp_custom_bwd,
-    amp_custom_fwd,
-    element_mul_kernel,
-    is_hip,
-)
+from liger_kernel.ops.utils import amp_custom_bwd
+from liger_kernel.ops.utils import amp_custom_fwd
+from liger_kernel.ops.utils import element_mul_kernel
+from liger_kernel.ops.utils import is_hip
 # The hard limit of TRITON_MAX_TENSOR_NUMEL is 1048576 https://github.com/triton-lang/triton/blob/ba42a5c68fd0505f8c42f4202d53be0f8d9a5fe0/python/triton/language/core.py#L19
 # However, setting limit as 65536 as in LayerNorm tutorial is faster because of less register spilling
@@ -43,16 +41,10 @@ def fused_linear_jsd_forward(
     BLOCK_SIZE = min(MAX_FUSED_SIZE, triton.next_power_of_2(V))
     inc_factor = triton.cdiv(V, H)  # (V + H - 1) // H
-    chunk_size = triton.next_power_of_2(
-        triton.cdiv(BT, inc_factor)
-    )  # (BT + inc_factor - 1) // inc_factor
+    chunk_size = triton.next_power_of_2(triton.cdiv(BT, inc_factor))  # (BT + inc_factor - 1) // inc_factor
     num_chunks = triton.cdiv(BT, chunk_size)  # (BT + chunk_size - 1) // chunk_size
-    grad_weight = (
-        torch.zeros_like(student_weight, device=device)
-        if student_weight.requires_grad
-        else None
-    )
+    grad_weight = torch.zeros_like(student_weight, device=device) if student_weight.requires_grad else None
     grad_input = torch.zeros_like(student_input)
     # we use fp32 for loss accumulator
     loss_1d = torch.zeros((BT, V), dtype=torch.float32, device=device)
@@ -73,12 +65,8 @@ def fused_linear_jsd_forward(
         # shape: chunk_size x V
         # For anything starting from logits to the final JSD loss, we do computation
         # in FP32 to avoid losing numerical stability.
-        student_logits_chunk = (student_input_chunk @ student_weight.t()).to(
-            torch.float32
-        )
-        teacher_logits_chunk = (teacher_input_chunk @ teacher_weight.t()).to(
-            torch.float32
-        )
+        student_logits_chunk = (student_input_chunk @ student_weight.t()).to(torch.float32)
+        teacher_logits_chunk = (teacher_input_chunk @ teacher_weight.t()).to(torch.float32)
         chunk_n_rows = student_logits_chunk.shape[0]
         # unreduced loss
@@ -104,9 +92,7 @@ def fused_linear_jsd_forward(
             dX_ptr=student_prob_chunk,
             dX_stride=student_prob_chunk.stride(-2),
             label_ptr=(
-                shift_labels[start_idx:end_idx]
-                if has_label
-                else torch.empty(1, device=device)
+                shift_labels[start_idx:end_idx] if has_label else torch.empty(1, device=device)
             ),  # dummy ptr if no label
             beta=jsd_beta,
             n_non_ignore=n_non_ignore,
@@ -121,9 +107,7 @@ def fused_linear_jsd_forward(
         student_logits_chunk = (
             student_prob_chunk
             - torch.softmax(student_logits_chunk, dim=-1)
-            * student_prob_chunk.sum(dim=-1, keepdim=True).broadcast_to(
-                student_prob_chunk.shape
-            )
+            * student_prob_chunk.sum(dim=-1, keepdim=True).broadcast_to(student_prob_chunk.shape)
         ) / temperature
         # now we traverse back to grad w.r.t. input to `lm_head` and grad
         # w.r.t. `lm_head` which should be computed in original dtype
@@ -239,7 +223,5 @@ class LigerFusedLinearJSDFunction(torch.autograd.Function):
     @amp_custom_bwd
     def backward(ctx, grad_output):
         (grad_input, grad_weight) = ctx.saved_tensors
-        grad_input, grad_weight = fused_linear_jsd_backward(
-            grad_output, grad_input, grad_weight
-        )
+        grad_input, grad_weight = fused_linear_jsd_backward(grad_output, grad_input, grad_weight)
         return (grad_input, grad_weight, None, None, None, None, None, None)

liger-kernel-nightly 0.5.2.dev20241223032630__py3-none-any.whl → 0.5.2.dev20241223042135__py3-none-any.whl

liger-kernel-nightly 0.5.2.dev20241223032630py3-none-any.whl → 0.5.2.dev20241223042135py3-none-any.whl