PyPI - liger-kernel - Versions diffs - 0.4.2__py3-none-any.whl → 0.5.1__py3-none-any.whl - Mend

liger-kernel 0.4.2py3-none-any.whl → 0.5.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

liger_kernel/__init__.py +0 -0
liger_kernel/chunked_loss/__init__.py +4 -0
liger_kernel/chunked_loss/cpo_loss.py +107 -0
liger_kernel/chunked_loss/dpo_loss.py +95 -17
liger_kernel/chunked_loss/functional.py +9 -0
liger_kernel/chunked_loss/fused_linear_distillation.py +252 -0
liger_kernel/chunked_loss/fused_linear_preference.py +245 -65
liger_kernel/chunked_loss/orpo_loss.py +63 -13
liger_kernel/chunked_loss/simpo_loss.py +115 -0
liger_kernel/env_report.py +22 -0
liger_kernel/ops/cross_entropy.py +17 -10
liger_kernel/ops/fused_linear_cross_entropy.py +0 -11
liger_kernel/ops/fused_linear_jsd.py +1 -1
liger_kernel/ops/jsd.py +19 -10
liger_kernel/ops/layer_norm.py +6 -1
liger_kernel/ops/qwen2vl_mrope.py +238 -0
liger_kernel/ops/rms_norm.py +6 -1
liger_kernel/ops/utils.py +5 -2
liger_kernel/transformers/functional.py +128 -11
liger_kernel/transformers/fused_linear_jsd.py +1 -4
liger_kernel/transformers/jsd.py +1 -4
liger_kernel/transformers/monkey_patch.py +6 -4
liger_kernel/transformers/qwen2vl_mrope.py +20 -0
liger_kernel/transformers/trainer/__init__.py +6 -0
liger_kernel/transformers/trainer/orpo_trainer.py +169 -0
liger_kernel/utils.py +13 -0
{liger_kernel-0.4.2.dist-info → liger_kernel-0.5.1.dist-info}/METADATA +71 -47
{liger_kernel-0.4.2.dist-info → liger_kernel-0.5.1.dist-info}/RECORD +32 -22
{liger_kernel-0.4.2.dist-info → liger_kernel-0.5.1.dist-info}/WHEEL +1 -1
{liger_kernel-0.4.2.dist-info → liger_kernel-0.5.1.dist-info}/LICENSE +0 -0
{liger_kernel-0.4.2.dist-info → liger_kernel-0.5.1.dist-info}/NOTICE +0 -0
{liger_kernel-0.4.2.dist-info → liger_kernel-0.5.1.dist-info}/top_level.txt +0 -0

liger_kernel/ops/cross_entropy.py CHANGED Viewed

@@ -92,8 +92,8 @@ def liger_cross_entropy_kernel(
     # 3. [Online softmax] first pass: find max + sum
     m = float("-inf")  # m is the max value. use the notation from the paper
     d = 0.0  # d is the sum. use the notation from the paper
-    ori_X_y = tl.load(
-        X_ptr + y
+    ori_X_y = tl.load(X_ptr + y).cast(
+        tl.float32
     )  # we need to store the original value of X_y for the loss calculation
     if HAS_SOFTCAPPING:
         ori_X_y = softcap * tanh(ori_X_y / softcap)
@@ -106,8 +106,11 @@ def liger_cross_entropy_kernel(
     for i in range(0, n_cols, BLOCK_SIZE):
         X_offsets = i + tl.arange(0, BLOCK_SIZE)
         X_block = tl.load(
-            X_ptr + X_offsets, mask=X_offsets < n_cols, other=float("-inf")
-        )
+            X_ptr + X_offsets,
+            mask=X_offsets < n_cols,
+            other=float("-inf"),
+            # Ensure float32 precision for softmax calculation
+        ).cast(tl.float32)
         if HAS_SOFTCAPPING:
             X_block = softcap * tanh(X_block / softcap)
         block_max = tl.max(X_block)
@@ -141,8 +144,11 @@ def liger_cross_entropy_kernel(
     for i in range(0, n_cols, BLOCK_SIZE):
         X_offsets = i + tl.arange(0, BLOCK_SIZE)
         X_block = tl.load(
-            X_ptr + X_offsets, mask=X_offsets < n_cols, other=float("-inf")
-        )
+            X_ptr + X_offsets,
+            mask=X_offsets < n_cols,
+            other=float("-inf"),
+            # Ensure float32 precision for softmax calculation
+        ).cast(tl.float32)
         if HAS_SOFTCAPPING:
             intermediate = tanh(X_block / softcap)
             X_block = softcap * intermediate
@@ -279,11 +285,12 @@ def cross_entropy_forward(
         num_warps=32 if not is_hip() else 16,
     )
-    loss = torch.sum(loss_1d)
-    if return_z_loss == _TRUE.value:
-        z_loss = torch.sum(z_loss_1d)
+    if reduction == "none":
+        loss = loss_1d
+        z_loss = z_loss_1d if return_z_loss == _TRUE.value else None
     else:
-        z_loss = None
+        loss = torch.sum(loss_1d)
+        z_loss = torch.sum(z_loss_1d) if return_z_loss == _TRUE.value else None
     return loss, z_loss, _input

liger_kernel/ops/fused_linear_cross_entropy.py CHANGED Viewed

@@ -26,7 +26,6 @@ def fused_linear_cross_entropy_forward(
     reduction="mean",
     softcap=None,
 ):
-    dtype = _input.dtype
     device = _input.device
     # inputs have shape: BT x H
@@ -74,9 +73,6 @@ def fused_linear_cross_entropy_forward(
         loss_1d_slice = loss_1d[start_idx:end_idx]  # chunk_size,
         n_non_ignore = (target_chunk != ignore_index).sum().item()
-        # when doing CE, use the upcasted precision
-        logits_chunk = logits_chunk.float()
         # ensure _input and target are contiguous
         logits_chunk = logits_chunk.contiguous()
         target_chunk = target_chunk.contiguous()
@@ -103,13 +99,6 @@ def fused_linear_cross_entropy_forward(
             num_warps=32 if not is_hip() else 16,
         )
-        # gradient of logits_chunk is computed in-place by the above triton kernel.
-        # Following HuggingFace model source code, we do the forward and backward
-        # w.r.t. logits in fp32 for numerical stability especially as the num classes (vocab size) is huge.
-        # (reference: https://github.com/huggingface/transformers/blob/v4.42.4/src/transformers/models/llama/modeling_llama.py#L1194)
-        # Propagating to lm_head's backward, we'll switch back to the original dtype.
-        logits_chunk = logits_chunk.to(dtype)
         # gradient of logits_chunk is computed in-place by the above triton kernel and is of shape: chunk_size x V
         # thus grad_input[start_idx: end_idx] should be of shape: chunk_size x H
         # additionally, since we are chunking the inputs, observe that the loss and gradients are calculated only

liger_kernel/ops/fused_linear_jsd.py CHANGED Viewed

@@ -202,7 +202,7 @@ class LigerFusedLinearJSDFunction(torch.autograd.Function):
             teacher_input (torch.tensor): input of the last projection layer in teacher model, with shape (B*T, H), where B is batch size, T is sequence length, H is hidden dimension.
             teacher_weight (torch.tensor): the last projection layer in teacher model, with shape (V, H), where V is vocab size
             shift_labels (Optional[torch.LongTensor]): indicator of next predicted vocab with shape (BT) where each value is in [0, V-1].
-            jsd_beta (float): coefficient beta of generalized JSD in the open interval (0, 1). Default: `0.5`
+            jsd_beta (float): coefficient beta of generalized JSD in the interval [0, 1]. It implements forward/reverse KL when beta equals 0 and 1 respectively. Default: `0.5`
             ignore_index (int): the index to ignore. Default: -100
             temperature (float): temperature in softmax function to control the output probability distribution. Default: `1.0`

liger_kernel/ops/jsd.py CHANGED Viewed

@@ -18,7 +18,7 @@ def _jsd_kernel(
     dX_ptr,
     dX_stride,
     label_ptr,
-    beta,
+    beta: tl.constexpr,
     n_non_ignore: int,
     ignore_index: tl.constexpr,
     n_cols,
@@ -50,17 +50,26 @@ def _jsd_kernel(
         X = tl.load(X_ptr + offsets, mask=mask, other=float("-inf")).to(tl.float32)
         Y = tl.load(Y_ptr + offsets, mask=mask, other=float("-inf")).to(tl.float32)
-        Q = tl.exp(X)
-        P = tl.exp(Y)
-        M = beta * P + (1 - beta) * Q
-        log_M = tl.log(M)
+        if beta == 0.0:  # forward KL
+            Y_prob = tl.exp(Y)
+            loss = Y_prob * (Y - X)
+            dX = -Y_prob
+        elif beta == 1.0:
+            X_prob = tl.exp(X)
+            loss = X_prob * (X - Y)
+            dX = loss + X_prob
+        else:
+            Q = tl.exp(X)
+            P = tl.exp(Y)
+            M = beta * P + (1 - beta) * Q
+            log_M = tl.log(M)
+            loss = beta * P * Y + (1 - beta) * Q * X - M * log_M
+            dX = (1 - beta) * Q * (X - log_M)
-        loss = beta * P * Y + (1 - beta) * Q * X - M * log_M
-        # reduction == "batchmean"
         loss = loss / n_non_ignore
+        dX = dX / n_non_ignore
         tl.store(loss_ptr + offsets, loss, mask=mask)
-        dX = (1 - beta) * Q * (X - log_M) / n_non_ignore
         tl.store(dX_ptr + offsets, dX, mask=mask)
@@ -142,7 +151,7 @@ class LigerJSDFunction(torch.autograd.Function):
             _input (torch.Tensor): predict values with shape (BT, V) in logspace
             target (torch.Tensor): ground truth values with shape (BT, V) in logspace
             shift_labels (Optional[torch.LongTensor]): indicator of next predicted vocab with shape (BT) where each value is in [0, V-1].
-            beta (float): coefficient beta of generalized JSD in the open interval (0, 1)
+            beta (float): coefficient beta of generalized JSD in the interval [0, 1]. It implements forward/reverse KL when beta equals 0 and 1 respectively. Default: `0.5`
             ignore_index (int): the index to ignore. Default: -100
         Returns:

liger_kernel/ops/layer_norm.py CHANGED Viewed

@@ -180,8 +180,13 @@ def layer_norm_backward(dY, X, W, B, Mean, RSTD):
     dY = dY.view(-1, dim)
     n_rows, n_cols = dY.shape
+    sm_count = 1
+    if X.device.type == "cuda":
+        sm_count = torch.cuda.get_device_properties(X.device).multi_processor_count
+    elif X.device.type == "xpu":
+        sm_count = torch.xpu.get_device_properties(X.device).gpu_subslice_count
     DX = torch.empty((n_rows, n_cols), dtype=X.dtype, device=X.device)
-    sm_count = torch.cuda.get_device_properties(X.device).multi_processor_count
     _DW = torch.empty((sm_count, n_cols), dtype=W.dtype, device=W.device)
     _DB = torch.empty((sm_count, n_cols), dtype=W.dtype, device=W.device)

liger_kernel/ops/qwen2vl_mrope.py ADDED Viewed

@@ -0,0 +1,238 @@
+import torch
+import triton
+import triton.language as tl
+@triton.jit
+def _triton_qwen2vl_mrope(
+    q_ptr,
+    k_ptr,
+    cos,
+    sin,
+    sl,
+    n_qh: tl.constexpr,
+    n_kh: tl.constexpr,
+    hd: tl.constexpr,
+    pad_n_qh: tl.constexpr,
+    pad_n_kh: tl.constexpr,
+    pad_hd: tl.constexpr,
+    mrope_section_t: tl.constexpr,
+    mrope_section_h: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+    BACKWARD_PASS: tl.constexpr = False,
+):
+    pid = tl.program_id(0)
+    # locate start address
+    q_ptr = q_ptr + pid * (n_qh * hd)
+    k_ptr = k_ptr + pid * (n_kh * hd)
+    # ####################################################################
+    # get the cos(mθ_{i...d/2}) and sin(mθ_{i...d/2}) for token position
+    # m of this program instance
+    # ####################################################################
+    # 1. program instances are laid out in a 1D vector of size bsz * seq_len, which
+    # effectively represents a 2D grid of size [bsz, seq_len] with seq_len dimension
+    # being the fastest changing dimension. Thus we can simply do pid // sl to get the batch index
+    # and pid % sl to get the sequence index.
+    # 2. We only need the left half of cos and sin matrix because the right half is just
+    # a clone of the left half.
+    t_end = mrope_section_t
+    h_end = t_end + mrope_section_h
+    cos_row_idx = pid % sl
+    t_cos = cos + cos_row_idx * hd
+    h_cos = t_cos + sl * hd
+    w_cos = h_cos + sl * hd
+    t_sin = sin + cos_row_idx * hd
+    h_sin = t_sin + sl * hd
+    w_sin = h_sin + sl * hd
+    cos_offsets = tl.arange(0, pad_hd // 2)
+    t_mask = cos_offsets < t_end
+    h_mask = (t_end <= cos_offsets) & (cos_offsets < h_end)
+    w_mask = (h_end <= cos_offsets) & (cos_offsets < hd // 2)
+    t_cos_row = tl.load(t_cos + cos_offsets, mask=t_mask, other=0)
+    h_cos_row = tl.load(h_cos + cos_offsets, mask=h_mask, other=0)
+    w_cos_row = tl.load(w_cos + cos_offsets, mask=w_mask, other=0)
+    t_sin_row = tl.load(t_sin + cos_offsets, mask=t_mask, other=0)
+    h_sin_row = tl.load(h_sin + cos_offsets, mask=h_mask, other=0)
+    w_sin_row = tl.load(w_sin + cos_offsets, mask=w_mask, other=0)
+    cos_row = t_cos_row + h_cos_row + w_cos_row
+    sin_row = t_sin_row + h_sin_row + w_sin_row
+    # ####################################################################
+    # Load the left and right half of q and k for the current
+    # program instance (i.e. for the current token) separately
+    # ####################################################################
+    # left half of the head
+    first_half_q_offsets = (
+        tl.arange(0, pad_n_qh)[:, None] * hd + tl.arange(0, pad_hd // 2)[None, :]
+    )
+    first_half_k_offsets = (
+        tl.arange(0, pad_n_kh)[:, None] * hd + tl.arange(0, pad_hd // 2)[None, :]
+    )
+    first_q_mask = (tl.arange(0, pad_n_qh)[:, None] < n_qh) & (
+        tl.arange(0, pad_hd // 2)[None, :] < hd // 2
+    )
+    first_k_mask = (tl.arange(0, pad_n_kh)[:, None] < n_kh) & (
+        tl.arange(0, pad_hd // 2)[None, :] < hd // 2
+    )
+    q_tile_1 = tl.load(q_ptr + first_half_q_offsets, mask=first_q_mask, other=0).to(
+        sin_row.dtype
+    )
+    k_tile_1 = tl.load(k_ptr + first_half_k_offsets, mask=first_k_mask, other=0).to(
+        sin_row.dtype
+    )
+    # right half of the head
+    second_half_q_offsets = first_half_q_offsets + (hd // 2)
+    second_half_k_offsets = first_half_k_offsets + (hd // 2)
+    second_q_mask = first_q_mask
+    second_k_mask = first_k_mask
+    q_tile_2 = tl.load(q_ptr + second_half_q_offsets, mask=second_q_mask, other=0).to(
+        sin_row.dtype
+    )
+    k_tile_2 = tl.load(k_ptr + second_half_k_offsets, mask=second_k_mask, other=0).to(
+        sin_row.dtype
+    )
+    if not BACKWARD_PASS:
+        # y = [x1, x2] * [cos, cos] + [-x2, x1] * [sin, sin]
+        new_q_tile_1 = q_tile_1 * cos_row - q_tile_2 * sin_row
+        tl.store(q_ptr + first_half_q_offsets, new_q_tile_1, mask=first_q_mask)
+        new_q_tile_2 = q_tile_2 * cos_row + q_tile_1 * sin_row
+        tl.store(q_ptr + second_half_q_offsets, new_q_tile_2, mask=second_q_mask)
+        new_k_tile_1 = k_tile_1 * cos_row - k_tile_2 * sin_row
+        tl.store(k_ptr + first_half_k_offsets, new_k_tile_1, mask=first_k_mask)
+        new_k_tile_2 = k_tile_2 * cos_row + k_tile_1 * sin_row
+        tl.store(k_ptr + second_half_k_offsets, new_k_tile_2, mask=second_k_mask)
+    else:
+        # with some math, we can get:
+        # dy = [dx1, dx2] * [cos, cos] + [-dx2, dx1] * [-sin, -sin]
+        new_q_tile_1 = q_tile_1 * cos_row + q_tile_2 * sin_row
+        tl.store(q_ptr + first_half_q_offsets, new_q_tile_1, mask=first_q_mask)
+        new_q_tile_2 = q_tile_2 * cos_row - q_tile_1 * sin_row
+        tl.store(q_ptr + second_half_q_offsets, new_q_tile_2, mask=second_q_mask)
+        new_k_tile_1 = k_tile_1 * cos_row + k_tile_2 * sin_row
+        tl.store(k_ptr + first_half_k_offsets, new_k_tile_1, mask=first_k_mask)
+        new_k_tile_2 = k_tile_2 * cos_row - k_tile_1 * sin_row
+        tl.store(k_ptr + second_half_k_offsets, new_k_tile_2, mask=second_k_mask)
+def qwen2vl_mrope_forward(q, k, cos, sin, mrope_section):
+    # transpose it back to the physical shape because Triton looks at the physical storage
+    # note: q and k are incontiguous before the transformation and will become contiguous after transpose
+    q = q.transpose(1, 2)
+    k = k.transpose(1, 2)
+    batch_size, seq_len, n_q_head, head_dim = q.shape
+    n_kv_head = k.shape[2]
+    pad_hd = triton.next_power_of_2(head_dim)
+    pad_n_q_head = triton.next_power_of_2(n_q_head)
+    pad_n_kv_head = triton.next_power_of_2(n_kv_head)
+    BLOCK_SIZE = max(pad_n_q_head, pad_n_kv_head)
+    n_row = batch_size * seq_len
+    # ensure tensors passed into the kernel are contiguous. It will be no-op if they are already contiguous
+    q = q.contiguous()
+    k = k.contiguous()
+    cos = cos.contiguous()
+    sin = sin.contiguous()
+    _triton_qwen2vl_mrope[(n_row,)](
+        q,
+        k,
+        cos,
+        sin,
+        seq_len,
+        n_q_head,
+        n_kv_head,
+        head_dim,
+        pad_n_q_head,
+        pad_n_kv_head,
+        pad_hd,
+        mrope_section[0],
+        mrope_section[1],
+        BLOCK_SIZE=BLOCK_SIZE,
+        BACKWARD_PASS=False,
+    )
+    return q.transpose(1, 2), k.transpose(1, 2), cos, sin
+def qwen2vl_mrope_backward(dq, dk, cos, sin, mrope_section):
+    dq = dq.transpose(1, 2)
+    dk = dk.transpose(1, 2)
+    batch_size, seq_len, n_q_head, head_dim = dq.shape
+    n_kv_head = dk.shape[2]
+    pad_hd = triton.next_power_of_2(head_dim)
+    pad_n_q_head = triton.next_power_of_2(n_q_head)
+    pad_n_kv_head = triton.next_power_of_2(n_kv_head)
+    BLOCK_SIZE = max(pad_n_q_head, pad_n_kv_head)
+    n_row = batch_size * seq_len
+    # ensure dq and dk are contiguous
+    dq = dq.contiguous()
+    dk = dk.contiguous()
+    # backward is similar to forward except swapping few ops
+    _triton_qwen2vl_mrope[(n_row,)](
+        dq,
+        dk,
+        cos,
+        sin,
+        seq_len,
+        n_q_head,
+        n_kv_head,
+        head_dim,
+        pad_n_q_head,
+        pad_n_kv_head,
+        pad_hd,
+        mrope_section[0],
+        mrope_section[1],
+        BLOCK_SIZE=BLOCK_SIZE,
+        BACKWARD_PASS=True,
+    )
+    return dq.transpose(1, 2), dk.transpose(1, 2)
+class LigerQwen2VLMRopeFunction(torch.autograd.Function):
+    """
+    Triton implementation of the Qwen2VL Multimodal Rotary Positional Embedding (M-RoPE) operation.
+    Please find the corresponding HuggingFace implementation here:
+    https://github.com/huggingface/transformers/blob/main/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py
+    """
+    @staticmethod
+    def forward(ctx, q, k, cos, sin, mrope_section, unsqueeze_dim=1):
+        """
+        q size: (bsz, n_q_head, seq_len, head_dim)
+        k size: (bsz, n_kv_head, seq_len, head_dim)
+        cos size: (3, 1, seq_len, head_dim)
+        sin size: (3, 1, seq_len, head_dim)
+        """
+        q, k, cos, sin = qwen2vl_mrope_forward(q, k, cos, sin, mrope_section)
+        ctx.save_for_backward(cos, sin)
+        ctx.mrope_section = mrope_section
+        return q, k
+    def backward(ctx, dq, dk):
+        """
+        dq size: (bsz, n_q_head, seq_len, head_dim)
+        dk size: (bsz, n_kv_head, seq_len, head_dim)
+        cos size: (3, 1, seq_len, head_dim)
+        sin size: (3, 1, seq_len, head_dim)
+        """
+        cos, sin = ctx.saved_tensors
+        mrope_section = ctx.mrope_section
+        dq, dk = qwen2vl_mrope_backward(dq, dk, cos, sin, mrope_section)
+        return dq, dk, None, None, None, None

liger_kernel/ops/rms_norm.py CHANGED Viewed

@@ -264,7 +264,12 @@ def rms_norm_backward(
     dY = dY.view(-1, dim)
     n_rows, n_cols = dY.shape
-    sm_count = torch.cuda.get_device_properties(X.device).multi_processor_count
+    sm_count = 1
+    if X.device.type == "cuda":
+        sm_count = torch.cuda.get_device_properties(X.device).multi_processor_count
+    elif X.device.type == "xpu":
+        sm_count = torch.xpu.get_device_properties(X.device).gpu_subslice_count
     # fp32 for numerical stability especially.
     _dW = torch.empty((sm_count, n_cols), dtype=torch.float32, device=W.device)

liger_kernel/ops/utils.py CHANGED Viewed

@@ -20,6 +20,8 @@ import triton
 import triton.language as tl
 from packaging.version import Version
+from liger_kernel.utils import infer_device
 def is_hip() -> bool:
     return torch.version.hip is not None
@@ -69,10 +71,11 @@ def compare_version(package: str, operator: Callable, target: str):
 def get_amp_custom_fwd_bwd() -> Callable:
+    device = infer_device()
     if compare_version("torch", operator.ge, "2.4.0"):
         return (
-            functools.partial(torch.amp.custom_fwd, device_type="cuda"),
-            functools.partial(torch.amp.custom_bwd, device_type="cuda"),
+            functools.partial(torch.amp.custom_fwd, device_type=device),
+            functools.partial(torch.amp.custom_bwd, device_type=device),
         )
     return torch.cuda.amp.custom_fwd, torch.cuda.amp.custom_bwd

liger_kernel/transformers/functional.py CHANGED Viewed

@@ -10,21 +10,11 @@ from liger_kernel.ops.group_norm import LigerGroupNormFunction
 from liger_kernel.ops.jsd import LigerJSDFunction
 from liger_kernel.ops.kl_div import LigerKLDivLossFunction
 from liger_kernel.ops.layer_norm import LigerLayerNormFunction
+from liger_kernel.ops.qwen2vl_mrope import LigerQwen2VLMRopeFunction
 from liger_kernel.ops.rms_norm import LigerRMSNormFunction
 from liger_kernel.ops.rope import LigerRopeFunction
 from liger_kernel.ops.swiglu import LigerSiLUMulFunction
-liger_swiglu = LigerSiLUMulFunction.apply
-liger_fused_linear_cross_entropy = LigerFusedLinearCrossEntropyFunction.apply
-liger_geglu = LigerGELUMulFunction.apply
-liger_rms_norm = LigerRMSNormFunction.apply
-liger_rope = LigerRopeFunction.apply
-liger_layer_norm = LigerLayerNormFunction.apply
-liger_kl_div = LigerKLDivLossFunction.apply
-liger_jsd = LigerJSDFunction.apply
-liger_fused_linear_jsd = LigerFusedLinearJSDFunction.apply
-liger_group_norm = LigerGroupNormFunction.apply
 # conform to the function signature in https://pytorch.org/docs/stable/generated/torch.nn.functional.cross_entropy.html
 # `weight` and `size_average` are placeholders and not implemented yet
@@ -54,3 +44,130 @@ def liger_cross_entropy(
     if not return_z_loss:
         return loss
     return loss, z_loss
+def liger_fused_linear_cross_entropy(
+    input,
+    weight,
+    target,
+    bias=None,
+    ignore_index: int = -100,
+    lse_square_scale: float = 0.0,
+    label_smoothing: float = 0.0,
+    reduction: str = "mean",
+    softcap: Optional[float] = None,
+):
+    return LigerFusedLinearCrossEntropyFunction.apply(
+        input,
+        weight,
+        target,
+        bias,
+        ignore_index,
+        lse_square_scale,
+        label_smoothing,
+        reduction,
+        softcap,
+    )
+def liger_fused_linear_jsd(
+    student_input,
+    student_weight,
+    teacher_input,
+    teacher_weight,
+    shift_labels=None,
+    jsd_beta: float = 0.5,
+    ignore_index: int = -100,
+    temperature: float = 1.0,
+):
+    return LigerFusedLinearJSDFunction.apply(
+        student_input,
+        student_weight,
+        teacher_input,
+        teacher_weight,
+        shift_labels,
+        jsd_beta,
+        ignore_index,
+        temperature,
+    )
+def liger_geglu(a, b):
+    return LigerGELUMulFunction.apply(a, b)
+def liger_group_norm(
+    X,
+    affine_scaling_weight,
+    affine_shifting_bias,
+    num_channels,
+    num_groups,
+    eps,
+):
+    return LigerGroupNormFunction.apply(
+        X,
+        affine_scaling_weight,
+        affine_shifting_bias,
+        num_channels,
+        num_groups,
+        eps,
+    )
+def liger_jsd(
+    input,
+    target,
+    shift_labels=None,
+    beta: float = 0.5,
+    ignore_index: int = -100,
+):
+    return LigerJSDFunction.apply(
+        input,
+        target,
+        shift_labels,
+        beta,
+        ignore_index,
+    )
+# conform to the function signature in https://pytorch.org/docs/stable/generated/torch.nn.functional.kl_div.html#torch.nn.functional.kl_div
+# `size_average` and `mean` are being deprecated in torch API and are placeholders here
+def liger_kl_div(
+    input,
+    target,
+    size_average: bool = True,
+    reduce: bool = True,
+    reduction: str = "mean",
+    log_target: bool = False,
+    eps: float = 1e-10,
+):
+    # Note: the default reduction in torch is `mean`, but being `batchmean` in Liger
+    return LigerKLDivLossFunction.apply(
+        input,
+        target,
+        reduction,
+        log_target,
+        eps,
+    )
+def liger_layer_norm(X, W, B, eps):
+    return LigerLayerNormFunction.apply(X, W, B, eps)
+def liger_qwen2vl_mrope(q, k, cos, sin, mrope_section, unsqueeze_dim=1):
+    return LigerQwen2VLMRopeFunction.apply(q, k, cos, sin, mrope_section, unsqueeze_dim)
+def liger_rms_norm(
+    X, W, eps, offset: float = 0.0, casting_mode: str = "llama", in_place: bool = True
+):
+    return LigerRMSNormFunction.apply(X, W, eps, offset, casting_mode, in_place)
+def liger_rope(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    return LigerRopeFunction.apply(q, k, cos, sin, position_ids, unsqueeze_dim)
+def liger_swiglu(a, b):
+    return LigerSiLUMulFunction.apply(a, b)

liger_kernel/transformers/fused_linear_jsd.py CHANGED Viewed

@@ -12,7 +12,7 @@ class LigerFusedLinearJSD(torch.nn.Module):
     the materialization of the large logits tensor.
     Args:
-        jsd_beta (float): coefficient beta of generalized JSD in the open interval (0, 1). Default: `0.5`
+        jsd_beta (float): coefficient beta of generalized JSD in the interval [0, 1]. It implements forward/reverse KL when beta equals 0 and 1 respectively. Default: `0.5`
         ignore_index (int): The index to ignore in the target. Default: `-100`
         temperature (float): temperature in softmax function to control the output probability distribution. Default: `1.0`
@@ -70,9 +70,6 @@ class LigerFusedLinearJSD(torch.nn.Module):
     def __init__(self, jsd_beta=0.5, ignore_index=-100, temperature=1.0):
         super().__init__()
-        assert (
-            jsd_beta > 0 and jsd_beta < 1
-        ), f"beta must be greater than 0 and less than 1. Got: {jsd_beta}"
         assert temperature != 0, "temperature cannot be 0."
         self.jsd_beta = jsd_beta
         self.temperature = temperature

liger_kernel/transformers/jsd.py CHANGED Viewed

@@ -18,7 +18,7 @@ class LigerJSD(torch.nn.Module):
     :math:`P` denotes the teacher model and :math:`Q` denotes the student model.
     Args:
-        beta (float): coefficient beta of generalized JSD in the open interval (0, 1). Default: `0.5`
+        beta (float): coefficient beta of generalized JSD in the interval [0, 1]. It implements forward/reverse KL when beta equals 0 and 1 respectively. Default: `0.5`
         ignore_index (int): The index to ignore in the target. Default: `-100`
     Shape:
@@ -58,9 +58,6 @@ class LigerJSD(torch.nn.Module):
     def __init__(self, beta: float = 0.5, ignore_index: int = -100):
         super().__init__()
-        assert (
-            beta > 0 and beta < 1
-        ), f"beta must be greater than 0 and less than 1. Got: {beta}"
         self.beta = beta
         self.ignore_index = ignore_index

liger_kernel/transformers/monkey_patch.py CHANGED Viewed

@@ -36,6 +36,7 @@ from liger_kernel.transformers.model.qwen2 import lce_forward as qwen2_lce_forwa
 from liger_kernel.transformers.model.qwen2 import (
     lce_forward_deprecated as qwen2_lce_forward_deprecated,
 )
+from liger_kernel.transformers.qwen2vl_mrope import liger_multimodal_rotary_pos_emb
 from liger_kernel.transformers.rms_norm import LigerRMSNorm
 from liger_kernel.transformers.rope import liger_rotary_pos_emb
 from liger_kernel.transformers.swiglu import (
@@ -610,9 +611,7 @@ def apply_liger_kernel_to_qwen2(
             logger.warning(TRANSFORMER_DEPRECATION_WARNING)
             modeling_qwen2.CrossEntropyLoss = LigerCrossEntropyLoss
-    # import pdb; pdb.set_trace()
     if fused_linear_cross_entropy:
         if transformer_version >= version.parse(SUPPORTED_TRANSFORMER_VERSION):
             modeling_qwen2.Qwen2ForCausalLM.forward = qwen2_lce_forward
         else:  # if version < 4.46.1
@@ -644,6 +643,7 @@ def apply_liger_kernel_to_qwen2(
 def apply_liger_kernel_to_qwen2_vl(
+    rope: bool = True,
     cross_entropy: bool = False,
     fused_linear_cross_entropy: bool = True,
     rms_norm: bool = True,
@@ -678,8 +678,10 @@ def apply_liger_kernel_to_qwen2_vl(
         lce_forward as qwen2_vl_lce_forward,
     )
-    # TODO: Support Qwen2-VL's multimodal RoPE implementation
+    if rope:
+        modeling_qwen2_vl.apply_multimodal_rotary_pos_emb = (
+            liger_multimodal_rotary_pos_emb
+        )
     if rms_norm:
         # https://github.com/huggingface/transformers/blob/main/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L439
         modeling_qwen2_vl.Qwen2RMSNorm = LigerRMSNorm

liger-kernel 0.4.2__py3-none-any.whl → 0.5.1__py3-none-any.whl

liger-kernel 0.4.2py3-none-any.whl → 0.5.1py3-none-any.whl