PyPI - liger-kernel-nightly - Versions diffs - 0.5.2.dev20250108102127__tar.gz → 0.5.2.dev20250109023714__tar.gz - Mend

liger-kernel-nightly 0.5.2.dev20250108102127tar.gz → 0.5.2.dev20250109023714tar.gz

Files changed (197) hide show

{liger_kernel_nightly-0.5.2.dev20250108102127 → liger_kernel_nightly-0.5.2.dev20250109023714}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: liger_kernel_nightly
-Version: 0.5.2.dev20250108102127
+Version: 0.5.2.dev20250109023714
 Summary: Efficient Triton kernels for LLM Training
 License: BSD 2-CLAUSE LICENSE
         Copyright 2024 LinkedIn Corporation

{liger_kernel_nightly-0.5.2.dev20250108102127 → liger_kernel_nightly-0.5.2.dev20250109023714}/benchmark/scripts/benchmark_orpo_loss.py RENAMED Viewed

@@ -45,12 +45,13 @@ def bench_memory_fused_linear_orpo_loss(
     _input = torch.randn(B, T, H, requires_grad=True, dtype=dtype, device=device)
     target = torch.randint(V, (B, T), dtype=torch.long, device=device)
+    nll_target = torch.randint(V, (B, T), dtype=torch.long, device=device)
     def fwd():
         if provider == "liger":
-            return liger_lm_head_orpo(_input, target)
+            return liger_lm_head_orpo(_input, target, nll_target)
         elif provider == "huggingface":
-            return torch_lm_head_orpo(_input, target)
+            return torch_lm_head_orpo(_input, target, nll_target)
     def full():
         y = fwd()
@@ -91,12 +92,13 @@ def bench_speed_fused_linear_orpo_loss(
     _input = torch.randn(B, T, H, requires_grad=True, dtype=dtype, device=device)
     target = torch.randint(V, (B, T), dtype=torch.long, device=device)
+    nll_target = torch.randint(V, (B, T), dtype=torch.long, device=device)
     def fwd():
         if provider == "liger":
-            return liger_lm_head_orpo(_input, target)
+            return liger_lm_head_orpo(_input, target, nll_target)
         elif provider == "huggingface":
-            return torch_lm_head_orpo(_input, target)
+            return torch_lm_head_orpo(_input, target, nll_target)
     if mode == "forward":
         ms_50, ms_20, ms_80 = triton.testing.do_bench(

{liger_kernel_nightly-0.5.2.dev20250108102127 → liger_kernel_nightly-0.5.2.dev20250109023714}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "liger_kernel_nightly"
-version = "0.5.2.dev20250108102127"
+version = "0.5.2.dev20250109023714"
 description = "Efficient Triton kernels for LLM Training"
 urls = { "Homepage" = "https://github.com/linkedin/Liger-Kernel" }
 readme = { file = "README.md", content-type = "text/markdown" }

{liger_kernel_nightly-0.5.2.dev20250108102127 → liger_kernel_nightly-0.5.2.dev20250109023714}/src/liger_kernel/chunked_loss/fused_linear_preference.py RENAMED Viewed

@@ -27,6 +27,7 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
         alpha=1.0,
         beta=0.1,
         compute_nll_loss=True,
+        nll_target=None,
         compiled=True,
         use_ref_model=False,
         ref_input=None,
@@ -58,6 +59,7 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
             alpha (float): Weight for the NLL loss.
             beta (float): Weight for the preference loss.
             compute_nll_loss (bool): Whether to compute NLL loss.
+            nll_target (torch.Tensor, optional): Target tensor for NLL loss. Shape: (batch_size, seq_len). If not provided the target is used.
             compiled (bool): Whether to use torch compile for chunk accumulation.
             use_ref_model (bool): Whether to use a reference model for the alignment loss.
             ref_weight (torch.Tensor): Reference weight tensor. Shape: (vocab_size, hidden_size).
@@ -96,11 +98,12 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
             use_ref_model=use_ref_model,
             ref_weight=ref_weight,
             ref_bias=ref_bias,
+            full_nll_target=nll_target,
             average_log_prob=average_log_prob,
             **loss_kwargs,
         )
-        def fused_fwd_bwd(input_chunk, target_chunk, ref_input_chunk):
+        def fused_fwd_bwd(input_chunk, target_chunk, ref_input_chunk, chosen_nll_target_chunk):
             """
             Fused forward and backward pass for a chunk of input and target.
             """
@@ -111,13 +114,18 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
                     target_chunk,
                     bias,
                     ref_input_chunk=ref_input_chunk,
+                    chosen_nll_target_chunk=chosen_nll_target_chunk,
                 )
             else:
                 return torch.func.grad_and_value(compute_loss, argnums=(0, 1), has_aux=True)(
-                    input_chunk, weight, target_chunk, ref_input_chunk=ref_input_chunk
+                    input_chunk,
+                    weight,
+                    target_chunk,
+                    ref_input_chunk=ref_input_chunk,
+                    chosen_nll_target_chunk=chosen_nll_target_chunk,
                 )
-        def accumulate_chunk(input_chunk, target_chunk, ref_input_chunk=None):
+        def accumulate_chunk(input_chunk, target_chunk, ref_input_chunk=None, chosen_nll_target_chunk=None):
             if bias is not None:
                 (
                     (chunk_grad_input, chunk_grad_weight, chunk_grad_bias),
@@ -132,7 +140,7 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
                             *aux_outputs,
                         ),
                     ),
-                ) = fused_fwd_bwd(input_chunk, target_chunk, ref_input_chunk)
+                ) = fused_fwd_bwd(input_chunk, target_chunk, ref_input_chunk, chosen_nll_target_chunk)
                 grad_bias.add_(chunk_grad_bias)  # accumulate bias gradient
             else:
                 (
@@ -148,7 +156,7 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
                             *aux_outputs,
                         ),
                     ),
-                ) = fused_fwd_bwd(input_chunk, target_chunk, ref_input_chunk)
+                ) = fused_fwd_bwd(input_chunk, target_chunk, ref_input_chunk, chosen_nll_target_chunk)
             # Accumulate gradients
             grad_weight.add_(chunk_grad_weight)
@@ -191,6 +199,9 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
         _rejected_input_chunks = torch.chunk(_input[len_chosen:], chunks=chunks, dim=0)
         _rejected_target_chunks = torch.chunk(target[len_chosen:], chunks=chunks, dim=0)
+        if nll_target is not None:
+            _chosen_nll_target_chunks = torch.chunk(nll_target[:len_chosen], chunks=chunks, dim=0)
         if use_ref_model:
             _ref_chosen_input_chunks = torch.chunk(ref_input[:len_chosen], chunks=chunks, dim=0)
             _ref_rejected_input_chunks = torch.chunk(ref_input[len_chosen:], chunks=chunks, dim=0)
@@ -202,6 +213,7 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
             rejected_target_chunk,
             ref_chosen_input_chunk,
             ref_rejected_input_chunk,
+            chosen_nll_target_chunk,
         ) in zip(
             _chosen_input_chunks,
             _rejected_input_chunks,
@@ -209,6 +221,7 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
             _rejected_target_chunks,
             (_ref_chosen_input_chunks if use_ref_model else [None] * len(_chosen_input_chunks)),
             (_ref_rejected_input_chunks if use_ref_model else [None] * len(_rejected_input_chunks)),
+            (_chosen_nll_target_chunks if nll_target is not None else [None] * len(_chosen_input_chunks)),
             strict=False,
         ):
             input_chunk = torch.cat([chosen_input_chunk, rejected_input_chunk], dim=0)
@@ -222,9 +235,10 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
             torch._dynamo.mark_dynamic(target_chunk, 1)
             torch._dynamo.mark_dynamic(target, 1)
             torch._dynamo.mark_dynamic(ref_input_chunk, 1) if use_ref_model else None
+            torch._dynamo.mark_dynamic(chosen_nll_target_chunk, 1) if nll_target is not None else None
             # accumulate loss, gradients, and metrics
-            accumulate_chunk(input_chunk, target_chunk, ref_input_chunk)
+            accumulate_chunk(input_chunk, target_chunk, ref_input_chunk, chosen_nll_target_chunk)
         # combine grad_chosen_inputs and grad_rejected_inputs
         grad_inputs = grad_chosen_inputs + grad_rejected_inputs
@@ -258,7 +272,7 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
             grad_weight = grad_weight * grad_output[0][0]
             grad_bias = grad_bias * grad_output[0][0] if grad_bias is not None else None
-        return grad_input, grad_weight, None, grad_bias, None, None, None
+        return grad_input, grad_weight, None, grad_bias, None, None, None, None
     @staticmethod
     def chunk_forward(
@@ -268,6 +282,7 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
         bias=None,
         ignore_index=-100,
         compute_nll_loss=True,
+        chosen_nll_target_chunk=None,
         average_log_prob=True,
     ):
         len_chosen_chunk = target_chunk.shape[0] // 2
@@ -278,9 +293,12 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
         chosen_nll_loss = 0.0
         if compute_nll_loss:
+            nll_labels = (
+                chosen_nll_target_chunk if chosen_nll_target_chunk is not None else target_chunk[:len_chosen_chunk]
+            )
             chosen_nll_loss = F.nll_loss(
                 log_probs_chunk[:len_chosen_chunk].view(-1, log_probs_chunk.shape[-1]),
-                target_chunk[:len_chosen_chunk].view(-1),
+                nll_labels.view(-1),
                 reduction="sum",
                 ignore_index=ignore_index,
             )
@@ -324,6 +342,8 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
         ref_input_chunk=None,
         ref_weight=None,
         ref_bias=None,
+        full_nll_target=None,
+        chosen_nll_target_chunk=None,
         average_log_prob=True,
         **loss_kwargs,
     ):
@@ -343,6 +363,8 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
             use_ref_model (bool): Whether to use a reference model for the alignment loss.
             ref_weight (torch.Tensor): Reference weight tensor. Shape: (vocab_size, hidden_size).
             ref_bias (torch.Tensor, optional): Reference bias tensor. Shape: (vocab_size,).
+            full_nll_target (torch.Tensor, optional): Full target tensor for NLL loss. Shape: (batch_size, sequence_length).
+            chosen_nll_target_chunk (torch.Tensor, optional): Target tensor for NLL loss. Shape: (chunk_size, sequence_length) If not provided the target_chunk is used.
             average_log_prob (bool): Whether to average log probabilities or the sum.
             loss_kwargs (dict): Additional arguments for the loss function.
         """
@@ -359,9 +381,14 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
             bias=bias,
             ignore_index=ignore_index,
             compute_nll_loss=compute_nll_loss,
+            chosen_nll_target_chunk=chosen_nll_target_chunk,
             average_log_prob=average_log_prob,
         )
-        chosen_nll_loss = chosen_nll_loss / (full_target[: full_target.shape[0] // 2] != ignore_index).sum()
+        if full_nll_target is not None:
+            chosen_nll_loss = chosen_nll_loss / (full_nll_target[: full_nll_target.shape[0] // 2] != ignore_index).sum()
+        else:
+            chosen_nll_loss = chosen_nll_loss / (full_target[: full_target.shape[0] // 2] != ignore_index).sum()
         chosen_logits_mean = chosen_logits.sum() / (full_target.shape[0] // 2 * input_chunk.shape[1] * weight.shape[0])
         rejected_logits_mean = rejected_logits.sum() / (
             full_target.shape[0] // 2 * input_chunk.shape[1] * weight.shape[0]
@@ -372,9 +399,9 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
                 (
                     ref_chosen_logps,
                     ref_rejected_logps,
-                    ref_chosen_logits,
-                    ref_rejected_logits,
-                    ref_chosen_nll_loss,
+                    _,
+                    _,
+                    _,
                 ) = LigerFusedLinearPreferenceBase.chunk_forward(
                     ref_input_chunk,
                     ref_weight,
@@ -382,6 +409,7 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
                     ref_bias,
                     ignore_index=ignore_index,
                     compute_nll_loss=False,  # We don't need NLL loss for the reference model
+                    chosen_nll_target_chunk=None,
                     average_log_prob=average_log_prob,
                 )
             loss_kwargs["ref_chosen_logps"] = ref_chosen_logps

{liger_kernel_nightly-0.5.2.dev20250108102127 → liger_kernel_nightly-0.5.2.dev20250109023714}/src/liger_kernel/chunked_loss/orpo_loss.py RENAMED Viewed

@@ -52,6 +52,7 @@ class LigerFusedLinearORPOFunction(LigerFusedLinearPreferenceBase):
         ignore_index=-100,
         beta=0.1,
         compute_nll_loss=True,
+        nll_target=None,
         compiled=True,
     ):
         return LigerFusedLinearPreferenceBase.forward(
@@ -64,13 +65,14 @@ class LigerFusedLinearORPOFunction(LigerFusedLinearPreferenceBase):
             ignore_index=ignore_index,
             beta=beta,
             compute_nll_loss=compute_nll_loss,
+            nll_target=nll_target,
             compiled=compiled,
         )
     @staticmethod
     def backward(ctx, *grad_output):
         grads = LigerFusedLinearPreferenceBase.backward(ctx, grad_output)[:4]
-        return *grads, None, None, None, None
+        return *grads, None, None, None, None, None
 class LigerFusedLinearORPOLoss(torch.nn.Module):
@@ -96,7 +98,7 @@ class LigerFusedLinearORPOLoss(torch.nn.Module):
         self.compute_nll_loss = compute_nll_loss
         self.compiled = compiled
-    def forward(self, lin_weight, _input, target, bias=None):
+    def forward(self, lin_weight, _input, target, bias=None, nll_target=None):
         return LigerFusedLinearORPOFunction.apply(
             _input,
             lin_weight,
@@ -105,5 +107,6 @@ class LigerFusedLinearORPOLoss(torch.nn.Module):
             self.ignore_index,
             self.beta,
             self.compute_nll_loss,
+            nll_target,
             self.compiled,
         )

{liger_kernel_nightly-0.5.2.dev20250108102127 → liger_kernel_nightly-0.5.2.dev20250109023714}/src/liger_kernel/transformers/trainer/orpo_trainer.py RENAMED Viewed

@@ -93,6 +93,13 @@ class LigerORPOTrainer(ORPOTrainer):
         if self.aux_loss_enabled:
             model_kwargs["output_router_logits"] = True
+        if self.is_encoder_decoder:
+            labels = concatenated_batch["concatenated_labels"].clone()
+        else:
+            labels = concatenated_batch["concatenated_input_ids"].clone()
+            attention_mask = concatenated_batch["concatenated_attention_mask"]
+            labels = torch.where(attention_mask == 1, labels, self.label_pad_token_id)
         if isinstance(model, FullyShardedDataParallel):
             outputs = _FSDPForwardRedirection()(
                 model,
@@ -114,15 +121,20 @@ class LigerORPOTrainer(ORPOTrainer):
         orpo_loss_fn = LigerFusedLinearORPOLoss(ignore_index=self.label_pad_token_id, beta=self.beta)
-        def orpo_partial(lm_head, last_hidden_state, concatenated_labels):
-            return orpo_loss_fn(lm_head.weight, last_hidden_state, concatenated_labels, lm_head.bias)
+        def orpo_partial(lm_head, last_hidden_state, concatenated_labels, nll_target):
+            return orpo_loss_fn(
+                lm_head.weight, last_hidden_state, concatenated_labels, lm_head.bias, nll_target=nll_target
+            )
         orpo_loss, aux_outputs = _FSDPForwardRedirection()(
             model,
             orpo_partial,
             model.lm_head,
-            outputs.last_hidden_state,
-            concatenated_batch["concatenated_labels"],
+            outputs.last_hidden_state[:, :-1] if not self.is_encoder_decoder else outputs.last_hidden_state,
+            concatenated_batch["concatenated_labels"][:, 1:]
+            if not self.is_encoder_decoder
+            else concatenated_batch["concatenated_labels"],
+            labels[:, 1:] if not self.is_encoder_decoder else labels,
         )
         # if aux_loss_enabled, add the aux_loss to the orpo_loss
         if self.aux_loss_enabled:

{liger_kernel_nightly-0.5.2.dev20250108102127 → liger_kernel_nightly-0.5.2.dev20250109023714}/src/liger_kernel_nightly.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: liger_kernel_nightly
-Version: 0.5.2.dev20250108102127
+Version: 0.5.2.dev20250109023714
 Summary: Efficient Triton kernels for LLM Training
 License: BSD 2-CLAUSE LICENSE
         Copyright 2024 LinkedIn Corporation

{liger_kernel_nightly-0.5.2.dev20250108102127 → liger_kernel_nightly-0.5.2.dev20250109023714}/test/chunked_loss/test_orpo_loss.py RENAMED Viewed

@@ -86,8 +86,8 @@ class TorchLMHeadORPO(torch.nn.Module):
         self.lin = torch.nn.Linear(in_features=H, out_features=V, bias=bias, dtype=dtype)
         self.orpo_loss = HFORPOLoss(ignore_index=ignore_index, beta=beta).get_batch_loss_metrics
-    def forward(self, x, y):
-        return self.orpo_loss(self.lin.weight, x, y, self.lin.bias)
+    def forward(self, x, y, nll_target=None):
+        return self.orpo_loss(self.lin.weight, x, y, self.lin.bias, nll_target=nll_target)
 class LigerLMHeadORPO(torch.nn.Module):
@@ -104,8 +104,8 @@ class LigerLMHeadORPO(torch.nn.Module):
         self.lin = torch.nn.Linear(in_features=H, out_features=V, bias=bias, dtype=dtype)
         self.orpo_loss = LigerFusedLinearORPOLoss(ignore_index=ignore_index, beta=beta)
-    def forward(self, x, y):
-        return self.orpo_loss(self.lin.weight, x, y, self.lin.bias)
+    def forward(self, x, y, nll_target=None):
+        return self.orpo_loss(self.lin.weight, x, y, self.lin.bias, nll_target=nll_target)
 @pytest.mark.parametrize(
@@ -164,13 +164,15 @@ def test_correctness(B, T, H, V, scalar, dtype, atol, rtol, bias, ignore_index,
         device=device,
         dtype=torch.long,
     )
+    nll_target = torch.randint(0, V, (B, T), device=device, dtype=torch.long)
     # Assign some random number of elements as ignore_index
     num_elements_to_assign = torch.randint(1, B * T // 2, (1,)).item()
     indices_to_assign = torch.randperm(B * T)[:num_elements_to_assign]
     target.view(-1)[indices_to_assign] = ignore_index
-    loss1, aggregated_aux_outputs1 = torch_lm_head_orpo(input1, target)
-    loss2, aggregated_aux_outputs2 = liger_lm_head_orpo(input2, target)
+    loss1, aggregated_aux_outputs1 = torch_lm_head_orpo(input1, target, nll_target)
+    loss2, aggregated_aux_outputs2 = liger_lm_head_orpo(input2, target, nll_target)
     assert_verbose_allclose(loss1, loss2, atol=atol, rtol=rtol)
@@ -244,8 +246,8 @@ def test_correctness_functional(B, T, H, V, scalar, dtype, atol, rtol, bias):
     bias1 = _bias.detach().clone().requires_grad_(True) if bias else None
     bias2 = _bias.detach().clone().requires_grad_(True) if bias else None
-    loss1, aggregated_aux_outputs1 = LigerFusedLinearORPOFunction.apply(input1, weight1, target, bias1)
-    loss2, aggregated_aux_outputs2 = liger_fused_linear_orpo(input2, weight2, target, bias2)
+    loss1, _ = LigerFusedLinearORPOFunction.apply(input1, weight1, target, bias1)
+    loss2, _ = liger_fused_linear_orpo(input2, weight2, target, bias2)
     assert_verbose_allclose(loss1, loss2, atol=atol, rtol=rtol)

{liger_kernel_nightly-0.5.2.dev20250108102127 → liger_kernel_nightly-0.5.2.dev20250109023714}/test/utils.py RENAMED Viewed

@@ -406,8 +406,9 @@ class HFAlignmentLoss:
         _input: torch.FloatTensor,
         weight: torch.FloatTensor,
         target: torch.LongTensor,
-        bias: torch.FloatTensor = None,
+        bias: torch.FloatTensor | None = None,
         average_log_prob: bool = True,
+        nll_target: torch.LongTensor | None = None,
     ) -> Tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
         """Run the given model on the given batch of inputs, concatenating the chosen and rejected inputs together.
@@ -430,7 +431,7 @@ class HFAlignmentLoss:
             loss = loss_fct(logits, labels)
             return loss
-        labels = target
+        labels = nll_target if nll_target is not None else target
         chosen_nll_loss = torch.tensor(0.0, device=all_logits.device)
         if self.compute_nll_loss:
             chosen_nll_loss = cross_entropy_loss(all_logits[:len_chosen], labels[:len_chosen])
@@ -465,10 +466,11 @@ class HFAlignmentLoss:
         ref_weight: torch.FloatTensor = None,
         ref_bias: torch.FloatTensor = None,
         average_log_prob: bool = True,
+        nll_target: torch.LongTensor = None,
     ):
         """Compute the loss metrics for the given batch of inputs for train or test."""
-        forward_output = self.concatenated_forward(_input, weight, target, bias, average_log_prob)
+        forward_output = self.concatenated_forward(_input, weight, target, bias, average_log_prob, nll_target)
         (
             policy_chosen_logps,
             policy_rejected_logps,