npm - cciwon-code-review-cli - Versions diffs - 2.0.1 → 2.0.3 - Mend

cciwon-code-review-cli 2.0.1 → 2.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (111) hide show

package/unsloth_compiled_cache/UnslothGKDTrainer.py ADDED Viewed

@@ -0,0 +1,1157 @@
+"""
+2025.12.6
+2025.12.7
+4.57.1
+0.24.0
+__UNSLOTH_VERSIONING__
+"""
+# Unsloth auto generated code
+# Copyright 2023-present Daniel Han-Chen, Michael Han-Chen & the Unsloth team. All rights reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+from torch import Tensor
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
+from trl.trainer.gkd_trainer import (Any, AutoModelForCausalLM, BaseImageProcessor, Callable, DataCollator, DataCollatorForChatML, Dataset, EvalPrediction, F, FeatureExtractionMixin, GKDConfig, GKDTrainer, GenerationConfig, Optional, PeftConfig, PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin, SFTTrainer, TrainerCallback, Union, disable_dropout_in_model, empty_cache, nn, os, prepare_deepspeed, random, textwrap, torch, unwrap_model_for_generation, warnings)
+import os
+from typing import *
+from dataclasses import dataclass, field
+from packaging.version import Version
+import torch
+import numpy as np
+from contextlib import nullcontext
+from torch.nn import functional as F
+import inspect
+from transformers import DataCollatorForSeq2Seq, DataCollatorForLanguageModeling as TransformersDataCollatorForLanguageModeling
+from transformers.training_args import ParallelMode
+# Wrap trainer with padding to right and enable training mode
+# Also patches W&B since multiple runs must use wandb.finish()
+import functools
+from types import MethodType
+def prepare_for_training_mode(f):
+    @functools.wraps(f)
+    def wrapper(self, *args, **kwargs):
+        # Enable training mode
+        if hasattr(self, 'model') and hasattr(self.model, "for_training"):
+            self.model.for_training()
+        output = f(self, *args, **kwargs)
+        # Return inference mode
+        if hasattr(self, 'model') and hasattr(self.model, "for_inference"):
+            self.model.for_inference()
+        # Patch W&B to enable logging on future runs, otherwise it'll overwrite the first run
+        try:
+            import wandb
+            wandb.finish()
+        except:
+            pass
+        return output
+    return wrapper
+pass
+torch_compile_options = {
+    "epilogue_fusion"   : True,
+    "max_autotune"      : False,
+    "shape_padding"     : True,
+    "trace.enabled"     : False,
+    "triton.cudagraphs" : False,
+}
+@torch.compile(dynamic = True, fullgraph = True, options = torch_compile_options,)
+def chunked_selective_log_softmax(logits, index):
+    # Split into 4 chunks only
+    chunked_logits = torch.chunk(logits.reshape(-1, logits.shape[-1]), chunks = 4, dim = 0)
+    chunked_index  = torch.chunk(index.reshape(-1), chunks = 4, dim = 0)
+    all_per_token_logps = []
+    # Below loop does the same as selective_log_softmax(chunk_logits, chunk_index)
+    for chunk_logits, chunk_index in zip(chunked_logits, chunked_index):
+        chunk_logits = chunk_logits.to(torch.float32)
+        selected_logits = torch.gather(chunk_logits, dim = -1, index = chunk_index.unsqueeze(-1)).squeeze(-1)
+        logsumexp_values = torch.logsumexp(chunk_logits, dim = -1)
+        per_token_logps = selected_logits - logsumexp_values
+        all_per_token_logps.append(per_token_logps)
+    pass
+    all_per_token_logps = torch.concat(all_per_token_logps)
+    all_per_token_logps = all_per_token_logps.reshape((logits.shape[0], logits.shape[1]))
+    return all_per_token_logps
+def calculate_pad_tokens_in_prompt(
+    input_ids: torch.Tensor,
+    logits_to_keep: int,
+    pad_token_id: int
+) -> torch.Tensor:
+    """
+    Given prompt tensor, it returns all the left padded tokens in that sequence. so [pad, pad, pad, cat] = 3 tokens
+    """
+    if logits_to_keep >= input_ids.shape[1]:
+        raise ValueError("logits_to_keep must be smaller than the sequence length.")
+    prompt_section = input_ids[:, :-logits_to_keep]
+    padding_mask = (prompt_section == pad_token_id)
+    pad_token_counts = padding_mask.sum(dim=1)
+    return pad_token_counts
+def create_completion_attention_mask(
+    completion_input_ids: torch.Tensor,
+    left_pad_tokens_per_prompt: torch.Tensor,
+    max_left_pad: int,
+    pad_token_id: int
+) -> torch.Tensor:
+    """
+    Given that we have a sequence, [p,p,p,c,c,c,pad,pad,pad]
+    Where p are extra prompt tokens we got from slicing the torch tensor, c is completion tokens
+    and pad are pad tokens, this function would make a completion mask that would 0 out the pad
+    and p tokens. so in this example [0,0,0,1,1,1,0,0,0]
+    """
+    batch_size, completion_len = completion_input_ids.shape
+    device = completion_input_ids.device
+    num_tokens_to_mask = max_left_pad - left_pad_tokens_per_prompt
+    indices = torch.arange(completion_len, device=device).unsqueeze(0)
+    shift_mask = indices >= num_tokens_to_mask.unsqueeze(1)
+    non_padding_mask = (completion_input_ids != pad_token_id)
+    final_mask = shift_mask & non_padding_mask
+    return final_mask
+def left_pack_padding(tensor: torch.Tensor, pad_id: int) -> torch.Tensor:
+    """
+    Moves all padding tokens in each sequence of a batch to the right.
+    """
+    mask = (tensor != pad_id)
+    # Must do stable=True since binary mark is unordered
+    sorted_indices = torch.argsort(mask, dim=1, descending=True, stable=True)
+    packed_tensor = torch.gather(tensor, 1, sorted_indices)
+    return packed_tensor
+def align_logprobs_with_mask(
+    logprob_tensor: torch.Tensor,
+    attention_mask: torch.Tensor,
+    pad_value: float = 0.0
+) -> torch.Tensor:
+    """
+    Aligns a log probability tensor with a given attention mask.
+    """
+    device = logprob_tensor.device
+    batch_size, logprob_seq_len = logprob_tensor.shape
+    mask_seq_len = attention_mask.shape[1]
+    padded_logprobs = torch.full(
+        attention_mask.shape,
+        fill_value=pad_value,
+        dtype=logprob_tensor.dtype,
+        device=device
+    )
+    left_pad_counts = torch.argmax(attention_mask, dim=1)
+    cols = torch.arange(logprob_seq_len, device=device)
+    dest_indices = left_pad_counts.unsqueeze(1) + cols
+    # Create destination row indices
+    # Shape: [batch_size, logprob_seq_len]
+    row_indices = torch.arange(batch_size, device=device).unsqueeze(1).expand_as(dest_indices)
+    # --- 4. Filter out-of-bounds indices and perform assignment ---
+    # Create a mask to identify only the indices that are within the bounds
+    # of the target tensor's sequence length.
+    valid_mask = dest_indices < mask_seq_len
+    # Use this mask to select only the valid row indices, column indices,
+    # and the corresponding values from the logprob tensor.
+    # This flattens the selected elements into 1D tensors.
+    valid_rows = row_indices[valid_mask]
+    valid_cols = dest_indices[valid_mask]
+    valid_vals = logprob_tensor[valid_mask]
+    # Place the valid values into their correct positions in the padded tensor
+    # using a single, efficient advanced indexing operation.
+    padded_logprobs[valid_rows, valid_cols] = valid_vals
+    return padded_logprobs
+@dataclass
+class UnslothGKDConfig(GKDConfig):
+    """
+    Configuration class for [`GKDTrainer`].
+    This class includes only the parameters that are specific to GKD training. For a full list of training arguments,
+    please refer to the [`~transformers.TrainingArguments`] and [`SFTConfig`] documentation.
+    Args:
+        temperature (`float`, *optional*, defaults to `0.9`):
+            Temperature for sampling. The higher the temperature, the more random the completions.
+        lmbda (`float`, *optional*, defaults to `0.5`):
+            Lambda parameter that controls the student data fraction (i.e., the proportion of on-policy
+            student-generated outputs).
+        beta (`float`, *optional*, defaults to `0.5`):
+            Interpolation coefficient between `0.0` and `1.0` of the Generalized Jensen-Shannon Divergence loss. When
+            beta is `0.0`, the loss is the KL divergence. When beta is `1.0`, the loss is the Inverse KL Divergence.
+        max_new_tokens (`int`, *optional*, defaults to `128`):
+            Maximum number of tokens to generate per completion.
+        teacher_model_name_or_path (`str`, *optional*):
+            Model name or path of the teacher model. If `None`, the teacher model will be the same as the model being
+            trained.
+        teacher_model_init_kwargs (`dict[str, Any]]`, *optional*):
+            Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the teacher model
+            from a string.
+        disable_dropout (`bool`, *optional*, defaults to `True`):
+            Whether to disable dropout in the model.
+        seq_kd (`bool`, *optional*, defaults to `False`):
+            Seq_kd parameter that controls whether to perform Sequence-Level KD (can be viewed as supervised FT on
+            teacher-generated output).
+    """
+    vllm_sampling_params: Optional[Any] = field(
+        default = None,
+        metadata = {'help': 'vLLM SamplingParams'},
+    )
+    unsloth_num_chunks : Optional[int] = field(
+        default = -1,
+        metadata = {'help': 'Chunk size to reduce memory usage. -1 is most efficient.'},
+    )
+    max_seq_length : Optional[int] = field(
+        default = None,
+        metadata = {'help': 'Maximum sequence length to truncate to.'},
+    )
+    def __init__(
+        self,
+        output_dir = None,
+        overwrite_output_dir = None,
+        do_train = False,
+        do_eval = False,
+        do_predict = False,
+        eval_strategy = 'no',
+        prediction_loss_only = False,
+        per_device_train_batch_size = 4,
+        per_device_eval_batch_size = 4,
+        per_gpu_train_batch_size = None,
+        per_gpu_eval_batch_size = None,
+        gradient_accumulation_steps = 2,
+        eval_accumulation_steps = 2,
+        eval_delay = 0,
+        torch_empty_cache_steps = 250,
+        learning_rate = 5e-05,
+        weight_decay = 0.01,
+        adam_beta1 = 0.9,
+        adam_beta2 = 0.999,
+        adam_epsilon = 1e-08,
+        max_grad_norm = 1.0,
+        num_train_epochs = 3.0,
+        max_steps = -1,
+        lr_scheduler_type = 'linear',
+        warmup_ratio = 0.1,
+        warmup_steps = 0,
+        log_level = 'passive',
+        log_level_replica = 'warning',
+        log_on_each_node = True,
+        logging_dir = None,
+        logging_strategy = 'steps',
+        logging_first_step = False,
+        logging_steps = 1,
+        logging_nan_inf_filter = False,
+        save_strategy = 'steps',
+        save_steps = 500,
+        save_total_limit = None,
+        save_safetensors = True,
+        save_on_each_node = False,
+        save_only_model = False,
+        restore_callback_states_from_checkpoint = False,
+        no_cuda = False,
+        use_cpu = False,
+        use_mps_device = False,
+        seed = 3407,
+        data_seed = 3407,
+        jit_mode_eval = False,
+        bf16 = False,
+        fp16 = False,
+        fp16_opt_level = 'O1',
+        half_precision_backend = 'auto',
+        bf16_full_eval = False,
+        fp16_full_eval = False,
+        tf32 = None,
+        local_rank = -1,
+        ddp_backend = None,
+        tpu_num_cores = None,
+        tpu_metrics_debug = False,
+        debug = '',
+        dataloader_drop_last = False,
+        eval_steps = None,
+        dataloader_num_workers = 0,
+        dataloader_prefetch_factor = None,
+        past_index = -1,
+        run_name = None,
+        disable_tqdm = None,
+        remove_unused_columns = True,
+        label_names = None,
+        load_best_model_at_end = False,
+        metric_for_best_model = None,
+        greater_is_better = None,
+        ignore_data_skip = False,
+        fsdp = None,
+        fsdp_min_num_params = 0,
+        fsdp_config = None,
+        fsdp_transformer_layer_cls_to_wrap = None,
+        accelerator_config = None,
+        parallelism_config = None,
+        deepspeed = None,
+        label_smoothing_factor = 0.0,
+        optim = 'adamw_8bit',
+        optim_args = None,
+        adafactor = False,
+        group_by_length = False,
+        length_column_name = 'length',
+        report_to = 'none',
+        project = 'huggingface',
+        trackio_space_id = 'trackio',
+        ddp_find_unused_parameters = None,
+        ddp_bucket_cap_mb = None,
+        ddp_broadcast_buffers = None,
+        dataloader_pin_memory = True,
+        dataloader_persistent_workers = False,
+        skip_memory_metrics = True,
+        use_legacy_prediction_loop = False,
+        push_to_hub = False,
+        resume_from_checkpoint = None,
+        hub_model_id = None,
+        hub_strategy = 'every_save',
+        hub_token = None,
+        hub_private_repo = None,
+        hub_always_push = False,
+        hub_revision = None,
+        gradient_checkpointing = True,
+        gradient_checkpointing_kwargs = None,
+        include_inputs_for_metrics = False,
+        eval_do_concat_batches = True,
+        fp16_backend = 'auto',
+        push_to_hub_model_id = None,
+        push_to_hub_organization = None,
+        push_to_hub_token = None,
+        mp_parameters = '',
+        auto_find_batch_size = False,
+        full_determinism = False,
+        torchdynamo = None,
+        ray_scope = 'last',
+        ddp_timeout = 1800,
+        torch_compile = False,
+        torch_compile_backend = None,
+        torch_compile_mode = None,
+        include_tokens_per_second = False,
+        include_num_input_tokens_seen = False,
+        neftune_noise_alpha = None,
+        optim_target_modules = None,
+        batch_eval_metrics = False,
+        eval_on_start = False,
+        use_liger_kernel = False,
+        liger_kernel_config = None,
+        eval_use_gather_object = False,
+        average_tokens_across_devices = True,
+        model_init_kwargs = None,
+        chat_template_path = None,
+        dataset_text_field = 'text',
+        dataset_kwargs = None,
+        dataset_num_proc = None,
+        eos_token = None,
+        pad_token = None,
+        max_length = 1024,
+        packing = False,
+        packing_strategy = 'bfd',
+        padding_free = False,
+        pad_to_multiple_of = None,
+        eval_packing = None,
+        completion_only_loss = None,
+        assistant_only_loss = False,
+        loss_type = 'nll',
+        activation_offloading = False,
+        temperature = 0.9,
+        lmbda = 0.5,
+        beta = 0.5,
+        max_new_tokens = 128,
+        teacher_model_name_or_path = None,
+        teacher_model_init_kwargs = None,
+        disable_dropout = True,
+        seq_kd = False,
+        vllm_sampling_params = None,
+        unsloth_num_chunks = -1,
+        max_seq_length = None,
+        **kwargs,
+    ):
+        if learning_rate < 1e-7: print(f'Unsloth: Your learning rate of `{learning_rate}` is too small and less than 1e-7! Consider increasing it, otherwise gradient updates will be close to 0!')
+        if learning_rate > 1: print(f'Unsloth: Your learning rate of `{learning_rate}` is way too larger > 1! Consider decreasing it to 1e-1, otherwise gradient updates will explode!')
+        if output_dir is None and save_strategy == 'steps' and save_steps == 500:
+            output_dir = 'unsloth_training_checkpoints'
+            save_strategy = 'no'
+        if dataset_num_proc is None:
+            from multiprocessing import cpu_count
+            dataset_num_proc = min(max(cpu_count()+4, 2), 64)
+        if os.environ.get('UNSLOTH_ENABLE_FLEX_ATTENTION', '0') == '1':
+            from unsloth_zoo.flex_attention import HAS_FLEX_ATTENTION
+            if HAS_FLEX_ATTENTION and pad_to_multiple_of is None:
+                from unsloth_zoo.flex_attention import FLEX_ATTENTION_BLOCK_SIZE
+                pad_to_multiple_of = FLEX_ATTENTION_BLOCK_SIZE
+        if temperature <= 0:
+            raise MathError('Unsloth: Please set a positive non-zero temperature since your results will be wrong.')
+        elif temperature >= 10:
+            raise MathError('Unsloth: Please set a positive non-zero temperature less than 10, since sampling will be quite erratic.')
+        super().__init__(
+            output_dir = output_dir,
+            overwrite_output_dir = overwrite_output_dir,
+            do_train = do_train,
+            do_eval = do_eval,
+            do_predict = do_predict,
+            eval_strategy = eval_strategy,
+            prediction_loss_only = prediction_loss_only,
+            per_device_train_batch_size = per_device_train_batch_size,
+            per_device_eval_batch_size = per_device_eval_batch_size,
+            per_gpu_train_batch_size = per_gpu_train_batch_size,
+            per_gpu_eval_batch_size = per_gpu_eval_batch_size,
+            gradient_accumulation_steps = gradient_accumulation_steps,
+            eval_accumulation_steps = eval_accumulation_steps,
+            eval_delay = eval_delay,
+            torch_empty_cache_steps = torch_empty_cache_steps,
+            learning_rate = learning_rate,
+            weight_decay = weight_decay,
+            adam_beta1 = adam_beta1,
+            adam_beta2 = adam_beta2,
+            adam_epsilon = adam_epsilon,
+            max_grad_norm = max_grad_norm,
+            num_train_epochs = num_train_epochs,
+            max_steps = max_steps,
+            lr_scheduler_type = lr_scheduler_type,
+            warmup_ratio = warmup_ratio,
+            warmup_steps = warmup_steps,
+            log_level = log_level,
+            log_level_replica = log_level_replica,
+            log_on_each_node = log_on_each_node,
+            logging_dir = logging_dir,
+            logging_strategy = logging_strategy,
+            logging_first_step = logging_first_step,
+            logging_steps = logging_steps,
+            logging_nan_inf_filter = logging_nan_inf_filter,
+            save_strategy = save_strategy,
+            save_steps = save_steps,
+            save_total_limit = save_total_limit,
+            save_safetensors = save_safetensors,
+            save_on_each_node = save_on_each_node,
+            save_only_model = save_only_model,
+            restore_callback_states_from_checkpoint = restore_callback_states_from_checkpoint,
+            no_cuda = no_cuda,
+            use_cpu = use_cpu,
+            use_mps_device = use_mps_device,
+            seed = seed,
+            data_seed = data_seed,
+            jit_mode_eval = jit_mode_eval,
+            bf16 = bf16,
+            fp16 = fp16,
+            fp16_opt_level = fp16_opt_level,
+            half_precision_backend = half_precision_backend,
+            bf16_full_eval = bf16_full_eval,
+            fp16_full_eval = fp16_full_eval,
+            tf32 = tf32,
+            local_rank = local_rank,
+            ddp_backend = ddp_backend,
+            tpu_num_cores = tpu_num_cores,
+            tpu_metrics_debug = tpu_metrics_debug,
+            debug = debug,
+            dataloader_drop_last = dataloader_drop_last,
+            eval_steps = eval_steps,
+            dataloader_num_workers = dataloader_num_workers,
+            dataloader_prefetch_factor = dataloader_prefetch_factor,
+            past_index = past_index,
+            run_name = run_name,
+            disable_tqdm = disable_tqdm,
+            remove_unused_columns = remove_unused_columns,
+            label_names = label_names,
+            load_best_model_at_end = load_best_model_at_end,
+            metric_for_best_model = metric_for_best_model,
+            greater_is_better = greater_is_better,
+            ignore_data_skip = ignore_data_skip,
+            fsdp = fsdp,
+            fsdp_min_num_params = fsdp_min_num_params,
+            fsdp_config = fsdp_config,
+            fsdp_transformer_layer_cls_to_wrap = fsdp_transformer_layer_cls_to_wrap,
+            accelerator_config = accelerator_config,
+            parallelism_config = parallelism_config,
+            deepspeed = deepspeed,
+            label_smoothing_factor = label_smoothing_factor,
+            optim = optim,
+            optim_args = optim_args,
+            adafactor = adafactor,
+            group_by_length = group_by_length,
+            length_column_name = length_column_name,
+            report_to = report_to,
+            project = project,
+            trackio_space_id = trackio_space_id,
+            ddp_find_unused_parameters = ddp_find_unused_parameters,
+            ddp_bucket_cap_mb = ddp_bucket_cap_mb,
+            ddp_broadcast_buffers = ddp_broadcast_buffers,
+            dataloader_pin_memory = dataloader_pin_memory,
+            dataloader_persistent_workers = dataloader_persistent_workers,
+            skip_memory_metrics = skip_memory_metrics,
+            use_legacy_prediction_loop = use_legacy_prediction_loop,
+            push_to_hub = push_to_hub,
+            resume_from_checkpoint = resume_from_checkpoint,
+            hub_model_id = hub_model_id,
+            hub_strategy = hub_strategy,
+            hub_token = hub_token,
+            hub_private_repo = hub_private_repo,
+            hub_always_push = hub_always_push,
+            hub_revision = hub_revision,
+            gradient_checkpointing = gradient_checkpointing,
+            gradient_checkpointing_kwargs = gradient_checkpointing_kwargs,
+            include_inputs_for_metrics = include_inputs_for_metrics,
+            eval_do_concat_batches = eval_do_concat_batches,
+            fp16_backend = fp16_backend,
+            push_to_hub_model_id = push_to_hub_model_id,
+            push_to_hub_organization = push_to_hub_organization,
+            push_to_hub_token = push_to_hub_token,
+            mp_parameters = mp_parameters,
+            auto_find_batch_size = auto_find_batch_size,
+            full_determinism = full_determinism,
+            torchdynamo = torchdynamo,
+            ray_scope = ray_scope,
+            ddp_timeout = ddp_timeout,
+            torch_compile = torch_compile,
+            torch_compile_backend = torch_compile_backend,
+            torch_compile_mode = torch_compile_mode,
+            include_tokens_per_second = include_tokens_per_second,
+            include_num_input_tokens_seen = include_num_input_tokens_seen,
+            neftune_noise_alpha = neftune_noise_alpha,
+            optim_target_modules = optim_target_modules,
+            batch_eval_metrics = batch_eval_metrics,
+            eval_on_start = eval_on_start,
+            use_liger_kernel = use_liger_kernel,
+            liger_kernel_config = liger_kernel_config,
+            eval_use_gather_object = eval_use_gather_object,
+            average_tokens_across_devices = average_tokens_across_devices,
+            model_init_kwargs = model_init_kwargs,
+            chat_template_path = chat_template_path,
+            dataset_text_field = dataset_text_field,
+            dataset_kwargs = dataset_kwargs,
+            dataset_num_proc = dataset_num_proc,
+            eos_token = eos_token,
+            pad_token = pad_token,
+            max_length = max_length,
+            packing = packing,
+            packing_strategy = packing_strategy,
+            padding_free = padding_free,
+            pad_to_multiple_of = pad_to_multiple_of,
+            eval_packing = eval_packing,
+            completion_only_loss = completion_only_loss,
+            assistant_only_loss = assistant_only_loss,
+            loss_type = loss_type,
+            activation_offloading = activation_offloading,
+            temperature = temperature,
+            lmbda = lmbda,
+            beta = beta,
+            max_new_tokens = max_new_tokens,
+            teacher_model_name_or_path = teacher_model_name_or_path,
+            teacher_model_init_kwargs = teacher_model_init_kwargs,
+            disable_dropout = disable_dropout,
+            seq_kd = seq_kd,**kwargs)
+        self.vllm_sampling_params = vllm_sampling_params
+        self.unsloth_num_chunks = unsloth_num_chunks
+        self.max_seq_length = max_seq_length
+pass
+class _UnslothGKDTrainer(SFTTrainer):
+    """"""
+    _tag_names = ["trl", "gkd"]
+    _name = "GKD"
+    _paper = {
+        "title": "On-Policy Distillation of Language Models: Learning from Self-Generated Mistakes",
+        "id": "2306.13649",
+        # docstyle-ignore
+        "citation": textwrap.dedent("""\
+            @inproceedings{agarwal2024on-policy,
+                title        = {{On-Policy Distillation of Language Models: Learning from Self-Generated Mistakes}},
+                author       = {Rishabh Agarwal and Nino Vieillard and Yongchao Zhou and Piotr Stanczyk and Sabela Ramos Garea and Matthieu Geist and Olivier Bachem},
+                year         = 2024,
+                booktitle    = {The Twelfth International Conference on Learning Representations, {ICLR} 2024, Vienna, Austria, May 7-11, 2024},
+                publisher    = {OpenReview.net},
+                url          = {https://openreview.net/forum?id=3zKtaqxLhW},
+            }"""),
+    }
+    def __init__(
+        self,
+        model: Optional[Union[PreTrainedModel, nn.Module, str]] = None,
+        teacher_model: Union[PreTrainedModel, nn.Module, str] = None,
+        args: Optional[GKDConfig] = None,
+        data_collator: Optional[DataCollator] = None,  # type: ignore
+        train_dataset: Optional[Dataset] = None,
+        eval_dataset: Optional[Union[Dataset, dict[str, Dataset]]] = None,
+        processing_class: Optional[
+            Union[PreTrainedTokenizerBase, BaseImageProcessor, FeatureExtractionMixin, ProcessorMixin]
+        ] = None,
+        compute_metrics: Optional[Callable[[EvalPrediction], dict]] = None,
+        callbacks: Optional[list[TrainerCallback]] = None,
+        optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
+        preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
+        peft_config: Optional["PeftConfig"] = None,
+        formatting_func: Optional[Callable] = None,
+    ):
+        if not os.environ.get("TRL_EXPERIMENTAL_SILENCE"):
+            warnings.warn(
+                "This trainer will soon be moved to trl.experimental and is a candidate for removal. If you rely on "
+                "it and want it to remain, please share your comments here: "
+                "https://github.com/huggingface/trl/issues/4223. Silence this warning by setting environment variable "
+                "TRL_EXPERIMENTAL_SILENCE=1."
+            )
+        # Ensure Trainer does not drop non-signature columns used by the collator [e.g., "prompts"]
+        args.remove_unused_columns = False
+        # Respect a user-provided data_collator; otherwise, provide a ChatML collator that
+        if data_collator is None:
+            data_collator = DataCollatorForChatML(tokenizer=processing_class, max_length=args.max_length)
+        # Ensure SFTTrainer does not pre-process the dataset when using a ChatML collator,
+        # so that raw conversational fields [e.g., "messages"] remain available to the collator.
+        if args.dataset_kwargs is None:
+            args.dataset_kwargs = {"skip_prepare_dataset": True}
+        else:
+            args.dataset_kwargs["skip_prepare_dataset"] = True
+        # Liger fused GKD loss [JSD]
+        self.use_liger_gkd_loss = False
+        if args.use_liger_kernel:
+            self.liger_jsd_loss = LigerFusedLinearJSDLoss(
+                beta=args.beta,
+                ignore_index=-100,
+                temperature=args.temperature,
+                compiled=False,
+            )
+            self.use_liger_gkd_loss = True
+        super().__init__(
+            model,
+            args=args,
+            data_collator=data_collator,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            processing_class=processing_class,
+            compute_metrics=compute_metrics,
+            callbacks=callbacks,
+            optimizers=optimizers,
+            preprocess_logits_for_metrics=preprocess_logits_for_metrics,
+            peft_config=peft_config,
+            formatting_func=formatting_func,
+        )
+        if args.teacher_model_init_kwargs is None:
+            teacher_model_init_kwargs = {}
+        elif not isinstance(teacher_model, str):
+            raise ValueError(
+                "You passed teacher_model_init_kwargs to the GKDConfig, but your teacher_model is already instantiated."
+            )
+        else:
+            teacher_model_init_kwargs = args.teacher_model_init_kwargs
+            teacher_model_init_kwargs["dtype"] = (
+                teacher_model_init_kwargs["dtype"]
+                if teacher_model_init_kwargs["dtype"] in ["auto", None]
+                else getattr(torch, teacher_model_init_kwargs["dtype"])
+            )
+        if isinstance(teacher_model, str):
+            teacher_model = AutoModelForCausalLM.from_pretrained(teacher_model, **teacher_model_init_kwargs)
+        # Disable dropout in the model
+        if args.disable_dropout:
+            disable_dropout_in_model(self.model)
+        if self.is_deepspeed_enabled:
+            self.teacher_model = prepare_deepspeed(teacher_model, self.accelerator)
+        else:
+            self.teacher_model = self.accelerator.prepare_model(teacher_model, evaluation_mode=True)
+        self.lmbda = args.lmbda
+        self.beta = args.beta
+        self.temperature = args.temperature
+        self.seq_kd = args.seq_kd
+        self.generation_config = GenerationConfig(
+            max_new_tokens=args.max_new_tokens,
+            temperature=args.temperature,
+            do_sample=True,
+            top_k=0,
+            use_cache=False if args.gradient_checkpointing else True,
+            pad_token_id=self.processing_class.pad_token_id,
+        )
+        # Set custom EOS tokens if they are specified by the model's generation
+        # config. This is important for models with the Llama 3 chat template,
+        # which use special tokens <|eot_id|> and <|eom_id|> to mark the end of
+        # turns or messages.
+        if (
+            hasattr(self.model.generation_config, "eos_token_id")
+            and self.model.generation_config.eos_token_id is not None
+        ):
+            self.generation_config.eos_token_id = self.model.generation_config.eos_token_id
+    @staticmethod
+    def generalized_jsd_loss(
+        student_logits, teacher_logits, labels=None, beta=0.5, temperature=1.0, reduction="batchmean"
+    ):
+        """
+        Compute the generalized Jensen-Shannon Divergence loss for knowledge distillation using F.kl_div. See Eq. (1)
+        of https://huggingface.co/papers/2306.13649 for the definition.
+        Args:
+            student_logits:
+                Tensor of shape (batch_size, sequence_length, vocab_size)
+            teacher_logits:
+                Tensor of shape (batch_size, sequence_length, vocab_size)
+            labels:
+                Tensor of shape (batch_size, sequence_length) with -100 for padding tokens to ignore when computing
+                loss
+            beta:
+                Interpolation coefficient between 0 and 1 (default: 0.5)
+            temperature:
+                Softmax temperature (default: 1.0)
+            reduction:
+                Specifies the reduction to apply to the output (default: 'batchmean')
+        Returns:
+            loss: Scalar tensor with the generalized JSD loss
+        """
+        # Apply temperature scaling
+        student_logits = student_logits / temperature
+        teacher_logits = teacher_logits / temperature
+        # Compute log probabilities for student and probabilities for teacher
+        student_log_probs = F.log_softmax(student_logits, dim=-1)
+        teacher_log_probs = F.log_softmax(teacher_logits, dim=-1)
+        if beta == 0:
+            jsd = F.kl_div(student_log_probs, teacher_log_probs, reduction="none", log_target=True)
+        elif beta == 1:
+            jsd = F.kl_div(teacher_log_probs, student_log_probs, reduction="none", log_target=True)
+        else:
+            # Compute the log of the mixture distribution
+            # log(a + b) = log(exp(log(a)) + exp(log(b))) -> for mixture
+            beta = torch.tensor(beta, dtype=student_log_probs.dtype)
+            mixture_log_probs = torch.logsumexp(
+                torch.stack([student_log_probs + torch.log(1 - beta), teacher_log_probs + torch.log(beta)]),
+                dim=0,
+            )
+            # Compute KL divergences using F.kl_div
+            # PyTorch differs from the standard mathematical definition, so the order of the probability distributions is swapped compared to that defined in the paper.
+            kl_teacher = F.kl_div(mixture_log_probs, teacher_log_probs, reduction="none", log_target=True)
+            kl_student = F.kl_div(mixture_log_probs, student_log_probs, reduction="none", log_target=True)
+            # Compute the Generalized Jensen-Shannon Divergence
+            jsd = beta * kl_teacher + (1 - beta) * kl_student
+        # Masking
+        if labels is not None:
+            mask = labels != -100
+            jsd = jsd[mask]
+        # Apply reduction
+        if reduction == "batchmean":
+            return jsd.sum() / mask.sum() if labels is not None else jsd.sum() / jsd.size(0)
+        elif reduction == "sum":
+            return jsd.sum()
+        elif reduction == "mean":
+            return jsd.mean()
+        else:
+            return jsd
+    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
+        if self.use_liger_gkd_loss:
+            # Forward only through the base models (avoid lm_head to save memory)
+            unwrapped_student = self.accelerator.unwrap_model(model)
+            if hasattr(unwrapped_student, "get_decoder") and unwrapped_student.get_decoder() is not None:
+                base_student = unwrapped_student.get_decoder()
+            else:
+                base_student = getattr(
+                    unwrapped_student, getattr(unwrapped_student, "base_model_prefix", "model"), unwrapped_student
+                )
+            student_outputs = base_student(
+                input_ids=inputs["input_ids"],
+                attention_mask=inputs["attention_mask"],
+                output_hidden_states=True,
+                use_cache=False,
+            )
+            self.teacher_model.eval()
+            unwrapped_teacher = self.accelerator.unwrap_model(self.teacher_model)
+            if hasattr(unwrapped_teacher, "get_decoder") and unwrapped_teacher.get_decoder() is not None:
+                base_teacher = unwrapped_teacher.get_decoder()
+            else:
+                base_teacher = getattr(
+                    unwrapped_teacher, getattr(unwrapped_teacher, "base_model_prefix", "model"), unwrapped_teacher
+                )
+            with torch.no_grad():
+                teacher_outputs = base_teacher(
+                    input_ids=inputs["input_ids"],
+                    attention_mask=inputs["attention_mask"],
+                    output_hidden_states=True,
+                    use_cache=False,
+                )
+            # hidden states (shifted)
+            student_hidden = student_outputs.last_hidden_state[:, :-1].contiguous()
+            teacher_hidden = teacher_outputs.last_hidden_state[:, :-1].contiguous()
+            # labels mask and labels (shifted)
+            labels_mask = inputs["labels"] != -100
+            masked_input_ids = torch.where(
+                labels_mask, inputs["input_ids"], torch.full_like(inputs["input_ids"], -100)
+            )
+            true_labels = masked_input_ids[:, 1:].contiguous()
+            # heads
+            student_head = unwrapped_student.get_output_embeddings()
+            teacher_head = unwrapped_teacher.get_output_embeddings()
+            # liger fused jsd loss
+            loss = self.liger_jsd_loss(
+                student_input=student_hidden,
+                student_weight=student_head.weight,
+                teacher_input=teacher_hidden,
+                teacher_weight=teacher_head.weight,
+                true_labels=true_labels,
+                student_bias=getattr(student_head, "bias", None),
+                teacher_bias=getattr(teacher_head, "bias", None),
+            )
+        else:
+            # compute student output
+            student_outputs = model(
+                input_ids=inputs["input_ids"],
+                attention_mask=inputs["attention_mask"],
+            )
+            # compute teacher output in eval mode
+            self.teacher_model.eval()
+            with torch.no_grad():
+                teacher_outputs = self.teacher_model(
+                    input_ids=inputs["input_ids"],
+                    attention_mask=inputs["attention_mask"],
+                )
+            # slice the logits for the generated tokens using the inputs["prompts"] lengths
+            prompt_lengths = inputs["prompts"].shape[1]
+            shifted_student_logits = student_outputs.logits[:, prompt_lengths - 1 : -1, :]
+            shifted_teacher_logits = teacher_outputs.logits[:, prompt_lengths - 1 : -1, :]
+            shifted_labels = inputs["labels"][:, prompt_lengths:]
+            # compute loss
+            loss = self.generalized_jsd_loss(
+                student_logits=shifted_student_logits,
+                teacher_logits=shifted_teacher_logits,
+                labels=shifted_labels,
+                beta=self.beta,
+            )
+        # empty cache
+        empty_cache()
+        # Return loss
+        return (loss, student_outputs) if return_outputs else loss
+    @staticmethod
+    def generate_on_policy_outputs(model, inputs, generation_config, pad_token_id=None):
+        # Generate output with respect to the prompt-only
+        generated_outputs = model.generate(
+            input_ids=inputs["prompts"],
+            attention_mask=inputs.get("prompt_attention_mask", None),
+            generation_config=generation_config,
+            return_dict_in_generate=True,
+        )
+        # Get the generated token IDs
+        generated_tokens = generated_outputs.sequences
+        # Calculate new attention mask
+        new_attention_mask = torch.ones_like(generated_tokens)
+        new_labels = generated_tokens.clone()
+        # If there's pad_token_id, set attention mask to 0 for padding tokens
+        if pad_token_id is not None:
+            new_labels[new_labels == pad_token_id] = -100
+            new_attention_mask[generated_tokens == pad_token_id] = 0
+        return generated_tokens, new_attention_mask, new_labels
+    def training_step(
+        self, model: nn.Module, inputs: dict[str, Union[torch.Tensor, Any]], num_items_in_batch: Optional[int] = None
+    ) -> torch.Tensor:
+        """
+        Perform a training step for the Generalized Knowledge Distillation (GKD) model.
+        This method implements the on-policy learning approach described in the GKD paper. With probability
+        `self.lmbda`, it generates new responses using the student model, which are then used for training instead of
+        the original inputs.
+        """
+        if self.seq_kd:
+            with unwrap_model_for_generation(self.teacher_model, self.accelerator) as unwrapped_model:
+                new_input_ids, new_attention_mask, new_labels = self.generate_on_policy_outputs(
+                    unwrapped_model, inputs, self.generation_config, self.processing_class.pad_token_id
+                )
+            inputs["input_ids"] = new_input_ids
+            inputs["attention_mask"] = new_attention_mask
+            inputs["labels"] = new_labels
+        if random.random() <= self.lmbda:
+            with unwrap_model_for_generation(model, self.accelerator) as unwrapped_model:
+                new_input_ids, new_attention_mask, new_labels = self.generate_on_policy_outputs(
+                    unwrapped_model, inputs, self.generation_config, self.processing_class.pad_token_id
+                )
+            inputs["input_ids"] = new_input_ids
+            inputs["attention_mask"] = new_attention_mask
+            inputs["labels"] = new_labels
+        loss = super().training_step(model, inputs, num_items_in_batch)
+        return loss
+class UnslothGKDTrainer(_UnslothGKDTrainer):
+    """
+    Trainer for Generalized Knowledge Distillation (GKD) of language models.
+    For details on GKD, see the paper: [On-Policy Distillation of Language Models: Learning from Self-Generated
+    Mistakes](https://huggingface.co/papers/2306.13649).
+    Args:
+        model ([`~transformers.PreTrainedModel`] or `torch.nn.Module` or `str`, *optional*):
+            Model to be trained, or the string identifier of the model to be instantiated from a pretrained model.
+        teacher_model ([`~transformers.PreTrainedModel`] or `torch.nn.Module` or `str`, *optional*):
+            Teacher model for knowledge distillation, or the string identifier of the model to be instantiated from a
+            pretrained model.
+        args ([`GKDConfig`], *optional*):
+            Training arguments.
+        data_collator ([`~transformers.DataCollator`], *optional*):
+            Data collator to batch samples from the dataset. It defaults to a [`DataCollatorForChatML`] using the
+            `processing_class`.
+        train_dataset ([`~datasets.Dataset`], *optional*):
+            Dataset for training.
+        eval_dataset ([`~datasets.Dataset`] or `dict` of [`~datasets.Dataset`], *optional*):
+            Dataset for evaluation.
+        processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`], *optional*):
+           Class to process the data.
+        compute_metrics (`Callable`, *optional*):
+            Function to compute metrics at evaluation. Must take in an [`~transformers.EvalPrediction`] and return a
+            dictionary string to float.
+        callbacks (`list` of [`~transformers.TrainerCallback`], *optional*):
+            Callbacks to use during training.
+        optimizers (`tuple` of `torch.optim.Optimizer` and `torch.optim.lr_scheduler.LambdaLR`, *optional*, defaults to `(None, None)`):
+            Tuple containing the optimizer and the learning rate scheduler to use for training.
+        preprocess_logits_for_metrics (`Callable`, *optional*):
+            Function to preprocess the logits before computing the metrics. Must take in the `logits` and `labels` and
+            return the logits to be used for metrics computation.
+        peft_config ([`~peft.PeftConfig`], *optional*):
+            PEFT configuration to use PEFT for training. If `None`, PEFT is not used. If provided, the `model` will be
+            wrapped with the specified PEFT adapter.
+        formatting_func (`Callable`, *optional*):
+            Function to format the dataset. Must take in an example and return an example.
+    """
+    def __init__(
+        self,
+        model = None,
+        teacher_model = None,
+        args = None,
+        data_collator = None,
+        train_dataset = None,
+        eval_dataset = None,
+        processing_class = None,
+        compute_metrics = None,
+        callbacks = None,
+        preprocess_logits_for_metrics = None,
+        peft_config = None,
+        formatting_func = None,
+        **kwargs
+    ):
+        if args is None: args = UnslothGKDConfig()
+        use_bf16 = getattr(args, 'bf16', False)
+        if type(use_bf16) is not bool: use_bf16 = False
+        use_fp16 = getattr(args, 'fp16', False)
+        if type(use_fp16) is not bool: use_fp16 = False
+        force_float32 = False
+        full_finetuning = os.environ.get('UNSLOTH_ENABLE_FULL_FINETUNING', '0') == '1'
+        if not full_finetuning and (os.environ.get('UNSLOTH_FORCE_FLOAT32', '0') == '1'):
+            print('Unsloth: Switching to float32 training since model cannot work with float16')
+            force_float32 = True
+        mixed_precision_dtype = os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32')
+        dtype = getattr(model.config, 'dtype', None) or getattr(model.config, 'torch_dtype', None)
+        if dtype is None: dtype = model.get_input_embeddings().weight.dtype
+        from unsloth_zoo.utils import _get_dtype
+        dtype = _get_dtype(dtype)
+        float16 = dtype == torch.float16
+        if not force_float32 and (float16 and use_bf16): raise TypeError('Unsloth: Model is in float16 precision but you want to use bfloat16 precision. Set fp16 to `True` and bf16 to `False`')
+        if not force_float32 and (not float16 and use_fp16): raise TypeError('Unsloth: Model is in bfloat16 precision but you want to use float16 precision. Set fp16 to `False` and bf16 to `True`')
+        if force_float32:
+            # Forced float32 training
+            args.fp16 = False
+            args.bf16 = False
+            os.environ['ACCELERATE_MIXED_PRECISION'] = 'no'
+            if hasattr(args, 'mixed_precision'): args.mixed_precision = 'no'
+            # args.mixed_precision is a new argument which needs to be set now
+        elif (not use_bf16 and not use_fp16) and mixed_precision_dtype == 'float32':
+            # Mixed precision training
+            args.fp16 = float16
+            args.bf16 = not float16
+            os.environ['ACCELERATE_MIXED_PRECISION'] = 'fp16' if float16 else 'bf16'
+            if hasattr(args, 'mixed_precision'): args.mixed_precision = 'fp16' if float16 else 'bf16'
+            # args.mixed_precision is a new argument which needs to be set now
+        elif mixed_precision_dtype == 'bfloat16':
+            # Both False since bfloat16 full finetuning doesn't do any autocasting.
+            args.fp16 = False
+            args.bf16 = False
+            os.environ['ACCELERATE_MIXED_PRECISION'] = 'no'
+            if hasattr(args, 'mixed_precision'): args.mixed_precision = 'no'
+            # args.mixed_precision is a new argument which needs to be set now
+        if getattr(args, 'eval_dataset', None) is not None and getattr(args, 'eval_strategy', 'no') == 'no':
+            args.eval_strategy = 'steps'
+            if getattr(args, 'eval_steps', None) is None: args.eval_steps = 0.1
+        ga_steps = getattr(args, 'gradient_accumulation_steps', None)
+        if ga_steps is not None and ga_steps > 1:
+            from transformers import __version__ as transformers_version
+            if Version(transformers_version) <= Version('4.45.2'):
+                print('**** Unsloth: Please use our fixed gradient_accumulation_steps by updating transformers, TRL and Unsloth!\n'
+                      '`pip install --upgrade --no-cache-dir --force-reinstall --no-deps unsloth transformers trl unsloth_zoo`')
+        if getattr(args, 'eval_strategy', 'no') != 'no':
+            eval_bsz = getattr(args, 'per_device_eval_batch_size', 8)
+            if eval_bsz == 8 and args.per_device_train_batch_size < eval_bsz: args.per_device_eval_batch_size = args.per_device_train_batch_size
+            if getattr(args, 'eval_accumulation_steps', None) is None and ga_steps is not None: args.eval_accumulation_steps = ga_steps
+        fp16_full_eval = getattr(args, 'fp16_full_eval', False)
+        if type(fp16_full_eval) is not bool: fp16_full_eval = False
+        bf16_full_eval = getattr(args, 'bf16_full_eval', False)
+        if type(bf16_full_eval) is not bool: bf16_full_eval = False
+        if args.fp16 and bf16_full_eval: args.bf16_full_eval = False; args.fp16_full_eval = True
+        if args.bf16 and fp16_full_eval: args.bf16_full_eval = True; args.fp16_full_eval = False
+        if force_float32:
+            args.bf16_full_eval = False
+            args.fp16_full_eval = False
+        elif os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32') == 'bfloat16':
+            args.bf16_full_eval = True
+            args.fp16_full_eval = False
+        elif not bf16_full_eval and not fp16_full_eval:
+            args.bf16_full_eval = args.bf16
+            args.fp16_full_eval = args.fp16
+        _output_logits = False
+        if locals().get('compute_metrics', None) is not None: _output_logits = True
+        if locals().get('preprocess_logits_for_metrics', None) is not None: _output_logits = True
+        if _output_logits:
+            os.environ['UNSLOTH_RETURN_LOGITS'] = '1'
+        if 'max_seq_length' not in locals() and not hasattr(args, 'max_seq_length'):
+            pass
+        else:
+            model_max_seq_length = getattr(model, 'max_seq_length', None)
+            args_max_seq_length  = getattr(args,  'max_seq_length', None)
+            if args_max_seq_length is None and model_max_seq_length is not None:
+                max_seq_length = model.max_seq_length
+                if hasattr(args, 'max_seq_length'): args.max_seq_length = max_seq_length
+        if model is not None and hasattr(model, 'for_training'):
+            model.for_training(use_gradient_checkpointing=getattr(args, 'gradient_checkpointing', True))
+        if 'tokenizer' in locals() and hasattr(tokenizer, 'padding_side'): tokenizer.padding_side = 'right'
+        if 'processing_class' in locals():
+            if hasattr(processing_class, 'padding_side'): processing_class.padding_side = 'right'
+            if hasattr(processing_class, 'tokenizer') and hasattr(processing_class.tokenizer, 'padding_side'): processing_class.tokenizer.padding_side = 'right'
+        __tokenizer = processing_class if 'processing_class' in locals() else tokenizer
+        from unsloth_zoo.vision_utils import UnslothVisionDataCollator
+        if not isinstance(data_collator, UnslothVisionDataCollator):
+            if isinstance(data_collator, DataCollatorForSeq2Seq) and 'labels' not in train_dataset.column_names:
+                data_collator = TransformersDataCollatorForLanguageModeling(
+                    __tokenizer,
+                    mlm = False,
+                    mlm_probability = 0.0,
+                    pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                )
+            elif isinstance(data_collator, TransformersDataCollatorForLanguageModeling) and 'labels' in train_dataset.column_names:
+                data_collator = DataCollatorForSeq2Seq(
+                    __tokenizer,
+                    pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                )
+        else:
+            if hasattr(args, 'remove_unused_columns'): args.remove_unused_columns = False
+            if hasattr(args, 'dataset_text_field'): args.dataset_text_field = ''
+            if hasattr(args, 'dataset_kwargs'): args.dataset_kwargs = {'skip_prepare_dataset': True}
+        if not isinstance(data_collator, UnslothVisionDataCollator):
+            if not hasattr(__tokenizer, 'pad') and hasattr(__tokenizer, 'tokenizer'):
+                if isinstance(data_collator, DataCollatorForSeq2Seq):
+                    data_collator = DataCollatorForSeq2Seq(
+                        __tokenizer.tokenizer,
+                        pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                    )
+                else:
+                    data_collator = TransformersDataCollatorForLanguageModeling(
+                        __tokenizer.tokenizer,
+                        mlm = False,
+                        mlm_probability = 0.0,
+                        pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                    )
+        other_metrics = []
+        from unsloth_zoo.logging_utils import PatchRLStatistics
+        PatchRLStatistics('gkd_trainer', other_metrics)
+        # [TODO] Fix up DataParallel multiplying batch sizes
+        # [TODO] DDP works, but DP seems to not work? [TODO]
+        if getattr(args, "parallel_mode", None) == ParallelMode.NOT_DISTRIBUTED and args.n_gpu > 1:
+            if getattr(args, "_n_gpu", 1) != 1:
+                args._n_gpu = 1
+        if "model" in locals() and hasattr(model, "for_training"):
+            model.for_training(use_gradient_checkpointing=getattr(args, 'gradient_checkpointing', True))
+        super().__init__(
+            model = model,
+            teacher_model = teacher_model,
+            args = args,
+            data_collator = data_collator,
+            train_dataset = train_dataset,
+            eval_dataset = eval_dataset,
+            processing_class = processing_class,
+            compute_metrics = compute_metrics,
+            callbacks = callbacks,
+            preprocess_logits_for_metrics = preprocess_logits_for_metrics,
+            peft_config = peft_config,
+            formatting_func = formatting_func,**kwargs)
+        if "model" in locals() and hasattr(model, "for_inference"):
+            model.for_inference()
+        if hasattr(self, 'neftune_hook_handle'):
+            self.neftune_hook_handle.remove()
+            if hasattr(self, 'neftune_hook_handle'): del self.neftune_hook_handle
+        if getattr(args, 'neftune_noise_alpha', None) is not None:
+            model.get_input_embeddings().neftune_noise_alpha = self.neftune_noise_alpha
+        pass
+        if hasattr(self, 'accelerator'):
+            scaler = self.accelerator.scaler
+            current_model = model
+            while hasattr(current_model, 'model'):
+                current_model.accelerator_scaler = scaler
+                current_model = current_model.model
+            current_model.accelerator_scaler = scaler
+        pass
+        if hasattr(self, 'train'):
+            self.train = MethodType(prepare_for_training_mode(self.__class__.train), self)
+        pass
+pass