PyPI - project-llm-trainer - Versions diffs - 0.12.3__py3-none-any.whl - Mend

project-llm-trainer 0.12.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

llm_trainer/__init__.py +13 -0
llm_trainer/base_trainer.py +683 -0
llm_trainer/checkpoint.py +126 -0
llm_trainer/dataset.py +335 -0
llm_trainer/dpo_trainer.py +297 -0
llm_trainer/ds_checkpoint.py +63 -0
llm_trainer/eval.py +33 -0
llm_trainer/generate_utils.py +450 -0
llm_trainer/grpo_trainer.py +385 -0
llm_trainer/log.py +65 -0
llm_trainer/loss.py +268 -0
llm_trainer/parallel.py +220 -0
llm_trainer/partition_utils.py +219 -0
llm_trainer/ppo_trainer.py +521 -0
llm_trainer/scheduler.py +179 -0
llm_trainer/sft_trainer.py +97 -0
llm_trainer/tokenizer.py +162 -0
llm_trainer/tools.py +116 -0
llm_trainer/train_configs.py +324 -0
llm_trainer/trainer.py +34 -0
llm_trainer/utils.py +547 -0
project_llm_trainer-0.12.3.data/scripts/calc_intermediate_size +15 -0
project_llm_trainer-0.12.3.data/scripts/ddp_train +21 -0
project_llm_trainer-0.12.3.data/scripts/ds_train +17 -0
project_llm_trainer-0.12.3.data/scripts/plot_log +69 -0
project_llm_trainer-0.12.3.data/scripts/plot_lr +45 -0
project_llm_trainer-0.12.3.data/scripts/py_train +12 -0
project_llm_trainer-0.12.3.data/scripts/smart_train +37 -0
project_llm_trainer-0.12.3.dist-info/METADATA +9 -0
project_llm_trainer-0.12.3.dist-info/RECORD +32 -0
project_llm_trainer-0.12.3.dist-info/WHEEL +5 -0
project_llm_trainer-0.12.3.dist-info/top_level.txt +1 -0

llm_trainer/checkpoint.py ADDED Viewed

@@ -0,0 +1,126 @@
+import os
+from typing import Optional, Union
+import shutil
+import torch
+from torch import nn
+from torch.optim import Optimizer
+from torch.nn.parallel import DistributedDataParallel as DDP
+from .parallel import DsParallel
+from .scheduler import LRScheduler
+from .tools import TrainerTools
+DEFAULT_CHECKPOINT_NAME = "checkpoint.pth"
+def save_checkpoint(
+        model: nn.Module,
+        optimizer: Optional[Optimizer] = None
+):
+    if isinstance(TrainerTools().parallel, DsParallel):
+        from .ds_checkpoint import save_ds_checkpoint
+        save_ds_checkpoint(model)
+    else:
+        if TrainerTools().parallel.is_main_process:
+            checkpoint_name = os.environ.get('CHECKPOINT_NAME', DEFAULT_CHECKPOINT_NAME)
+            raw_model = model if not isinstance(model, DDP) else model.module
+            ckpt = {'model_state_dict': raw_model.state_dict()}
+            if optimizer:
+                ckpt.update({'optim_state_dict': optimizer.state_dict()})
+            torch.save(ckpt, checkpoint_name)
+def save_best_checkpoint(
+        current_loss: float,
+        last_best_checkpoint_loss: Optional[float] = None
+) -> bool:
+    # 指定不保存最佳checkpoint
+    if os.environ.get('SAVE_BEST_CHECKPOINT', '1') != '1':
+        return False
+    need_replace = not last_best_checkpoint_loss or current_loss <= last_best_checkpoint_loss
+    if need_replace and TrainerTools().parallel.is_main_process:
+        try:
+            if isinstance(TrainerTools().parallel, DsParallel):
+                checkpoint_dir = os.environ.get('DIST_CHECKPOINT_DIR', 'checkpoint')
+                if checkpoint_dir.endswith('/'):
+                    best_checkpoint_dir = f'{checkpoint_dir[:-1]}_best'
+                else:
+                    best_checkpoint_dir = f'{checkpoint_dir}_best'
+                if not os.path.exists(best_checkpoint_dir):
+                    os.makedirs(best_checkpoint_dir)
+                if os.path.exists(checkpoint_dir):
+                    shutil.rmtree(best_checkpoint_dir)
+                    shutil.copytree(checkpoint_dir, best_checkpoint_dir)
+            else:
+                checkpoint_name = os.environ.get('CHECKPOINT_NAME', DEFAULT_CHECKPOINT_NAME)
+                best_checkpoint_name = f'{checkpoint_name}_best'
+                if os.path.exists(checkpoint_name):
+                    if os.path.exists(best_checkpoint_name):
+                        os.remove(best_checkpoint_name)
+                    shutil.copy2(checkpoint_name, best_checkpoint_name)
+        except: pass
+    TrainerTools().parallel.wait('save best checkpoint')
+    return need_replace
+def load_checkpoint(
+        model: nn.Module,
+        optimizer: Optional[Optimizer] = None,
+        device: Optional[Union[torch.device, str]] = None,
+        load_module_only: bool = False
+):
+    if isinstance(TrainerTools().parallel, DsParallel):
+        from .ds_checkpoint import load_ds_checkpoint
+        load_ds_checkpoint(model, load_module_only=load_module_only)
+    else:
+        checkpoint_name = os.environ.get('CHECKPOINT_NAME', DEFAULT_CHECKPOINT_NAME)
+        if os.path.exists(checkpoint_name):
+            state_dict = torch.load(checkpoint_name, weights_only=True, map_location=device)
+            raw_model = model.module if isinstance(model, DDP) else model
+            raw_model.load_state_dict(state_dict['model_state_dict'])
+            if optimizer:
+                optimizer.load_state_dict(state_dict['optim_state_dict'])
+def load_checkpoint_for_eval(
+        model: nn.Module,
+        device: Optional[Union[torch.device, str]] = None
+):
+    if isinstance(TrainerTools().parallel, DsParallel):
+        from .ds_checkpoint import load_ds_checkpoint_for_eval
+        load_ds_checkpoint_for_eval(model)
+    else:
+        load_checkpoint(model, None, device)
+def save_steps(
+    global_steps: int,
+    lr_scheduler: Optional[LRScheduler] = None,
+):
+    # 暂时只保存主进程的
+    if TrainerTools().parallel.is_main_process:
+        steps_checkpoint_name = f"{os.environ.get('LOG_DIR', './')}steps.pt"
+        ckpt = {'global_steps': global_steps}
+        if lr_scheduler:
+            ckpt.update(lr_scheduler.get_ckpt_dict())
+        torch.save(ckpt, steps_checkpoint_name)
+def load_steps() -> Optional[dict]:
+    steps_checkpoint_name = f"{os.environ.get('LOG_DIR', './')}steps.pt"
+    if os.path.exists(steps_checkpoint_name):
+        return torch.load(steps_checkpoint_name, weights_only=True)
+    return None

llm_trainer/dataset.py ADDED Viewed

@@ -0,0 +1,335 @@
+import torch
+from torch.utils.data import Dataset
+import pickle
+import csv
+import json
+import numpy as np
+from .tools import TrainerTools
+from .utils import repeat_image_tok
+"""
+support jsonl and pkl
+"""
+def _get_file_type(file_path: str):
+    if file_path.endswith('.npy'):
+        return 'npy'
+    elif file_path.endswith('.jsonl'):
+        return 'jsonl'
+    elif file_path.endswith('.pkl'):
+        return 'pkl'
+    return None
+class PretrainDataset(Dataset):
+    """
+    适用于pretrain阶段，数据格式支持jsonl和pkl，如果是jsonl会在init阶段全部encode成token
+    1. npy:【推荐】numpy 数组，支持 mmap，内存占用极低
+    2. jsonl: {'text': 'text1'}\n{'text': 'text2'}
+    3. pkl: [0, 1, 2, 3 ...]
+    """
+    def __init__(
+            self,
+            file_path,
+            block_size,
+            stride
+    ):
+        super().__init__()
+        self.block_size = block_size
+        self.stride = stride
+        self.use_mmap = False
+        file_type = _get_file_type(file_path)
+        if file_type == 'npy':
+            self.input_ids = np.load(file_path, mmap_mode='r')
+            self.use_mmap = True
+        elif file_type == 'jsonl':
+            tokens = []
+            with open(file_path, 'r') as f:
+                for line in f:
+                    tokens.extend(TrainerTools().tokenizer.encode(json.loads(line.strip())['text']))
+            self.input_ids = torch.tensor(tokens, dtype=torch.int32)
+            del tokens
+        elif file_type == 'pkl':
+            with open(file_path, 'rb') as f:
+                tokens = pickle.load(f)
+            self.input_ids = torch.tensor(tokens, dtype=torch.int32)
+            del tokens
+        else:
+            raise Exception(f'unsupported file type for {file_path}')
+        if len(self.input_ids) < block_size:
+            self.length = 0
+        else:
+            self.length = (len(self.input_ids) - block_size) // stride + 1
+    def __len__(self):
+        return self.length
+    def __getitem__(self, item):
+        if item < 0 or item >= self.length:
+            raise IndexError(f"Index {item} out of range")
+        start_idx = item * self.stride
+        end_idx = start_idx + self.block_size
+        data = self.input_ids[start_idx:end_idx]
+        if self.use_mmap:
+            return torch.from_numpy(data.astype(np.int64))
+        else:
+            return data.long()
+class SFTDataset(Dataset):
+    """
+    适用于sft阶段，数据格式支持jsonl和pkl，如果是jsonl，则会在getitem阶段encode成token
+    npy: [
+            [0, 1, 2, 3],
+            [4, 5, 6, 7]
+         ]
+    jsonl: [
+            {'role': 'system', 'content': 'system_content'},
+            {'role': 'user', 'content': 'user_content'},
+            {'role': 'assistant', 'think': 'think_content', 'content': 'assistant_content'}
+           ]\n
+           [
+            {'role': 'system', 'content': 'system_content'},
+            {'role': 'user', 'content': 'user_content'},
+            {'role': 'assistant', 'think': 'think_content', 'content': 'assistant_content'}
+           ]
+    pkl: [
+            [0, 1, 2, 3],
+            [4, 5, 6, 7]
+         ]
+    """
+    def __init__(
+            self,
+            file_path,
+            max_len,
+            image_tags_file_path=None,
+            tokens_per_image=-1
+    ):
+        super().__init__()
+        self.max_len = max_len
+        self.tokens_per_image = tokens_per_image
+        self.input_ids = []
+        self.image_tags = []
+        self.plain_text = False
+        file_type = _get_file_type(file_path)
+        if file_type == 'npy':
+            try:
+                self.input_ids = np.load(file_path, mmap_mode='r')
+            except ValueError:
+                self.input_ids = np.load(file_path, allow_pickle=True)
+        elif file_type == 'jsonl':
+            self.plain_text = True
+            with open(file_path, 'r') as f:
+                for line in f:
+                    self.input_ids.append(json.loads(line.strip()))
+        elif file_type == 'pkl':
+            with open(file_path, 'rb') as f:
+                self.input_ids = pickle.load(f)
+        else:
+            raise Exception(f'unsupported file type for {file_path}')
+        if image_tags_file_path:
+            with open(image_tags_file_path, 'r') as f:
+                csv_reader = csv.reader(f)
+                for line in csv_reader:
+                    self.image_tags.append(line[0])
+    def __len__(self):
+        return len(self.input_ids)
+    def __getitem__(self, item):
+        if self.plain_text:
+            inputs = TrainerTools().tokenizer.apply_chat_template(self.input_ids[item])
+        else:
+            inputs = self.input_ids[item]
+        if isinstance(inputs, np.ndarray):
+            inputs = torch.from_numpy(inputs.astype(np.int64))
+        else:
+            inputs = torch.tensor(inputs).long()
+        image_tag = self.image_tags[item] if self.image_tags else None
+        if self.tokens_per_image != -1:
+            inputs = repeat_image_tok(inputs, self.tokens_per_image)
+        else:
+            image_tag = None
+        inputs = inputs[:self.max_len]
+        return {
+            'inputs': inputs,
+            'image_tag': image_tag
+        }
+class DPODataset(Dataset):
+    """
+    适用于dpo阶段，数据格式支持jsonl和pkl，如果是jsonl，则会在getitem阶段encode成token
+    npy: [
+            {'chosen': xxx, 'rejected': xxx},
+            {'chosen': xxx, 'rejected': xxx},
+         ]
+    jsonl: {'chosen':
+                [{'role': 'system', 'content': 'system_content'},
+                {'role': 'user', 'content': 'user_content'},
+                {'role': 'assistant', 'think': 'think_content', 'content': 'assistant_content'}],
+            'rejected':
+                [{'role': 'system', 'content': 'system_content'},
+                {'role': 'user', 'content': 'user_content'},
+                {'role': 'assistant', 'think': 'think_content', 'content': 'assistant_content'}],
+            }\n
+           {'chosen':
+                [{'role': 'system', 'content': 'system_content'},
+                {'role': 'user', 'content': 'user_content'},
+                {'role': 'assistant', 'think': 'think_content', 'content': 'assistant_content'}],
+            'rejected':
+                [{'role': 'system', 'content': 'system_content'},
+                {'role': 'user', 'content': 'user_content'},
+                'role': 'assistant', 'think': 'think_content', 'content': 'assistant_content'}],
+            }
+    pkl: [
+            {'chosen': xxx, 'rejected': xxx},
+            {'chosen': xxx, 'rejected': xxx},
+         ]
+    """
+    def __init__(self, file_path, max_len):
+        self.max_len = max_len
+        self.data = []
+        self.plain_text = False
+        file_type = _get_file_type(file_path)
+        if file_type == 'npy':
+            try:
+                self.data = np.load(file_path, mmap_mode='r')
+            except ValueError:
+                self.data = np.load(file_path, allow_pickle=True)
+        elif file_type == 'jsonl':
+            self.plain_text = True
+            with open(file_path, 'r') as f:
+                for line in f:
+                    self.data.append(json.loads(line.strip()))
+        elif file_type == 'pkl':
+            with open(file_path, 'rb') as f:
+                self.data = pickle.load(f)
+        else:
+            raise Exception(f'unsupported file type for {file_path}')
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, item):
+        record = self.data[item]
+        chosen_raw = record['chosen']
+        rejected_raw = record['rejected']
+        if self.plain_text:
+            chosen_id = TrainerTools().tokenizer.apply_chat_template(chosen_raw)
+            rejected_id = TrainerTools().tokenizer.apply_chat_template(rejected_raw)
+        else:
+            chosen_id = chosen_raw
+            rejected_id = rejected_raw
+        if isinstance(chosen_id, np.ndarray): chosen_id = chosen_id.tolist()
+        if isinstance(rejected_id, np.ndarray): rejected_id = rejected_id.tolist()
+        return {
+            'chosen': chosen_id[:self.max_len],
+            'rejected': rejected_id[:self.max_len]
+        }
+class RLDataset(Dataset):
+    """
+        适用于RL阶段（例如：PPO、GRPO、GSPO），数据格式支持jsonl和pkl，如果是jsonl，则会在getitem阶段encode成token
+        npy: [
+                {'prompt': xxx, 'answer': xxx},
+                {'prompt': xxx, 'answer': xxx},
+             ]
+        jsonl: {'prompt':
+                    [{'role': 'system', 'content': 'system_content'},
+                    {'role': 'user', 'content': 'user_content'}]
+                'answer': '10'
+               }\n
+               {'prompt':
+                    [{'role': 'system', 'content': 'system_content'},
+                    {'role': 'user', 'content': 'user_content'}]
+                'answer': '10'
+               }
+        pkl: [
+                {'prompt': xxx, 'answer': xxx},
+                {'prompt': xxx, 'answer': xxx},
+             ]
+        """
+    def __init__(self, file_path):
+        self.data = []
+        self.plain_text = False
+        file_type = _get_file_type(file_path)
+        if file_type == 'npy':
+            try:
+                self.data = np.load(file_path, mmap_mode='r')
+            except ValueError:
+                self.data = np.load(file_path, allow_pickle=True)
+        elif file_type == 'jsonl':
+            self.plain_text = True
+            with open(file_path, 'r') as f:
+                for line in f:
+                    self.data.append(json.loads(line.strip()))
+        elif file_type == 'pkl':
+            with open(file_path, 'rb') as f:
+                self.data = pickle.load(f)
+        else:
+            raise Exception(f'unsupported file type for {file_path}')
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, item):
+        record = self.data[item]
+        prompt_raw = record['prompt']
+        answer_raw = record.get('answer', None)
+        if self.plain_text:
+            question = TrainerTools().tokenizer.apply_chat_template(prompt_raw)
+            answer = TrainerTools().tokenizer.encode(answer_raw) if answer_raw else None
+        else:
+            question = prompt_raw
+            answer = answer_raw
+        # 转换为 Tensor
+        if isinstance(question, np.ndarray):
+            prompt_tensor = torch.from_numpy(question.astype(np.int64))
+        else:
+            prompt_tensor = torch.tensor(question).long()
+        if answer is not None:
+            if isinstance(answer, np.ndarray):
+                answer_tensor = torch.from_numpy(answer.astype(np.int64))
+            else:
+                answer_tensor = torch.tensor(answer).long()
+        else:
+            answer_tensor = None
+        return {
+            'prompt': prompt_tensor,
+            'answer': answer_tensor
+        }