PyPI - project-llm-trainer - Versions diffs - 0.7.0__py3-none-any.whl → 0.7.2__py3-none-any.whl - Mend

project-llm-trainer 0.7.0py3-none-any.whl → 0.7.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of project-llm-trainer might be problematic. Click here for more details.

Files changed (18) hide show

llm_trainer/checkpoint.py CHANGED Viewed

@@ -2,6 +2,7 @@ import os
 from typing import Optional, Union, Tuple
 import shutil
 import torch
+from sympy import false
 from torch import nn
 from torch.optim import Optimizer
 from torch.nn.parallel import DistributedDataParallel as DDP
@@ -36,6 +37,10 @@ def save_best_checkpoint(
         current_loss: float,
         last_best_checkpoint_loss: Optional[float] = None
 ) -> bool:
+    # 指定不保存最佳checkpoint
+    if os.environ.get('SAVE_BEST_CHECKPOINT', '1') != '1':
+        return False
     need_replace = not last_best_checkpoint_loss or current_loss <= last_best_checkpoint_loss
     if need_replace and TrainerTools().parallel.is_main_process:
         try:
@@ -62,8 +67,7 @@ def save_best_checkpoint(
                         os.remove(best_checkpoint_name)
                     shutil.copy2(checkpoint_name, best_checkpoint_name)
-        except:
-            pass
+        except: pass
     TrainerTools().parallel.wait('save best checkpoint')
     return need_replace

llm_trainer/dpo_trainer.py CHANGED Viewed

@@ -11,7 +11,7 @@ from .dataset import DPODataset
 from .loss import DPOLoss
 from .tools import TrainerTools
 from .utils import (
-    autocastcontext,
+    autocast,
     get_dpo_collate_fn
 )
 from .partition_utils import sync_model_params
@@ -203,7 +203,7 @@ class DPOTrainer(Trainer):
                         if TrainerTools().parallel.parallel_train:
                             self.train_model.require_backward_grad_sync = need_update_grad
-                        with autocastcontext(TrainerTools().parallel.device_type):
+                        with autocast(TrainerTools().parallel.device_type):
                             policy_outputs = self.train_model(concat_inputs, attention_mask=concat_mask)
                             policy_probs = self._logprobs(policy_outputs['logits'], concat_labels, concat_mask)
                             aux_loss = policy_outputs.get('aux_loss')

llm_trainer/ds_checkpoint.py CHANGED Viewed

@@ -28,9 +28,11 @@ def save_ds_checkpoint(model: nn.Module):
     # 只在main rank上执行
     if TrainerTools().parallel.is_main_process:
+        # 最多保存多少checkpoint，默认为2
+        max_to_keep = int(os.environ.get('CKPT_MAX_TO_KEEP', '2'))
         # 删除历史checkpoint
         ckpt_paths = glob(os.path.join(ckpt_dir, "global_*"))
-        if len(ckpt_paths) > 2:
+        if len(ckpt_paths) > max_to_keep:
             # 按修改时间排序，找到最旧的目录
             oldest_ckpt = sorted(ckpt_paths, key=os.path.getmtime)[0]
             try:

llm_trainer/generate_utils.py CHANGED Viewed

@@ -4,7 +4,7 @@ import torch
 from llm_model import VlmModel, KVCache
 from .tools import TrainerTools
 from .utils import (
-    autocastcontext,
+    autocast,
     batch_repeat_image_tok
 )
@@ -127,7 +127,6 @@ def _generate(
     如果temperature很大但内容单一，需要增大k、p
     """
     use_kv_cache = True
-    ctx = autocastcontext(device)
     if isinstance(model, VlmModel):
         tokens = batch_repeat_image_tok(tokens, tokens_per_image)
@@ -141,7 +140,7 @@ def _generate(
     with torch.inference_mode():
         for _ in range(max_new_tokens):
             t = tokens # tokens[:, -max_position_embeddings:]
-            with ctx:
+            with autocast(device):
                 result = model(
                     t,
                     past_key_values=kv_cache,
@@ -327,7 +326,6 @@ def batch_generate(
         device: Union[str, torch.device, int]
 ):
     use_kv_cache = True
-    ctx = autocastcontext(device)
     if isinstance(model, VlmModel):
         tokens = batch_repeat_image_tok(tokens, tokens_per_image)
@@ -350,7 +348,7 @@ def batch_generate(
                 break
             t = tokens #tokens[:, -max_position_embeddings:]
-            with ctx:
+            with autocast(device):
                 result = model(
                     t,
                     attention_mask=attention_mask,

llm_trainer/grpo_trainer.py CHANGED Viewed

@@ -13,7 +13,7 @@ from .loss import GRPOLoss
 from .tools import TrainerTools
 from .generate_utils import batch_generate
 from .log import log
-from .utils import autocastcontext
+from .utils import autocast
 from .partition_utils import (
     sync_model_params,
@@ -342,7 +342,7 @@ class GRPOTrainer(Trainer):
                             log(f'start train for batch {batch}/{batch_count_per_file}')
                         for grpo_step in range(self.train_config.grpo_config.grpo_steps):
-                            with autocastcontext(TrainerTools().parallel.device_type):
+                            with autocast(TrainerTools().parallel.device_type):
                                 loss, aux_loss = self._maximize_grpo_objective(rollout_data)
                                 if aux_loss_coef and aux_loss:
                                     loss += aux_loss_coef * aux_loss

llm_trainer/trainer.py CHANGED Viewed

@@ -36,7 +36,7 @@ from .checkpoint import (
 from .utils import (
     set_seed,
-    autocastcontext,
+    autocast,
     create_doc_boundary_mask,
     generate_position_ids,
     pretrain_collate_fn,
@@ -556,7 +556,7 @@ class Trainer:
                         if TrainerTools().parallel.parallel_train:
                             self.train_model.require_backward_grad_sync = need_update_grad
-                        with autocastcontext(TrainerTools().parallel.device_type):
+                        with autocast(TrainerTools().parallel.device_type):
                             result = self.train_model(
                                 inputs,
                                 attention_mask=attention_mask,

llm_trainer/utils.py CHANGED Viewed

@@ -15,7 +15,7 @@ def set_seed(seed=42):
     torch.cuda.manual_seed_all(seed)
-def autocastcontext(device_type):
+def autocast(device_type):
     if TrainerTools().use_amp:
         dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16
         return torch.autocast(

{project_llm_trainer-0.7.0.dist-info → project_llm_trainer-0.7.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: project_llm_trainer
-Version: 0.7.0
+Version: 0.7.2
 Summary: LLM and VLM trainer
 Author: qibin
 Author-email: qibin0506@gmail.com

{project_llm_trainer-0.7.0.dist-info → project_llm_trainer-0.7.2.dist-info}/RECORD RENAMED Viewed

@@ -1,11 +1,11 @@
 llm_trainer/__init__.py,sha256=HWgtTEVeQSnZmEyYQm2K6eFEG4X2QAoigMlB5Z2tcXE,260
-llm_trainer/checkpoint.py,sha256=gz31pZbbQvRTYrBhxV-MFaBAIFeqpe7rM6nFsjwT9lY,4328
+llm_trainer/checkpoint.py,sha256=-sHPwhZwJfiSpbHTDto7n_oagnSVmLe8pkcU9x217gs,4459
 llm_trainer/dataset.py,sha256=4QlOo0SFB5816BUYegQjgobUqTUMQvdmZMM_OEAMSjE,4347
-llm_trainer/dpo_trainer.py,sha256=_8ZwOKQH69c6Fa5Cey5hNep7XUoI4jPIXQaQcV3soGw,12367
-llm_trainer/ds_checkpoint.py,sha256=Wzy7PvVVWR794-BW4uragWFTAkkgDvjvkF-qMdyB4fc,2141
+llm_trainer/dpo_trainer.py,sha256=RMfbTsl3eav4yTJ2PK59mi6a0ECVOg8WwYVsHvMbNUE,12353
+llm_trainer/ds_checkpoint.py,sha256=X2IWgpgi0yOtogph7n6DEwvK_0Ceb7juu1WMutv3HSk,2270
 llm_trainer/eval.py,sha256=ZyUfSo2Q8P-lrCdPEnGkoo5pGubd0AabREK5eMISRII,1109
-llm_trainer/generate_utils.py,sha256=zX5218RX4ltahCQCZVVCWQghCWhKslPk2NUnl_CakIE,15050
-llm_trainer/grpo_trainer.py,sha256=0iWvpuMI5CDNIjH08Dd1ihZFqDYenVnHACiMY2GLJtg,16449
+llm_trainer/generate_utils.py,sha256=8K3YFbp7IF_lCkmkzjHhqTW26EBFb2AilQmarVcfMvs,15001
+llm_trainer/grpo_trainer.py,sha256=zxbLIzk34cHFw5yfRH8EBr0wrFTS7qFa5DepcC0WXwk,16435
 llm_trainer/log.py,sha256=XwychwKF6gvFPhthCIZCAEUZ0G3DY3fiQrOHqPWsxz0,463
 llm_trainer/loss.py,sha256=eYvOlCoguKnLvdGuqvQpGUoLVSADQ5coaU3DWYbJEdM,6811
 llm_trainer/parallel.py,sha256=yjStV21DJ26yM8-0O6GTMxdFAcyShY5GsQWSZmbI7HU,4543
@@ -18,16 +18,16 @@ llm_trainer/sft_trainer.py,sha256=LudTRIaqLQYy6ym6jjMX7v9xtFBJelrR3nnPCwb48nM,18
 llm_trainer/tokenizer.py,sha256=SSpgXtb0e1NtQqRW0gCq09TTZi47umggy-Fh5EMHKJg,6708
 llm_trainer/tools.py,sha256=5op5qrjjkK-Lr9oes5VxIVnOVYOYGoAdlIJq9mPUf64,2637
 llm_trainer/train_configs.py,sha256=U4hwXWKI6svDqiDOu6RPTitCzpxEYyjZUN6gwh_co8c,7510
-llm_trainer/trainer.py,sha256=2TC2GJeoGd0fDE6CFodk1chsSkk0v0yO0wrFYim5t4g,27938
-llm_trainer/utils.py,sha256=ox2fWtSOS7F2Nh7_FoHxuQgaps1jGW3q59VXz04wRuA,11491
-project_llm_trainer-0.7.0.data/scripts/calc_intermediate_size,sha256=AggpgNHokJiJMbEtVdOnolqr_4bH3i1UYuZNEAzC2Gc,460
-project_llm_trainer-0.7.0.data/scripts/ddp_train,sha256=x81AasaN2-9TwARFFF1l7iV1LmfMQ0bLw0i_CGbOwSw,299
-project_llm_trainer-0.7.0.data/scripts/ds_train,sha256=qL3qc3TcedBCw98UZUjW07ONcErRawLE1HymW2AmscA,265
-project_llm_trainer-0.7.0.data/scripts/plot_loss,sha256=MzFcdJESlVr1srj4Td6-AxPGUKkfB_QEcJwm0Bd-5fU,910
-project_llm_trainer-0.7.0.data/scripts/plot_lr,sha256=w_7XR_x3KYYyboeOVAeu_I4fveLFI-C0wBmRrNlmWUI,894
-project_llm_trainer-0.7.0.data/scripts/py_train,sha256=tOp9TquORQeU8XN5H7OVIk5O0Ypwi34p_GENxTwgwdk,265
-project_llm_trainer-0.7.0.data/scripts/smart_train,sha256=Pmt4Q0to4Hoz82iB9uFPZuz7uahNUbfE7FR1940EBy8,716
-project_llm_trainer-0.7.0.dist-info/METADATA,sha256=Q_UU9xBZIIBFOmfQJg1708lFfYn4bu5FA0fuxJCCcxQ,195
-project_llm_trainer-0.7.0.dist-info/WHEEL,sha256=Nw36Djuh_5VDukK0H78QzOX-_FQEo6V37m3nkm96gtU,91
-project_llm_trainer-0.7.0.dist-info/top_level.txt,sha256=LtRFg28i0QIG7iBCD2t095oSco99LCtkijibS9cMGik,12
-project_llm_trainer-0.7.0.dist-info/RECORD,,
+llm_trainer/trainer.py,sha256=jS31zEXIIj9BoPTPlmaGYq61x72HGCjKfS2u3_gOkDk,27924
+llm_trainer/utils.py,sha256=xcdzpvPvXRKqsOK2yB7PZ9GmOvZMDFcglDPUZY2hJTY,11484
+project_llm_trainer-0.7.2.data/scripts/calc_intermediate_size,sha256=AggpgNHokJiJMbEtVdOnolqr_4bH3i1UYuZNEAzC2Gc,460
+project_llm_trainer-0.7.2.data/scripts/ddp_train,sha256=x81AasaN2-9TwARFFF1l7iV1LmfMQ0bLw0i_CGbOwSw,299
+project_llm_trainer-0.7.2.data/scripts/ds_train,sha256=qL3qc3TcedBCw98UZUjW07ONcErRawLE1HymW2AmscA,265
+project_llm_trainer-0.7.2.data/scripts/plot_loss,sha256=MzFcdJESlVr1srj4Td6-AxPGUKkfB_QEcJwm0Bd-5fU,910
+project_llm_trainer-0.7.2.data/scripts/plot_lr,sha256=w_7XR_x3KYYyboeOVAeu_I4fveLFI-C0wBmRrNlmWUI,894
+project_llm_trainer-0.7.2.data/scripts/py_train,sha256=tOp9TquORQeU8XN5H7OVIk5O0Ypwi34p_GENxTwgwdk,265
+project_llm_trainer-0.7.2.data/scripts/smart_train,sha256=Pmt4Q0to4Hoz82iB9uFPZuz7uahNUbfE7FR1940EBy8,716
+project_llm_trainer-0.7.2.dist-info/METADATA,sha256=WYohRO3Qb9o9QD3UZWqWmtoEOzoYJNWmj1_Olds6P4c,195
+project_llm_trainer-0.7.2.dist-info/WHEEL,sha256=Nw36Djuh_5VDukK0H78QzOX-_FQEo6V37m3nkm96gtU,91
+project_llm_trainer-0.7.2.dist-info/top_level.txt,sha256=LtRFg28i0QIG7iBCD2t095oSco99LCtkijibS9cMGik,12
+project_llm_trainer-0.7.2.dist-info/RECORD,,