PyPI - project-llm-trainer - Versions diffs - 0.7.1__py3-none-any.whl → 0.7.2__py3-none-any.whl - Mend

project-llm-trainer 0.7.1py3-none-any.whl → 0.7.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of project-llm-trainer might be problematic. Click here for more details.

Files changed (13) hide show

llm_trainer/checkpoint.py CHANGED Viewed

@@ -2,6 +2,7 @@ import os
 from typing import Optional, Union, Tuple
 import shutil
 import torch
+from sympy import false
 from torch import nn
 from torch.optim import Optimizer
 from torch.nn.parallel import DistributedDataParallel as DDP
@@ -36,6 +37,10 @@ def save_best_checkpoint(
         current_loss: float,
         last_best_checkpoint_loss: Optional[float] = None
 ) -> bool:
+    # 指定不保存最佳checkpoint
+    if os.environ.get('SAVE_BEST_CHECKPOINT', '1') != '1':
+        return False
     need_replace = not last_best_checkpoint_loss or current_loss <= last_best_checkpoint_loss
     if need_replace and TrainerTools().parallel.is_main_process:
         try:
@@ -62,8 +67,7 @@ def save_best_checkpoint(
                         os.remove(best_checkpoint_name)
                     shutil.copy2(checkpoint_name, best_checkpoint_name)
-        except:
-            pass
+        except: pass
     TrainerTools().parallel.wait('save best checkpoint')
     return need_replace

llm_trainer/ds_checkpoint.py CHANGED Viewed

@@ -28,9 +28,11 @@ def save_ds_checkpoint(model: nn.Module):
     # 只在main rank上执行
     if TrainerTools().parallel.is_main_process:
+        # 最多保存多少checkpoint，默认为2
+        max_to_keep = int(os.environ.get('CKPT_MAX_TO_KEEP', '2'))
         # 删除历史checkpoint
         ckpt_paths = glob(os.path.join(ckpt_dir, "global_*"))
-        if len(ckpt_paths) > 2:
+        if len(ckpt_paths) > max_to_keep:
             # 按修改时间排序，找到最旧的目录
             oldest_ckpt = sorted(ckpt_paths, key=os.path.getmtime)[0]
             try:

{project_llm_trainer-0.7.1.dist-info → project_llm_trainer-0.7.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: project_llm_trainer
-Version: 0.7.1
+Version: 0.7.2
 Summary: LLM and VLM trainer
 Author: qibin
 Author-email: qibin0506@gmail.com

{project_llm_trainer-0.7.1.dist-info → project_llm_trainer-0.7.2.dist-info}/RECORD RENAMED Viewed

@@ -1,8 +1,8 @@
 llm_trainer/__init__.py,sha256=HWgtTEVeQSnZmEyYQm2K6eFEG4X2QAoigMlB5Z2tcXE,260
-llm_trainer/checkpoint.py,sha256=gz31pZbbQvRTYrBhxV-MFaBAIFeqpe7rM6nFsjwT9lY,4328
+llm_trainer/checkpoint.py,sha256=-sHPwhZwJfiSpbHTDto7n_oagnSVmLe8pkcU9x217gs,4459
 llm_trainer/dataset.py,sha256=4QlOo0SFB5816BUYegQjgobUqTUMQvdmZMM_OEAMSjE,4347
 llm_trainer/dpo_trainer.py,sha256=RMfbTsl3eav4yTJ2PK59mi6a0ECVOg8WwYVsHvMbNUE,12353
-llm_trainer/ds_checkpoint.py,sha256=Wzy7PvVVWR794-BW4uragWFTAkkgDvjvkF-qMdyB4fc,2141
+llm_trainer/ds_checkpoint.py,sha256=X2IWgpgi0yOtogph7n6DEwvK_0Ceb7juu1WMutv3HSk,2270
 llm_trainer/eval.py,sha256=ZyUfSo2Q8P-lrCdPEnGkoo5pGubd0AabREK5eMISRII,1109
 llm_trainer/generate_utils.py,sha256=8K3YFbp7IF_lCkmkzjHhqTW26EBFb2AilQmarVcfMvs,15001
 llm_trainer/grpo_trainer.py,sha256=zxbLIzk34cHFw5yfRH8EBr0wrFTS7qFa5DepcC0WXwk,16435
@@ -20,14 +20,14 @@ llm_trainer/tools.py,sha256=5op5qrjjkK-Lr9oes5VxIVnOVYOYGoAdlIJq9mPUf64,2637
 llm_trainer/train_configs.py,sha256=U4hwXWKI6svDqiDOu6RPTitCzpxEYyjZUN6gwh_co8c,7510
 llm_trainer/trainer.py,sha256=jS31zEXIIj9BoPTPlmaGYq61x72HGCjKfS2u3_gOkDk,27924
 llm_trainer/utils.py,sha256=xcdzpvPvXRKqsOK2yB7PZ9GmOvZMDFcglDPUZY2hJTY,11484
-project_llm_trainer-0.7.1.data/scripts/calc_intermediate_size,sha256=AggpgNHokJiJMbEtVdOnolqr_4bH3i1UYuZNEAzC2Gc,460
-project_llm_trainer-0.7.1.data/scripts/ddp_train,sha256=x81AasaN2-9TwARFFF1l7iV1LmfMQ0bLw0i_CGbOwSw,299
-project_llm_trainer-0.7.1.data/scripts/ds_train,sha256=qL3qc3TcedBCw98UZUjW07ONcErRawLE1HymW2AmscA,265
-project_llm_trainer-0.7.1.data/scripts/plot_loss,sha256=MzFcdJESlVr1srj4Td6-AxPGUKkfB_QEcJwm0Bd-5fU,910
-project_llm_trainer-0.7.1.data/scripts/plot_lr,sha256=w_7XR_x3KYYyboeOVAeu_I4fveLFI-C0wBmRrNlmWUI,894
-project_llm_trainer-0.7.1.data/scripts/py_train,sha256=tOp9TquORQeU8XN5H7OVIk5O0Ypwi34p_GENxTwgwdk,265
-project_llm_trainer-0.7.1.data/scripts/smart_train,sha256=Pmt4Q0to4Hoz82iB9uFPZuz7uahNUbfE7FR1940EBy8,716
-project_llm_trainer-0.7.1.dist-info/METADATA,sha256=5O5GDggubLuaVquiTdCwB3K2v8dD2EwqVVFvsgeSyZM,195
-project_llm_trainer-0.7.1.dist-info/WHEEL,sha256=Nw36Djuh_5VDukK0H78QzOX-_FQEo6V37m3nkm96gtU,91
-project_llm_trainer-0.7.1.dist-info/top_level.txt,sha256=LtRFg28i0QIG7iBCD2t095oSco99LCtkijibS9cMGik,12
-project_llm_trainer-0.7.1.dist-info/RECORD,,
+project_llm_trainer-0.7.2.data/scripts/calc_intermediate_size,sha256=AggpgNHokJiJMbEtVdOnolqr_4bH3i1UYuZNEAzC2Gc,460
+project_llm_trainer-0.7.2.data/scripts/ddp_train,sha256=x81AasaN2-9TwARFFF1l7iV1LmfMQ0bLw0i_CGbOwSw,299
+project_llm_trainer-0.7.2.data/scripts/ds_train,sha256=qL3qc3TcedBCw98UZUjW07ONcErRawLE1HymW2AmscA,265
+project_llm_trainer-0.7.2.data/scripts/plot_loss,sha256=MzFcdJESlVr1srj4Td6-AxPGUKkfB_QEcJwm0Bd-5fU,910
+project_llm_trainer-0.7.2.data/scripts/plot_lr,sha256=w_7XR_x3KYYyboeOVAeu_I4fveLFI-C0wBmRrNlmWUI,894
+project_llm_trainer-0.7.2.data/scripts/py_train,sha256=tOp9TquORQeU8XN5H7OVIk5O0Ypwi34p_GENxTwgwdk,265
+project_llm_trainer-0.7.2.data/scripts/smart_train,sha256=Pmt4Q0to4Hoz82iB9uFPZuz7uahNUbfE7FR1940EBy8,716
+project_llm_trainer-0.7.2.dist-info/METADATA,sha256=WYohRO3Qb9o9QD3UZWqWmtoEOzoYJNWmj1_Olds6P4c,195
+project_llm_trainer-0.7.2.dist-info/WHEEL,sha256=Nw36Djuh_5VDukK0H78QzOX-_FQEo6V37m3nkm96gtU,91
+project_llm_trainer-0.7.2.dist-info/top_level.txt,sha256=LtRFg28i0QIG7iBCD2t095oSco99LCtkijibS9cMGik,12
+project_llm_trainer-0.7.2.dist-info/RECORD,,