PyPI - project-llm-trainer - Versions diffs - 0.5.5__py3-none-any.whl → 0.5.7__py3-none-any.whl - Mend

project-llm-trainer 0.5.5py3-none-any.whl → 0.5.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of project-llm-trainer might be problematic. Click here for more details.

Files changed (14) hide show

llm_trainer/checkpoint.py CHANGED Viewed

@@ -38,29 +38,32 @@ def save_best_checkpoint(
 ) -> bool:
     need_replace = not last_best_checkpoint_loss or current_loss <= last_best_checkpoint_loss
     if need_replace and TrainerTools().parallel.is_main_process:
-        if isinstance(TrainerTools().parallel, DsParallel):
-            checkpoint_dir = os.environ.get('DIST_CHECKPOINT_DIR', 'checkpoint')
+        try:
+            if isinstance(TrainerTools().parallel, DsParallel):
+                checkpoint_dir = os.environ.get('DIST_CHECKPOINT_DIR', 'checkpoint')
-            if checkpoint_dir.endswith('/'):
-                best_checkpoint_dir = f'{checkpoint_dir[:-1]}_best'
-            else:
-                best_checkpoint_dir = f'{checkpoint_dir}_best'
+                if checkpoint_dir.endswith('/'):
+                    best_checkpoint_dir = f'{checkpoint_dir[:-1]}_best'
+                else:
+                    best_checkpoint_dir = f'{checkpoint_dir}_best'
-            if not os.path.exists(best_checkpoint_dir):
-                os.makedirs(best_checkpoint_dir)
+                if not os.path.exists(best_checkpoint_dir):
+                    os.makedirs(best_checkpoint_dir)
-            if os.path.exists(checkpoint_dir):
-                shutil.rmtree(best_checkpoint_dir)
-                shutil.copytree(checkpoint_dir, best_checkpoint_dir)
-        else:
-            checkpoint_name = os.environ.get('CHECKPOINT_NAME', DEFAULT_CHECKPOINT_NAME)
-            best_checkpoint_name = f'{checkpoint_name}_best'
+                if os.path.exists(checkpoint_dir):
+                    shutil.rmtree(best_checkpoint_dir)
+                    shutil.copytree(checkpoint_dir, best_checkpoint_dir)
+            else:
+                checkpoint_name = os.environ.get('CHECKPOINT_NAME', DEFAULT_CHECKPOINT_NAME)
+                best_checkpoint_name = f'{checkpoint_name}_best'
-            if os.path.exists(checkpoint_name):
-                if os.path.exists(best_checkpoint_name):
-                    os.remove(best_checkpoint_name)
+                if os.path.exists(checkpoint_name):
+                    if os.path.exists(best_checkpoint_name):
+                        os.remove(best_checkpoint_name)
-                shutil.copy2(checkpoint_name, best_checkpoint_name)
+                    shutil.copy2(checkpoint_name, best_checkpoint_name)
+        except:
+            pass
     TrainerTools().parallel.wait()
     return need_replace
@@ -101,17 +104,14 @@ def save_steps(global_steps: int, lr_scheduler: Optional[LRScheduler] = None):
     # 暂时只保存主进程的
     if TrainerTools().parallel.is_main_process:
         steps_checkpoint_name = f"{os.environ.get('LOG_DIR', './')}steps.pt"
-        ckpt = {'global_steps': global_steps, 'lr_steps': lr_scheduler.cur_steps}
+        ckpt = {'global_steps': global_steps}
+        ckpt.update(lr_scheduler.get_ckpt_dict())
         torch.save(ckpt, steps_checkpoint_name)
-def load_steps(
-        default_global_steps: int = 0,
-        default_lr_steps: int = 0
-) -> Tuple[Optional[int], Optional[int]]:
+def load_steps() -> Optional[dict]:
     steps_checkpoint_name = f"{os.environ.get('LOG_DIR', './')}steps.pt"
     if os.path.exists(steps_checkpoint_name):
-        ckpt = torch.load(steps_checkpoint_name, weights_only=True)
-        return ckpt['global_steps'], ckpt['lr_steps']
+        return torch.load(steps_checkpoint_name, weights_only=True)
-    return default_global_steps, default_lr_steps
+    return None

llm_trainer/scheduler.py CHANGED Viewed

@@ -15,15 +15,18 @@ class LRScheduler(ABC):
     @abstractmethod
     def cur_lr(self): ...
-    @abstractmethod
-    def update_steps(self, steps): ...
     @abstractmethod
     def step(self): ...
     @abstractmethod
     def can_clip_grad(self): ...
+    @abstractmethod
+    def get_ckpt_dict(self) -> dict: ...
+    @abstractmethod
+    def restore_ckpt_dict(self, ckpt: dict): ...
 class WarmupCosineAnnealingLRScheduler(LRScheduler):
     def __init__(
@@ -72,11 +75,6 @@ class WarmupCosineAnnealingLRScheduler(LRScheduler):
     def cur_lr(self):
         return self._current_lr
-    def update_steps(self, steps):
-        log(f'update step to {steps}')
-        self._steps = steps
-        self._update_lr()
     def step(self):
         self._steps += 1
         self._update_lr()
@@ -122,6 +120,33 @@ class WarmupCosineAnnealingLRScheduler(LRScheduler):
         if self.need_log:
             log(f"step={self.cur_steps},lr={lr}\n", f'{get_log_dir()}lr.txt')
+    def get_ckpt_dict(self) -> dict:
+        return {
+            'cur_lr': self._current_lr,
+            'lr_steps': self.cur_steps,
+            'cosine_annealing_base_lr': self._cosine_annealing_base_lr,
+            't_cur': self.T_cur,
+            'cycle': self.cycle,
+        }
+    def restore_ckpt_dict(self, ckpt: dict):
+        if ckpt['cur_lr']:
+            self._current_lr = ckpt['cur_lr']
+        if ckpt['lr_steps']:
+            self._steps = ckpt['lr_steps']
+        if ckpt['cosine_annealing_base_lr']:
+            self._cosine_annealing_base_lr = ckpt['cosine_annealing_base_lr']
+        if ckpt['t_cur']:
+            self.T_cur = ckpt['t_cur']
+        if ckpt['cycle']:
+            self.cycle = ckpt['cycle']
+        self._update_lr()
 class NoneLRScheduler(LRScheduler):
     def __init__(self, initial_lr):
@@ -135,9 +160,14 @@ class NoneLRScheduler(LRScheduler):
     def cur_lr(self):
         return self._current_lr
-    def update_steps(self, steps): ...
     def step(self): ...
     def can_clip_grad(self):
-        return True
+        return True
+    def get_ckpt_dict(self) -> dict:
+        return {'cur_lr': self._current_lr}
+    def restore_ckpt_dict(self, ckpt: dict):
+        if ckpt['cur_lr']:
+            self._current_lr = ckpt['cur_lr']

llm_trainer/trainer.py CHANGED Viewed

@@ -92,12 +92,15 @@ class Trainer:
             device=TrainerTools().parallel.device
         )
-        last_global_steps, last_lr_steps = load_steps(0, -1)
-        self.last_global_steps = last_global_steps
-        log(f'last_global_steps={last_global_steps}, last_lr_steps={last_lr_steps}')
+        steps_dict = load_steps()
+        if steps_dict:
+            self.last_global_steps = steps_dict['global_steps']
+            if not self.last_global_steps:
+                self.last_global_steps = 0
-        if last_lr_steps != -1:
-            self.lr_scheduler.update_steps(last_lr_steps)
+            self.lr_scheduler.restore_ckpt_dict(steps_dict)
+            log(f'restore steps_dict = {steps_dict}')
         if isinstance(train_config.model_config, VLMConfig):
             self.pixel_values_provider = train_config.pixel_values_provider

{project_llm_trainer-0.5.5.dist-info → project_llm_trainer-0.5.7.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: project_llm_trainer
-Version: 0.5.5
+Version: 0.5.7
 Summary: LLM and VLM trainer
 Author: qibin
 Author-email: qibin0506@gmail.com

{project_llm_trainer-0.5.5.dist-info → project_llm_trainer-0.5.7.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
 llm_trainer/__init__.py,sha256=HWgtTEVeQSnZmEyYQm2K6eFEG4X2QAoigMlB5Z2tcXE,260
-llm_trainer/checkpoint.py,sha256=UVjOaDsiSIzRJ5VJZib6iXrdKv2A7K_gtJw3a9wNyoM,4293
+llm_trainer/checkpoint.py,sha256=RoRlIB-Qtvl3MyY3g0FbEBHLpFRLnEMZLpEncXOLToQ,4242
 llm_trainer/dataset.py,sha256=4QlOo0SFB5816BUYegQjgobUqTUMQvdmZMM_OEAMSjE,4347
 llm_trainer/dpo_trainer.py,sha256=1A_4QP2_xqM_YeqdXy-0RaMvEL80gim-pgnPQyHww9U,12052
 llm_trainer/ds_checkpoint.py,sha256=D092fkS1Up4QmpV9YCpqbSzfX_caCAeX-UiOrhOE1I8,1947
@@ -13,21 +13,21 @@ llm_trainer/parallel_ddp.py,sha256=Pob9vUlBZnkL4oP1Re11kFob7nufMSE96pn7m7fuOEM,1
 llm_trainer/parallel_ds.py,sha256=oy8RRxHud3rACWubFlJqqd0pjPEQhKeAPGPQUSdJX2c,1145
 llm_trainer/parallel_none.py,sha256=TG6Pm829Dg-yQu-97O-EHV3FCARBlNcP47KkGFAs16E,676
 llm_trainer/partition_utils.py,sha256=xzv8kwlbKp3dai2pBwX89gN5ymeHk1bGbTkGru5H-UM,5167
-llm_trainer/scheduler.py,sha256=lyC9TFuF_y8EXYq9d-WAqN4CSaq_w9kSKeh_BOo3EpI,4039
+llm_trainer/scheduler.py,sha256=LAI_0VxClsIQkix0bRoduRD4vPfVuIZDhZgTAT_KK8k,4901
 llm_trainer/sft_trainer.py,sha256=gxQA7T1o1QGUsHp2CX1Qb_fO5LppBJuNbc0H4ixCYUA,1783
 llm_trainer/tokenizer.py,sha256=SSpgXtb0e1NtQqRW0gCq09TTZi47umggy-Fh5EMHKJg,6708
 llm_trainer/tools.py,sha256=yF17lp6oOfLe2XJeKDQ1juZcbv-6vFamJSLwEeArduA,2975
 llm_trainer/train_configs.py,sha256=c6bgivkkWRYcPD3NzI5uRItAUhZiIBgKVMuMgVFRnFo,7336
-llm_trainer/trainer.py,sha256=YW59dJWTyQy77cLDGzBHhfinGyfkvmWCkl1SR9hM6a8,26071
+llm_trainer/trainer.py,sha256=sqN5cXsFAH9xe8-px6tAgcUe5nw6iZU5PEjT9mgEusE,26106
 llm_trainer/utils.py,sha256=LWNhyQ0NDEZ9mZtk2Ryvh6EulvHIaUGIflugSpqmeFI,6791
-project_llm_trainer-0.5.5.data/scripts/calc_intermediate_size,sha256=AggpgNHokJiJMbEtVdOnolqr_4bH3i1UYuZNEAzC2Gc,460
-project_llm_trainer-0.5.5.data/scripts/ddp_train,sha256=x81AasaN2-9TwARFFF1l7iV1LmfMQ0bLw0i_CGbOwSw,299
-project_llm_trainer-0.5.5.data/scripts/ds_train,sha256=qL3qc3TcedBCw98UZUjW07ONcErRawLE1HymW2AmscA,265
-project_llm_trainer-0.5.5.data/scripts/plot_loss,sha256=MzFcdJESlVr1srj4Td6-AxPGUKkfB_QEcJwm0Bd-5fU,910
-project_llm_trainer-0.5.5.data/scripts/plot_lr,sha256=w_7XR_x3KYYyboeOVAeu_I4fveLFI-C0wBmRrNlmWUI,894
-project_llm_trainer-0.5.5.data/scripts/py_train,sha256=tOp9TquORQeU8XN5H7OVIk5O0Ypwi34p_GENxTwgwdk,265
-project_llm_trainer-0.5.5.data/scripts/smart_train,sha256=Pmt4Q0to4Hoz82iB9uFPZuz7uahNUbfE7FR1940EBy8,716
-project_llm_trainer-0.5.5.dist-info/METADATA,sha256=ajxfapuo4Q2xfdJ3kjZoCzs7Q5ynGp6BssXRFOIbF7Y,195
-project_llm_trainer-0.5.5.dist-info/WHEEL,sha256=Nw36Djuh_5VDukK0H78QzOX-_FQEo6V37m3nkm96gtU,91
-project_llm_trainer-0.5.5.dist-info/top_level.txt,sha256=LtRFg28i0QIG7iBCD2t095oSco99LCtkijibS9cMGik,12
-project_llm_trainer-0.5.5.dist-info/RECORD,,
+project_llm_trainer-0.5.7.data/scripts/calc_intermediate_size,sha256=AggpgNHokJiJMbEtVdOnolqr_4bH3i1UYuZNEAzC2Gc,460
+project_llm_trainer-0.5.7.data/scripts/ddp_train,sha256=x81AasaN2-9TwARFFF1l7iV1LmfMQ0bLw0i_CGbOwSw,299
+project_llm_trainer-0.5.7.data/scripts/ds_train,sha256=qL3qc3TcedBCw98UZUjW07ONcErRawLE1HymW2AmscA,265
+project_llm_trainer-0.5.7.data/scripts/plot_loss,sha256=MzFcdJESlVr1srj4Td6-AxPGUKkfB_QEcJwm0Bd-5fU,910
+project_llm_trainer-0.5.7.data/scripts/plot_lr,sha256=w_7XR_x3KYYyboeOVAeu_I4fveLFI-C0wBmRrNlmWUI,894
+project_llm_trainer-0.5.7.data/scripts/py_train,sha256=tOp9TquORQeU8XN5H7OVIk5O0Ypwi34p_GENxTwgwdk,265
+project_llm_trainer-0.5.7.data/scripts/smart_train,sha256=Pmt4Q0to4Hoz82iB9uFPZuz7uahNUbfE7FR1940EBy8,716
+project_llm_trainer-0.5.7.dist-info/METADATA,sha256=3yxEJlE4psbIzjpHGKnrpz04BT1n03vUR0xlnqu0-V0,195
+project_llm_trainer-0.5.7.dist-info/WHEEL,sha256=Nw36Djuh_5VDukK0H78QzOX-_FQEo6V37m3nkm96gtU,91
+project_llm_trainer-0.5.7.dist-info/top_level.txt,sha256=LtRFg28i0QIG7iBCD2t095oSco99LCtkijibS9cMGik,12
+project_llm_trainer-0.5.7.dist-info/RECORD,,