project-llm-trainer 0.4.9__py3-none-any.whl → 0.4.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of project-llm-trainer might be problematic. Click here for more details.
- llm_trainer/trainer.py +7 -3
- {project_llm_trainer-0.4.9.dist-info → project_llm_trainer-0.4.10.dist-info}/METADATA +1 -1
- {project_llm_trainer-0.4.9.dist-info → project_llm_trainer-0.4.10.dist-info}/RECORD +12 -12
- {project_llm_trainer-0.4.9.data → project_llm_trainer-0.4.10.data}/scripts/calc_intermediate_size +0 -0
- {project_llm_trainer-0.4.9.data → project_llm_trainer-0.4.10.data}/scripts/ddp_train +0 -0
- {project_llm_trainer-0.4.9.data → project_llm_trainer-0.4.10.data}/scripts/ds_train +0 -0
- {project_llm_trainer-0.4.9.data → project_llm_trainer-0.4.10.data}/scripts/plot_loss +0 -0
- {project_llm_trainer-0.4.9.data → project_llm_trainer-0.4.10.data}/scripts/plot_lr +0 -0
- {project_llm_trainer-0.4.9.data → project_llm_trainer-0.4.10.data}/scripts/py_train +0 -0
- {project_llm_trainer-0.4.9.data → project_llm_trainer-0.4.10.data}/scripts/smart_train +0 -0
- {project_llm_trainer-0.4.9.dist-info → project_llm_trainer-0.4.10.dist-info}/WHEEL +0 -0
- {project_llm_trainer-0.4.9.dist-info → project_llm_trainer-0.4.10.dist-info}/top_level.txt +0 -0
llm_trainer/trainer.py
CHANGED
|
@@ -540,13 +540,17 @@ class Trainer:
|
|
|
540
540
|
if gradient_accumulation_steps > 1:
|
|
541
541
|
loss = loss / gradient_accumulation_steps
|
|
542
542
|
|
|
543
|
-
loss_accumulation += loss.detach()
|
|
543
|
+
loss_accumulation += loss.detach().item()
|
|
544
544
|
self._backward_loss(loss)
|
|
545
545
|
|
|
546
546
|
if need_update_grad:
|
|
547
|
+
loss_tensor = torch.tensor(loss_accumulation, device=TrainerTools().parallel.device)
|
|
548
|
+
|
|
547
549
|
# todo check all_reduce??
|
|
548
550
|
if TrainerTools().parallel.parallel_train:
|
|
549
|
-
dist.all_reduce(
|
|
551
|
+
dist.all_reduce(loss_tensor, dist.ReduceOp.AVG)
|
|
552
|
+
|
|
553
|
+
final_log_loss = loss_tensor.item()
|
|
550
554
|
|
|
551
555
|
# ds模式已经集成gradient_clipping
|
|
552
556
|
if not isinstance(TrainerTools().parallel, DsParallel) and self.lr_scheduler.can_clip_grad():
|
|
@@ -560,7 +564,7 @@ class Trainer:
|
|
|
560
564
|
epoch_tag=f'epoch: {epoch}',
|
|
561
565
|
file_tag=f'file: {file_idx + 1}/{file_count}',
|
|
562
566
|
batch_tag=f'batch: {batch}/{batch_count_per_file}',
|
|
563
|
-
loss=
|
|
567
|
+
loss=final_log_loss
|
|
564
568
|
)
|
|
565
569
|
# reset to default
|
|
566
570
|
loss_accumulation = 0.0
|
|
@@ -20,16 +20,16 @@ llm_trainer/sft_trainer.py,sha256=gxQA7T1o1QGUsHp2CX1Qb_fO5LppBJuNbc0H4ixCYUA,17
|
|
|
20
20
|
llm_trainer/tokenizer.py,sha256=A7TYYUbtPf75kjCvWP7yBui4xZBObMk2aPem62YpwpY,6776
|
|
21
21
|
llm_trainer/tools.py,sha256=O45-20wRmh-nyTfU-U-XtjbKAoe7boEIsUvWT_NaKx4,3041
|
|
22
22
|
llm_trainer/train_configs.py,sha256=gzTXMLUuQexRvqyKIZQ1U6ESa0DELD7hPpYZdrDcyxg,15974
|
|
23
|
-
llm_trainer/trainer.py,sha256=
|
|
23
|
+
llm_trainer/trainer.py,sha256=pUtJVRosn54j1hn76CFAptJcAsrDo59H6p8NMkg2zt4,25521
|
|
24
24
|
llm_trainer/utils.py,sha256=-ivhMF0d999va13S1wt2uBvtVw8Nvr3uBzhaUFKL04Q,6826
|
|
25
|
-
project_llm_trainer-0.4.
|
|
26
|
-
project_llm_trainer-0.4.
|
|
27
|
-
project_llm_trainer-0.4.
|
|
28
|
-
project_llm_trainer-0.4.
|
|
29
|
-
project_llm_trainer-0.4.
|
|
30
|
-
project_llm_trainer-0.4.
|
|
31
|
-
project_llm_trainer-0.4.
|
|
32
|
-
project_llm_trainer-0.4.
|
|
33
|
-
project_llm_trainer-0.4.
|
|
34
|
-
project_llm_trainer-0.4.
|
|
35
|
-
project_llm_trainer-0.4.
|
|
25
|
+
project_llm_trainer-0.4.10.data/scripts/calc_intermediate_size,sha256=AggpgNHokJiJMbEtVdOnolqr_4bH3i1UYuZNEAzC2Gc,460
|
|
26
|
+
project_llm_trainer-0.4.10.data/scripts/ddp_train,sha256=x81AasaN2-9TwARFFF1l7iV1LmfMQ0bLw0i_CGbOwSw,299
|
|
27
|
+
project_llm_trainer-0.4.10.data/scripts/ds_train,sha256=qL3qc3TcedBCw98UZUjW07ONcErRawLE1HymW2AmscA,265
|
|
28
|
+
project_llm_trainer-0.4.10.data/scripts/plot_loss,sha256=MzFcdJESlVr1srj4Td6-AxPGUKkfB_QEcJwm0Bd-5fU,910
|
|
29
|
+
project_llm_trainer-0.4.10.data/scripts/plot_lr,sha256=w_7XR_x3KYYyboeOVAeu_I4fveLFI-C0wBmRrNlmWUI,894
|
|
30
|
+
project_llm_trainer-0.4.10.data/scripts/py_train,sha256=tOp9TquORQeU8XN5H7OVIk5O0Ypwi34p_GENxTwgwdk,265
|
|
31
|
+
project_llm_trainer-0.4.10.data/scripts/smart_train,sha256=Pmt4Q0to4Hoz82iB9uFPZuz7uahNUbfE7FR1940EBy8,716
|
|
32
|
+
project_llm_trainer-0.4.10.dist-info/METADATA,sha256=zrHUkQPm7Zox2CSeYN5HBqedZebXuZAQgZVj0O24U6I,196
|
|
33
|
+
project_llm_trainer-0.4.10.dist-info/WHEEL,sha256=Nw36Djuh_5VDukK0H78QzOX-_FQEo6V37m3nkm96gtU,91
|
|
34
|
+
project_llm_trainer-0.4.10.dist-info/top_level.txt,sha256=LtRFg28i0QIG7iBCD2t095oSco99LCtkijibS9cMGik,12
|
|
35
|
+
project_llm_trainer-0.4.10.dist-info/RECORD,,
|
{project_llm_trainer-0.4.9.data → project_llm_trainer-0.4.10.data}/scripts/calc_intermediate_size
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|