project-llm-trainer 0.7.9__py3-none-any.whl → 0.8.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of project-llm-trainer might be problematic. Click here for more details.
- llm_trainer/dpo_trainer.py +3 -3
- llm_trainer/grpo_trainer.py +3 -3
- llm_trainer/sft_trainer.py +3 -3
- llm_trainer/train_configs.py +5 -4
- llm_trainer/trainer.py +53 -40
- project_llm_trainer-0.8.2.data/scripts/ddp_train +21 -0
- project_llm_trainer-0.8.2.data/scripts/ds_train +17 -0
- {project_llm_trainer-0.7.9.data → project_llm_trainer-0.8.2.data}/scripts/smart_train +7 -16
- {project_llm_trainer-0.7.9.dist-info → project_llm_trainer-0.8.2.dist-info}/METADATA +1 -1
- {project_llm_trainer-0.7.9.dist-info → project_llm_trainer-0.8.2.dist-info}/RECORD +16 -16
- project_llm_trainer-0.7.9.data/scripts/ddp_train +0 -24
- project_llm_trainer-0.7.9.data/scripts/ds_train +0 -30
- {project_llm_trainer-0.7.9.data → project_llm_trainer-0.8.2.data}/scripts/calc_intermediate_size +0 -0
- {project_llm_trainer-0.7.9.data → project_llm_trainer-0.8.2.data}/scripts/plot_loss +0 -0
- {project_llm_trainer-0.7.9.data → project_llm_trainer-0.8.2.data}/scripts/plot_lr +0 -0
- {project_llm_trainer-0.7.9.data → project_llm_trainer-0.8.2.data}/scripts/py_train +0 -0
- {project_llm_trainer-0.7.9.dist-info → project_llm_trainer-0.8.2.dist-info}/WHEEL +0 -0
- {project_llm_trainer-0.7.9.dist-info → project_llm_trainer-0.8.2.dist-info}/top_level.txt +0 -0
llm_trainer/dpo_trainer.py
CHANGED
|
@@ -70,12 +70,12 @@ class DPOTrainer(Trainer):
|
|
|
70
70
|
|
|
71
71
|
return criterion, None
|
|
72
72
|
|
|
73
|
-
def _convert_train_args(self) -> Tuple[dict, dict, dict
|
|
73
|
+
def _convert_train_args(self) -> Tuple[dict, dict, dict]:
|
|
74
74
|
dpo_collate_fn = get_dpo_collate_fn(self.train_config.mask_prompt)
|
|
75
|
-
parallel_kwargs, data_loader_kwargs, sampler_kwargs
|
|
75
|
+
parallel_kwargs, data_loader_kwargs, sampler_kwargs = super()._convert_train_args()
|
|
76
76
|
data_loader_kwargs.update({"collate_fn": dpo_collate_fn})
|
|
77
77
|
|
|
78
|
-
return parallel_kwargs, data_loader_kwargs, sampler_kwargs
|
|
78
|
+
return parallel_kwargs, data_loader_kwargs, sampler_kwargs
|
|
79
79
|
|
|
80
80
|
def _create_dataset(self, file_idx) -> Tuple[Dataset, str]:
|
|
81
81
|
file_path = self.train_config.file_dataset[file_idx]
|
llm_trainer/grpo_trainer.py
CHANGED
|
@@ -82,11 +82,11 @@ class GRPOTrainer(Trainer):
|
|
|
82
82
|
|
|
83
83
|
return criterion, None
|
|
84
84
|
|
|
85
|
-
def _convert_train_args(self) -> Tuple[dict, dict, dict
|
|
86
|
-
parallel_kwargs, data_loader_kwargs, sampler_kwargs
|
|
85
|
+
def _convert_train_args(self) -> Tuple[dict, dict, dict]:
|
|
86
|
+
parallel_kwargs, data_loader_kwargs, sampler_kwargs = super()._convert_train_args()
|
|
87
87
|
data_loader_kwargs.update({"collate_fn": lambda x: x})
|
|
88
88
|
|
|
89
|
-
return parallel_kwargs, data_loader_kwargs, sampler_kwargs
|
|
89
|
+
return parallel_kwargs, data_loader_kwargs, sampler_kwargs
|
|
90
90
|
|
|
91
91
|
def _create_dataset(self, file_idx) -> Tuple[Dataset, str]:
|
|
92
92
|
file_path = self.train_config.file_dataset[file_idx]
|
llm_trainer/sft_trainer.py
CHANGED
|
@@ -23,12 +23,12 @@ class SFTTrainer(Trainer):
|
|
|
23
23
|
)
|
|
24
24
|
self.packed_sequences = False
|
|
25
25
|
|
|
26
|
-
def _convert_train_args(self) -> Tuple[dict, dict, dict
|
|
26
|
+
def _convert_train_args(self) -> Tuple[dict, dict, dict]:
|
|
27
27
|
sft_collate_fn = get_sft_collate_fn(self.train_config.mask_prompt)
|
|
28
|
-
parallel_kwargs, data_loader_kwargs, sampler_kwargs
|
|
28
|
+
parallel_kwargs, data_loader_kwargs, sampler_kwargs = super()._convert_train_args()
|
|
29
29
|
data_loader_kwargs.update({"collate_fn": sft_collate_fn})
|
|
30
30
|
|
|
31
|
-
return parallel_kwargs, data_loader_kwargs, sampler_kwargs
|
|
31
|
+
return parallel_kwargs, data_loader_kwargs, sampler_kwargs
|
|
32
32
|
|
|
33
33
|
def _create_dataset(self, file_idx) -> Tuple[Dataset, str]:
|
|
34
34
|
file_path = self.train_config.file_dataset[file_idx]
|
llm_trainer/train_configs.py
CHANGED
|
@@ -107,7 +107,8 @@ class DataLoaderConfig:
|
|
|
107
107
|
|
|
108
108
|
|
|
109
109
|
@dataclass(kw_only=True)
|
|
110
|
-
class
|
|
110
|
+
class OptimConfig:
|
|
111
|
+
optim_type: str = 'adam' # or 'lion'
|
|
111
112
|
enable_lr_scheduler: bool = False
|
|
112
113
|
initial_lr: float
|
|
113
114
|
weight_decay: float = 0.1
|
|
@@ -195,8 +196,8 @@ class TrainConfig:
|
|
|
195
196
|
grpo训练时不生效该配置!
|
|
196
197
|
eval_batch_interval (`int`, default is 100):
|
|
197
198
|
每隔多少个batch进行模型eval
|
|
198
|
-
|
|
199
|
-
|
|
199
|
+
optim_config (`OptimConfig`):
|
|
200
|
+
optim配置项
|
|
200
201
|
data_loader_config: (`DataLoaderConfig`):
|
|
201
202
|
data loader配置项
|
|
202
203
|
kd_config: (`KDConfig`, *Optional*, default is None):
|
|
@@ -213,7 +214,7 @@ class TrainConfig:
|
|
|
213
214
|
image_tags_file_dataset: Optional[FileDataset] = None
|
|
214
215
|
|
|
215
216
|
loss_config: LossConfig = field(default_factory=LossConfig)
|
|
216
|
-
|
|
217
|
+
optim_config: OptimConfig = field(default_factory=OptimConfig)
|
|
217
218
|
|
|
218
219
|
ds_config: DsConfig = field(default_factory=DsConfig)
|
|
219
220
|
|
llm_trainer/trainer.py
CHANGED
|
@@ -77,19 +77,15 @@ class Trainer:
|
|
|
77
77
|
if self.eval_image_tags:
|
|
78
78
|
assert len(self.eval_prompts) == len(self.eval_image_tags)
|
|
79
79
|
|
|
80
|
-
parallel_kwargs, data_loader_kwargs, sampler_kwargs
|
|
81
|
-
self.parallel_kwargs = parallel_kwargs
|
|
82
|
-
self.data_loader_kwargs: dict[str, Any] = data_loader_kwargs
|
|
83
|
-
self.sampler_kwargs: dict[str, Any] = sampler_kwargs
|
|
84
|
-
|
|
80
|
+
self.parallel_kwargs, self.data_loader_kwargs, self.sampler_kwargs = self._convert_train_args()
|
|
85
81
|
# initialize a GradScaler. If enabled=False scaler is a no-op
|
|
86
82
|
self.scalar = torch.GradScaler(enabled=TrainerTools().use_amp)
|
|
87
83
|
|
|
88
84
|
# 注意:学习率要根据GPU的数量进行倍增:
|
|
89
85
|
# 在训练的过程中,损失梯度决定下降的方向,学习率决定下降的步长。如果有两块gpu,前进的综合步长为:平均学习率*2
|
|
90
|
-
initial_lr = train_config.
|
|
86
|
+
initial_lr = train_config.optim_config.initial_lr
|
|
91
87
|
|
|
92
|
-
self.train_model, self.optimizer = self._init_train_model_and_optim(initial_lr
|
|
88
|
+
self.train_model, self.optimizer = self._init_train_model_and_optim(initial_lr)
|
|
93
89
|
self.lr_scheduler = self._init_lr_scheduler(initial_lr)
|
|
94
90
|
|
|
95
91
|
self.criterion, self.kd_loss = self._init_loss()
|
|
@@ -127,12 +123,7 @@ class Trainer:
|
|
|
127
123
|
freeze_llm_model = self.train_config.freeze_llm_model
|
|
128
124
|
return model.parameters() if not freeze_llm_model else filter(lambda p: p.requires_grad, model.parameters())
|
|
129
125
|
|
|
130
|
-
def _init_train_model_and_optim(
|
|
131
|
-
self,
|
|
132
|
-
initial_lr: float,
|
|
133
|
-
parallel_kwargs: dict,
|
|
134
|
-
use_ds_optim: bool
|
|
135
|
-
):
|
|
126
|
+
def _init_train_model_and_optim(self, initial_lr: float):
|
|
136
127
|
model = self._new_model(self.train_config)
|
|
137
128
|
|
|
138
129
|
if self.train_config.init_state_dict:
|
|
@@ -161,34 +152,58 @@ class Trainer:
|
|
|
161
152
|
total_size_mb = total_size_bytes / (1024 * 1024)
|
|
162
153
|
log(f"Total size of the model: {total_size_mb:.2f} MB")
|
|
163
154
|
|
|
164
|
-
if use_ds_optim:
|
|
165
|
-
import deepspeed
|
|
166
|
-
origin_optim = deepspeed.ops.adam.DeepSpeedCPUAdam(
|
|
167
|
-
self._get_trainable_params(model),
|
|
168
|
-
lr=initial_lr,
|
|
169
|
-
weight_decay=self.train_config.lr_config.weight_decay
|
|
170
|
-
)
|
|
171
|
-
else:
|
|
172
|
-
origin_optim = torch.optim.AdamW(
|
|
173
|
-
self._get_trainable_params(model),
|
|
174
|
-
lr=initial_lr,
|
|
175
|
-
weight_decay=self.train_config.lr_config.weight_decay
|
|
176
|
-
)
|
|
177
155
|
model, optim = TrainerTools().parallel.process(
|
|
178
156
|
model=model,
|
|
179
|
-
optimizer=
|
|
180
|
-
kwargs=parallel_kwargs
|
|
157
|
+
optimizer=self._get_optim(model, initial_lr),
|
|
158
|
+
kwargs=self.parallel_kwargs
|
|
181
159
|
)
|
|
182
160
|
|
|
183
161
|
return model, optim
|
|
184
162
|
|
|
163
|
+
def _get_optim(self, model, initial_lr):
|
|
164
|
+
optimizer = None
|
|
165
|
+
|
|
166
|
+
if isinstance(TrainerTools().parallel, DsParallel) and self.parallel_kwargs:
|
|
167
|
+
import deepspeed
|
|
168
|
+
if ('zero_optimization' in self.parallel_kwargs
|
|
169
|
+
and 'offload_optimizer' in self.parallel_kwargs['zero_optimization']
|
|
170
|
+
and self.parallel_kwargs['zero_optimization']['offload_optimizer']['device'] == 'cpu'):
|
|
171
|
+
# offline optimizer to cpu
|
|
172
|
+
# 不能使用 deepspeed.ops.lion.cpu_lion.DeepSpeedCPULion???
|
|
173
|
+
# 所以,这里忽略lion判断
|
|
174
|
+
optimizer = deepspeed.ops.adam.DeepSpeedCPUAdam
|
|
175
|
+
if self.train_config.optim_config.optim_type == 'lion':
|
|
176
|
+
log('When set offload_optimizer, lion optim is unsupported, so set optim to adam!!!!!')
|
|
177
|
+
else:
|
|
178
|
+
if self.train_config.optim_config.optim_type == 'lion':
|
|
179
|
+
optimizer = deepspeed.ops.lion.FusedLion
|
|
180
|
+
else:
|
|
181
|
+
optimizer = deepspeed.ops.adam.FusedAdam
|
|
182
|
+
|
|
183
|
+
if not optimizer:
|
|
184
|
+
if self.train_config.optim_config.optim_type == 'lion':
|
|
185
|
+
try:
|
|
186
|
+
import lion_pytorch
|
|
187
|
+
except:
|
|
188
|
+
raise Exception('lion is not detected, please use `pip3 install lion_pytorch` to install or set optim_type to adam')
|
|
189
|
+
|
|
190
|
+
optimizer = lion_pytorch.Lion
|
|
191
|
+
else:
|
|
192
|
+
optimizer = torch.optim.AdamW
|
|
193
|
+
|
|
194
|
+
return optimizer(
|
|
195
|
+
self._get_trainable_params(model),
|
|
196
|
+
lr=initial_lr,
|
|
197
|
+
weight_decay=self.train_config.optim_config.weight_decay
|
|
198
|
+
)
|
|
199
|
+
|
|
185
200
|
def _init_lr_scheduler(self, initial_lr: float) -> LRScheduler:
|
|
186
|
-
if self.train_config.
|
|
187
|
-
warmup_iters = self.train_config.
|
|
188
|
-
min_lr = self.train_config.
|
|
189
|
-
max_lr = self.train_config.
|
|
190
|
-
cosine_annealing_period = self.train_config.
|
|
191
|
-
cosine_annealing_period_mul = self.train_config.
|
|
201
|
+
if self.train_config.optim_config.enable_lr_scheduler:
|
|
202
|
+
warmup_iters = self.train_config.optim_config.warmup_iters
|
|
203
|
+
min_lr = self.train_config.optim_config.min_lr
|
|
204
|
+
max_lr = self.train_config.optim_config.max_lr
|
|
205
|
+
cosine_annealing_period = self.train_config.optim_config.cosine_annealing_period
|
|
206
|
+
cosine_annealing_period_mul = self.train_config.optim_config.cosine_annealing_period_mul
|
|
192
207
|
|
|
193
208
|
return WarmupCosineAnnealingLRScheduler(
|
|
194
209
|
optimizer=self.optimizer,
|
|
@@ -220,9 +235,8 @@ class Trainer:
|
|
|
220
235
|
|
|
221
236
|
return criterion, kd_loss
|
|
222
237
|
|
|
223
|
-
def _convert_train_args(self) -> Tuple[dict, dict, dict
|
|
238
|
+
def _convert_train_args(self) -> Tuple[dict, dict, dict]:
|
|
224
239
|
parallel_kwargs: Optional[Dict[str, Any]] = None
|
|
225
|
-
use_ds_optim: bool = False
|
|
226
240
|
if isinstance(TrainerTools().parallel, DsParallel) and self.train_config.ds_config:
|
|
227
241
|
parallel_kwargs = {
|
|
228
242
|
'gradient_accumulation_steps': 1,
|
|
@@ -253,7 +267,6 @@ class Trainer:
|
|
|
253
267
|
"device": zero_config.offload_optimizer.device,
|
|
254
268
|
"pin_memory": zero_config.offload_optimizer.pin_memory
|
|
255
269
|
}
|
|
256
|
-
use_ds_optim = True
|
|
257
270
|
if zero_config.offload_param is not None:
|
|
258
271
|
zero_optimization['offload_param'] = {
|
|
259
272
|
"device": zero_config.offload_param.device,
|
|
@@ -328,10 +341,10 @@ class Trainer:
|
|
|
328
341
|
"drop_last": dataloader_args.data_loader_drop_last,
|
|
329
342
|
}
|
|
330
343
|
|
|
331
|
-
return parallel_kwargs, data_loader_kwargs, sampler_kwargs
|
|
344
|
+
return parallel_kwargs, data_loader_kwargs, sampler_kwargs
|
|
332
345
|
|
|
333
346
|
def _init_ref_model_args(self) -> dict:
|
|
334
|
-
parallel_kwargs = copy.deepcopy(self.parallel_kwargs)
|
|
347
|
+
parallel_kwargs = copy.deepcopy(self.parallel_kwargs) if self.parallel_kwargs else None
|
|
335
348
|
|
|
336
349
|
if parallel_kwargs and isinstance(TrainerTools().parallel, DsParallel):
|
|
337
350
|
# reference to https://github.com/huggingface/trl/blob/main/trl/models/utils.py:prepare_deepspeed
|
|
@@ -435,7 +448,7 @@ class Trainer:
|
|
|
435
448
|
exception_file = e.__traceback__.tb_frame.f_globals["__file__"]
|
|
436
449
|
exception_line = e.__traceback__.tb_lineno
|
|
437
450
|
log_msg = f"epoch: {epoch}, batch: {batch}, {e} at {exception_file} line {exception_line}\n"
|
|
438
|
-
log(log_msg, f'{log_dir}
|
|
451
|
+
log(log_msg, f'{log_dir}exception.txt')
|
|
439
452
|
|
|
440
453
|
raise e
|
|
441
454
|
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
#!python
|
|
2
|
+
|
|
3
|
+
if __name__ == '__main__':
|
|
4
|
+
import os, sys
|
|
5
|
+
arguments = sys.argv[1:]
|
|
6
|
+
# file_name
|
|
7
|
+
run_file_name = arguments[0]
|
|
8
|
+
|
|
9
|
+
extra_args = ''
|
|
10
|
+
if len(arguments) > 1:
|
|
11
|
+
extra_args = f"{' '.join(arguments[1:])} "
|
|
12
|
+
|
|
13
|
+
os.environ['PARALLEL_TYPE'] = 'ddp'
|
|
14
|
+
|
|
15
|
+
if len(extra_args) == 0:
|
|
16
|
+
extra_args = '--standalone --nproc_per_node=gpu '
|
|
17
|
+
|
|
18
|
+
command = f'torchrun {extra_args}{run_file_name}'
|
|
19
|
+
|
|
20
|
+
print(f'run command {command}')
|
|
21
|
+
os.system(command)
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
#!python
|
|
2
|
+
|
|
3
|
+
if __name__ == '__main__':
|
|
4
|
+
import os, sys
|
|
5
|
+
arguments = sys.argv[1:]
|
|
6
|
+
# file_name
|
|
7
|
+
run_file_name = arguments[0]
|
|
8
|
+
|
|
9
|
+
extra_args = ''
|
|
10
|
+
if len(arguments) > 1:
|
|
11
|
+
extra_args = f"{' '.join(arguments[1:])} "
|
|
12
|
+
|
|
13
|
+
os.environ['PARALLEL_TYPE'] = 'ds'
|
|
14
|
+
command = f'deepspeed {extra_args}{run_file_name}'
|
|
15
|
+
|
|
16
|
+
print(f'run command {command}')
|
|
17
|
+
os.system(command)
|
|
@@ -7,18 +7,9 @@ if __name__ == '__main__':
|
|
|
7
7
|
# file name
|
|
8
8
|
run_file_name = arguments[0]
|
|
9
9
|
|
|
10
|
-
|
|
10
|
+
extra_args = ''
|
|
11
11
|
if len(arguments) > 1:
|
|
12
|
-
|
|
13
|
-
cuda_visible_devive = arguments[1]
|
|
14
|
-
else:
|
|
15
|
-
cuda_visible_devive = None
|
|
16
|
-
|
|
17
|
-
# cuda location
|
|
18
|
-
if len(arguments) > 2:
|
|
19
|
-
cuda_loc = arguments[2]
|
|
20
|
-
else:
|
|
21
|
-
cuda_loc = 'localhost'
|
|
12
|
+
extra_args = f"{' '.join(arguments[1:])} "
|
|
22
13
|
|
|
23
14
|
try:
|
|
24
15
|
import deepspeed
|
|
@@ -33,12 +24,12 @@ if __name__ == '__main__':
|
|
|
33
24
|
os.environ['PARALLEL_TYPE'] = parallel_type
|
|
34
25
|
|
|
35
26
|
if parallel_type == 'ds':
|
|
36
|
-
|
|
37
|
-
command = f'deepspeed{cuda_ctrl} {run_file_name}'
|
|
27
|
+
command = f'deepspeed {extra_args}{run_file_name}'
|
|
38
28
|
elif parallel_type == 'ddp':
|
|
39
|
-
if
|
|
40
|
-
|
|
41
|
-
|
|
29
|
+
if len(extra_args) == 0:
|
|
30
|
+
extra_args = '--standalone --nproc_per_node=gpu '
|
|
31
|
+
|
|
32
|
+
command = f'torchrun {extra_args}{run_file_name}'
|
|
42
33
|
else:
|
|
43
34
|
command = f'python3 {run_file_name}'
|
|
44
35
|
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
llm_trainer/__init__.py,sha256=HWgtTEVeQSnZmEyYQm2K6eFEG4X2QAoigMlB5Z2tcXE,260
|
|
2
2
|
llm_trainer/checkpoint.py,sha256=X5ZeUtJlxVz7pnWQLaS-y7UIZOaOAnZTt2L8rSAPzUs,4428
|
|
3
3
|
llm_trainer/dataset.py,sha256=UL3fGeM4XSlyNQRZH-139u3LujqAQx3YyaxNRewk6LE,8935
|
|
4
|
-
llm_trainer/dpo_trainer.py,sha256=
|
|
4
|
+
llm_trainer/dpo_trainer.py,sha256=Qi7WKhFO4fdnj9W8BNIF_so6-F8g_YKUoPU9sNjWK_M,13320
|
|
5
5
|
llm_trainer/ds_checkpoint.py,sha256=X2IWgpgi0yOtogph7n6DEwvK_0Ceb7juu1WMutv3HSk,2270
|
|
6
6
|
llm_trainer/eval.py,sha256=ZyUfSo2Q8P-lrCdPEnGkoo5pGubd0AabREK5eMISRII,1109
|
|
7
7
|
llm_trainer/generate_utils.py,sha256=8K3YFbp7IF_lCkmkzjHhqTW26EBFb2AilQmarVcfMvs,15001
|
|
8
|
-
llm_trainer/grpo_trainer.py,sha256=
|
|
8
|
+
llm_trainer/grpo_trainer.py,sha256=3CcV-cuyV4ZUTymN9vz3au4uf3gZdyo8SGgSj2NEofs,16774
|
|
9
9
|
llm_trainer/log.py,sha256=XwychwKF6gvFPhthCIZCAEUZ0G3DY3fiQrOHqPWsxz0,463
|
|
10
10
|
llm_trainer/loss.py,sha256=RhTxftLMj1Tqc5pkUvJiZumfbMEPWL8GBGxdTfQggmk,6744
|
|
11
11
|
llm_trainer/parallel.py,sha256=yjStV21DJ26yM8-0O6GTMxdFAcyShY5GsQWSZmbI7HU,4543
|
|
@@ -14,20 +14,20 @@ llm_trainer/parallel_ds.py,sha256=oy8RRxHud3rACWubFlJqqd0pjPEQhKeAPGPQUSdJX2c,11
|
|
|
14
14
|
llm_trainer/parallel_none.py,sha256=TG6Pm829Dg-yQu-97O-EHV3FCARBlNcP47KkGFAs16E,676
|
|
15
15
|
llm_trainer/partition_utils.py,sha256=eEYNhfEIF4hGzZ3OLa6sEBIECz261drptEz_n7fZYtk,8396
|
|
16
16
|
llm_trainer/scheduler.py,sha256=LAI_0VxClsIQkix0bRoduRD4vPfVuIZDhZgTAT_KK8k,4901
|
|
17
|
-
llm_trainer/sft_trainer.py,sha256=
|
|
17
|
+
llm_trainer/sft_trainer.py,sha256=rSOGZx53jMgOuJdztfxQASYJ62uD0dVaih4IAnSwGBc,1787
|
|
18
18
|
llm_trainer/tokenizer.py,sha256=0-xQCMz1xiPTDAZiYsVsiECSoZ_1eIvW9XsZOoFfakQ,7250
|
|
19
19
|
llm_trainer/tools.py,sha256=5op5qrjjkK-Lr9oes5VxIVnOVYOYGoAdlIJq9mPUf64,2637
|
|
20
|
-
llm_trainer/train_configs.py,sha256=
|
|
21
|
-
llm_trainer/trainer.py,sha256=
|
|
20
|
+
llm_trainer/train_configs.py,sha256=pPZkbliRdTnWSv3TUuTM23x9RDdMhGSPrxbNAyzDklY,7636
|
|
21
|
+
llm_trainer/trainer.py,sha256=diP-1suOf2U5dY_R8QH5arAx4MgBrKW-GBQ2_ScGNM8,28799
|
|
22
22
|
llm_trainer/utils.py,sha256=xC5plG-8-_Al5yIF5xIU5lroOcBBk98TEhtUJrazZPE,12305
|
|
23
|
-
project_llm_trainer-0.
|
|
24
|
-
project_llm_trainer-0.
|
|
25
|
-
project_llm_trainer-0.
|
|
26
|
-
project_llm_trainer-0.
|
|
27
|
-
project_llm_trainer-0.
|
|
28
|
-
project_llm_trainer-0.
|
|
29
|
-
project_llm_trainer-0.
|
|
30
|
-
project_llm_trainer-0.
|
|
31
|
-
project_llm_trainer-0.
|
|
32
|
-
project_llm_trainer-0.
|
|
33
|
-
project_llm_trainer-0.
|
|
23
|
+
project_llm_trainer-0.8.2.data/scripts/calc_intermediate_size,sha256=AggpgNHokJiJMbEtVdOnolqr_4bH3i1UYuZNEAzC2Gc,460
|
|
24
|
+
project_llm_trainer-0.8.2.data/scripts/ddp_train,sha256=eZSud6KYQAoKLsYB5QB-FI2zq5AZm6Apq1azKdupV3o,477
|
|
25
|
+
project_llm_trainer-0.8.2.data/scripts/ds_train,sha256=41q4rOxwbvZDUY0FDdAIpG13PEaUWBpthhvFvww8uOc,388
|
|
26
|
+
project_llm_trainer-0.8.2.data/scripts/plot_loss,sha256=MzFcdJESlVr1srj4Td6-AxPGUKkfB_QEcJwm0Bd-5fU,910
|
|
27
|
+
project_llm_trainer-0.8.2.data/scripts/plot_lr,sha256=w_7XR_x3KYYyboeOVAeu_I4fveLFI-C0wBmRrNlmWUI,894
|
|
28
|
+
project_llm_trainer-0.8.2.data/scripts/py_train,sha256=tOp9TquORQeU8XN5H7OVIk5O0Ypwi34p_GENxTwgwdk,265
|
|
29
|
+
project_llm_trainer-0.8.2.data/scripts/smart_train,sha256=N8dp2n7k6bghGczedBVwOdtf1O66oM_cNPh9QmZt0bM,914
|
|
30
|
+
project_llm_trainer-0.8.2.dist-info/METADATA,sha256=XlNe-d24OrjYkzrJMiQCjiZPT70QOFRd4K2XrVDWZiY,195
|
|
31
|
+
project_llm_trainer-0.8.2.dist-info/WHEEL,sha256=Nw36Djuh_5VDukK0H78QzOX-_FQEo6V37m3nkm96gtU,91
|
|
32
|
+
project_llm_trainer-0.8.2.dist-info/top_level.txt,sha256=LtRFg28i0QIG7iBCD2t095oSco99LCtkijibS9cMGik,12
|
|
33
|
+
project_llm_trainer-0.8.2.dist-info/RECORD,,
|
|
@@ -1,24 +0,0 @@
|
|
|
1
|
-
#!python
|
|
2
|
-
|
|
3
|
-
if __name__ == '__main__':
|
|
4
|
-
import os, sys
|
|
5
|
-
arguments = sys.argv[1:]
|
|
6
|
-
# file_name
|
|
7
|
-
run_file_name = arguments[0]
|
|
8
|
-
|
|
9
|
-
# cuda_visible_devive
|
|
10
|
-
if len(arguments) > 1:
|
|
11
|
-
# 0,1,2,3
|
|
12
|
-
cuda_visible_devive = arguments[1]
|
|
13
|
-
else:
|
|
14
|
-
cuda_visible_devive = None
|
|
15
|
-
|
|
16
|
-
os.environ['PARALLEL_TYPE'] = 'ddp'
|
|
17
|
-
|
|
18
|
-
if cuda_visible_devive:
|
|
19
|
-
os.environ['CUDA_VISIBLE_DEVICES'] = cuda_visible_devive
|
|
20
|
-
|
|
21
|
-
command = f'torchrun --standalone --nproc_per_node=gpu {run_file_name}'
|
|
22
|
-
|
|
23
|
-
print(f'run command {command}')
|
|
24
|
-
os.system(command)
|
|
@@ -1,30 +0,0 @@
|
|
|
1
|
-
#!python
|
|
2
|
-
|
|
3
|
-
if __name__ == '__main__':
|
|
4
|
-
import os, sys
|
|
5
|
-
arguments = sys.argv[1:]
|
|
6
|
-
# file_name
|
|
7
|
-
run_file_name = arguments[0]
|
|
8
|
-
|
|
9
|
-
# cuda_visible_devive
|
|
10
|
-
if len(arguments) > 1:
|
|
11
|
-
# 0,1,2,3
|
|
12
|
-
cuda_visible_devive = arguments[1]
|
|
13
|
-
|
|
14
|
-
# cuda location
|
|
15
|
-
if len(arguments) > 2:
|
|
16
|
-
cuda_loc = arguments[2]
|
|
17
|
-
else:
|
|
18
|
-
cuda_loc = 'localhost'
|
|
19
|
-
else:
|
|
20
|
-
cuda_visible_devive = None
|
|
21
|
-
cuda_loc = None
|
|
22
|
-
|
|
23
|
-
os.environ['PARALLEL_TYPE'] = 'ds'
|
|
24
|
-
|
|
25
|
-
cuda_ctrl = f' --include {cuda_loc}:{cuda_visible_devive}' if cuda_visible_devive else ''
|
|
26
|
-
|
|
27
|
-
command = f'deepspeed{cuda_ctrl} {run_file_name}'
|
|
28
|
-
|
|
29
|
-
print(f'run command {command}')
|
|
30
|
-
os.system(command)
|
{project_llm_trainer-0.7.9.data → project_llm_trainer-0.8.2.data}/scripts/calc_intermediate_size
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|