project-llm-trainer 0.7.9__py3-none-any.whl → 0.8.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of project-llm-trainer might be problematic. Click here for more details.

@@ -70,12 +70,12 @@ class DPOTrainer(Trainer):
70
70
 
71
71
  return criterion, None
72
72
 
73
- def _convert_train_args(self) -> Tuple[dict, dict, dict, bool]:
73
+ def _convert_train_args(self) -> Tuple[dict, dict, dict]:
74
74
  dpo_collate_fn = get_dpo_collate_fn(self.train_config.mask_prompt)
75
- parallel_kwargs, data_loader_kwargs, sampler_kwargs, use_ds_optim = super()._convert_train_args()
75
+ parallel_kwargs, data_loader_kwargs, sampler_kwargs = super()._convert_train_args()
76
76
  data_loader_kwargs.update({"collate_fn": dpo_collate_fn})
77
77
 
78
- return parallel_kwargs, data_loader_kwargs, sampler_kwargs, use_ds_optim
78
+ return parallel_kwargs, data_loader_kwargs, sampler_kwargs
79
79
 
80
80
  def _create_dataset(self, file_idx) -> Tuple[Dataset, str]:
81
81
  file_path = self.train_config.file_dataset[file_idx]
@@ -82,11 +82,11 @@ class GRPOTrainer(Trainer):
82
82
 
83
83
  return criterion, None
84
84
 
85
- def _convert_train_args(self) -> Tuple[dict, dict, dict, bool]:
86
- parallel_kwargs, data_loader_kwargs, sampler_kwargs, use_ds_optim = super()._convert_train_args()
85
+ def _convert_train_args(self) -> Tuple[dict, dict, dict]:
86
+ parallel_kwargs, data_loader_kwargs, sampler_kwargs = super()._convert_train_args()
87
87
  data_loader_kwargs.update({"collate_fn": lambda x: x})
88
88
 
89
- return parallel_kwargs, data_loader_kwargs, sampler_kwargs, use_ds_optim
89
+ return parallel_kwargs, data_loader_kwargs, sampler_kwargs
90
90
 
91
91
  def _create_dataset(self, file_idx) -> Tuple[Dataset, str]:
92
92
  file_path = self.train_config.file_dataset[file_idx]
@@ -23,12 +23,12 @@ class SFTTrainer(Trainer):
23
23
  )
24
24
  self.packed_sequences = False
25
25
 
26
- def _convert_train_args(self) -> Tuple[dict, dict, dict, bool]:
26
+ def _convert_train_args(self) -> Tuple[dict, dict, dict]:
27
27
  sft_collate_fn = get_sft_collate_fn(self.train_config.mask_prompt)
28
- parallel_kwargs, data_loader_kwargs, sampler_kwargs, use_ds_optim = super()._convert_train_args()
28
+ parallel_kwargs, data_loader_kwargs, sampler_kwargs = super()._convert_train_args()
29
29
  data_loader_kwargs.update({"collate_fn": sft_collate_fn})
30
30
 
31
- return parallel_kwargs, data_loader_kwargs, sampler_kwargs, use_ds_optim
31
+ return parallel_kwargs, data_loader_kwargs, sampler_kwargs
32
32
 
33
33
  def _create_dataset(self, file_idx) -> Tuple[Dataset, str]:
34
34
  file_path = self.train_config.file_dataset[file_idx]
@@ -107,7 +107,8 @@ class DataLoaderConfig:
107
107
 
108
108
 
109
109
  @dataclass(kw_only=True)
110
- class LrConfig:
110
+ class OptimConfig:
111
+ optim_type: str = 'adam' # or 'lion'
111
112
  enable_lr_scheduler: bool = False
112
113
  initial_lr: float
113
114
  weight_decay: float = 0.1
@@ -195,8 +196,8 @@ class TrainConfig:
195
196
  grpo训练时不生效该配置!
196
197
  eval_batch_interval (`int`, default is 100):
197
198
  每隔多少个batch进行模型eval
198
- lr_config (`LrConfig`):
199
- lr配置项
199
+ optim_config (`OptimConfig`):
200
+ optim配置项
200
201
  data_loader_config: (`DataLoaderConfig`):
201
202
  data loader配置项
202
203
  kd_config: (`KDConfig`, *Optional*, default is None):
@@ -213,7 +214,7 @@ class TrainConfig:
213
214
  image_tags_file_dataset: Optional[FileDataset] = None
214
215
 
215
216
  loss_config: LossConfig = field(default_factory=LossConfig)
216
- lr_config: LrConfig = field(default_factory=LrConfig)
217
+ optim_config: OptimConfig = field(default_factory=OptimConfig)
217
218
 
218
219
  ds_config: DsConfig = field(default_factory=DsConfig)
219
220
 
llm_trainer/trainer.py CHANGED
@@ -77,19 +77,15 @@ class Trainer:
77
77
  if self.eval_image_tags:
78
78
  assert len(self.eval_prompts) == len(self.eval_image_tags)
79
79
 
80
- parallel_kwargs, data_loader_kwargs, sampler_kwargs, use_ds_optim = self._convert_train_args()
81
- self.parallel_kwargs = parallel_kwargs
82
- self.data_loader_kwargs: dict[str, Any] = data_loader_kwargs
83
- self.sampler_kwargs: dict[str, Any] = sampler_kwargs
84
-
80
+ self.parallel_kwargs, self.data_loader_kwargs, self.sampler_kwargs = self._convert_train_args()
85
81
  # initialize a GradScaler. If enabled=False scaler is a no-op
86
82
  self.scalar = torch.GradScaler(enabled=TrainerTools().use_amp)
87
83
 
88
84
  # 注意:学习率要根据GPU的数量进行倍增:
89
85
  # 在训练的过程中,损失梯度决定下降的方向,学习率决定下降的步长。如果有两块gpu,前进的综合步长为:平均学习率*2
90
- initial_lr = train_config.lr_config.initial_lr
86
+ initial_lr = train_config.optim_config.initial_lr
91
87
 
92
- self.train_model, self.optimizer = self._init_train_model_and_optim(initial_lr, parallel_kwargs, use_ds_optim)
88
+ self.train_model, self.optimizer = self._init_train_model_and_optim(initial_lr)
93
89
  self.lr_scheduler = self._init_lr_scheduler(initial_lr)
94
90
 
95
91
  self.criterion, self.kd_loss = self._init_loss()
@@ -127,12 +123,7 @@ class Trainer:
127
123
  freeze_llm_model = self.train_config.freeze_llm_model
128
124
  return model.parameters() if not freeze_llm_model else filter(lambda p: p.requires_grad, model.parameters())
129
125
 
130
- def _init_train_model_and_optim(
131
- self,
132
- initial_lr: float,
133
- parallel_kwargs: dict,
134
- use_ds_optim: bool
135
- ):
126
+ def _init_train_model_and_optim(self, initial_lr: float):
136
127
  model = self._new_model(self.train_config)
137
128
 
138
129
  if self.train_config.init_state_dict:
@@ -161,34 +152,58 @@ class Trainer:
161
152
  total_size_mb = total_size_bytes / (1024 * 1024)
162
153
  log(f"Total size of the model: {total_size_mb:.2f} MB")
163
154
 
164
- if use_ds_optim:
165
- import deepspeed
166
- origin_optim = deepspeed.ops.adam.DeepSpeedCPUAdam(
167
- self._get_trainable_params(model),
168
- lr=initial_lr,
169
- weight_decay=self.train_config.lr_config.weight_decay
170
- )
171
- else:
172
- origin_optim = torch.optim.AdamW(
173
- self._get_trainable_params(model),
174
- lr=initial_lr,
175
- weight_decay=self.train_config.lr_config.weight_decay
176
- )
177
155
  model, optim = TrainerTools().parallel.process(
178
156
  model=model,
179
- optimizer=origin_optim,
180
- kwargs=parallel_kwargs
157
+ optimizer=self._get_optim(model, initial_lr),
158
+ kwargs=self.parallel_kwargs
181
159
  )
182
160
 
183
161
  return model, optim
184
162
 
163
+ def _get_optim(self, model, initial_lr):
164
+ optimizer = None
165
+
166
+ if isinstance(TrainerTools().parallel, DsParallel) and self.parallel_kwargs:
167
+ import deepspeed
168
+ if ('zero_optimization' in self.parallel_kwargs
169
+ and 'offload_optimizer' in self.parallel_kwargs['zero_optimization']
170
+ and self.parallel_kwargs['zero_optimization']['offload_optimizer']['device'] == 'cpu'):
171
+ # offline optimizer to cpu
172
+ # 不能使用 deepspeed.ops.lion.cpu_lion.DeepSpeedCPULion???
173
+ # 所以,这里忽略lion判断
174
+ optimizer = deepspeed.ops.adam.DeepSpeedCPUAdam
175
+ if self.train_config.optim_config.optim_type == 'lion':
176
+ log('When set offload_optimizer, lion optim is unsupported, so set optim to adam!!!!!')
177
+ else:
178
+ if self.train_config.optim_config.optim_type == 'lion':
179
+ optimizer = deepspeed.ops.lion.FusedLion
180
+ else:
181
+ optimizer = deepspeed.ops.adam.FusedAdam
182
+
183
+ if not optimizer:
184
+ if self.train_config.optim_config.optim_type == 'lion':
185
+ try:
186
+ import lion_pytorch
187
+ except:
188
+ raise Exception('lion is not detected, please use `pip3 install lion_pytorch` to install or set optim_type to adam')
189
+
190
+ optimizer = lion_pytorch.Lion
191
+ else:
192
+ optimizer = torch.optim.AdamW
193
+
194
+ return optimizer(
195
+ self._get_trainable_params(model),
196
+ lr=initial_lr,
197
+ weight_decay=self.train_config.optim_config.weight_decay
198
+ )
199
+
185
200
  def _init_lr_scheduler(self, initial_lr: float) -> LRScheduler:
186
- if self.train_config.lr_config.enable_lr_scheduler:
187
- warmup_iters = self.train_config.lr_config.warmup_iters
188
- min_lr = self.train_config.lr_config.min_lr
189
- max_lr = self.train_config.lr_config.max_lr
190
- cosine_annealing_period = self.train_config.lr_config.cosine_annealing_period
191
- cosine_annealing_period_mul = self.train_config.lr_config.cosine_annealing_period_mul
201
+ if self.train_config.optim_config.enable_lr_scheduler:
202
+ warmup_iters = self.train_config.optim_config.warmup_iters
203
+ min_lr = self.train_config.optim_config.min_lr
204
+ max_lr = self.train_config.optim_config.max_lr
205
+ cosine_annealing_period = self.train_config.optim_config.cosine_annealing_period
206
+ cosine_annealing_period_mul = self.train_config.optim_config.cosine_annealing_period_mul
192
207
 
193
208
  return WarmupCosineAnnealingLRScheduler(
194
209
  optimizer=self.optimizer,
@@ -220,9 +235,8 @@ class Trainer:
220
235
 
221
236
  return criterion, kd_loss
222
237
 
223
- def _convert_train_args(self) -> Tuple[dict, dict, dict, bool]:
238
+ def _convert_train_args(self) -> Tuple[dict, dict, dict]:
224
239
  parallel_kwargs: Optional[Dict[str, Any]] = None
225
- use_ds_optim: bool = False
226
240
  if isinstance(TrainerTools().parallel, DsParallel) and self.train_config.ds_config:
227
241
  parallel_kwargs = {
228
242
  'gradient_accumulation_steps': 1,
@@ -253,7 +267,6 @@ class Trainer:
253
267
  "device": zero_config.offload_optimizer.device,
254
268
  "pin_memory": zero_config.offload_optimizer.pin_memory
255
269
  }
256
- use_ds_optim = True
257
270
  if zero_config.offload_param is not None:
258
271
  zero_optimization['offload_param'] = {
259
272
  "device": zero_config.offload_param.device,
@@ -328,10 +341,10 @@ class Trainer:
328
341
  "drop_last": dataloader_args.data_loader_drop_last,
329
342
  }
330
343
 
331
- return parallel_kwargs, data_loader_kwargs, sampler_kwargs, use_ds_optim
344
+ return parallel_kwargs, data_loader_kwargs, sampler_kwargs
332
345
 
333
346
  def _init_ref_model_args(self) -> dict:
334
- parallel_kwargs = copy.deepcopy(self.parallel_kwargs)
347
+ parallel_kwargs = copy.deepcopy(self.parallel_kwargs) if self.parallel_kwargs else None
335
348
 
336
349
  if parallel_kwargs and isinstance(TrainerTools().parallel, DsParallel):
337
350
  # reference to https://github.com/huggingface/trl/blob/main/trl/models/utils.py:prepare_deepspeed
@@ -435,7 +448,7 @@ class Trainer:
435
448
  exception_file = e.__traceback__.tb_frame.f_globals["__file__"]
436
449
  exception_line = e.__traceback__.tb_lineno
437
450
  log_msg = f"epoch: {epoch}, batch: {batch}, {e} at {exception_file} line {exception_line}\n"
438
- log(log_msg, f'{log_dir}log.txt')
451
+ log(log_msg, f'{log_dir}exception.txt')
439
452
 
440
453
  raise e
441
454
 
@@ -0,0 +1,21 @@
1
+ #!python
2
+
3
+ if __name__ == '__main__':
4
+ import os, sys
5
+ arguments = sys.argv[1:]
6
+ # file_name
7
+ run_file_name = arguments[0]
8
+
9
+ extra_args = ''
10
+ if len(arguments) > 1:
11
+ extra_args = f"{' '.join(arguments[1:])} "
12
+
13
+ os.environ['PARALLEL_TYPE'] = 'ddp'
14
+
15
+ if len(extra_args) == 0:
16
+ extra_args = '--standalone --nproc_per_node=gpu '
17
+
18
+ command = f'torchrun {extra_args}{run_file_name}'
19
+
20
+ print(f'run command {command}')
21
+ os.system(command)
@@ -0,0 +1,17 @@
1
+ #!python
2
+
3
+ if __name__ == '__main__':
4
+ import os, sys
5
+ arguments = sys.argv[1:]
6
+ # file_name
7
+ run_file_name = arguments[0]
8
+
9
+ extra_args = ''
10
+ if len(arguments) > 1:
11
+ extra_args = f"{' '.join(arguments[1:])} "
12
+
13
+ os.environ['PARALLEL_TYPE'] = 'ds'
14
+ command = f'deepspeed {extra_args}{run_file_name}'
15
+
16
+ print(f'run command {command}')
17
+ os.system(command)
@@ -7,18 +7,9 @@ if __name__ == '__main__':
7
7
  # file name
8
8
  run_file_name = arguments[0]
9
9
 
10
- # cuda_visible_devive
10
+ extra_args = ''
11
11
  if len(arguments) > 1:
12
- # 0,1,2,3
13
- cuda_visible_devive = arguments[1]
14
- else:
15
- cuda_visible_devive = None
16
-
17
- # cuda location
18
- if len(arguments) > 2:
19
- cuda_loc = arguments[2]
20
- else:
21
- cuda_loc = 'localhost'
12
+ extra_args = f"{' '.join(arguments[1:])} "
22
13
 
23
14
  try:
24
15
  import deepspeed
@@ -33,12 +24,12 @@ if __name__ == '__main__':
33
24
  os.environ['PARALLEL_TYPE'] = parallel_type
34
25
 
35
26
  if parallel_type == 'ds':
36
- cuda_ctrl = f' --include {cuda_loc}:{cuda_visible_devive}' if cuda_visible_devive else ''
37
- command = f'deepspeed{cuda_ctrl} {run_file_name}'
27
+ command = f'deepspeed {extra_args}{run_file_name}'
38
28
  elif parallel_type == 'ddp':
39
- if cuda_visible_devive:
40
- os.environ['CUDA_VISIBLE_DEVICES'] = cuda_visible_devive
41
- command = f'torchrun --standalone --nproc_per_node=gpu {run_file_name}'
29
+ if len(extra_args) == 0:
30
+ extra_args = '--standalone --nproc_per_node=gpu '
31
+
32
+ command = f'torchrun {extra_args}{run_file_name}'
42
33
  else:
43
34
  command = f'python3 {run_file_name}'
44
35
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: project_llm_trainer
3
- Version: 0.7.9
3
+ Version: 0.8.2
4
4
  Summary: LLM and VLM trainer
5
5
  Author: qibin
6
6
  Author-email: qibin0506@gmail.com
@@ -1,11 +1,11 @@
1
1
  llm_trainer/__init__.py,sha256=HWgtTEVeQSnZmEyYQm2K6eFEG4X2QAoigMlB5Z2tcXE,260
2
2
  llm_trainer/checkpoint.py,sha256=X5ZeUtJlxVz7pnWQLaS-y7UIZOaOAnZTt2L8rSAPzUs,4428
3
3
  llm_trainer/dataset.py,sha256=UL3fGeM4XSlyNQRZH-139u3LujqAQx3YyaxNRewk6LE,8935
4
- llm_trainer/dpo_trainer.py,sha256=Bgds18UWFhzf_UNCFN-iBCdhKf9pcXJBFPEc32oJeXA,13354
4
+ llm_trainer/dpo_trainer.py,sha256=Qi7WKhFO4fdnj9W8BNIF_so6-F8g_YKUoPU9sNjWK_M,13320
5
5
  llm_trainer/ds_checkpoint.py,sha256=X2IWgpgi0yOtogph7n6DEwvK_0Ceb7juu1WMutv3HSk,2270
6
6
  llm_trainer/eval.py,sha256=ZyUfSo2Q8P-lrCdPEnGkoo5pGubd0AabREK5eMISRII,1109
7
7
  llm_trainer/generate_utils.py,sha256=8K3YFbp7IF_lCkmkzjHhqTW26EBFb2AilQmarVcfMvs,15001
8
- llm_trainer/grpo_trainer.py,sha256=MXnP8Kc9CQJw0CB3uMbHxIYwvpuujai4hgbbpUut_K4,16808
8
+ llm_trainer/grpo_trainer.py,sha256=3CcV-cuyV4ZUTymN9vz3au4uf3gZdyo8SGgSj2NEofs,16774
9
9
  llm_trainer/log.py,sha256=XwychwKF6gvFPhthCIZCAEUZ0G3DY3fiQrOHqPWsxz0,463
10
10
  llm_trainer/loss.py,sha256=RhTxftLMj1Tqc5pkUvJiZumfbMEPWL8GBGxdTfQggmk,6744
11
11
  llm_trainer/parallel.py,sha256=yjStV21DJ26yM8-0O6GTMxdFAcyShY5GsQWSZmbI7HU,4543
@@ -14,20 +14,20 @@ llm_trainer/parallel_ds.py,sha256=oy8RRxHud3rACWubFlJqqd0pjPEQhKeAPGPQUSdJX2c,11
14
14
  llm_trainer/parallel_none.py,sha256=TG6Pm829Dg-yQu-97O-EHV3FCARBlNcP47KkGFAs16E,676
15
15
  llm_trainer/partition_utils.py,sha256=eEYNhfEIF4hGzZ3OLa6sEBIECz261drptEz_n7fZYtk,8396
16
16
  llm_trainer/scheduler.py,sha256=LAI_0VxClsIQkix0bRoduRD4vPfVuIZDhZgTAT_KK8k,4901
17
- llm_trainer/sft_trainer.py,sha256=LudTRIaqLQYy6ym6jjMX7v9xtFBJelrR3nnPCwb48nM,1821
17
+ llm_trainer/sft_trainer.py,sha256=rSOGZx53jMgOuJdztfxQASYJ62uD0dVaih4IAnSwGBc,1787
18
18
  llm_trainer/tokenizer.py,sha256=0-xQCMz1xiPTDAZiYsVsiECSoZ_1eIvW9XsZOoFfakQ,7250
19
19
  llm_trainer/tools.py,sha256=5op5qrjjkK-Lr9oes5VxIVnOVYOYGoAdlIJq9mPUf64,2637
20
- llm_trainer/train_configs.py,sha256=N3ykM1uaLHcSNRC8ErYIxp9VYhSP7voJyAP-2D4ZJe0,7574
21
- llm_trainer/trainer.py,sha256=jS31zEXIIj9BoPTPlmaGYq61x72HGCjKfS2u3_gOkDk,27924
20
+ llm_trainer/train_configs.py,sha256=pPZkbliRdTnWSv3TUuTM23x9RDdMhGSPrxbNAyzDklY,7636
21
+ llm_trainer/trainer.py,sha256=diP-1suOf2U5dY_R8QH5arAx4MgBrKW-GBQ2_ScGNM8,28799
22
22
  llm_trainer/utils.py,sha256=xC5plG-8-_Al5yIF5xIU5lroOcBBk98TEhtUJrazZPE,12305
23
- project_llm_trainer-0.7.9.data/scripts/calc_intermediate_size,sha256=AggpgNHokJiJMbEtVdOnolqr_4bH3i1UYuZNEAzC2Gc,460
24
- project_llm_trainer-0.7.9.data/scripts/ddp_train,sha256=Z-309mM56CN0m3bxoeC5us4LUuwuNnoiOm3-fDdLMjQ,566
25
- project_llm_trainer-0.7.9.data/scripts/ds_train,sha256=tME0xmMdX1D9XuVo07D9dilW5VIWavBS3UK9DoY67WI,709
26
- project_llm_trainer-0.7.9.data/scripts/plot_loss,sha256=MzFcdJESlVr1srj4Td6-AxPGUKkfB_QEcJwm0Bd-5fU,910
27
- project_llm_trainer-0.7.9.data/scripts/plot_lr,sha256=w_7XR_x3KYYyboeOVAeu_I4fveLFI-C0wBmRrNlmWUI,894
28
- project_llm_trainer-0.7.9.data/scripts/py_train,sha256=tOp9TquORQeU8XN5H7OVIk5O0Ypwi34p_GENxTwgwdk,265
29
- project_llm_trainer-0.7.9.data/scripts/smart_train,sha256=3oLIDuuqb4U4TU1lXy9V8lw_0gIf7i8tGsxlQ_s6bro,1220
30
- project_llm_trainer-0.7.9.dist-info/METADATA,sha256=mDGLc1BjmIlOPz85JYB5bFnlXJgJ5VaNesW4z0HDZCA,195
31
- project_llm_trainer-0.7.9.dist-info/WHEEL,sha256=Nw36Djuh_5VDukK0H78QzOX-_FQEo6V37m3nkm96gtU,91
32
- project_llm_trainer-0.7.9.dist-info/top_level.txt,sha256=LtRFg28i0QIG7iBCD2t095oSco99LCtkijibS9cMGik,12
33
- project_llm_trainer-0.7.9.dist-info/RECORD,,
23
+ project_llm_trainer-0.8.2.data/scripts/calc_intermediate_size,sha256=AggpgNHokJiJMbEtVdOnolqr_4bH3i1UYuZNEAzC2Gc,460
24
+ project_llm_trainer-0.8.2.data/scripts/ddp_train,sha256=eZSud6KYQAoKLsYB5QB-FI2zq5AZm6Apq1azKdupV3o,477
25
+ project_llm_trainer-0.8.2.data/scripts/ds_train,sha256=41q4rOxwbvZDUY0FDdAIpG13PEaUWBpthhvFvww8uOc,388
26
+ project_llm_trainer-0.8.2.data/scripts/plot_loss,sha256=MzFcdJESlVr1srj4Td6-AxPGUKkfB_QEcJwm0Bd-5fU,910
27
+ project_llm_trainer-0.8.2.data/scripts/plot_lr,sha256=w_7XR_x3KYYyboeOVAeu_I4fveLFI-C0wBmRrNlmWUI,894
28
+ project_llm_trainer-0.8.2.data/scripts/py_train,sha256=tOp9TquORQeU8XN5H7OVIk5O0Ypwi34p_GENxTwgwdk,265
29
+ project_llm_trainer-0.8.2.data/scripts/smart_train,sha256=N8dp2n7k6bghGczedBVwOdtf1O66oM_cNPh9QmZt0bM,914
30
+ project_llm_trainer-0.8.2.dist-info/METADATA,sha256=XlNe-d24OrjYkzrJMiQCjiZPT70QOFRd4K2XrVDWZiY,195
31
+ project_llm_trainer-0.8.2.dist-info/WHEEL,sha256=Nw36Djuh_5VDukK0H78QzOX-_FQEo6V37m3nkm96gtU,91
32
+ project_llm_trainer-0.8.2.dist-info/top_level.txt,sha256=LtRFg28i0QIG7iBCD2t095oSco99LCtkijibS9cMGik,12
33
+ project_llm_trainer-0.8.2.dist-info/RECORD,,
@@ -1,24 +0,0 @@
1
- #!python
2
-
3
- if __name__ == '__main__':
4
- import os, sys
5
- arguments = sys.argv[1:]
6
- # file_name
7
- run_file_name = arguments[0]
8
-
9
- # cuda_visible_devive
10
- if len(arguments) > 1:
11
- # 0,1,2,3
12
- cuda_visible_devive = arguments[1]
13
- else:
14
- cuda_visible_devive = None
15
-
16
- os.environ['PARALLEL_TYPE'] = 'ddp'
17
-
18
- if cuda_visible_devive:
19
- os.environ['CUDA_VISIBLE_DEVICES'] = cuda_visible_devive
20
-
21
- command = f'torchrun --standalone --nproc_per_node=gpu {run_file_name}'
22
-
23
- print(f'run command {command}')
24
- os.system(command)
@@ -1,30 +0,0 @@
1
- #!python
2
-
3
- if __name__ == '__main__':
4
- import os, sys
5
- arguments = sys.argv[1:]
6
- # file_name
7
- run_file_name = arguments[0]
8
-
9
- # cuda_visible_devive
10
- if len(arguments) > 1:
11
- # 0,1,2,3
12
- cuda_visible_devive = arguments[1]
13
-
14
- # cuda location
15
- if len(arguments) > 2:
16
- cuda_loc = arguments[2]
17
- else:
18
- cuda_loc = 'localhost'
19
- else:
20
- cuda_visible_devive = None
21
- cuda_loc = None
22
-
23
- os.environ['PARALLEL_TYPE'] = 'ds'
24
-
25
- cuda_ctrl = f' --include {cuda_loc}:{cuda_visible_devive}' if cuda_visible_devive else ''
26
-
27
- command = f'deepspeed{cuda_ctrl} {run_file_name}'
28
-
29
- print(f'run command {command}')
30
- os.system(command)