project-llm-trainer 0.4__py3-none-any.whl → 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of project-llm-trainer might be problematic. Click here for more details.

llm_trainer/checkpoint.py CHANGED
@@ -121,8 +121,12 @@ def load_checkpoint_for_eval(
121
121
 
122
122
  def copy_model_params(
123
123
  _from: nn.Module,
124
- _to: nn.Module
124
+ _to: Optional[nn.Module]
125
125
  ):
126
+ """
127
+ 必须在所有rank上调用,非rank0, _to可以设置为None
128
+ """
129
+
126
130
  if isinstance(TrainerTools().parallel, DsParallel):
127
131
  from .ds_checkpoint import get_ds_model_params
128
132
  state_dict = get_ds_model_params(_from)
@@ -134,7 +138,7 @@ def copy_model_params(
134
138
  else:
135
139
  state_dict = _from.state_dict()
136
140
 
137
- if state_dict:
141
+ if _to and state_dict:
138
142
  _to.load_state_dict(state_dict)
139
143
 
140
144
 
@@ -67,45 +67,47 @@ def load_ds_checkpoint_for_eval(model: nn.Module):
67
67
  model.load_state_dict(state_dict)
68
68
 
69
69
 
70
+ def _get_ds_full_state_dict_on_rank0(model: DeepSpeedEngine) -> Optional[dict]:
71
+ """
72
+ 可以在任意rank上调用,然后只有rank0有值
73
+ """
74
+
75
+ if model.zero_optimization_stage() != 3:
76
+ if TrainerTools().parallel.is_main_process:
77
+ return {k: v.cpu().clone() for k, v in model.module.state_dict().items()}
78
+ return None
79
+
80
+ # ZeRO-3
81
+ state_dict_on_rank_0 = {}
82
+ for param_name, param in model.module.named_parameters():
83
+ if hasattr(param, 'ds_id'):
84
+ with deepspeed.zero.GatheredParameters(param, modifier_rank=0):
85
+ if TrainerTools().parallel.is_main_process:
86
+ state_dict_on_rank_0[param_name] = param.data.to(torch.float32).cpu().clone()
87
+ else:
88
+ if TrainerTools().parallel.is_main_process:
89
+ state_dict_on_rank_0[param_name] = param.data.to(torch.float32).cpu().clone()
90
+
91
+ return state_dict_on_rank_0 if TrainerTools().parallel.is_main_process else None
92
+
93
+
70
94
  def get_ds_model_params(model: nn.Module):
71
95
  """
72
96
  从一个正在运行的 DeepSpeedEngine 中高效地提取完整的 FP32 state_dict,
73
97
  兼容 ZeRO Stages 0, 1, 2, 3。
74
- 这个版本包含了对 ZeRO-3 中非分片参数的正确处理。
98
+ 包含了对 ZeRO-3 中分片参数的正确处理。
75
99
  """
76
100
 
77
101
  assert isinstance(model, DeepSpeedEngine)
78
- zero_stage = model.zero_optimization_stage()
79
- state_dict = None
80
-
81
- if TrainerTools().parallel.is_main_process:
82
- if zero_stage == 3:
83
- # ZeRO-3: Rank 0 聚合参数来构建完整的 state_dict
84
- state_dict = {}
85
- for param in model.module.parameters():
86
- # 关键检查:判断参数是否被 ZeRO-3 分片管理
87
- if hasattr(param, 'ds_id'):
88
- # 这是被分片的参数,使用 GatheredParameters 聚合
89
- with deepspeed.zero.GatheredParameters(param, modifier_rank=0):
90
- # .clone() 创建一个独立副本, .to('cpu') 移动到CPU, .to(torch.float32) 确保类型
91
- state_dict[param.ds_name] = param.data.to(torch.float32).cpu().clone()
92
- else:
93
- # 这是未被分片的参数 (e.g., tied weights, buffers), 直接从 Rank 0 复制
94
- state_dict[param.ds_name] = param.data.to(torch.float32).cpu().clone()
95
- else: # zero_stage in [0, 1, 2]
96
- # 在这些 stage,rank 0 已经有完整的模型。
97
- # 我们从 model_engine.module 获取原始模型状态。
98
- state_dict = {k: v.cpu().clone() for k, v in model.module.state_dict().items()}
102
+ state_dict = _get_ds_full_state_dict_on_rank0(model)
99
103
 
100
104
  # 现在,只有 rank 0 上的 state_dict 是一个有效的字典,其他 rank 上是 None。
101
105
  # 我们需要将其广播给所有进程。
102
106
  if TrainerTools().parallel.world_size > 1:
103
107
  # 准备一个列表,rank 0 有数据,其他 rank 是占位符
104
108
  object_list = [state_dict] if TrainerTools().parallel.is_main_process else [None]
105
-
106
109
  # 执行广播,这个操作是阻塞的,会同步所有进程
107
110
  dist.broadcast_object_list(object_list, src=0)
108
-
109
111
  # 所有进程从列表中获取广播后的 state_dict 副本
110
112
  state_dict = object_list[0]
111
113
 
llm_trainer/eval.py CHANGED
@@ -1,7 +1,6 @@
1
1
  import torch
2
2
 
3
3
  from .generate_utils import generate
4
- from .checkpoint import copy_model_params
5
4
  from .log import get_log_dir
6
5
  from .tools import TrainerTools
7
6
  from .train_configs import EvalConfig
@@ -37,7 +36,6 @@ def _eval_task(
37
36
 
38
37
 
39
38
  def submit_gen_task(
40
- train_model: torch.nn.Module,
41
39
  eval_model: torch.nn.Module,
42
40
  eval_config: EvalConfig,
43
41
  tag,
@@ -46,13 +44,6 @@ def submit_gen_task(
46
44
  max_position_embeddings,
47
45
  tokens_per_image
48
46
  ):
49
- try:
50
- copy_model_params(_from=train_model, _to=eval_model)
51
- except Exception as e:
52
- if isinstance(e, KeyboardInterrupt):
53
- raise e
54
- return
55
-
56
47
  eval_model.to(TrainerTools().parallel.device)
57
48
  _eval_task(
58
49
  eval_model=eval_model,
@@ -53,6 +53,18 @@ def load_fsdp_checkpoint(
53
53
  optimizer.load_state_dict(state_dict['optim_state_dict'])
54
54
 
55
55
 
56
+ def _get_fsdp_full_state_dict_on_rank0(model: nn.Module) -> Optional[dict]:
57
+ """
58
+ 可以在任意rank上调用,然后只有rank0有值
59
+ """
60
+
61
+ from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
62
+ with FSDP.summon_full_params(model, writeback=False, offload_to_cpu=True):
63
+ if TrainerTools().parallel.is_main_process:
64
+ return {k: v.clone() for k, v in model.state_dict().items()}
65
+
66
+ return None
67
+
56
68
 
57
69
  def get_fsdp_model_params(model: nn.Module):
58
70
  """
@@ -60,31 +72,15 @@ def get_fsdp_model_params(model: nn.Module):
60
72
  这个函数会聚合所有分片的参数,并确保所有 rank 都收到一个完整的副本。
61
73
  """
62
74
 
63
- # FSDP 要求在所有 rank 上都调用 summon_full_params,即使我们只在 rank 0 上操作。
64
- # writeback=False: 我们只读取参数,不写回,可以节省开销。
65
- # offload_to_cpu=True: 直接将聚合后的参数卸载到 CPU,避免在 GPU 上产生大的峰值内存,
66
- # 并为我们省去了 .cpu() 的步骤。这是一个非常有用的优化。
67
- # rank0_only=False: 为了让 offload_to_cpu 在所有 rank 上都生效,这里通常设为 False。
68
- # 我们稍后通过 get_rank() 来确保只有 rank 0 实际构建字典。
69
- with FSDP.summon_full_params(model, writeback=False, offload_to_cpu=True):
70
-
71
- state_dict = None
72
- if TrainerTools().parallel.is_main_process:
73
- # 在这个 with 块内部, model.state_dict() 会返回一个在 CPU 上的、完整的状态字典。
74
- # 因为我们设置了 offload_to_cpu=True。
75
- # 我们使用 .clone() 来确保我们得到的是一个独立的副本,
76
- # 尽管 offload_to_cpu 已经帮我们处理了大部分情况。
77
- state_dict = {k: v.clone() for k, v in model.state_dict().items()}
75
+ state_dict = _get_fsdp_full_state_dict_on_rank0(model)
78
76
 
79
77
  # 现在,只有 rank 0 上的 state_dict 是一个有效的字典,其他 rank 上是 None。
80
78
  # 我们需要将其广播给所有进程。
81
79
  if TrainerTools().parallel.world_size > 1:
82
80
  # 准备一个列表,rank 0 有数据,其他 rank 是占位符
83
81
  object_list = [state_dict] if TrainerTools().parallel.is_main_process else [None]
84
-
85
82
  # 执行广播,这个操作是阻塞的,会同步所有进程
86
83
  dist.broadcast_object_list(object_list, src=0)
87
-
88
84
  # 所有进程从列表中获取广播后的 state_dict 副本
89
85
  state_dict = object_list[0]
90
86
 
@@ -1,6 +1,7 @@
1
1
  from typing import Union, Optional, List
2
2
  from contextlib import nullcontext
3
3
  import torch
4
+ import torch.distributed as dist
4
5
  from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
5
6
  from llm_model import VlmModel, KVCache
6
7
  from .tools import TrainerTools
@@ -14,6 +14,7 @@ from .dataset import GRPORolloutDataset
14
14
  from .loss import GRPOLoss
15
15
  from .tools import TrainerTools
16
16
  from .generate_utils import batch_generate
17
+ from .log import log
17
18
 
18
19
  from .checkpoint import (
19
20
  save_checkpoint,
@@ -46,12 +47,9 @@ class GRPOTrainer(Trainer):
46
47
 
47
48
  def _init_reference_model(self):
48
49
  reference_model = self._new_model(self.train_config)
49
-
50
- device = 'cpu' # TrainerTools().parallel.device
51
- reference_model.to(device)
52
- # load_checkpoint_for_eval(model=reference_model, device=device)
53
-
50
+ reference_model.to('cpu')
54
51
  reference_model.eval()
52
+
55
53
  for param in reference_model.parameters():
56
54
  param.requires_grad = False
57
55
 
@@ -59,17 +57,6 @@ class GRPOTrainer(Trainer):
59
57
 
60
58
  def _init_generate_model(self):
61
59
  return copy.deepcopy(self.reference_model)
62
- # generate_model = self._new_model(self.train_config)
63
- #
64
- # device = 'cpu' #TrainerTools().parallel.device
65
- # generate_model.to(device)
66
- # # load_checkpoint_for_eval(model=generate_model, device=device)
67
- #
68
- # generate_model.eval()
69
- # for param in generate_model.parameters():
70
- # param.requires_grad = False
71
- #
72
- # return generate_model
73
60
 
74
61
  def _init_loss(self):
75
62
  criterion = GRPOLoss(
@@ -194,7 +181,6 @@ class GRPOTrainer(Trainer):
194
181
 
195
182
  # [batch*group_size, max_prompt_len+max_gen_len]
196
183
  outputs: torch.Tensor = batch_generate(
197
- # model=self.train_model,
198
184
  model=self.generate_model,
199
185
  tokens=prompt_ids,
200
186
  pad_token_id=pad_token_id,
@@ -325,10 +311,14 @@ class GRPOTrainer(Trainer):
325
311
  self.generate_model.to(device)
326
312
  self.reference_model.to(device)
327
313
 
328
- # 保存了train_model checkpoint后,这里保证生成模型使用的参数是最新
329
- copy_model_params(_from=self.train_model, _to=self.generate_model)
314
+ if TrainerTools().parallel.is_main_process:
315
+ log(f'start generate for batch {batch}/{batch_count_per_file}')
316
+
330
317
  # 生成数据
331
- rollout_data = self._generate_rollout_data(batch_data)
318
+ with torch.no_grad():
319
+ # 保存了train_model checkpoint后,这里保证生成模型使用的参数是最新
320
+ copy_model_params(_from=self.train_model, _to=self.generate_model)
321
+ rollout_data = self._generate_rollout_data(batch_data)
332
322
 
333
323
  # 卸载到cpu上,等待下次使用时再to gpu
334
324
  self.generate_model.to('cpu')
@@ -337,6 +327,9 @@ class GRPOTrainer(Trainer):
337
327
  # end generate
338
328
 
339
329
  try:
330
+ if TrainerTools().parallel.is_main_process:
331
+ log(f'start train for batch {batch}/{batch_count_per_file}')
332
+
340
333
  for grpo_step in range(self.train_config.grpo_config.grpo_steps):
341
334
  with self.ctx:
342
335
  loss, aux_loss = self._maximize_grpo_objective(rollout_data)
llm_trainer/trainer.py CHANGED
@@ -31,6 +31,7 @@ from .scheduler import (
31
31
  from .checkpoint import (
32
32
  load_checkpoint,
33
33
  save_checkpoint,
34
+ copy_model_params,
34
35
  load_steps,
35
36
  save_steps,
36
37
  )
@@ -416,6 +417,8 @@ class Trainer:
416
417
  self,
417
418
  tag: str
418
419
  ):
420
+ copy_model_params(_from=self.train_model, _to=self.eval_model)
421
+
419
422
  if TrainerTools().parallel.is_main_process:
420
423
  eval_prompt, eval_image_tag = self._get_eval_data()
421
424
  if isinstance(self.train_model, VlmModel) and self.pixel_values_provider and eval_image_tag:
@@ -424,7 +427,6 @@ class Trainer:
424
427
  eval_pixel_values = None
425
428
 
426
429
  submit_gen_task(
427
- self.train_model,
428
430
  self.eval_model,
429
431
  self.train_config.eval_config,
430
432
  tag=f'sign:batch/{tag}',
@@ -439,6 +441,8 @@ class Trainer:
439
441
  self,
440
442
  tag: str
441
443
  ):
444
+ copy_model_params(_from=self.train_model, _to=self.eval_model)
445
+
442
446
  if TrainerTools().parallel.is_main_process:
443
447
  eval_prompt, eval_image_tag = self._get_eval_data()
444
448
  if isinstance(self.train_model, VlmModel) and self.pixel_values_provider and eval_image_tag:
@@ -447,7 +451,6 @@ class Trainer:
447
451
  eval_pixel_values = None
448
452
 
449
453
  submit_gen_task(
450
- self.train_model,
451
454
  self.eval_model,
452
455
  self.train_config.eval_config,
453
456
  tag=f'sign:epoch/{tag}',
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: project_llm_trainer
3
- Version: 0.4
3
+ Version: 0.4.2
4
4
  Summary: LLM and VLM trainer
5
5
  Author: qibin
6
6
  Author-email: qibin0506@gmail.com
@@ -0,0 +1,35 @@
1
+ llm_trainer/__init__.py,sha256=HWgtTEVeQSnZmEyYQm2K6eFEG4X2QAoigMlB5Z2tcXE,260
2
+ llm_trainer/checkpoint.py,sha256=yZcExxneN2yzvWxRiK-pstMWs35LV7GiOfqcLq-S6vc,5745
3
+ llm_trainer/dataset.py,sha256=4QlOo0SFB5816BUYegQjgobUqTUMQvdmZMM_OEAMSjE,4347
4
+ llm_trainer/dcp.py,sha256=PkD97DyrOtoTKn4FJsfL3VqAy4dxufgjdzJEz8-Cnoc,3635
5
+ llm_trainer/dpo_trainer.py,sha256=rC_I5ipesSlP3gFK_SG2GB8NbgJAMu4K7KLxkAS-aRY,13406
6
+ llm_trainer/ds_checkpoint.py,sha256=nchGocJE2oJnQ_KNN1kw-BkOAEIyTtO8SJt41cuN_xM,4232
7
+ llm_trainer/eval.py,sha256=NDm8PbXLch7xT81xPYPRCNrcrB_Xj5GDJSCxyVwUOp4,1524
8
+ llm_trainer/fsdp_checkpoint.py,sha256=lqZFzHyWyfzuCq_81kQNtJd2qaiMeY1N5BCEMnrJTBw,3192
9
+ llm_trainer/generate_utils.py,sha256=RpAIjN0fvyTkMk9b9x7YE6c5GiiE3x5YGyPaa4R_BjA,15191
10
+ llm_trainer/grpo_trainer.py,sha256=bZPrxhyPQLAnFzWhI7hhA6fpuKVNwj7nOm9k0ku9aK4,15977
11
+ llm_trainer/log.py,sha256=LxqTGRNZUGMTSQCePRpk-rYyxSnSIbT4kOdP8Fbzr0M,462
12
+ llm_trainer/loss.py,sha256=Yv3fsaVuZ5AhnGPJOr5vEMb_tM2urR6mCb4DBbrHHI8,6030
13
+ llm_trainer/parallel.py,sha256=DQu8GqEFxD99HQ6hKuIxxyKi-05dMO33eMhImYlPuOI,4468
14
+ llm_trainer/parallel_ddp.py,sha256=Pob9vUlBZnkL4oP1Re11kFob7nufMSE96pn7m7fuOEM,1345
15
+ llm_trainer/parallel_ds.py,sha256=oy8RRxHud3rACWubFlJqqd0pjPEQhKeAPGPQUSdJX2c,1145
16
+ llm_trainer/parallel_fsdp.py,sha256=cQOdY8ou6m8OsR06PpFVn6GiyZlK9nefkcGyszUOIJk,4055
17
+ llm_trainer/parallel_none.py,sha256=TG6Pm829Dg-yQu-97O-EHV3FCARBlNcP47KkGFAs16E,676
18
+ llm_trainer/scheduler.py,sha256=Xz8HhwoRMjRe41sf_NHhpZfkTlEs0I2MYusvMY6hCVw,3531
19
+ llm_trainer/sft_trainer.py,sha256=gxQA7T1o1QGUsHp2CX1Qb_fO5LppBJuNbc0H4ixCYUA,1783
20
+ llm_trainer/tokenizer.py,sha256=A7TYYUbtPf75kjCvWP7yBui4xZBObMk2aPem62YpwpY,6776
21
+ llm_trainer/tools.py,sha256=O45-20wRmh-nyTfU-U-XtjbKAoe7boEIsUvWT_NaKx4,3041
22
+ llm_trainer/train_configs.py,sha256=arnet3tIzgVnwshod08F1jE7r4I7e-SIgMy55IagPnE,15971
23
+ llm_trainer/trainer.py,sha256=hOn-z8kOd67RTuaaNMmdQjlw7N5LIZRHjSt5frpA1xI,25355
24
+ llm_trainer/utils.py,sha256=-ivhMF0d999va13S1wt2uBvtVw8Nvr3uBzhaUFKL04Q,6826
25
+ project_llm_trainer-0.4.2.data/scripts/calc_intermediate_size,sha256=AggpgNHokJiJMbEtVdOnolqr_4bH3i1UYuZNEAzC2Gc,460
26
+ project_llm_trainer-0.4.2.data/scripts/ddp_train,sha256=x81AasaN2-9TwARFFF1l7iV1LmfMQ0bLw0i_CGbOwSw,299
27
+ project_llm_trainer-0.4.2.data/scripts/ds_train,sha256=qL3qc3TcedBCw98UZUjW07ONcErRawLE1HymW2AmscA,265
28
+ project_llm_trainer-0.4.2.data/scripts/plot_loss,sha256=MzFcdJESlVr1srj4Td6-AxPGUKkfB_QEcJwm0Bd-5fU,910
29
+ project_llm_trainer-0.4.2.data/scripts/plot_lr,sha256=w_7XR_x3KYYyboeOVAeu_I4fveLFI-C0wBmRrNlmWUI,894
30
+ project_llm_trainer-0.4.2.data/scripts/py_train,sha256=tOp9TquORQeU8XN5H7OVIk5O0Ypwi34p_GENxTwgwdk,265
31
+ project_llm_trainer-0.4.2.data/scripts/smart_train,sha256=Pmt4Q0to4Hoz82iB9uFPZuz7uahNUbfE7FR1940EBy8,716
32
+ project_llm_trainer-0.4.2.dist-info/METADATA,sha256=CBDzoyiYlDPzgmffGgQIMy134eDLLOyRwBoLeUzgQ2g,195
33
+ project_llm_trainer-0.4.2.dist-info/WHEEL,sha256=Nw36Djuh_5VDukK0H78QzOX-_FQEo6V37m3nkm96gtU,91
34
+ project_llm_trainer-0.4.2.dist-info/top_level.txt,sha256=LtRFg28i0QIG7iBCD2t095oSco99LCtkijibS9cMGik,12
35
+ project_llm_trainer-0.4.2.dist-info/RECORD,,
@@ -1,35 +0,0 @@
1
- llm_trainer/__init__.py,sha256=HWgtTEVeQSnZmEyYQm2K6eFEG4X2QAoigMlB5Z2tcXE,260
2
- llm_trainer/checkpoint.py,sha256=GPaSGvnLCGMgsIA_vfjuw34tTQY26EuNwu7c08fhJHQ,5638
3
- llm_trainer/dataset.py,sha256=4QlOo0SFB5816BUYegQjgobUqTUMQvdmZMM_OEAMSjE,4347
4
- llm_trainer/dcp.py,sha256=PkD97DyrOtoTKn4FJsfL3VqAy4dxufgjdzJEz8-Cnoc,3635
5
- llm_trainer/dpo_trainer.py,sha256=rC_I5ipesSlP3gFK_SG2GB8NbgJAMu4K7KLxkAS-aRY,13406
6
- llm_trainer/ds_checkpoint.py,sha256=H0BxYQixOWKRC20t55cFqNDTPzalD3AGTVt-owIB0_4,4488
7
- llm_trainer/eval.py,sha256=CsB3TpSVwhYVS9SP4Kuj_JhFUUvLcZUkvd8hvEIkPDU,1782
8
- llm_trainer/fsdp_checkpoint.py,sha256=xPQnAfXbx1SRKcVDLLgOtVrqjk0CjIRleVY0ZrwOAJU,3876
9
- llm_trainer/generate_utils.py,sha256=4iM0vyc_1C_iTL31GlS9PR4eZtYaELPRZ02KDSPZA9U,15158
10
- llm_trainer/grpo_trainer.py,sha256=fqLT48ORSCece_e8dpyt8J7EarDuTnGoJ_eHk7Oy-1k,16177
11
- llm_trainer/log.py,sha256=LxqTGRNZUGMTSQCePRpk-rYyxSnSIbT4kOdP8Fbzr0M,462
12
- llm_trainer/loss.py,sha256=Yv3fsaVuZ5AhnGPJOr5vEMb_tM2urR6mCb4DBbrHHI8,6030
13
- llm_trainer/parallel.py,sha256=DQu8GqEFxD99HQ6hKuIxxyKi-05dMO33eMhImYlPuOI,4468
14
- llm_trainer/parallel_ddp.py,sha256=Pob9vUlBZnkL4oP1Re11kFob7nufMSE96pn7m7fuOEM,1345
15
- llm_trainer/parallel_ds.py,sha256=oy8RRxHud3rACWubFlJqqd0pjPEQhKeAPGPQUSdJX2c,1145
16
- llm_trainer/parallel_fsdp.py,sha256=cQOdY8ou6m8OsR06PpFVn6GiyZlK9nefkcGyszUOIJk,4055
17
- llm_trainer/parallel_none.py,sha256=TG6Pm829Dg-yQu-97O-EHV3FCARBlNcP47KkGFAs16E,676
18
- llm_trainer/scheduler.py,sha256=Xz8HhwoRMjRe41sf_NHhpZfkTlEs0I2MYusvMY6hCVw,3531
19
- llm_trainer/sft_trainer.py,sha256=gxQA7T1o1QGUsHp2CX1Qb_fO5LppBJuNbc0H4ixCYUA,1783
20
- llm_trainer/tokenizer.py,sha256=A7TYYUbtPf75kjCvWP7yBui4xZBObMk2aPem62YpwpY,6776
21
- llm_trainer/tools.py,sha256=O45-20wRmh-nyTfU-U-XtjbKAoe7boEIsUvWT_NaKx4,3041
22
- llm_trainer/train_configs.py,sha256=arnet3tIzgVnwshod08F1jE7r4I7e-SIgMy55IagPnE,15971
23
- llm_trainer/trainer.py,sha256=DujZR1KOHyP3EHR8uIQPEsnX_5b7YC9Cto_eH7zxWqc,25256
24
- llm_trainer/utils.py,sha256=-ivhMF0d999va13S1wt2uBvtVw8Nvr3uBzhaUFKL04Q,6826
25
- project_llm_trainer-0.4.data/scripts/calc_intermediate_size,sha256=AggpgNHokJiJMbEtVdOnolqr_4bH3i1UYuZNEAzC2Gc,460
26
- project_llm_trainer-0.4.data/scripts/ddp_train,sha256=x81AasaN2-9TwARFFF1l7iV1LmfMQ0bLw0i_CGbOwSw,299
27
- project_llm_trainer-0.4.data/scripts/ds_train,sha256=qL3qc3TcedBCw98UZUjW07ONcErRawLE1HymW2AmscA,265
28
- project_llm_trainer-0.4.data/scripts/plot_loss,sha256=MzFcdJESlVr1srj4Td6-AxPGUKkfB_QEcJwm0Bd-5fU,910
29
- project_llm_trainer-0.4.data/scripts/plot_lr,sha256=w_7XR_x3KYYyboeOVAeu_I4fveLFI-C0wBmRrNlmWUI,894
30
- project_llm_trainer-0.4.data/scripts/py_train,sha256=tOp9TquORQeU8XN5H7OVIk5O0Ypwi34p_GENxTwgwdk,265
31
- project_llm_trainer-0.4.data/scripts/smart_train,sha256=Pmt4Q0to4Hoz82iB9uFPZuz7uahNUbfE7FR1940EBy8,716
32
- project_llm_trainer-0.4.dist-info/METADATA,sha256=-xxg-UyXn5MhW5OdYGFUcL5DtbIkgnQoUZS5b5bcEio,193
33
- project_llm_trainer-0.4.dist-info/WHEEL,sha256=Nw36Djuh_5VDukK0H78QzOX-_FQEo6V37m3nkm96gtU,91
34
- project_llm_trainer-0.4.dist-info/top_level.txt,sha256=LtRFg28i0QIG7iBCD2t095oSco99LCtkijibS9cMGik,12
35
- project_llm_trainer-0.4.dist-info/RECORD,,