PyPI - project-llm-trainer - Versions diffs - 0.13.4__py3-none-any.whl - Mend

project-llm-trainer 0.13.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of project-llm-trainer might be problematic. Click here for more details.

Files changed (32) hide show

llm_trainer/__init__.py +13 -0
llm_trainer/base_trainer.py +707 -0
llm_trainer/checkpoint.py +114 -0
llm_trainer/dataset.py +335 -0
llm_trainer/dpo_trainer.py +311 -0
llm_trainer/ds_checkpoint.py +72 -0
llm_trainer/eval.py +33 -0
llm_trainer/generate_utils.py +463 -0
llm_trainer/grpo_trainer.py +410 -0
llm_trainer/log.py +65 -0
llm_trainer/loss.py +266 -0
llm_trainer/parallel.py +220 -0
llm_trainer/partition_utils.py +219 -0
llm_trainer/ppo_trainer.py +686 -0
llm_trainer/scheduler.py +220 -0
llm_trainer/sft_trainer.py +97 -0
llm_trainer/tokenizer.py +162 -0
llm_trainer/tools.py +116 -0
llm_trainer/train_configs.py +327 -0
llm_trainer/trainer.py +34 -0
llm_trainer/utils.py +630 -0
project_llm_trainer-0.13.4.data/scripts/calc_intermediate_size +15 -0
project_llm_trainer-0.13.4.data/scripts/ddp_train +21 -0
project_llm_trainer-0.13.4.data/scripts/ds_train +17 -0
project_llm_trainer-0.13.4.data/scripts/py_train +12 -0
project_llm_trainer-0.13.4.data/scripts/smart_train +37 -0
project_llm_trainer-0.13.4.data/scripts/vis_log +98 -0
project_llm_trainer-0.13.4.data/scripts/vis_lr +46 -0
project_llm_trainer-0.13.4.dist-info/METADATA +9 -0
project_llm_trainer-0.13.4.dist-info/RECORD +32 -0
project_llm_trainer-0.13.4.dist-info/WHEEL +5 -0
project_llm_trainer-0.13.4.dist-info/top_level.txt +1 -0

project_llm_trainer-0.13.4.data/scripts/smart_train ADDED Viewed

@@ -0,0 +1,37 @@
+#!python
+if __name__ == '__main__':
+    import os, sys, torch
+    arguments = sys.argv[1:]
+    # file name
+    run_file_name = arguments[0]
+    extra_args = ''
+    if len(arguments) > 1:
+        extra_args = f"{' '.join(arguments[1:])} "
+    try:
+        import deepspeed
+        parallel_type = 'ds'
+    except:
+        gpu_count = torch.cuda.device_count()
+        if gpu_count <= 1:
+            parallel_type = 'none'
+        else:
+            parallel_type = 'ddp'
+    os.environ['PARALLEL_TYPE'] = parallel_type
+    if parallel_type == 'ds':
+        command = f'deepspeed {extra_args}{run_file_name}'
+    elif parallel_type == 'ddp':
+        if len(extra_args) == 0:
+            extra_args = '--standalone --nproc_per_node=gpu '
+        command = f'torchrun {extra_args}{run_file_name}'
+    else:
+        command = f'python3 {run_file_name}'
+    print(f'run command {command}')
+    os.system(command)

project_llm_trainer-0.13.4.data/scripts/vis_log ADDED Viewed

@@ -0,0 +1,98 @@
+#!python
+import math
+import os, sys
+import matplotlib.pyplot as plt
+from numpy import ndarray
+from matplotlib.ticker import MaxNLocator
+import re
+if __name__ == '__main__':
+    arguments = sys.argv[1:]
+    loss_file = arguments[0]
+    if not os.path.exists(loss_file):
+        print(f'{loss_file} not found')
+        exit(0)
+    data_map = {}
+    all_metric_keys = []
+    with open(loss_file, 'r') as f:
+        for line in f:
+            if '====' in line:
+                continue
+            try:
+                meta_part, values_part = line.split(' -> ')
+                epoch = int(re.search(r'epoch:\s*(\d+)', meta_part).group(1))
+                file_str = re.search(r'file:\s*(\d+)', meta_part).group(1)
+                file_idx = int(file_str)
+                batch_str = re.search(r'batch:\s*(\d+)', meta_part).group(1)
+                batch_idx = int(batch_str)
+                sort_key = (epoch, file_idx, batch_idx)
+                current_metrics = {}
+                values_kvs = values_part.split(', ')
+                for values_kv in values_kvs:
+                    k, v = values_kv.split(': ')
+                    val = float(v.strip())
+                    current_metrics[k] = val
+                    if k not in all_metric_keys:
+                        all_metric_keys.append(k)
+                data_map[sort_key] = current_metrics
+            except Exception as e:
+                continue
+    sorted_keys = sorted(data_map.keys())
+    results = {k: [] for k in all_metric_keys}
+    for key in sorted_keys:
+        metrics = data_map[key]
+        for k in all_metric_keys:
+            if k in metrics:
+                results[k].append(metrics[k])
+    if not results:
+        print("No valid data found.")
+        exit(0)
+    results_size = len(results.keys())
+    if results_size <= 4:
+        rows = 1
+        cols = results_size
+    else:
+        rows = math.ceil(results_size / 4)
+        cols = 4
+    fig, axes = plt.subplots(nrows=rows, ncols=cols, figsize=(4 * cols, 4 * rows))
+    if isinstance(axes, ndarray):
+        axes = axes.flatten()
+    else:
+        axes = [axes]
+    for idx, title in enumerate(results.keys()):
+        ax = axes[idx]
+        y = results[title]
+        x = list(range(len(y)))
+        ax.plot(x, y)
+        ax.set_title(title)
+        ax.xaxis.set_major_locator(MaxNLocator(nbins=10))
+        ax.tick_params(axis='x', rotation=30)
+        ax.set_xlabel("Step")
+        ax.set_ylabel(title)
+    total_plots = len(results.keys())
+    for i in range(total_plots, len(axes)):
+        axes[i].set_visible(False)
+    plt.tight_layout()
+    plt.show()

project_llm_trainer-0.13.4.data/scripts/vis_lr ADDED Viewed

@@ -0,0 +1,46 @@
+#!python
+import os, sys
+import matplotlib.pyplot as plt
+from matplotlib.ticker import MaxNLocator
+if __name__ == '__main__':
+    arguments = sys.argv[1:]
+    lr_file = arguments[0]
+    if not os.path.exists(lr_file):
+        print(f'{lr_file} not found')
+        exit(0)
+    lrs = {}
+    # [time] step: {self.cur_steps}, lr: {lr}
+    with open(lr_file, 'r') as f:
+        for line in f:
+            if not line:
+                continue
+            data = line.split('step: ')[-1]
+            data = data.split(', lr:')
+            step = int(data[0].strip())
+            lr = float(data[1].strip())
+            lrs[step] = lr
+    plt.title('lr')
+    plt.xlabel("Step")
+    plt.ylabel("Learning Rate")
+    y = lrs.values()
+    x = list(range(len(y)))
+    ax = plt.gca()
+    plt.plot(x, y)
+    ax.xaxis.set_major_locator(MaxNLocator(nbins=20))
+    plt.xticks(rotation=30)
+    plt.tight_layout()
+    plt.show()

project_llm_trainer-0.13.4.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,9 @@
+Metadata-Version: 2.4
+Name: project_llm_trainer
+Version: 0.13.4
+Summary: LLM and VLM trainer
+Author: qibin
+Author-email: qibin0506@gmail.com
+Dynamic: author
+Dynamic: author-email
+Dynamic: summary

project_llm_trainer-0.13.4.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,32 @@
+llm_trainer/__init__.py,sha256=U_rFD6hqNJuNXjcKJ9QnxnAL3SXhyWdGZEcA5GbrU3s,385
+llm_trainer/base_trainer.py,sha256=62zoWzNajK07cnSLuWovxZSlQOikvK5hGa7nW5Yy9BE,29916
+llm_trainer/checkpoint.py,sha256=vjarm-9J-9HAklpQAxbB3Bgph2HI6gxBQvUkB3LywwI,4009
+llm_trainer/dataset.py,sha256=obbJuFmRS3-ntjF3q7acRYkbKYNqLQFMtZij0mCfCjU,10947
+llm_trainer/dpo_trainer.py,sha256=TI8SZxxiqS3BA8IByQl74fjyjCNe-C6OXAqBNbcO5Yw,13192
+llm_trainer/ds_checkpoint.py,sha256=0XZEdBV50obVmAXK1dX_mNuS-yomZW6RTzt1R0TdCyw,2611
+llm_trainer/eval.py,sha256=6qwkRZQXpWJoGm3173Tx39GbgI0gEjA0VNath5J9ekg,1004
+llm_trainer/generate_utils.py,sha256=Yc6xqS0xIaWx4paJMIHDrvQaLCHi5_R91dKvoEtMXgw,16388
+llm_trainer/grpo_trainer.py,sha256=3dSxFSzxzTciGYjUZ_7VN6SdHZx71RIILq0c7Ph6QfU,15962
+llm_trainer/log.py,sha256=BCb8qzs2TGltBFHNuDeEibT6FgBZZTZ-Ijuu1XNOSes,1746
+llm_trainer/loss.py,sha256=AeiUSIkUV6JqyhH3M5CSrXFY9Y_EscG-kE3aOw4bMBE,10140
+llm_trainer/parallel.py,sha256=eWRcqFkOfWM50Chv6gKpifAkaoxF3h8lr3592QXBmx8,6199
+llm_trainer/partition_utils.py,sha256=EMXVGi-AN2piqbOCQei7WmddwQ07jwC5RWClaofIj9Q,8087
+llm_trainer/ppo_trainer.py,sha256=xXgXNVKxTV1jTuz25J1BMfP6r9I0k-hRVGf-b4yJsyw,28946
+llm_trainer/scheduler.py,sha256=cNRPeApnIrSh0fRDo9qKkrkRSYJb7JWKlWOJ30rmzoM,6448
+llm_trainer/sft_trainer.py,sha256=yAHZp8MUlngKgciEUrcVhdEFjjQKRwQ-NqppaBmhc5Y,3687
+llm_trainer/tokenizer.py,sha256=8Mccp4sCaYWiKVD78dEwBMHlA9uS0xf22FOiVxTVtK4,5875
+llm_trainer/tools.py,sha256=QGYOwjabWEMyOe_N9z1yL9WNEjNrEshpZFjnv_QOZH0,3323
+llm_trainer/train_configs.py,sha256=ZL4M5ap3ndaK8hRnBCJ3mjspBYiDyzU8rZxsu2LXJ4E,10519
+llm_trainer/trainer.py,sha256=X0E5-mU5SZRrpevDhhCuUIVMVs0GhVnY7OwAhEgMo9w,1214
+llm_trainer/utils.py,sha256=4SBse7AXn6R7xiRKpRGOF9xrx_ZP9SidgyANkO22CxU,23346
+project_llm_trainer-0.13.4.data/scripts/calc_intermediate_size,sha256=AggpgNHokJiJMbEtVdOnolqr_4bH3i1UYuZNEAzC2Gc,460
+project_llm_trainer-0.13.4.data/scripts/ddp_train,sha256=eZSud6KYQAoKLsYB5QB-FI2zq5AZm6Apq1azKdupV3o,477
+project_llm_trainer-0.13.4.data/scripts/ds_train,sha256=41q4rOxwbvZDUY0FDdAIpG13PEaUWBpthhvFvww8uOc,388
+project_llm_trainer-0.13.4.data/scripts/py_train,sha256=tOp9TquORQeU8XN5H7OVIk5O0Ypwi34p_GENxTwgwdk,265
+project_llm_trainer-0.13.4.data/scripts/smart_train,sha256=N8dp2n7k6bghGczedBVwOdtf1O66oM_cNPh9QmZt0bM,914
+project_llm_trainer-0.13.4.data/scripts/vis_log,sha256=hn3HinTbmOhn9PTby_vodAWmNHDwRA0a9yoU7DHqMjg,2626
+project_llm_trainer-0.13.4.data/scripts/vis_lr,sha256=mgSOckQrRw_42locxk09TTBEeCqSTiu7j1OJ5_vMLDU,923
+project_llm_trainer-0.13.4.dist-info/METADATA,sha256=TaaOytFZKGXMITWTGqPL6Dvm_v_dhLT-ejsMvQ7hsH4,196
+project_llm_trainer-0.13.4.dist-info/WHEEL,sha256=Nw36Djuh_5VDukK0H78QzOX-_FQEo6V37m3nkm96gtU,91
+project_llm_trainer-0.13.4.dist-info/top_level.txt,sha256=LtRFg28i0QIG7iBCD2t095oSco99LCtkijibS9cMGik,12
+project_llm_trainer-0.13.4.dist-info/RECORD,,

project_llm_trainer-0.13.4.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: setuptools (80.7.1)
+Root-Is-Purelib: true
+Tag: py3-none-any

project_llm_trainer-0.13.4.dist-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ llm_trainer