gitarsenal-cli 1.9.72 → 1.9.74
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.venv_status.json +1 -1
- package/bin/gitarsenal.js +8 -31
- package/kill_claude/claude_code_agent.py +58 -37
- package/kill_claude/nanoGPT/.gitattributes +3 -0
- package/kill_claude/nanoGPT/LICENSE +21 -0
- package/kill_claude/nanoGPT/README.md +227 -0
- package/kill_claude/nanoGPT/assets/gpt2_124M_loss.png +0 -0
- package/kill_claude/nanoGPT/assets/nanogpt.jpg +0 -0
- package/kill_claude/nanoGPT/bench.py +117 -0
- package/kill_claude/nanoGPT/config/eval_gpt2.py +8 -0
- package/kill_claude/nanoGPT/config/eval_gpt2_large.py +8 -0
- package/kill_claude/nanoGPT/config/eval_gpt2_medium.py +8 -0
- package/kill_claude/nanoGPT/config/eval_gpt2_xl.py +8 -0
- package/kill_claude/nanoGPT/config/finetune_shakespeare.py +25 -0
- package/kill_claude/nanoGPT/config/train_gpt2.py +25 -0
- package/kill_claude/nanoGPT/config/train_shakespeare_char.py +37 -0
- package/kill_claude/nanoGPT/configurator.py +47 -0
- package/kill_claude/nanoGPT/data/openwebtext/prepare.py +81 -0
- package/kill_claude/nanoGPT/data/openwebtext/readme.md +15 -0
- package/kill_claude/nanoGPT/data/shakespeare/prepare.py +33 -0
- package/kill_claude/nanoGPT/data/shakespeare/readme.md +9 -0
- package/kill_claude/nanoGPT/data/shakespeare_char/prepare.py +68 -0
- package/kill_claude/nanoGPT/data/shakespeare_char/readme.md +9 -0
- package/kill_claude/nanoGPT/model.py +330 -0
- package/kill_claude/nanoGPT/sample.py +89 -0
- package/kill_claude/nanoGPT/scaling_laws.ipynb +792 -0
- package/kill_claude/nanoGPT/train.py +336 -0
- package/kill_claude/nanoGPT/transformer_sizing.ipynb +402 -0
- package/kill_claude/prompts/claude-code-tool-prompts.md +1 -0
- package/kill_claude/tools/__pycache__/bash_tool.cpython-313.pyc +0 -0
- package/kill_claude/tools/__pycache__/task_tool.cpython-313.pyc +0 -0
- package/kill_claude/tools/bash_tool.py +1 -0
- package/lib/sandbox.js +1 -8
- package/package.json +1 -1
- package/python/debug_modal_minimal.py +212 -0
- package/python/test_container.py +108 -17
- package/python/test_modalSandboxScript.py +65 -1097
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
import time
|
|
2
|
+
|
|
3
|
+
out_dir = 'out-shakespeare'
|
|
4
|
+
eval_interval = 5
|
|
5
|
+
eval_iters = 40
|
|
6
|
+
wandb_log = False # feel free to turn on
|
|
7
|
+
wandb_project = 'shakespeare'
|
|
8
|
+
wandb_run_name = 'ft-' + str(time.time())
|
|
9
|
+
|
|
10
|
+
dataset = 'shakespeare'
|
|
11
|
+
init_from = 'gpt2-xl' # this is the largest GPT-2 model
|
|
12
|
+
|
|
13
|
+
# only save checkpoints if the validation loss improves
|
|
14
|
+
always_save_checkpoint = False
|
|
15
|
+
|
|
16
|
+
# the number of examples per iter:
|
|
17
|
+
# 1 batch_size * 32 grad_accum * 1024 tokens = 32,768 tokens/iter
|
|
18
|
+
# shakespeare has 301,966 tokens, so 1 epoch ~= 9.2 iters
|
|
19
|
+
batch_size = 1
|
|
20
|
+
gradient_accumulation_steps = 32
|
|
21
|
+
max_iters = 20
|
|
22
|
+
|
|
23
|
+
# finetune at constant LR
|
|
24
|
+
learning_rate = 3e-5
|
|
25
|
+
decay_lr = False
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# config for training GPT-2 (124M) down to very nice loss of ~2.85 on 1 node of 8X A100 40GB
|
|
2
|
+
# launch as the following (e.g. in a screen session) and wait ~5 days:
|
|
3
|
+
# $ torchrun --standalone --nproc_per_node=8 train.py config/train_gpt2.py
|
|
4
|
+
|
|
5
|
+
wandb_log = True
|
|
6
|
+
wandb_project = 'owt'
|
|
7
|
+
wandb_run_name='gpt2-124M'
|
|
8
|
+
|
|
9
|
+
# these make the total batch size be ~0.5M
|
|
10
|
+
# 12 batch size * 1024 block size * 5 gradaccum * 8 GPUs = 491,520
|
|
11
|
+
batch_size = 12
|
|
12
|
+
block_size = 1024
|
|
13
|
+
gradient_accumulation_steps = 5 * 8
|
|
14
|
+
|
|
15
|
+
# this makes total number of tokens be 300B
|
|
16
|
+
max_iters = 600000
|
|
17
|
+
lr_decay_iters = 600000
|
|
18
|
+
|
|
19
|
+
# eval stuff
|
|
20
|
+
eval_interval = 1000
|
|
21
|
+
eval_iters = 200
|
|
22
|
+
log_interval = 10
|
|
23
|
+
|
|
24
|
+
# weight decay
|
|
25
|
+
weight_decay = 1e-1
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# train a miniature character-level shakespeare model
|
|
2
|
+
# good for debugging and playing on macbooks and such
|
|
3
|
+
|
|
4
|
+
out_dir = 'out-shakespeare-char'
|
|
5
|
+
eval_interval = 250 # keep frequent because we'll overfit
|
|
6
|
+
eval_iters = 200
|
|
7
|
+
log_interval = 10 # don't print too too often
|
|
8
|
+
|
|
9
|
+
# we expect to overfit on this small dataset, so only save when val improves
|
|
10
|
+
always_save_checkpoint = False
|
|
11
|
+
|
|
12
|
+
wandb_log = False # override via command line if you like
|
|
13
|
+
wandb_project = 'shakespeare-char'
|
|
14
|
+
wandb_run_name = 'mini-gpt'
|
|
15
|
+
|
|
16
|
+
dataset = 'shakespeare_char'
|
|
17
|
+
gradient_accumulation_steps = 1
|
|
18
|
+
batch_size = 64
|
|
19
|
+
block_size = 256 # context of up to 256 previous characters
|
|
20
|
+
|
|
21
|
+
# baby GPT model :)
|
|
22
|
+
n_layer = 6
|
|
23
|
+
n_head = 6
|
|
24
|
+
n_embd = 384
|
|
25
|
+
dropout = 0.2
|
|
26
|
+
|
|
27
|
+
learning_rate = 1e-3 # with baby networks can afford to go a bit higher
|
|
28
|
+
max_iters = 5000
|
|
29
|
+
lr_decay_iters = 5000 # make equal to max_iters usually
|
|
30
|
+
min_lr = 1e-4 # learning_rate / 10 usually
|
|
31
|
+
beta2 = 0.99 # make a bit bigger because number of tokens per iter is small
|
|
32
|
+
|
|
33
|
+
warmup_iters = 100 # not super necessary potentially
|
|
34
|
+
|
|
35
|
+
# on macbook also add
|
|
36
|
+
# device = 'cpu' # run on cpu only
|
|
37
|
+
# compile = False # do not torch compile the model
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Poor Man's Configurator. Probably a terrible idea. Example usage:
|
|
3
|
+
$ python train.py config/override_file.py --batch_size=32
|
|
4
|
+
this will first run config/override_file.py, then override batch_size to 32
|
|
5
|
+
|
|
6
|
+
The code in this file will be run as follows from e.g. train.py:
|
|
7
|
+
>>> exec(open('configurator.py').read())
|
|
8
|
+
|
|
9
|
+
So it's not a Python module, it's just shuttling this code away from train.py
|
|
10
|
+
The code in this script then overrides the globals()
|
|
11
|
+
|
|
12
|
+
I know people are not going to love this, I just really dislike configuration
|
|
13
|
+
complexity and having to prepend config. to every single variable. If someone
|
|
14
|
+
comes up with a better simple Python solution I am all ears.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import sys
|
|
18
|
+
from ast import literal_eval
|
|
19
|
+
|
|
20
|
+
for arg in sys.argv[1:]:
|
|
21
|
+
if '=' not in arg:
|
|
22
|
+
# assume it's the name of a config file
|
|
23
|
+
assert not arg.startswith('--')
|
|
24
|
+
config_file = arg
|
|
25
|
+
print(f"Overriding config with {config_file}:")
|
|
26
|
+
with open(config_file) as f:
|
|
27
|
+
print(f.read())
|
|
28
|
+
exec(open(config_file).read())
|
|
29
|
+
else:
|
|
30
|
+
# assume it's a --key=value argument
|
|
31
|
+
assert arg.startswith('--')
|
|
32
|
+
key, val = arg.split('=')
|
|
33
|
+
key = key[2:]
|
|
34
|
+
if key in globals():
|
|
35
|
+
try:
|
|
36
|
+
# attempt to eval it it (e.g. if bool, number, or etc)
|
|
37
|
+
attempt = literal_eval(val)
|
|
38
|
+
except (SyntaxError, ValueError):
|
|
39
|
+
# if that goes wrong, just use the string
|
|
40
|
+
attempt = val
|
|
41
|
+
# ensure the types match ok
|
|
42
|
+
assert type(attempt) == type(globals()[key])
|
|
43
|
+
# cross fingers
|
|
44
|
+
print(f"Overriding: {key} = {attempt}")
|
|
45
|
+
globals()[key] = attempt
|
|
46
|
+
else:
|
|
47
|
+
raise ValueError(f"Unknown config key: {key}")
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
# saves the openwebtext dataset to a binary file for training. following was helpful:
|
|
2
|
+
# https://github.com/HazyResearch/flash-attention/blob/main/training/src/datamodules/language_modeling_hf.py
|
|
3
|
+
|
|
4
|
+
import os
|
|
5
|
+
from tqdm import tqdm
|
|
6
|
+
import numpy as np
|
|
7
|
+
import tiktoken
|
|
8
|
+
from datasets import load_dataset # huggingface datasets
|
|
9
|
+
|
|
10
|
+
# number of workers in .map() call
|
|
11
|
+
# good number to use is ~order number of cpu cores // 2
|
|
12
|
+
num_proc = 8
|
|
13
|
+
|
|
14
|
+
# number of workers in load_dataset() call
|
|
15
|
+
# best number might be different from num_proc above as it also depends on NW speed.
|
|
16
|
+
# it is better than 1 usually though
|
|
17
|
+
num_proc_load_dataset = num_proc
|
|
18
|
+
|
|
19
|
+
enc = tiktoken.get_encoding("gpt2")
|
|
20
|
+
|
|
21
|
+
if __name__ == '__main__':
|
|
22
|
+
# takes 54GB in huggingface .cache dir, about 8M documents (8,013,769)
|
|
23
|
+
dataset = load_dataset("openwebtext", num_proc=num_proc_load_dataset)
|
|
24
|
+
|
|
25
|
+
# owt by default only contains the 'train' split, so create a test split
|
|
26
|
+
split_dataset = dataset["train"].train_test_split(test_size=0.0005, seed=2357, shuffle=True)
|
|
27
|
+
split_dataset['val'] = split_dataset.pop('test') # rename the test split to val
|
|
28
|
+
|
|
29
|
+
# this results in:
|
|
30
|
+
# >>> split_dataset
|
|
31
|
+
# DatasetDict({
|
|
32
|
+
# train: Dataset({
|
|
33
|
+
# features: ['text'],
|
|
34
|
+
# num_rows: 8009762
|
|
35
|
+
# })
|
|
36
|
+
# val: Dataset({
|
|
37
|
+
# features: ['text'],
|
|
38
|
+
# num_rows: 4007
|
|
39
|
+
# })
|
|
40
|
+
# })
|
|
41
|
+
|
|
42
|
+
# we now want to tokenize the dataset. first define the encoding function (gpt2 bpe)
|
|
43
|
+
def process(example):
|
|
44
|
+
ids = enc.encode_ordinary(example['text']) # encode_ordinary ignores any special tokens
|
|
45
|
+
ids.append(enc.eot_token) # add the end of text token, e.g. 50256 for gpt2 bpe
|
|
46
|
+
# note: I think eot should be prepended not appended... hmm. it's called "eot" though...
|
|
47
|
+
out = {'ids': ids, 'len': len(ids)}
|
|
48
|
+
return out
|
|
49
|
+
|
|
50
|
+
# tokenize the dataset
|
|
51
|
+
tokenized = split_dataset.map(
|
|
52
|
+
process,
|
|
53
|
+
remove_columns=['text'],
|
|
54
|
+
desc="tokenizing the splits",
|
|
55
|
+
num_proc=num_proc,
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
# concatenate all the ids in each dataset into one large file we can use for training
|
|
59
|
+
for split, dset in tokenized.items():
|
|
60
|
+
arr_len = np.sum(dset['len'], dtype=np.uint64)
|
|
61
|
+
filename = os.path.join(os.path.dirname(__file__), f'{split}.bin')
|
|
62
|
+
dtype = np.uint16 # (can do since enc.max_token_value == 50256 is < 2**16)
|
|
63
|
+
arr = np.memmap(filename, dtype=dtype, mode='w+', shape=(arr_len,))
|
|
64
|
+
total_batches = 1024
|
|
65
|
+
|
|
66
|
+
idx = 0
|
|
67
|
+
for batch_idx in tqdm(range(total_batches), desc=f'writing {filename}'):
|
|
68
|
+
# Batch together samples for faster write
|
|
69
|
+
batch = dset.shard(num_shards=total_batches, index=batch_idx, contiguous=True).with_format('numpy')
|
|
70
|
+
arr_batch = np.concatenate(batch['ids'])
|
|
71
|
+
# Write into mmap
|
|
72
|
+
arr[idx : idx + len(arr_batch)] = arr_batch
|
|
73
|
+
idx += len(arr_batch)
|
|
74
|
+
arr.flush()
|
|
75
|
+
|
|
76
|
+
# train.bin is ~17GB, val.bin ~8.5MB
|
|
77
|
+
# train has ~9B tokens (9,035,582,198)
|
|
78
|
+
# val has ~4M tokens (4,434,897)
|
|
79
|
+
|
|
80
|
+
# to read the bin files later, e.g. with numpy:
|
|
81
|
+
# m = np.memmap('train.bin', dtype=np.uint16, mode='r')
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
|
|
2
|
+
## openwebtext dataset
|
|
3
|
+
|
|
4
|
+
after running `prepare.py` (preprocess) we get:
|
|
5
|
+
|
|
6
|
+
- train.bin is ~17GB, val.bin ~8.5MB
|
|
7
|
+
- train has ~9B tokens (9,035,582,198)
|
|
8
|
+
- val has ~4M tokens (4,434,897)
|
|
9
|
+
|
|
10
|
+
this came from 8,013,769 documents in total.
|
|
11
|
+
|
|
12
|
+
references:
|
|
13
|
+
|
|
14
|
+
- OpenAI's WebText dataset is discussed in [GPT-2 paper](https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf)
|
|
15
|
+
- [OpenWebText](https://skylion007.github.io/OpenWebTextCorpus/) dataset
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import requests
|
|
3
|
+
import tiktoken
|
|
4
|
+
import numpy as np
|
|
5
|
+
|
|
6
|
+
# download the tiny shakespeare dataset
|
|
7
|
+
input_file_path = os.path.join(os.path.dirname(__file__), 'input.txt')
|
|
8
|
+
if not os.path.exists(input_file_path):
|
|
9
|
+
data_url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
|
|
10
|
+
with open(input_file_path, 'w', encoding='utf-8') as f:
|
|
11
|
+
f.write(requests.get(data_url).text)
|
|
12
|
+
|
|
13
|
+
with open(input_file_path, 'r', encoding='utf-8') as f:
|
|
14
|
+
data = f.read()
|
|
15
|
+
n = len(data)
|
|
16
|
+
train_data = data[:int(n*0.9)]
|
|
17
|
+
val_data = data[int(n*0.9):]
|
|
18
|
+
|
|
19
|
+
# encode with tiktoken gpt2 bpe
|
|
20
|
+
enc = tiktoken.get_encoding("gpt2")
|
|
21
|
+
train_ids = enc.encode_ordinary(train_data)
|
|
22
|
+
val_ids = enc.encode_ordinary(val_data)
|
|
23
|
+
print(f"train has {len(train_ids):,} tokens")
|
|
24
|
+
print(f"val has {len(val_ids):,} tokens")
|
|
25
|
+
|
|
26
|
+
# export to bin files
|
|
27
|
+
train_ids = np.array(train_ids, dtype=np.uint16)
|
|
28
|
+
val_ids = np.array(val_ids, dtype=np.uint16)
|
|
29
|
+
train_ids.tofile(os.path.join(os.path.dirname(__file__), 'train.bin'))
|
|
30
|
+
val_ids.tofile(os.path.join(os.path.dirname(__file__), 'val.bin'))
|
|
31
|
+
|
|
32
|
+
# train.bin has 301,966 tokens
|
|
33
|
+
# val.bin has 36,059 tokens
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Prepare the Shakespeare dataset for character-level language modeling.
|
|
3
|
+
So instead of encoding with GPT-2 BPE tokens, we just map characters to ints.
|
|
4
|
+
Will save train.bin, val.bin containing the ids, and meta.pkl containing the
|
|
5
|
+
encoder and decoder and some other related info.
|
|
6
|
+
"""
|
|
7
|
+
import os
|
|
8
|
+
import pickle
|
|
9
|
+
import requests
|
|
10
|
+
import numpy as np
|
|
11
|
+
|
|
12
|
+
# download the tiny shakespeare dataset
|
|
13
|
+
input_file_path = os.path.join(os.path.dirname(__file__), 'input.txt')
|
|
14
|
+
if not os.path.exists(input_file_path):
|
|
15
|
+
data_url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
|
|
16
|
+
with open(input_file_path, 'w') as f:
|
|
17
|
+
f.write(requests.get(data_url).text)
|
|
18
|
+
|
|
19
|
+
with open(input_file_path, 'r') as f:
|
|
20
|
+
data = f.read()
|
|
21
|
+
print(f"length of dataset in characters: {len(data):,}")
|
|
22
|
+
|
|
23
|
+
# get all the unique characters that occur in this text
|
|
24
|
+
chars = sorted(list(set(data)))
|
|
25
|
+
vocab_size = len(chars)
|
|
26
|
+
print("all the unique characters:", ''.join(chars))
|
|
27
|
+
print(f"vocab size: {vocab_size:,}")
|
|
28
|
+
|
|
29
|
+
# create a mapping from characters to integers
|
|
30
|
+
stoi = { ch:i for i,ch in enumerate(chars) }
|
|
31
|
+
itos = { i:ch for i,ch in enumerate(chars) }
|
|
32
|
+
def encode(s):
|
|
33
|
+
return [stoi[c] for c in s] # encoder: take a string, output a list of integers
|
|
34
|
+
def decode(l):
|
|
35
|
+
return ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string
|
|
36
|
+
|
|
37
|
+
# create the train and test splits
|
|
38
|
+
n = len(data)
|
|
39
|
+
train_data = data[:int(n*0.9)]
|
|
40
|
+
val_data = data[int(n*0.9):]
|
|
41
|
+
|
|
42
|
+
# encode both to integers
|
|
43
|
+
train_ids = encode(train_data)
|
|
44
|
+
val_ids = encode(val_data)
|
|
45
|
+
print(f"train has {len(train_ids):,} tokens")
|
|
46
|
+
print(f"val has {len(val_ids):,} tokens")
|
|
47
|
+
|
|
48
|
+
# export to bin files
|
|
49
|
+
train_ids = np.array(train_ids, dtype=np.uint16)
|
|
50
|
+
val_ids = np.array(val_ids, dtype=np.uint16)
|
|
51
|
+
train_ids.tofile(os.path.join(os.path.dirname(__file__), 'train.bin'))
|
|
52
|
+
val_ids.tofile(os.path.join(os.path.dirname(__file__), 'val.bin'))
|
|
53
|
+
|
|
54
|
+
# save the meta information as well, to help us encode/decode later
|
|
55
|
+
meta = {
|
|
56
|
+
'vocab_size': vocab_size,
|
|
57
|
+
'itos': itos,
|
|
58
|
+
'stoi': stoi,
|
|
59
|
+
}
|
|
60
|
+
with open(os.path.join(os.path.dirname(__file__), 'meta.pkl'), 'wb') as f:
|
|
61
|
+
pickle.dump(meta, f)
|
|
62
|
+
|
|
63
|
+
# length of dataset in characters: 1115394
|
|
64
|
+
# all the unique characters:
|
|
65
|
+
# !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
|
|
66
|
+
# vocab size: 65
|
|
67
|
+
# train has 1003854 tokens
|
|
68
|
+
# val has 111540 tokens
|