gitarsenal-cli 1.9.72 → 1.9.74

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. package/.venv_status.json +1 -1
  2. package/bin/gitarsenal.js +8 -31
  3. package/kill_claude/claude_code_agent.py +58 -37
  4. package/kill_claude/nanoGPT/.gitattributes +3 -0
  5. package/kill_claude/nanoGPT/LICENSE +21 -0
  6. package/kill_claude/nanoGPT/README.md +227 -0
  7. package/kill_claude/nanoGPT/assets/gpt2_124M_loss.png +0 -0
  8. package/kill_claude/nanoGPT/assets/nanogpt.jpg +0 -0
  9. package/kill_claude/nanoGPT/bench.py +117 -0
  10. package/kill_claude/nanoGPT/config/eval_gpt2.py +8 -0
  11. package/kill_claude/nanoGPT/config/eval_gpt2_large.py +8 -0
  12. package/kill_claude/nanoGPT/config/eval_gpt2_medium.py +8 -0
  13. package/kill_claude/nanoGPT/config/eval_gpt2_xl.py +8 -0
  14. package/kill_claude/nanoGPT/config/finetune_shakespeare.py +25 -0
  15. package/kill_claude/nanoGPT/config/train_gpt2.py +25 -0
  16. package/kill_claude/nanoGPT/config/train_shakespeare_char.py +37 -0
  17. package/kill_claude/nanoGPT/configurator.py +47 -0
  18. package/kill_claude/nanoGPT/data/openwebtext/prepare.py +81 -0
  19. package/kill_claude/nanoGPT/data/openwebtext/readme.md +15 -0
  20. package/kill_claude/nanoGPT/data/shakespeare/prepare.py +33 -0
  21. package/kill_claude/nanoGPT/data/shakespeare/readme.md +9 -0
  22. package/kill_claude/nanoGPT/data/shakespeare_char/prepare.py +68 -0
  23. package/kill_claude/nanoGPT/data/shakespeare_char/readme.md +9 -0
  24. package/kill_claude/nanoGPT/model.py +330 -0
  25. package/kill_claude/nanoGPT/sample.py +89 -0
  26. package/kill_claude/nanoGPT/scaling_laws.ipynb +792 -0
  27. package/kill_claude/nanoGPT/train.py +336 -0
  28. package/kill_claude/nanoGPT/transformer_sizing.ipynb +402 -0
  29. package/kill_claude/prompts/claude-code-tool-prompts.md +1 -0
  30. package/kill_claude/tools/__pycache__/bash_tool.cpython-313.pyc +0 -0
  31. package/kill_claude/tools/__pycache__/task_tool.cpython-313.pyc +0 -0
  32. package/kill_claude/tools/bash_tool.py +1 -0
  33. package/lib/sandbox.js +1 -8
  34. package/package.json +1 -1
  35. package/python/debug_modal_minimal.py +212 -0
  36. package/python/test_container.py +108 -17
  37. package/python/test_modalSandboxScript.py +65 -1097
@@ -0,0 +1,8 @@
1
+ # evaluate the base gpt2
2
+ # n_layer=24, n_head=16, n_embd=1024
3
+ # 350M parameters
4
+ batch_size = 8
5
+ eval_iters = 500 # use more iterations to get good estimate
6
+ eval_only = True
7
+ wandb_log = False
8
+ init_from = 'gpt2-medium'
@@ -0,0 +1,8 @@
1
+ # evaluate the base gpt2
2
+ # n_layer=48, n_head=25, n_embd=1600
3
+ # 1558M parameters
4
+ batch_size = 8
5
+ eval_iters = 500 # use more iterations to get good estimate
6
+ eval_only = True
7
+ wandb_log = False
8
+ init_from = 'gpt2-xl'
@@ -0,0 +1,25 @@
1
+ import time
2
+
3
+ out_dir = 'out-shakespeare'
4
+ eval_interval = 5
5
+ eval_iters = 40
6
+ wandb_log = False # feel free to turn on
7
+ wandb_project = 'shakespeare'
8
+ wandb_run_name = 'ft-' + str(time.time())
9
+
10
+ dataset = 'shakespeare'
11
+ init_from = 'gpt2-xl' # this is the largest GPT-2 model
12
+
13
+ # only save checkpoints if the validation loss improves
14
+ always_save_checkpoint = False
15
+
16
+ # the number of examples per iter:
17
+ # 1 batch_size * 32 grad_accum * 1024 tokens = 32,768 tokens/iter
18
+ # shakespeare has 301,966 tokens, so 1 epoch ~= 9.2 iters
19
+ batch_size = 1
20
+ gradient_accumulation_steps = 32
21
+ max_iters = 20
22
+
23
+ # finetune at constant LR
24
+ learning_rate = 3e-5
25
+ decay_lr = False
@@ -0,0 +1,25 @@
1
+ # config for training GPT-2 (124M) down to very nice loss of ~2.85 on 1 node of 8X A100 40GB
2
+ # launch as the following (e.g. in a screen session) and wait ~5 days:
3
+ # $ torchrun --standalone --nproc_per_node=8 train.py config/train_gpt2.py
4
+
5
+ wandb_log = True
6
+ wandb_project = 'owt'
7
+ wandb_run_name='gpt2-124M'
8
+
9
+ # these make the total batch size be ~0.5M
10
+ # 12 batch size * 1024 block size * 5 gradaccum * 8 GPUs = 491,520
11
+ batch_size = 12
12
+ block_size = 1024
13
+ gradient_accumulation_steps = 5 * 8
14
+
15
+ # this makes total number of tokens be 300B
16
+ max_iters = 600000
17
+ lr_decay_iters = 600000
18
+
19
+ # eval stuff
20
+ eval_interval = 1000
21
+ eval_iters = 200
22
+ log_interval = 10
23
+
24
+ # weight decay
25
+ weight_decay = 1e-1
@@ -0,0 +1,37 @@
1
+ # train a miniature character-level shakespeare model
2
+ # good for debugging and playing on macbooks and such
3
+
4
+ out_dir = 'out-shakespeare-char'
5
+ eval_interval = 250 # keep frequent because we'll overfit
6
+ eval_iters = 200
7
+ log_interval = 10 # don't print too too often
8
+
9
+ # we expect to overfit on this small dataset, so only save when val improves
10
+ always_save_checkpoint = False
11
+
12
+ wandb_log = False # override via command line if you like
13
+ wandb_project = 'shakespeare-char'
14
+ wandb_run_name = 'mini-gpt'
15
+
16
+ dataset = 'shakespeare_char'
17
+ gradient_accumulation_steps = 1
18
+ batch_size = 64
19
+ block_size = 256 # context of up to 256 previous characters
20
+
21
+ # baby GPT model :)
22
+ n_layer = 6
23
+ n_head = 6
24
+ n_embd = 384
25
+ dropout = 0.2
26
+
27
+ learning_rate = 1e-3 # with baby networks can afford to go a bit higher
28
+ max_iters = 5000
29
+ lr_decay_iters = 5000 # make equal to max_iters usually
30
+ min_lr = 1e-4 # learning_rate / 10 usually
31
+ beta2 = 0.99 # make a bit bigger because number of tokens per iter is small
32
+
33
+ warmup_iters = 100 # not super necessary potentially
34
+
35
+ # on macbook also add
36
+ # device = 'cpu' # run on cpu only
37
+ # compile = False # do not torch compile the model
@@ -0,0 +1,47 @@
1
+ """
2
+ Poor Man's Configurator. Probably a terrible idea. Example usage:
3
+ $ python train.py config/override_file.py --batch_size=32
4
+ this will first run config/override_file.py, then override batch_size to 32
5
+
6
+ The code in this file will be run as follows from e.g. train.py:
7
+ >>> exec(open('configurator.py').read())
8
+
9
+ So it's not a Python module, it's just shuttling this code away from train.py
10
+ The code in this script then overrides the globals()
11
+
12
+ I know people are not going to love this, I just really dislike configuration
13
+ complexity and having to prepend config. to every single variable. If someone
14
+ comes up with a better simple Python solution I am all ears.
15
+ """
16
+
17
+ import sys
18
+ from ast import literal_eval
19
+
20
+ for arg in sys.argv[1:]:
21
+ if '=' not in arg:
22
+ # assume it's the name of a config file
23
+ assert not arg.startswith('--')
24
+ config_file = arg
25
+ print(f"Overriding config with {config_file}:")
26
+ with open(config_file) as f:
27
+ print(f.read())
28
+ exec(open(config_file).read())
29
+ else:
30
+ # assume it's a --key=value argument
31
+ assert arg.startswith('--')
32
+ key, val = arg.split('=')
33
+ key = key[2:]
34
+ if key in globals():
35
+ try:
36
+ # attempt to eval it it (e.g. if bool, number, or etc)
37
+ attempt = literal_eval(val)
38
+ except (SyntaxError, ValueError):
39
+ # if that goes wrong, just use the string
40
+ attempt = val
41
+ # ensure the types match ok
42
+ assert type(attempt) == type(globals()[key])
43
+ # cross fingers
44
+ print(f"Overriding: {key} = {attempt}")
45
+ globals()[key] = attempt
46
+ else:
47
+ raise ValueError(f"Unknown config key: {key}")
@@ -0,0 +1,81 @@
1
+ # saves the openwebtext dataset to a binary file for training. following was helpful:
2
+ # https://github.com/HazyResearch/flash-attention/blob/main/training/src/datamodules/language_modeling_hf.py
3
+
4
+ import os
5
+ from tqdm import tqdm
6
+ import numpy as np
7
+ import tiktoken
8
+ from datasets import load_dataset # huggingface datasets
9
+
10
+ # number of workers in .map() call
11
+ # good number to use is ~order number of cpu cores // 2
12
+ num_proc = 8
13
+
14
+ # number of workers in load_dataset() call
15
+ # best number might be different from num_proc above as it also depends on NW speed.
16
+ # it is better than 1 usually though
17
+ num_proc_load_dataset = num_proc
18
+
19
+ enc = tiktoken.get_encoding("gpt2")
20
+
21
+ if __name__ == '__main__':
22
+ # takes 54GB in huggingface .cache dir, about 8M documents (8,013,769)
23
+ dataset = load_dataset("openwebtext", num_proc=num_proc_load_dataset)
24
+
25
+ # owt by default only contains the 'train' split, so create a test split
26
+ split_dataset = dataset["train"].train_test_split(test_size=0.0005, seed=2357, shuffle=True)
27
+ split_dataset['val'] = split_dataset.pop('test') # rename the test split to val
28
+
29
+ # this results in:
30
+ # >>> split_dataset
31
+ # DatasetDict({
32
+ # train: Dataset({
33
+ # features: ['text'],
34
+ # num_rows: 8009762
35
+ # })
36
+ # val: Dataset({
37
+ # features: ['text'],
38
+ # num_rows: 4007
39
+ # })
40
+ # })
41
+
42
+ # we now want to tokenize the dataset. first define the encoding function (gpt2 bpe)
43
+ def process(example):
44
+ ids = enc.encode_ordinary(example['text']) # encode_ordinary ignores any special tokens
45
+ ids.append(enc.eot_token) # add the end of text token, e.g. 50256 for gpt2 bpe
46
+ # note: I think eot should be prepended not appended... hmm. it's called "eot" though...
47
+ out = {'ids': ids, 'len': len(ids)}
48
+ return out
49
+
50
+ # tokenize the dataset
51
+ tokenized = split_dataset.map(
52
+ process,
53
+ remove_columns=['text'],
54
+ desc="tokenizing the splits",
55
+ num_proc=num_proc,
56
+ )
57
+
58
+ # concatenate all the ids in each dataset into one large file we can use for training
59
+ for split, dset in tokenized.items():
60
+ arr_len = np.sum(dset['len'], dtype=np.uint64)
61
+ filename = os.path.join(os.path.dirname(__file__), f'{split}.bin')
62
+ dtype = np.uint16 # (can do since enc.max_token_value == 50256 is < 2**16)
63
+ arr = np.memmap(filename, dtype=dtype, mode='w+', shape=(arr_len,))
64
+ total_batches = 1024
65
+
66
+ idx = 0
67
+ for batch_idx in tqdm(range(total_batches), desc=f'writing {filename}'):
68
+ # Batch together samples for faster write
69
+ batch = dset.shard(num_shards=total_batches, index=batch_idx, contiguous=True).with_format('numpy')
70
+ arr_batch = np.concatenate(batch['ids'])
71
+ # Write into mmap
72
+ arr[idx : idx + len(arr_batch)] = arr_batch
73
+ idx += len(arr_batch)
74
+ arr.flush()
75
+
76
+ # train.bin is ~17GB, val.bin ~8.5MB
77
+ # train has ~9B tokens (9,035,582,198)
78
+ # val has ~4M tokens (4,434,897)
79
+
80
+ # to read the bin files later, e.g. with numpy:
81
+ # m = np.memmap('train.bin', dtype=np.uint16, mode='r')
@@ -0,0 +1,15 @@
1
+
2
+ ## openwebtext dataset
3
+
4
+ after running `prepare.py` (preprocess) we get:
5
+
6
+ - train.bin is ~17GB, val.bin ~8.5MB
7
+ - train has ~9B tokens (9,035,582,198)
8
+ - val has ~4M tokens (4,434,897)
9
+
10
+ this came from 8,013,769 documents in total.
11
+
12
+ references:
13
+
14
+ - OpenAI's WebText dataset is discussed in [GPT-2 paper](https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf)
15
+ - [OpenWebText](https://skylion007.github.io/OpenWebTextCorpus/) dataset
@@ -0,0 +1,33 @@
1
+ import os
2
+ import requests
3
+ import tiktoken
4
+ import numpy as np
5
+
6
+ # download the tiny shakespeare dataset
7
+ input_file_path = os.path.join(os.path.dirname(__file__), 'input.txt')
8
+ if not os.path.exists(input_file_path):
9
+ data_url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
10
+ with open(input_file_path, 'w', encoding='utf-8') as f:
11
+ f.write(requests.get(data_url).text)
12
+
13
+ with open(input_file_path, 'r', encoding='utf-8') as f:
14
+ data = f.read()
15
+ n = len(data)
16
+ train_data = data[:int(n*0.9)]
17
+ val_data = data[int(n*0.9):]
18
+
19
+ # encode with tiktoken gpt2 bpe
20
+ enc = tiktoken.get_encoding("gpt2")
21
+ train_ids = enc.encode_ordinary(train_data)
22
+ val_ids = enc.encode_ordinary(val_data)
23
+ print(f"train has {len(train_ids):,} tokens")
24
+ print(f"val has {len(val_ids):,} tokens")
25
+
26
+ # export to bin files
27
+ train_ids = np.array(train_ids, dtype=np.uint16)
28
+ val_ids = np.array(val_ids, dtype=np.uint16)
29
+ train_ids.tofile(os.path.join(os.path.dirname(__file__), 'train.bin'))
30
+ val_ids.tofile(os.path.join(os.path.dirname(__file__), 'val.bin'))
31
+
32
+ # train.bin has 301,966 tokens
33
+ # val.bin has 36,059 tokens
@@ -0,0 +1,9 @@
1
+
2
+ # tiny shakespeare
3
+
4
+ Tiny shakespeare, of the good old char-rnn fame :)
5
+
6
+ After running `prepare.py`:
7
+
8
+ - train.bin has 301,966 tokens
9
+ - val.bin has 36,059 tokens
@@ -0,0 +1,68 @@
1
+ """
2
+ Prepare the Shakespeare dataset for character-level language modeling.
3
+ So instead of encoding with GPT-2 BPE tokens, we just map characters to ints.
4
+ Will save train.bin, val.bin containing the ids, and meta.pkl containing the
5
+ encoder and decoder and some other related info.
6
+ """
7
+ import os
8
+ import pickle
9
+ import requests
10
+ import numpy as np
11
+
12
+ # download the tiny shakespeare dataset
13
+ input_file_path = os.path.join(os.path.dirname(__file__), 'input.txt')
14
+ if not os.path.exists(input_file_path):
15
+ data_url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
16
+ with open(input_file_path, 'w') as f:
17
+ f.write(requests.get(data_url).text)
18
+
19
+ with open(input_file_path, 'r') as f:
20
+ data = f.read()
21
+ print(f"length of dataset in characters: {len(data):,}")
22
+
23
+ # get all the unique characters that occur in this text
24
+ chars = sorted(list(set(data)))
25
+ vocab_size = len(chars)
26
+ print("all the unique characters:", ''.join(chars))
27
+ print(f"vocab size: {vocab_size:,}")
28
+
29
+ # create a mapping from characters to integers
30
+ stoi = { ch:i for i,ch in enumerate(chars) }
31
+ itos = { i:ch for i,ch in enumerate(chars) }
32
+ def encode(s):
33
+ return [stoi[c] for c in s] # encoder: take a string, output a list of integers
34
+ def decode(l):
35
+ return ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string
36
+
37
+ # create the train and test splits
38
+ n = len(data)
39
+ train_data = data[:int(n*0.9)]
40
+ val_data = data[int(n*0.9):]
41
+
42
+ # encode both to integers
43
+ train_ids = encode(train_data)
44
+ val_ids = encode(val_data)
45
+ print(f"train has {len(train_ids):,} tokens")
46
+ print(f"val has {len(val_ids):,} tokens")
47
+
48
+ # export to bin files
49
+ train_ids = np.array(train_ids, dtype=np.uint16)
50
+ val_ids = np.array(val_ids, dtype=np.uint16)
51
+ train_ids.tofile(os.path.join(os.path.dirname(__file__), 'train.bin'))
52
+ val_ids.tofile(os.path.join(os.path.dirname(__file__), 'val.bin'))
53
+
54
+ # save the meta information as well, to help us encode/decode later
55
+ meta = {
56
+ 'vocab_size': vocab_size,
57
+ 'itos': itos,
58
+ 'stoi': stoi,
59
+ }
60
+ with open(os.path.join(os.path.dirname(__file__), 'meta.pkl'), 'wb') as f:
61
+ pickle.dump(meta, f)
62
+
63
+ # length of dataset in characters: 1115394
64
+ # all the unique characters:
65
+ # !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
66
+ # vocab size: 65
67
+ # train has 1003854 tokens
68
+ # val has 111540 tokens
@@ -0,0 +1,9 @@
1
+
2
+ # tiny shakespeare, character-level
3
+
4
+ Tiny shakespeare, of the good old char-rnn fame :) Treated on character-level.
5
+
6
+ After running `prepare.py`:
7
+
8
+ - train.bin has 1,003,854 tokens
9
+ - val.bin has 111,540 tokens