gitarsenal-cli 1.9.73 → 1.9.74

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. package/.venv_status.json +1 -1
  2. package/kill_claude/claude_code_agent.py +58 -37
  3. package/kill_claude/nanoGPT/.gitattributes +3 -0
  4. package/kill_claude/nanoGPT/LICENSE +21 -0
  5. package/kill_claude/nanoGPT/README.md +227 -0
  6. package/kill_claude/nanoGPT/assets/gpt2_124M_loss.png +0 -0
  7. package/kill_claude/nanoGPT/assets/nanogpt.jpg +0 -0
  8. package/kill_claude/nanoGPT/bench.py +117 -0
  9. package/kill_claude/nanoGPT/config/eval_gpt2.py +8 -0
  10. package/kill_claude/nanoGPT/config/eval_gpt2_large.py +8 -0
  11. package/kill_claude/nanoGPT/config/eval_gpt2_medium.py +8 -0
  12. package/kill_claude/nanoGPT/config/eval_gpt2_xl.py +8 -0
  13. package/kill_claude/nanoGPT/config/finetune_shakespeare.py +25 -0
  14. package/kill_claude/nanoGPT/config/train_gpt2.py +25 -0
  15. package/kill_claude/nanoGPT/config/train_shakespeare_char.py +37 -0
  16. package/kill_claude/nanoGPT/configurator.py +47 -0
  17. package/kill_claude/nanoGPT/data/openwebtext/prepare.py +81 -0
  18. package/kill_claude/nanoGPT/data/openwebtext/readme.md +15 -0
  19. package/kill_claude/nanoGPT/data/shakespeare/prepare.py +33 -0
  20. package/kill_claude/nanoGPT/data/shakespeare/readme.md +9 -0
  21. package/kill_claude/nanoGPT/data/shakespeare_char/prepare.py +68 -0
  22. package/kill_claude/nanoGPT/data/shakespeare_char/readme.md +9 -0
  23. package/kill_claude/nanoGPT/model.py +330 -0
  24. package/kill_claude/nanoGPT/sample.py +89 -0
  25. package/kill_claude/nanoGPT/scaling_laws.ipynb +792 -0
  26. package/kill_claude/nanoGPT/train.py +336 -0
  27. package/kill_claude/nanoGPT/transformer_sizing.ipynb +402 -0
  28. package/kill_claude/tools/__pycache__/bash_tool.cpython-313.pyc +0 -0
  29. package/package.json +1 -1
package/.venv_status.json CHANGED
@@ -1 +1 @@
1
- {"created":"2025-08-15T09:00:00.169Z","packages":["modal","gitingest","requests","anthropic"],"uv_version":"uv 0.8.4 (Homebrew 2025-07-30)"}
1
+ {"created":"2025-08-15T09:19:17.108Z","packages":["modal","gitingest","requests","anthropic"],"uv_version":"uv 0.8.4 (Homebrew 2025-07-30)"}
@@ -372,21 +372,10 @@ The following {len(TOOL_SCHEMAS)} tools are loaded and available:
372
372
 
373
373
  # Execute tool calls
374
374
  if tool_calls:
375
- print(f"šŸ¤– Agent requested {len(tool_calls)} tool call(s)")
376
375
  tool_results = []
377
376
 
378
377
  for i, tool_call in enumerate(tool_calls, 1):
379
- print(f"--- Tool Call {i}/{len(tool_calls)} ---")
380
- result = self.execute_tool_call(tool_call)
381
-
382
- # Display the tool result
383
- if result and result.strip():
384
- print(f"šŸ“‹ Tool Result:")
385
- print(result)
386
- print() # Empty line for readability
387
- else:
388
- print("šŸ“‹ Tool Result: (empty or no output)")
389
- print()
378
+ result = self.execute_tool_call(tool_call, i, len(tool_calls))
390
379
 
391
380
  tool_results.append({
392
381
  "tool_use_id": tool_call.id,
@@ -408,12 +397,14 @@ The following {len(TOOL_SCHEMAS)} tools are loaded and available:
408
397
  except Exception as e:
409
398
  return f"Error calling Anthropic API: {str(e)}"
410
399
 
411
- def execute_tool_call(self, tool_call) -> str:
400
+ def execute_tool_call(self, tool_call, call_num: int = 1, total_calls: int = 1) -> str:
412
401
  """
413
402
  Execute a tool call using the loaded tool implementations.
414
403
 
415
404
  Args:
416
405
  tool_call: The tool call object from Claude
406
+ call_num: Current tool call number
407
+ total_calls: Total number of tool calls
417
408
 
418
409
  Returns:
419
410
  Tool execution result as string
@@ -421,27 +412,48 @@ The following {len(TOOL_SCHEMAS)} tools are loaded and available:
421
412
  tool_name = tool_call.name
422
413
  tool_input = tool_call.input
423
414
 
424
- # Print tool usage information
425
- print(f"šŸ› ļø Using tool: {tool_name}")
415
+ # Print clean tool usage information like Claude Code
416
+ if total_calls > 1:
417
+ print(f"šŸ› ļø Using tool: {tool_name}")
418
+ else:
419
+ print(f"šŸ› ļø Using tool: {tool_name}")
420
+
421
+ # Show parameters in a clean format
426
422
  if tool_input:
427
- # Show key parameters (truncate long values)
428
423
  params_display = []
429
424
  for key, value in tool_input.items():
430
- if isinstance(value, str) and len(value) > 50:
431
- params_display.append(f"{key}='{value[:50]}...'")
425
+ if isinstance(value, str) and len(value) > 80:
426
+ params_display.append(f"{key}={value[:80]}...")
427
+ elif isinstance(value, list) and len(value) > 3:
428
+ params_display.append(f"{key}=[{len(value)} items]")
432
429
  else:
433
- params_display.append(f"{key}={repr(value)}")
430
+ params_display.append(f"{key}={value}")
434
431
  print(f" Parameters: {', '.join(params_display)}")
435
- print() # Empty line for readability
436
432
 
437
433
  try:
438
- # Always use built-in implementations
434
+ # Execute the tool
439
435
  result = self._execute_builtin_tool(tool_name, tool_input)
440
- print(f"āœ… Tool {tool_name} completed successfully\n")
436
+
437
+ # Print success message
438
+ print(f"\nāœ… Tool {tool_name} completed successfully")
439
+
440
+ # Print result in a clean format
441
+ if result and result.strip():
442
+ print(f"\nšŸ“‹ Tool Result:")
443
+ # Truncate very long results for readability
444
+ if len(result) > 5000:
445
+ print(result[:5000] + f"\n\n[Output truncated - showing first 5000 characters of {len(result)} total]")
446
+ else:
447
+ print(result)
448
+ else:
449
+ print(f"\nšŸ“‹ Tool Result: (no output)")
450
+
451
+ print() # Empty line for readability
441
452
  return result if result is not None else ""
442
453
 
443
454
  except Exception as e:
444
- print(f"āŒ Tool {tool_name} failed: {str(e)}\n")
455
+ print(f"\nāŒ Tool {tool_name} failed: {str(e)}")
456
+ print()
445
457
  return f"Error executing {tool_name}: {str(e)}"
446
458
 
447
459
  def _execute_builtin_tool(self, tool_name: str, tool_input: Dict[str, Any]) -> str:
@@ -511,19 +523,30 @@ The following {len(TOOL_SCHEMAS)} tools are loaded and available:
511
523
  # Apply offset and limit
512
524
  start_idx = (offset or 1) - 1
513
525
  end_idx = start_idx + (limit or len(lines))
514
- lines = lines[start_idx:end_idx]
526
+ selected_lines = lines[start_idx:end_idx]
527
+
528
+ # If file is empty, return empty indicator
529
+ if not lines:
530
+ return "<system-reminder>File exists but has empty contents</system-reminder>"
515
531
 
516
532
  # Format with line numbers like Claude Code
517
533
  formatted_lines = []
518
- for i, line in enumerate(lines):
534
+ for i, line in enumerate(selected_lines):
519
535
  line_num = start_idx + i + 1
520
536
  # Remove trailing newline and truncate if too long
521
- clean_line = line.rstrip('\n')
537
+ clean_line = line.rstrip('\n\r')
522
538
  if len(clean_line) > 2000:
523
539
  clean_line = clean_line[:2000] + "..."
524
540
  formatted_lines.append(f"{line_num:>5}→{clean_line}")
525
541
 
526
- return '\n'.join(formatted_lines)
542
+ # Add truncation note if we limited the output
543
+ result = '\n'.join(formatted_lines)
544
+ if limit and len(lines) > end_idx:
545
+ result += f"\n... (showing lines {start_idx + 1}-{end_idx} of {len(lines)} total lines)"
546
+
547
+ return result
548
+ except UnicodeDecodeError:
549
+ return f"Cannot read file (binary or non-UTF-8 encoding): {file_path}"
527
550
  except Exception as e:
528
551
  return f"Error reading file: {str(e)}"
529
552
 
@@ -573,6 +596,10 @@ The following {len(TOOL_SCHEMAS)} tools are loaded and available:
573
596
 
574
597
  output_parts = []
575
598
 
599
+ # If command failed, show exit code first
600
+ if result.returncode != 0:
601
+ output_parts.append(f"Exit code: {result.returncode}")
602
+
576
603
  # Add stdout if present
577
604
  if result.stdout and result.stdout.strip():
578
605
  output_parts.append(result.stdout.strip())
@@ -581,16 +608,9 @@ The following {len(TOOL_SCHEMAS)} tools are loaded and available:
581
608
  if result.stderr and result.stderr.strip():
582
609
  output_parts.append(result.stderr.strip())
583
610
 
584
- # If command failed, include exit code
585
- if result.returncode != 0:
586
- if not output_parts:
587
- output_parts.append(f"Command failed with exit code {result.returncode}")
588
- else:
589
- output_parts.insert(0, f"Exit code: {result.returncode}")
590
-
591
- # If no output at all but success, indicate success
611
+ # If no output at all but success, indicate success like Claude Code
592
612
  if not output_parts and result.returncode == 0:
593
- output_parts.append("<system>Tool ran without output or errors</system>")
613
+ return "<system>Tool ran without output or errors</system>"
594
614
 
595
615
  return "\n".join(output_parts)
596
616
 
@@ -744,7 +764,8 @@ The following {len(TOOL_SCHEMAS)} tools are loaded and available:
744
764
 
745
765
  print("šŸ¤– Claude Code Agent:")
746
766
  response = self.process_query(user_input)
747
- print(response)
767
+ if response.strip():
768
+ print(response)
748
769
 
749
770
  except KeyboardInterrupt:
750
771
  print("\n\nšŸ‘‹ Goodbye!")
@@ -0,0 +1,3 @@
1
+ # Override jupyter in Github language stats for more accurate estimate of repo code languages
2
+ # reference: https://github.com/github/linguist/blob/master/docs/overrides.md#generated-code
3
+ *.ipynb linguist-generated
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2022 Andrej Karpathy
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,227 @@
1
+
2
+ # nanoGPT
3
+
4
+ ![nanoGPT](assets/nanogpt.jpg)
5
+
6
+ The simplest, fastest repository for training/finetuning medium-sized GPTs. It is a rewrite of [minGPT](https://github.com/karpathy/minGPT) that prioritizes teeth over education. Still under active development, but currently the file `train.py` reproduces GPT-2 (124M) on OpenWebText, running on a single 8XA100 40GB node in about 4 days of training. The code itself is plain and readable: `train.py` is a ~300-line boilerplate training loop and `model.py` a ~300-line GPT model definition, which can optionally load the GPT-2 weights from OpenAI. That's it.
7
+
8
+ ![repro124m](assets/gpt2_124M_loss.png)
9
+
10
+ Because the code is so simple, it is very easy to hack to your needs, train new models from scratch, or finetune pretrained checkpoints (e.g. biggest one currently available as a starting point would be the GPT-2 1.3B model from OpenAI).
11
+
12
+ ## install
13
+
14
+ ```
15
+ pip install torch numpy transformers datasets tiktoken wandb tqdm
16
+ ```
17
+
18
+ Dependencies:
19
+
20
+ - [pytorch](https://pytorch.org) <3
21
+ - [numpy](https://numpy.org/install/) <3
22
+ - `transformers` for huggingface transformers <3 (to load GPT-2 checkpoints)
23
+ - `datasets` for huggingface datasets <3 (if you want to download + preprocess OpenWebText)
24
+ - `tiktoken` for OpenAI's fast BPE code <3
25
+ - `wandb` for optional logging <3
26
+ - `tqdm` for progress bars <3
27
+
28
+ ## quick start
29
+
30
+ If you are not a deep learning professional and you just want to feel the magic and get your feet wet, the fastest way to get started is to train a character-level GPT on the works of Shakespeare. First, we download it as a single (1MB) file and turn it from raw text into one large stream of integers:
31
+
32
+ ```sh
33
+ python data/shakespeare_char/prepare.py
34
+ ```
35
+
36
+ This creates a `train.bin` and `val.bin` in that data directory. Now it is time to train your GPT. The size of it very much depends on the computational resources of your system:
37
+
38
+ **I have a GPU**. Great, we can quickly train a baby GPT with the settings provided in the [config/train_shakespeare_char.py](config/train_shakespeare_char.py) config file:
39
+
40
+ ```sh
41
+ python train.py config/train_shakespeare_char.py
42
+ ```
43
+
44
+ If you peek inside it, you'll see that we're training a GPT with a context size of up to 256 characters, 384 feature channels, and it is a 6-layer Transformer with 6 heads in each layer. On one A100 GPU this training run takes about 3 minutes and the best validation loss is 1.4697. Based on the configuration, the model checkpoints are being written into the `--out_dir` directory `out-shakespeare-char`. So once the training finishes we can sample from the best model by pointing the sampling script at this directory:
45
+
46
+ ```sh
47
+ python sample.py --out_dir=out-shakespeare-char
48
+ ```
49
+
50
+ This generates a few samples, for example:
51
+
52
+ ```
53
+ ANGELO:
54
+ And cowards it be strawn to my bed,
55
+ And thrust the gates of my threats,
56
+ Because he that ale away, and hang'd
57
+ An one with him.
58
+
59
+ DUKE VINCENTIO:
60
+ I thank your eyes against it.
61
+
62
+ DUKE VINCENTIO:
63
+ Then will answer him to save the malm:
64
+ And what have you tyrannous shall do this?
65
+
66
+ DUKE VINCENTIO:
67
+ If you have done evils of all disposition
68
+ To end his power, the day of thrust for a common men
69
+ That I leave, to fight with over-liking
70
+ Hasting in a roseman.
71
+ ```
72
+
73
+ lol `ĀÆ\_(惄)_/ĀÆ`. Not bad for a character-level model after 3 minutes of training on a GPU. Better results are quite likely obtainable by instead finetuning a pretrained GPT-2 model on this dataset (see finetuning section later).
74
+
75
+ **I only have a macbook** (or other cheap computer). No worries, we can still train a GPT but we want to dial things down a notch. I recommend getting the bleeding edge PyTorch nightly ([select it here](https://pytorch.org/get-started/locally/) when installing) as it is currently quite likely to make your code more efficient. But even without it, a simple train run could look as follows:
76
+
77
+ ```sh
78
+ python train.py config/train_shakespeare_char.py --device=cpu --compile=False --eval_iters=20 --log_interval=1 --block_size=64 --batch_size=12 --n_layer=4 --n_head=4 --n_embd=128 --max_iters=2000 --lr_decay_iters=2000 --dropout=0.0
79
+ ```
80
+
81
+ Here, since we are running on CPU instead of GPU we must set both `--device=cpu` and also turn off PyTorch 2.0 compile with `--compile=False`. Then when we evaluate we get a bit more noisy but faster estimate (`--eval_iters=20`, down from 200), our context size is only 64 characters instead of 256, and the batch size only 12 examples per iteration, not 64. We'll also use a much smaller Transformer (4 layers, 4 heads, 128 embedding size), and decrease the number of iterations to 2000 (and correspondingly usually decay the learning rate to around max_iters with `--lr_decay_iters`). Because our network is so small we also ease down on regularization (`--dropout=0.0`). This still runs in about ~3 minutes, but gets us a loss of only 1.88 and therefore also worse samples, but it's still good fun:
82
+
83
+ ```sh
84
+ python sample.py --out_dir=out-shakespeare-char --device=cpu
85
+ ```
86
+ Generates samples like this:
87
+
88
+ ```
89
+ GLEORKEN VINGHARD III:
90
+ Whell's the couse, the came light gacks,
91
+ And the for mought you in Aut fries the not high shee
92
+ bot thou the sought bechive in that to doth groan you,
93
+ No relving thee post mose the wear
94
+ ```
95
+
96
+ Not bad for ~3 minutes on a CPU, for a hint of the right character gestalt. If you're willing to wait longer, feel free to tune the hyperparameters, increase the size of the network, the context length (`--block_size`), the length of training, etc.
97
+
98
+ Finally, on Apple Silicon Macbooks and with a recent PyTorch version make sure to add `--device=mps` (short for "Metal Performance Shaders"); PyTorch then uses the on-chip GPU that can *significantly* accelerate training (2-3X) and allow you to use larger networks. See [Issue 28](https://github.com/karpathy/nanoGPT/issues/28) for more.
99
+
100
+ ## reproducing GPT-2
101
+
102
+ A more serious deep learning professional may be more interested in reproducing GPT-2 results. So here we go - we first tokenize the dataset, in this case the [OpenWebText](https://openwebtext2.readthedocs.io/en/latest/), an open reproduction of OpenAI's (private) WebText:
103
+
104
+ ```sh
105
+ python data/openwebtext/prepare.py
106
+ ```
107
+
108
+ This downloads and tokenizes the [OpenWebText](https://huggingface.co/datasets/openwebtext) dataset. It will create a `train.bin` and `val.bin` which holds the GPT2 BPE token ids in one sequence, stored as raw uint16 bytes. Then we're ready to kick off training. To reproduce GPT-2 (124M) you'll want at least an 8X A100 40GB node and run:
109
+
110
+ ```sh
111
+ torchrun --standalone --nproc_per_node=8 train.py config/train_gpt2.py
112
+ ```
113
+
114
+ This will run for about 4 days using PyTorch Distributed Data Parallel (DDP) and go down to loss of ~2.85. Now, a GPT-2 model just evaluated on OWT gets a val loss of about 3.11, but if you finetune it it will come down to ~2.85 territory (due to an apparent domain gap), making the two models ~match.
115
+
116
+ If you're in a cluster environment and you are blessed with multiple GPU nodes you can make GPU go brrrr e.g. across 2 nodes like:
117
+
118
+ ```sh
119
+ # Run on the first (master) node with example IP 123.456.123.456:
120
+ torchrun --nproc_per_node=8 --nnodes=2 --node_rank=0 --master_addr=123.456.123.456 --master_port=1234 train.py
121
+ # Run on the worker node:
122
+ torchrun --nproc_per_node=8 --nnodes=2 --node_rank=1 --master_addr=123.456.123.456 --master_port=1234 train.py
123
+ ```
124
+
125
+ It is a good idea to benchmark your interconnect (e.g. iperf3). In particular, if you don't have Infiniband then also prepend `NCCL_IB_DISABLE=1` to the above launches. Your multinode training will work, but most likely _crawl_. By default checkpoints are periodically written to the `--out_dir`. We can sample from the model by simply `python sample.py`.
126
+
127
+ Finally, to train on a single GPU simply run the `python train.py` script. Have a look at all of its args, the script tries to be very readable, hackable and transparent. You'll most likely want to tune a number of those variables depending on your needs.
128
+
129
+ ## baselines
130
+
131
+ OpenAI GPT-2 checkpoints allow us to get some baselines in place for openwebtext. We can get the numbers as follows:
132
+
133
+ ```sh
134
+ $ python train.py config/eval_gpt2.py
135
+ $ python train.py config/eval_gpt2_medium.py
136
+ $ python train.py config/eval_gpt2_large.py
137
+ $ python train.py config/eval_gpt2_xl.py
138
+ ```
139
+
140
+ and observe the following losses on train and val:
141
+
142
+ | model | params | train loss | val loss |
143
+ | ------| ------ | ---------- | -------- |
144
+ | gpt2 | 124M | 3.11 | 3.12 |
145
+ | gpt2-medium | 350M | 2.85 | 2.84 |
146
+ | gpt2-large | 774M | 2.66 | 2.67 |
147
+ | gpt2-xl | 1558M | 2.56 | 2.54 |
148
+
149
+ However, we have to note that GPT-2 was trained on (closed, never released) WebText, while OpenWebText is just a best-effort open reproduction of this dataset. This means there is a dataset domain gap. Indeed, taking the GPT-2 (124M) checkpoint and finetuning on OWT directly for a while reaches loss down to ~2.85. This then becomes the more appropriate baseline w.r.t. reproduction.
150
+
151
+ ## finetuning
152
+
153
+ Finetuning is no different than training, we just make sure to initialize from a pretrained model and train with a smaller learning rate. For an example of how to finetune a GPT on new text go to `data/shakespeare` and run `prepare.py` to download the tiny shakespeare dataset and render it into a `train.bin` and `val.bin`, using the OpenAI BPE tokenizer from GPT-2. Unlike OpenWebText this will run in seconds. Finetuning can take very little time, e.g. on a single GPU just a few minutes. Run an example finetuning like:
154
+
155
+ ```sh
156
+ python train.py config/finetune_shakespeare.py
157
+ ```
158
+
159
+ This will load the config parameter overrides in `config/finetune_shakespeare.py` (I didn't tune them much though). Basically, we initialize from a GPT2 checkpoint with `init_from` and train as normal, except shorter and with a small learning rate. If you're running out of memory try decreasing the model size (they are `{'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}`) or possibly decreasing the `block_size` (context length). The best checkpoint (lowest validation loss) will be in the `out_dir` directory, e.g. in `out-shakespeare` by default, per the config file. You can then run the code in `sample.py --out_dir=out-shakespeare`:
160
+
161
+ ```
162
+ THEODORE:
163
+ Thou shalt sell me to the highest bidder: if I die,
164
+ I sell thee to the first; if I go mad,
165
+ I sell thee to the second; if I
166
+ lie, I sell thee to the third; if I slay,
167
+ I sell thee to the fourth: so buy or sell,
168
+ I tell thee again, thou shalt not sell my
169
+ possession.
170
+
171
+ JULIET:
172
+ And if thou steal, thou shalt not sell thyself.
173
+
174
+ THEODORE:
175
+ I do not steal; I sell the stolen goods.
176
+
177
+ THEODORE:
178
+ Thou know'st not what thou sell'st; thou, a woman,
179
+ Thou art ever a victim, a thing of no worth:
180
+ Thou hast no right, no right, but to be sold.
181
+ ```
182
+
183
+ Whoa there, GPT, entering some dark place over there. I didn't really tune the hyperparameters in the config too much, feel free to try!
184
+
185
+ ## sampling / inference
186
+
187
+ Use the script `sample.py` to sample either from pre-trained GPT-2 models released by OpenAI, or from a model you trained yourself. For example, here is a way to sample from the largest available `gpt2-xl` model:
188
+
189
+ ```sh
190
+ python sample.py \
191
+ --init_from=gpt2-xl \
192
+ --start="What is the answer to life, the universe, and everything?" \
193
+ --num_samples=5 --max_new_tokens=100
194
+ ```
195
+
196
+ If you'd like to sample from a model you trained, use the `--out_dir` to point the code appropriately. You can also prompt the model with some text from a file, e.g. ```python sample.py --start=FILE:prompt.txt```.
197
+
198
+ ## efficiency notes
199
+
200
+ For simple model benchmarking and profiling, `bench.py` might be useful. It's identical to what happens in the meat of the training loop of `train.py`, but omits much of the other complexities.
201
+
202
+ Note that the code by default uses [PyTorch 2.0](https://pytorch.org/get-started/pytorch-2.0/). At the time of writing (Dec 29, 2022) this makes `torch.compile()` available in the nightly release. The improvement from the one line of code is noticeable, e.g. cutting down iteration time from ~250ms / iter to 135ms / iter. Nice work PyTorch team!
203
+
204
+ ## todos
205
+
206
+ - Investigate and add FSDP instead of DDP
207
+ - Eval zero-shot perplexities on standard evals (e.g. LAMBADA? HELM? etc.)
208
+ - Finetune the finetuning script, I think the hyperparams are not great
209
+ - Schedule for linear batch size increase during training
210
+ - Incorporate other embeddings (rotary, alibi)
211
+ - Separate out the optim buffers from model params in checkpoints I think
212
+ - Additional logging around network health (e.g. gradient clip events, magnitudes)
213
+ - Few more investigations around better init etc.
214
+
215
+ ## troubleshooting
216
+
217
+ Note that by default this repo uses PyTorch 2.0 (i.e. `torch.compile`). This is fairly new and experimental, and not yet available on all platforms (e.g. Windows). If you're running into related error messages try to disable this by adding `--compile=False` flag. This will slow down the code but at least it will run.
218
+
219
+ For some context on this repository, GPT, and language modeling it might be helpful to watch my [Zero To Hero series](https://karpathy.ai/zero-to-hero.html). Specifically, the [GPT video](https://www.youtube.com/watch?v=kCc8FmEb1nY) is popular if you have some prior language modeling context.
220
+
221
+ For more questions/discussions feel free to stop by **#nanoGPT** on Discord:
222
+
223
+ [![](https://dcbadge.vercel.app/api/server/3zy8kqD9Cp?compact=true&style=flat)](https://discord.gg/3zy8kqD9Cp)
224
+
225
+ ## acknowledgements
226
+
227
+ All nanoGPT experiments are powered by GPUs on [Lambda labs](https://lambdalabs.com), my favorite Cloud GPU provider. Thank you Lambda labs for sponsoring nanoGPT!
@@ -0,0 +1,117 @@
1
+ """
2
+ A much shorter version of train.py for benchmarking
3
+ """
4
+ import os
5
+ from contextlib import nullcontext
6
+ import numpy as np
7
+ import time
8
+ import torch
9
+ from model import GPTConfig, GPT
10
+
11
+ # -----------------------------------------------------------------------------
12
+ batch_size = 12
13
+ block_size = 1024
14
+ bias = False
15
+ real_data = True
16
+ seed = 1337
17
+ device = 'cuda' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1', etc.
18
+ dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16' # 'float32' or 'bfloat16' or 'float16'
19
+ compile = True # use PyTorch 2.0 to compile the model to be faster
20
+ profile = False # use pytorch profiler, or just simple benchmarking?
21
+ exec(open('configurator.py').read()) # overrides from command line or config file
22
+ # -----------------------------------------------------------------------------
23
+
24
+ torch.manual_seed(seed)
25
+ torch.cuda.manual_seed(seed)
26
+ torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
27
+ torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
28
+ device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast
29
+ ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
30
+ ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)
31
+
32
+ # data loading init
33
+ if real_data:
34
+ dataset = 'openwebtext'
35
+ data_dir = os.path.join('data', dataset)
36
+ train_data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.uint16, mode='r')
37
+ def get_batch(split):
38
+ data = train_data # note ignore split in benchmarking script
39
+ ix = torch.randint(len(data) - block_size, (batch_size,))
40
+ x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix])
41
+ y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in ix])
42
+ x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
43
+ return x, y
44
+ else:
45
+ # alternatively, if fixed data is desired to not care about data loading
46
+ x = torch.randint(50304, (batch_size, block_size), device=device)
47
+ y = torch.randint(50304, (batch_size, block_size), device=device)
48
+ get_batch = lambda split: (x, y)
49
+
50
+ # model init
51
+ gptconf = GPTConfig(
52
+ block_size = block_size, # how far back does the model look? i.e. context size
53
+ n_layer = 12, n_head = 12, n_embd = 768, # size of the model
54
+ dropout = 0, # for determinism
55
+ bias = bias,
56
+ )
57
+ model = GPT(gptconf)
58
+ model.to(device)
59
+
60
+ optimizer = model.configure_optimizers(weight_decay=1e-2, learning_rate=1e-4, betas=(0.9, 0.95), device_type=device_type)
61
+
62
+ if compile:
63
+ print("Compiling model...")
64
+ model = torch.compile(model) # pytorch 2.0
65
+
66
+ if profile:
67
+ # useful docs on pytorch profiler:
68
+ # - tutorial https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html
69
+ # - api https://pytorch.org/docs/stable/profiler.html#torch.profiler.profile
70
+ wait, warmup, active = 5, 5, 5
71
+ num_steps = wait + warmup + active
72
+ with torch.profiler.profile(
73
+ activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
74
+ schedule=torch.profiler.schedule(wait=wait, warmup=warmup, active=active, repeat=1),
75
+ on_trace_ready=torch.profiler.tensorboard_trace_handler('./bench_log'),
76
+ record_shapes=False,
77
+ profile_memory=False,
78
+ with_stack=False, # incurs an additional overhead, disable if not needed
79
+ with_flops=True,
80
+ with_modules=False, # only for torchscript models atm
81
+ ) as prof:
82
+
83
+ X, Y = get_batch('train')
84
+ for k in range(num_steps):
85
+ with ctx:
86
+ logits, loss = model(X, Y)
87
+ X, Y = get_batch('train')
88
+ optimizer.zero_grad(set_to_none=True)
89
+ loss.backward()
90
+ optimizer.step()
91
+ lossf = loss.item()
92
+ print(f"{k}/{num_steps} loss: {lossf:.4f}")
93
+
94
+ prof.step() # notify the profiler at end of each step
95
+
96
+ else:
97
+
98
+ # simple benchmarking
99
+ torch.cuda.synchronize()
100
+ for stage, num_steps in enumerate([10, 20]): # burnin, then benchmark
101
+ t0 = time.time()
102
+ X, Y = get_batch('train')
103
+ for k in range(num_steps):
104
+ with ctx:
105
+ logits, loss = model(X, Y)
106
+ X, Y = get_batch('train')
107
+ optimizer.zero_grad(set_to_none=True)
108
+ loss.backward()
109
+ optimizer.step()
110
+ lossf = loss.item()
111
+ print(f"{k}/{num_steps} loss: {lossf:.4f}")
112
+ torch.cuda.synchronize()
113
+ t1 = time.time()
114
+ dt = t1-t0
115
+ mfu = model.estimate_mfu(batch_size * 1 * num_steps, dt)
116
+ if stage == 1:
117
+ print(f"time per iteration: {dt/num_steps*1000:.4f}ms, MFU: {mfu*100:.2f}%")
@@ -0,0 +1,8 @@
1
+ # evaluate the base gpt2
2
+ # n_layer=12, n_head=12, n_embd=768
3
+ # 124M parameters
4
+ batch_size = 8
5
+ eval_iters = 500 # use more iterations to get good estimate
6
+ eval_only = True
7
+ wandb_log = False
8
+ init_from = 'gpt2'
@@ -0,0 +1,8 @@
1
+ # evaluate the base gpt2
2
+ # n_layer=36, n_head=20, n_embd=1280
3
+ # 774M parameters
4
+ batch_size = 8
5
+ eval_iters = 500 # use more iterations to get good estimate
6
+ eval_only = True
7
+ wandb_log = False
8
+ init_from = 'gpt2-large'
@@ -0,0 +1,8 @@
1
+ # evaluate the base gpt2
2
+ # n_layer=24, n_head=16, n_embd=1024
3
+ # 350M parameters
4
+ batch_size = 8
5
+ eval_iters = 500 # use more iterations to get good estimate
6
+ eval_only = True
7
+ wandb_log = False
8
+ init_from = 'gpt2-medium'
@@ -0,0 +1,8 @@
1
+ # evaluate the base gpt2
2
+ # n_layer=48, n_head=25, n_embd=1600
3
+ # 1558M parameters
4
+ batch_size = 8
5
+ eval_iters = 500 # use more iterations to get good estimate
6
+ eval_only = True
7
+ wandb_log = False
8
+ init_from = 'gpt2-xl'
@@ -0,0 +1,25 @@
1
+ import time
2
+
3
+ out_dir = 'out-shakespeare'
4
+ eval_interval = 5
5
+ eval_iters = 40
6
+ wandb_log = False # feel free to turn on
7
+ wandb_project = 'shakespeare'
8
+ wandb_run_name = 'ft-' + str(time.time())
9
+
10
+ dataset = 'shakespeare'
11
+ init_from = 'gpt2-xl' # this is the largest GPT-2 model
12
+
13
+ # only save checkpoints if the validation loss improves
14
+ always_save_checkpoint = False
15
+
16
+ # the number of examples per iter:
17
+ # 1 batch_size * 32 grad_accum * 1024 tokens = 32,768 tokens/iter
18
+ # shakespeare has 301,966 tokens, so 1 epoch ~= 9.2 iters
19
+ batch_size = 1
20
+ gradient_accumulation_steps = 32
21
+ max_iters = 20
22
+
23
+ # finetune at constant LR
24
+ learning_rate = 3e-5
25
+ decay_lr = False
@@ -0,0 +1,25 @@
1
+ # config for training GPT-2 (124M) down to very nice loss of ~2.85 on 1 node of 8X A100 40GB
2
+ # launch as the following (e.g. in a screen session) and wait ~5 days:
3
+ # $ torchrun --standalone --nproc_per_node=8 train.py config/train_gpt2.py
4
+
5
+ wandb_log = True
6
+ wandb_project = 'owt'
7
+ wandb_run_name='gpt2-124M'
8
+
9
+ # these make the total batch size be ~0.5M
10
+ # 12 batch size * 1024 block size * 5 gradaccum * 8 GPUs = 491,520
11
+ batch_size = 12
12
+ block_size = 1024
13
+ gradient_accumulation_steps = 5 * 8
14
+
15
+ # this makes total number of tokens be 300B
16
+ max_iters = 600000
17
+ lr_decay_iters = 600000
18
+
19
+ # eval stuff
20
+ eval_interval = 1000
21
+ eval_iters = 200
22
+ log_interval = 10
23
+
24
+ # weight decay
25
+ weight_decay = 1e-1