gitarsenal-cli 1.9.72 → 1.9.74
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.venv_status.json +1 -1
- package/bin/gitarsenal.js +8 -31
- package/kill_claude/claude_code_agent.py +58 -37
- package/kill_claude/nanoGPT/.gitattributes +3 -0
- package/kill_claude/nanoGPT/LICENSE +21 -0
- package/kill_claude/nanoGPT/README.md +227 -0
- package/kill_claude/nanoGPT/assets/gpt2_124M_loss.png +0 -0
- package/kill_claude/nanoGPT/assets/nanogpt.jpg +0 -0
- package/kill_claude/nanoGPT/bench.py +117 -0
- package/kill_claude/nanoGPT/config/eval_gpt2.py +8 -0
- package/kill_claude/nanoGPT/config/eval_gpt2_large.py +8 -0
- package/kill_claude/nanoGPT/config/eval_gpt2_medium.py +8 -0
- package/kill_claude/nanoGPT/config/eval_gpt2_xl.py +8 -0
- package/kill_claude/nanoGPT/config/finetune_shakespeare.py +25 -0
- package/kill_claude/nanoGPT/config/train_gpt2.py +25 -0
- package/kill_claude/nanoGPT/config/train_shakespeare_char.py +37 -0
- package/kill_claude/nanoGPT/configurator.py +47 -0
- package/kill_claude/nanoGPT/data/openwebtext/prepare.py +81 -0
- package/kill_claude/nanoGPT/data/openwebtext/readme.md +15 -0
- package/kill_claude/nanoGPT/data/shakespeare/prepare.py +33 -0
- package/kill_claude/nanoGPT/data/shakespeare/readme.md +9 -0
- package/kill_claude/nanoGPT/data/shakespeare_char/prepare.py +68 -0
- package/kill_claude/nanoGPT/data/shakespeare_char/readme.md +9 -0
- package/kill_claude/nanoGPT/model.py +330 -0
- package/kill_claude/nanoGPT/sample.py +89 -0
- package/kill_claude/nanoGPT/scaling_laws.ipynb +792 -0
- package/kill_claude/nanoGPT/train.py +336 -0
- package/kill_claude/nanoGPT/transformer_sizing.ipynb +402 -0
- package/kill_claude/prompts/claude-code-tool-prompts.md +1 -0
- package/kill_claude/tools/__pycache__/bash_tool.cpython-313.pyc +0 -0
- package/kill_claude/tools/__pycache__/task_tool.cpython-313.pyc +0 -0
- package/kill_claude/tools/bash_tool.py +1 -0
- package/lib/sandbox.js +1 -8
- package/package.json +1 -1
- package/python/debug_modal_minimal.py +212 -0
- package/python/test_container.py +108 -17
- package/python/test_modalSandboxScript.py +65 -1097
package/.venv_status.json
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"created":"2025-08-
|
|
1
|
+
{"created":"2025-08-15T09:19:17.108Z","packages":["modal","gitingest","requests","anthropic"],"uv_version":"uv 0.8.4 (Homebrew 2025-07-30)"}
|
package/bin/gitarsenal.js
CHANGED
|
@@ -747,7 +747,6 @@ async function runContainerCommand(options) {
|
|
|
747
747
|
let gpuCount = parseInt(options.gpuCount) || 1;
|
|
748
748
|
let volumeName = options.volumeName || options.volume;
|
|
749
749
|
let skipConfirmation = options.yes;
|
|
750
|
-
let useApi = !options.manual;
|
|
751
750
|
let setupCommands = options.setupCommands || [];
|
|
752
751
|
|
|
753
752
|
if (!repoUrl) {
|
|
@@ -762,8 +761,8 @@ async function runContainerCommand(options) {
|
|
|
762
761
|
repoUrl = answers.repoUrl;
|
|
763
762
|
}
|
|
764
763
|
|
|
765
|
-
//
|
|
766
|
-
if (
|
|
764
|
+
// Analyze repository for GPU recommendations (repository setup is now handled by Agent)
|
|
765
|
+
if (repoUrl) {
|
|
767
766
|
// Start a main spinner that will show overall progress
|
|
768
767
|
const mainSpinner = ora('Analyzing repository...').start();
|
|
769
768
|
|
|
@@ -783,11 +782,7 @@ async function runContainerCommand(options) {
|
|
|
783
782
|
previewAbort.abort();
|
|
784
783
|
mainSpinner.succeed('Analysis complete!');
|
|
785
784
|
printGpuTorchCudaSummary(fullData);
|
|
786
|
-
|
|
787
|
-
setupCommands = fullData.commands;
|
|
788
|
-
// Disable auto-detection since we already have commands
|
|
789
|
-
useApi = false;
|
|
790
|
-
}
|
|
785
|
+
// Repository setup will be handled by Agent in container
|
|
791
786
|
} else {
|
|
792
787
|
// Full fetch failed, wait for preview and show its results
|
|
793
788
|
mainSpinner.text = 'Waiting for preview analysis to complete...';
|
|
@@ -798,13 +793,13 @@ async function runContainerCommand(options) {
|
|
|
798
793
|
} else {
|
|
799
794
|
mainSpinner.fail('Analysis failed - both preview and full analysis timed out or failed');
|
|
800
795
|
console.log(chalk.yellow('⚠️ Unable to analyze repository automatically.'));
|
|
801
|
-
console.log(chalk.gray('
|
|
796
|
+
console.log(chalk.gray('Repository setup will still be handled by Agent in container.'));
|
|
802
797
|
}
|
|
803
798
|
}
|
|
804
799
|
} catch (error) {
|
|
805
800
|
mainSpinner.fail(`Analysis failed: ${error.message}`);
|
|
806
801
|
console.log(chalk.yellow('⚠️ Unable to analyze repository automatically.'));
|
|
807
|
-
console.log(chalk.gray('
|
|
802
|
+
console.log(chalk.gray('Repository setup will still be handled by Agent in container.'));
|
|
808
803
|
}
|
|
809
804
|
}
|
|
810
805
|
|
|
@@ -879,31 +874,14 @@ async function runContainerCommand(options) {
|
|
|
879
874
|
volumeName = getDefaultVolumeName(repoUrl);
|
|
880
875
|
}
|
|
881
876
|
|
|
882
|
-
//
|
|
883
|
-
if (!
|
|
884
|
-
const apiAnswers = await inquirer.prompt([
|
|
885
|
-
{
|
|
886
|
-
type: 'confirm',
|
|
887
|
-
name: 'useApi',
|
|
888
|
-
message: 'Automatically detect setup commands for this repository?',
|
|
889
|
-
default: true
|
|
890
|
-
}
|
|
891
|
-
]);
|
|
892
|
-
|
|
893
|
-
useApi = apiAnswers.useApi;
|
|
894
|
-
} else if (options.yes) {
|
|
895
|
-
// If --yes flag is used, default to using API for setup command detection
|
|
896
|
-
useApi = true;
|
|
897
|
-
}
|
|
898
|
-
|
|
899
|
-
// Only prompt for custom commands if auto-detection is disabled and no commands provided
|
|
900
|
-
if (!useApi && setupCommands.length === 0) {
|
|
877
|
+
// Prompt for custom setup commands only if no repo URL provided and no commands specified
|
|
878
|
+
if (!repoUrl && !options.manual && setupCommands.length === 0) {
|
|
901
879
|
const setupAnswers = await inquirer.prompt([
|
|
902
880
|
{
|
|
903
881
|
type: 'confirm',
|
|
904
882
|
name: 'useCustomCommands',
|
|
905
883
|
message: 'Provide custom setup commands?',
|
|
906
|
-
default:
|
|
884
|
+
default: false
|
|
907
885
|
}
|
|
908
886
|
]);
|
|
909
887
|
|
|
@@ -953,7 +931,6 @@ async function runContainerCommand(options) {
|
|
|
953
931
|
gpuCount,
|
|
954
932
|
volumeName,
|
|
955
933
|
setupCommands,
|
|
956
|
-
useApi,
|
|
957
934
|
yes: skipConfirmation,
|
|
958
935
|
userId,
|
|
959
936
|
userName,
|
|
@@ -372,21 +372,10 @@ The following {len(TOOL_SCHEMAS)} tools are loaded and available:
|
|
|
372
372
|
|
|
373
373
|
# Execute tool calls
|
|
374
374
|
if tool_calls:
|
|
375
|
-
print(f"🤖 Agent requested {len(tool_calls)} tool call(s)")
|
|
376
375
|
tool_results = []
|
|
377
376
|
|
|
378
377
|
for i, tool_call in enumerate(tool_calls, 1):
|
|
379
|
-
|
|
380
|
-
result = self.execute_tool_call(tool_call)
|
|
381
|
-
|
|
382
|
-
# Display the tool result
|
|
383
|
-
if result and result.strip():
|
|
384
|
-
print(f"📋 Tool Result:")
|
|
385
|
-
print(result)
|
|
386
|
-
print() # Empty line for readability
|
|
387
|
-
else:
|
|
388
|
-
print("📋 Tool Result: (empty or no output)")
|
|
389
|
-
print()
|
|
378
|
+
result = self.execute_tool_call(tool_call, i, len(tool_calls))
|
|
390
379
|
|
|
391
380
|
tool_results.append({
|
|
392
381
|
"tool_use_id": tool_call.id,
|
|
@@ -408,12 +397,14 @@ The following {len(TOOL_SCHEMAS)} tools are loaded and available:
|
|
|
408
397
|
except Exception as e:
|
|
409
398
|
return f"Error calling Anthropic API: {str(e)}"
|
|
410
399
|
|
|
411
|
-
def execute_tool_call(self, tool_call) -> str:
|
|
400
|
+
def execute_tool_call(self, tool_call, call_num: int = 1, total_calls: int = 1) -> str:
|
|
412
401
|
"""
|
|
413
402
|
Execute a tool call using the loaded tool implementations.
|
|
414
403
|
|
|
415
404
|
Args:
|
|
416
405
|
tool_call: The tool call object from Claude
|
|
406
|
+
call_num: Current tool call number
|
|
407
|
+
total_calls: Total number of tool calls
|
|
417
408
|
|
|
418
409
|
Returns:
|
|
419
410
|
Tool execution result as string
|
|
@@ -421,27 +412,48 @@ The following {len(TOOL_SCHEMAS)} tools are loaded and available:
|
|
|
421
412
|
tool_name = tool_call.name
|
|
422
413
|
tool_input = tool_call.input
|
|
423
414
|
|
|
424
|
-
# Print tool usage information
|
|
425
|
-
|
|
415
|
+
# Print clean tool usage information like Claude Code
|
|
416
|
+
if total_calls > 1:
|
|
417
|
+
print(f"🛠️ Using tool: {tool_name}")
|
|
418
|
+
else:
|
|
419
|
+
print(f"🛠️ Using tool: {tool_name}")
|
|
420
|
+
|
|
421
|
+
# Show parameters in a clean format
|
|
426
422
|
if tool_input:
|
|
427
|
-
# Show key parameters (truncate long values)
|
|
428
423
|
params_display = []
|
|
429
424
|
for key, value in tool_input.items():
|
|
430
|
-
if isinstance(value, str) and len(value) >
|
|
431
|
-
params_display.append(f"{key}=
|
|
425
|
+
if isinstance(value, str) and len(value) > 80:
|
|
426
|
+
params_display.append(f"{key}={value[:80]}...")
|
|
427
|
+
elif isinstance(value, list) and len(value) > 3:
|
|
428
|
+
params_display.append(f"{key}=[{len(value)} items]")
|
|
432
429
|
else:
|
|
433
|
-
params_display.append(f"{key}={
|
|
430
|
+
params_display.append(f"{key}={value}")
|
|
434
431
|
print(f" Parameters: {', '.join(params_display)}")
|
|
435
|
-
print() # Empty line for readability
|
|
436
432
|
|
|
437
433
|
try:
|
|
438
|
-
#
|
|
434
|
+
# Execute the tool
|
|
439
435
|
result = self._execute_builtin_tool(tool_name, tool_input)
|
|
440
|
-
|
|
436
|
+
|
|
437
|
+
# Print success message
|
|
438
|
+
print(f"\n✅ Tool {tool_name} completed successfully")
|
|
439
|
+
|
|
440
|
+
# Print result in a clean format
|
|
441
|
+
if result and result.strip():
|
|
442
|
+
print(f"\n📋 Tool Result:")
|
|
443
|
+
# Truncate very long results for readability
|
|
444
|
+
if len(result) > 5000:
|
|
445
|
+
print(result[:5000] + f"\n\n[Output truncated - showing first 5000 characters of {len(result)} total]")
|
|
446
|
+
else:
|
|
447
|
+
print(result)
|
|
448
|
+
else:
|
|
449
|
+
print(f"\n📋 Tool Result: (no output)")
|
|
450
|
+
|
|
451
|
+
print() # Empty line for readability
|
|
441
452
|
return result if result is not None else ""
|
|
442
453
|
|
|
443
454
|
except Exception as e:
|
|
444
|
-
print(f"❌ Tool {tool_name} failed: {str(e)}
|
|
455
|
+
print(f"\n❌ Tool {tool_name} failed: {str(e)}")
|
|
456
|
+
print()
|
|
445
457
|
return f"Error executing {tool_name}: {str(e)}"
|
|
446
458
|
|
|
447
459
|
def _execute_builtin_tool(self, tool_name: str, tool_input: Dict[str, Any]) -> str:
|
|
@@ -511,19 +523,30 @@ The following {len(TOOL_SCHEMAS)} tools are loaded and available:
|
|
|
511
523
|
# Apply offset and limit
|
|
512
524
|
start_idx = (offset or 1) - 1
|
|
513
525
|
end_idx = start_idx + (limit or len(lines))
|
|
514
|
-
|
|
526
|
+
selected_lines = lines[start_idx:end_idx]
|
|
527
|
+
|
|
528
|
+
# If file is empty, return empty indicator
|
|
529
|
+
if not lines:
|
|
530
|
+
return "<system-reminder>File exists but has empty contents</system-reminder>"
|
|
515
531
|
|
|
516
532
|
# Format with line numbers like Claude Code
|
|
517
533
|
formatted_lines = []
|
|
518
|
-
for i, line in enumerate(
|
|
534
|
+
for i, line in enumerate(selected_lines):
|
|
519
535
|
line_num = start_idx + i + 1
|
|
520
536
|
# Remove trailing newline and truncate if too long
|
|
521
|
-
clean_line = line.rstrip('\n')
|
|
537
|
+
clean_line = line.rstrip('\n\r')
|
|
522
538
|
if len(clean_line) > 2000:
|
|
523
539
|
clean_line = clean_line[:2000] + "..."
|
|
524
540
|
formatted_lines.append(f"{line_num:>5}→{clean_line}")
|
|
525
541
|
|
|
526
|
-
|
|
542
|
+
# Add truncation note if we limited the output
|
|
543
|
+
result = '\n'.join(formatted_lines)
|
|
544
|
+
if limit and len(lines) > end_idx:
|
|
545
|
+
result += f"\n... (showing lines {start_idx + 1}-{end_idx} of {len(lines)} total lines)"
|
|
546
|
+
|
|
547
|
+
return result
|
|
548
|
+
except UnicodeDecodeError:
|
|
549
|
+
return f"Cannot read file (binary or non-UTF-8 encoding): {file_path}"
|
|
527
550
|
except Exception as e:
|
|
528
551
|
return f"Error reading file: {str(e)}"
|
|
529
552
|
|
|
@@ -573,6 +596,10 @@ The following {len(TOOL_SCHEMAS)} tools are loaded and available:
|
|
|
573
596
|
|
|
574
597
|
output_parts = []
|
|
575
598
|
|
|
599
|
+
# If command failed, show exit code first
|
|
600
|
+
if result.returncode != 0:
|
|
601
|
+
output_parts.append(f"Exit code: {result.returncode}")
|
|
602
|
+
|
|
576
603
|
# Add stdout if present
|
|
577
604
|
if result.stdout and result.stdout.strip():
|
|
578
605
|
output_parts.append(result.stdout.strip())
|
|
@@ -581,16 +608,9 @@ The following {len(TOOL_SCHEMAS)} tools are loaded and available:
|
|
|
581
608
|
if result.stderr and result.stderr.strip():
|
|
582
609
|
output_parts.append(result.stderr.strip())
|
|
583
610
|
|
|
584
|
-
# If
|
|
585
|
-
if result.returncode != 0:
|
|
586
|
-
if not output_parts:
|
|
587
|
-
output_parts.append(f"Command failed with exit code {result.returncode}")
|
|
588
|
-
else:
|
|
589
|
-
output_parts.insert(0, f"Exit code: {result.returncode}")
|
|
590
|
-
|
|
591
|
-
# If no output at all but success, indicate success
|
|
611
|
+
# If no output at all but success, indicate success like Claude Code
|
|
592
612
|
if not output_parts and result.returncode == 0:
|
|
593
|
-
|
|
613
|
+
return "<system>Tool ran without output or errors</system>"
|
|
594
614
|
|
|
595
615
|
return "\n".join(output_parts)
|
|
596
616
|
|
|
@@ -744,7 +764,8 @@ The following {len(TOOL_SCHEMAS)} tools are loaded and available:
|
|
|
744
764
|
|
|
745
765
|
print("🤖 Claude Code Agent:")
|
|
746
766
|
response = self.process_query(user_input)
|
|
747
|
-
|
|
767
|
+
if response.strip():
|
|
768
|
+
print(response)
|
|
748
769
|
|
|
749
770
|
except KeyboardInterrupt:
|
|
750
771
|
print("\n\n👋 Goodbye!")
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2022 Andrej Karpathy
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,227 @@
|
|
|
1
|
+
|
|
2
|
+
# nanoGPT
|
|
3
|
+
|
|
4
|
+

|
|
5
|
+
|
|
6
|
+
The simplest, fastest repository for training/finetuning medium-sized GPTs. It is a rewrite of [minGPT](https://github.com/karpathy/minGPT) that prioritizes teeth over education. Still under active development, but currently the file `train.py` reproduces GPT-2 (124M) on OpenWebText, running on a single 8XA100 40GB node in about 4 days of training. The code itself is plain and readable: `train.py` is a ~300-line boilerplate training loop and `model.py` a ~300-line GPT model definition, which can optionally load the GPT-2 weights from OpenAI. That's it.
|
|
7
|
+
|
|
8
|
+

|
|
9
|
+
|
|
10
|
+
Because the code is so simple, it is very easy to hack to your needs, train new models from scratch, or finetune pretrained checkpoints (e.g. biggest one currently available as a starting point would be the GPT-2 1.3B model from OpenAI).
|
|
11
|
+
|
|
12
|
+
## install
|
|
13
|
+
|
|
14
|
+
```
|
|
15
|
+
pip install torch numpy transformers datasets tiktoken wandb tqdm
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
Dependencies:
|
|
19
|
+
|
|
20
|
+
- [pytorch](https://pytorch.org) <3
|
|
21
|
+
- [numpy](https://numpy.org/install/) <3
|
|
22
|
+
- `transformers` for huggingface transformers <3 (to load GPT-2 checkpoints)
|
|
23
|
+
- `datasets` for huggingface datasets <3 (if you want to download + preprocess OpenWebText)
|
|
24
|
+
- `tiktoken` for OpenAI's fast BPE code <3
|
|
25
|
+
- `wandb` for optional logging <3
|
|
26
|
+
- `tqdm` for progress bars <3
|
|
27
|
+
|
|
28
|
+
## quick start
|
|
29
|
+
|
|
30
|
+
If you are not a deep learning professional and you just want to feel the magic and get your feet wet, the fastest way to get started is to train a character-level GPT on the works of Shakespeare. First, we download it as a single (1MB) file and turn it from raw text into one large stream of integers:
|
|
31
|
+
|
|
32
|
+
```sh
|
|
33
|
+
python data/shakespeare_char/prepare.py
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
This creates a `train.bin` and `val.bin` in that data directory. Now it is time to train your GPT. The size of it very much depends on the computational resources of your system:
|
|
37
|
+
|
|
38
|
+
**I have a GPU**. Great, we can quickly train a baby GPT with the settings provided in the [config/train_shakespeare_char.py](config/train_shakespeare_char.py) config file:
|
|
39
|
+
|
|
40
|
+
```sh
|
|
41
|
+
python train.py config/train_shakespeare_char.py
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
If you peek inside it, you'll see that we're training a GPT with a context size of up to 256 characters, 384 feature channels, and it is a 6-layer Transformer with 6 heads in each layer. On one A100 GPU this training run takes about 3 minutes and the best validation loss is 1.4697. Based on the configuration, the model checkpoints are being written into the `--out_dir` directory `out-shakespeare-char`. So once the training finishes we can sample from the best model by pointing the sampling script at this directory:
|
|
45
|
+
|
|
46
|
+
```sh
|
|
47
|
+
python sample.py --out_dir=out-shakespeare-char
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
This generates a few samples, for example:
|
|
51
|
+
|
|
52
|
+
```
|
|
53
|
+
ANGELO:
|
|
54
|
+
And cowards it be strawn to my bed,
|
|
55
|
+
And thrust the gates of my threats,
|
|
56
|
+
Because he that ale away, and hang'd
|
|
57
|
+
An one with him.
|
|
58
|
+
|
|
59
|
+
DUKE VINCENTIO:
|
|
60
|
+
I thank your eyes against it.
|
|
61
|
+
|
|
62
|
+
DUKE VINCENTIO:
|
|
63
|
+
Then will answer him to save the malm:
|
|
64
|
+
And what have you tyrannous shall do this?
|
|
65
|
+
|
|
66
|
+
DUKE VINCENTIO:
|
|
67
|
+
If you have done evils of all disposition
|
|
68
|
+
To end his power, the day of thrust for a common men
|
|
69
|
+
That I leave, to fight with over-liking
|
|
70
|
+
Hasting in a roseman.
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
lol `¯\_(ツ)_/¯`. Not bad for a character-level model after 3 minutes of training on a GPU. Better results are quite likely obtainable by instead finetuning a pretrained GPT-2 model on this dataset (see finetuning section later).
|
|
74
|
+
|
|
75
|
+
**I only have a macbook** (or other cheap computer). No worries, we can still train a GPT but we want to dial things down a notch. I recommend getting the bleeding edge PyTorch nightly ([select it here](https://pytorch.org/get-started/locally/) when installing) as it is currently quite likely to make your code more efficient. But even without it, a simple train run could look as follows:
|
|
76
|
+
|
|
77
|
+
```sh
|
|
78
|
+
python train.py config/train_shakespeare_char.py --device=cpu --compile=False --eval_iters=20 --log_interval=1 --block_size=64 --batch_size=12 --n_layer=4 --n_head=4 --n_embd=128 --max_iters=2000 --lr_decay_iters=2000 --dropout=0.0
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
Here, since we are running on CPU instead of GPU we must set both `--device=cpu` and also turn off PyTorch 2.0 compile with `--compile=False`. Then when we evaluate we get a bit more noisy but faster estimate (`--eval_iters=20`, down from 200), our context size is only 64 characters instead of 256, and the batch size only 12 examples per iteration, not 64. We'll also use a much smaller Transformer (4 layers, 4 heads, 128 embedding size), and decrease the number of iterations to 2000 (and correspondingly usually decay the learning rate to around max_iters with `--lr_decay_iters`). Because our network is so small we also ease down on regularization (`--dropout=0.0`). This still runs in about ~3 minutes, but gets us a loss of only 1.88 and therefore also worse samples, but it's still good fun:
|
|
82
|
+
|
|
83
|
+
```sh
|
|
84
|
+
python sample.py --out_dir=out-shakespeare-char --device=cpu
|
|
85
|
+
```
|
|
86
|
+
Generates samples like this:
|
|
87
|
+
|
|
88
|
+
```
|
|
89
|
+
GLEORKEN VINGHARD III:
|
|
90
|
+
Whell's the couse, the came light gacks,
|
|
91
|
+
And the for mought you in Aut fries the not high shee
|
|
92
|
+
bot thou the sought bechive in that to doth groan you,
|
|
93
|
+
No relving thee post mose the wear
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
Not bad for ~3 minutes on a CPU, for a hint of the right character gestalt. If you're willing to wait longer, feel free to tune the hyperparameters, increase the size of the network, the context length (`--block_size`), the length of training, etc.
|
|
97
|
+
|
|
98
|
+
Finally, on Apple Silicon Macbooks and with a recent PyTorch version make sure to add `--device=mps` (short for "Metal Performance Shaders"); PyTorch then uses the on-chip GPU that can *significantly* accelerate training (2-3X) and allow you to use larger networks. See [Issue 28](https://github.com/karpathy/nanoGPT/issues/28) for more.
|
|
99
|
+
|
|
100
|
+
## reproducing GPT-2
|
|
101
|
+
|
|
102
|
+
A more serious deep learning professional may be more interested in reproducing GPT-2 results. So here we go - we first tokenize the dataset, in this case the [OpenWebText](https://openwebtext2.readthedocs.io/en/latest/), an open reproduction of OpenAI's (private) WebText:
|
|
103
|
+
|
|
104
|
+
```sh
|
|
105
|
+
python data/openwebtext/prepare.py
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
This downloads and tokenizes the [OpenWebText](https://huggingface.co/datasets/openwebtext) dataset. It will create a `train.bin` and `val.bin` which holds the GPT2 BPE token ids in one sequence, stored as raw uint16 bytes. Then we're ready to kick off training. To reproduce GPT-2 (124M) you'll want at least an 8X A100 40GB node and run:
|
|
109
|
+
|
|
110
|
+
```sh
|
|
111
|
+
torchrun --standalone --nproc_per_node=8 train.py config/train_gpt2.py
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
This will run for about 4 days using PyTorch Distributed Data Parallel (DDP) and go down to loss of ~2.85. Now, a GPT-2 model just evaluated on OWT gets a val loss of about 3.11, but if you finetune it it will come down to ~2.85 territory (due to an apparent domain gap), making the two models ~match.
|
|
115
|
+
|
|
116
|
+
If you're in a cluster environment and you are blessed with multiple GPU nodes you can make GPU go brrrr e.g. across 2 nodes like:
|
|
117
|
+
|
|
118
|
+
```sh
|
|
119
|
+
# Run on the first (master) node with example IP 123.456.123.456:
|
|
120
|
+
torchrun --nproc_per_node=8 --nnodes=2 --node_rank=0 --master_addr=123.456.123.456 --master_port=1234 train.py
|
|
121
|
+
# Run on the worker node:
|
|
122
|
+
torchrun --nproc_per_node=8 --nnodes=2 --node_rank=1 --master_addr=123.456.123.456 --master_port=1234 train.py
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
It is a good idea to benchmark your interconnect (e.g. iperf3). In particular, if you don't have Infiniband then also prepend `NCCL_IB_DISABLE=1` to the above launches. Your multinode training will work, but most likely _crawl_. By default checkpoints are periodically written to the `--out_dir`. We can sample from the model by simply `python sample.py`.
|
|
126
|
+
|
|
127
|
+
Finally, to train on a single GPU simply run the `python train.py` script. Have a look at all of its args, the script tries to be very readable, hackable and transparent. You'll most likely want to tune a number of those variables depending on your needs.
|
|
128
|
+
|
|
129
|
+
## baselines
|
|
130
|
+
|
|
131
|
+
OpenAI GPT-2 checkpoints allow us to get some baselines in place for openwebtext. We can get the numbers as follows:
|
|
132
|
+
|
|
133
|
+
```sh
|
|
134
|
+
$ python train.py config/eval_gpt2.py
|
|
135
|
+
$ python train.py config/eval_gpt2_medium.py
|
|
136
|
+
$ python train.py config/eval_gpt2_large.py
|
|
137
|
+
$ python train.py config/eval_gpt2_xl.py
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
and observe the following losses on train and val:
|
|
141
|
+
|
|
142
|
+
| model | params | train loss | val loss |
|
|
143
|
+
| ------| ------ | ---------- | -------- |
|
|
144
|
+
| gpt2 | 124M | 3.11 | 3.12 |
|
|
145
|
+
| gpt2-medium | 350M | 2.85 | 2.84 |
|
|
146
|
+
| gpt2-large | 774M | 2.66 | 2.67 |
|
|
147
|
+
| gpt2-xl | 1558M | 2.56 | 2.54 |
|
|
148
|
+
|
|
149
|
+
However, we have to note that GPT-2 was trained on (closed, never released) WebText, while OpenWebText is just a best-effort open reproduction of this dataset. This means there is a dataset domain gap. Indeed, taking the GPT-2 (124M) checkpoint and finetuning on OWT directly for a while reaches loss down to ~2.85. This then becomes the more appropriate baseline w.r.t. reproduction.
|
|
150
|
+
|
|
151
|
+
## finetuning
|
|
152
|
+
|
|
153
|
+
Finetuning is no different than training, we just make sure to initialize from a pretrained model and train with a smaller learning rate. For an example of how to finetune a GPT on new text go to `data/shakespeare` and run `prepare.py` to download the tiny shakespeare dataset and render it into a `train.bin` and `val.bin`, using the OpenAI BPE tokenizer from GPT-2. Unlike OpenWebText this will run in seconds. Finetuning can take very little time, e.g. on a single GPU just a few minutes. Run an example finetuning like:
|
|
154
|
+
|
|
155
|
+
```sh
|
|
156
|
+
python train.py config/finetune_shakespeare.py
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
This will load the config parameter overrides in `config/finetune_shakespeare.py` (I didn't tune them much though). Basically, we initialize from a GPT2 checkpoint with `init_from` and train as normal, except shorter and with a small learning rate. If you're running out of memory try decreasing the model size (they are `{'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}`) or possibly decreasing the `block_size` (context length). The best checkpoint (lowest validation loss) will be in the `out_dir` directory, e.g. in `out-shakespeare` by default, per the config file. You can then run the code in `sample.py --out_dir=out-shakespeare`:
|
|
160
|
+
|
|
161
|
+
```
|
|
162
|
+
THEODORE:
|
|
163
|
+
Thou shalt sell me to the highest bidder: if I die,
|
|
164
|
+
I sell thee to the first; if I go mad,
|
|
165
|
+
I sell thee to the second; if I
|
|
166
|
+
lie, I sell thee to the third; if I slay,
|
|
167
|
+
I sell thee to the fourth: so buy or sell,
|
|
168
|
+
I tell thee again, thou shalt not sell my
|
|
169
|
+
possession.
|
|
170
|
+
|
|
171
|
+
JULIET:
|
|
172
|
+
And if thou steal, thou shalt not sell thyself.
|
|
173
|
+
|
|
174
|
+
THEODORE:
|
|
175
|
+
I do not steal; I sell the stolen goods.
|
|
176
|
+
|
|
177
|
+
THEODORE:
|
|
178
|
+
Thou know'st not what thou sell'st; thou, a woman,
|
|
179
|
+
Thou art ever a victim, a thing of no worth:
|
|
180
|
+
Thou hast no right, no right, but to be sold.
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
Whoa there, GPT, entering some dark place over there. I didn't really tune the hyperparameters in the config too much, feel free to try!
|
|
184
|
+
|
|
185
|
+
## sampling / inference
|
|
186
|
+
|
|
187
|
+
Use the script `sample.py` to sample either from pre-trained GPT-2 models released by OpenAI, or from a model you trained yourself. For example, here is a way to sample from the largest available `gpt2-xl` model:
|
|
188
|
+
|
|
189
|
+
```sh
|
|
190
|
+
python sample.py \
|
|
191
|
+
--init_from=gpt2-xl \
|
|
192
|
+
--start="What is the answer to life, the universe, and everything?" \
|
|
193
|
+
--num_samples=5 --max_new_tokens=100
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
If you'd like to sample from a model you trained, use the `--out_dir` to point the code appropriately. You can also prompt the model with some text from a file, e.g. ```python sample.py --start=FILE:prompt.txt```.
|
|
197
|
+
|
|
198
|
+
## efficiency notes
|
|
199
|
+
|
|
200
|
+
For simple model benchmarking and profiling, `bench.py` might be useful. It's identical to what happens in the meat of the training loop of `train.py`, but omits much of the other complexities.
|
|
201
|
+
|
|
202
|
+
Note that the code by default uses [PyTorch 2.0](https://pytorch.org/get-started/pytorch-2.0/). At the time of writing (Dec 29, 2022) this makes `torch.compile()` available in the nightly release. The improvement from the one line of code is noticeable, e.g. cutting down iteration time from ~250ms / iter to 135ms / iter. Nice work PyTorch team!
|
|
203
|
+
|
|
204
|
+
## todos
|
|
205
|
+
|
|
206
|
+
- Investigate and add FSDP instead of DDP
|
|
207
|
+
- Eval zero-shot perplexities on standard evals (e.g. LAMBADA? HELM? etc.)
|
|
208
|
+
- Finetune the finetuning script, I think the hyperparams are not great
|
|
209
|
+
- Schedule for linear batch size increase during training
|
|
210
|
+
- Incorporate other embeddings (rotary, alibi)
|
|
211
|
+
- Separate out the optim buffers from model params in checkpoints I think
|
|
212
|
+
- Additional logging around network health (e.g. gradient clip events, magnitudes)
|
|
213
|
+
- Few more investigations around better init etc.
|
|
214
|
+
|
|
215
|
+
## troubleshooting
|
|
216
|
+
|
|
217
|
+
Note that by default this repo uses PyTorch 2.0 (i.e. `torch.compile`). This is fairly new and experimental, and not yet available on all platforms (e.g. Windows). If you're running into related error messages try to disable this by adding `--compile=False` flag. This will slow down the code but at least it will run.
|
|
218
|
+
|
|
219
|
+
For some context on this repository, GPT, and language modeling it might be helpful to watch my [Zero To Hero series](https://karpathy.ai/zero-to-hero.html). Specifically, the [GPT video](https://www.youtube.com/watch?v=kCc8FmEb1nY) is popular if you have some prior language modeling context.
|
|
220
|
+
|
|
221
|
+
For more questions/discussions feel free to stop by **#nanoGPT** on Discord:
|
|
222
|
+
|
|
223
|
+
[](https://discord.gg/3zy8kqD9Cp)
|
|
224
|
+
|
|
225
|
+
## acknowledgements
|
|
226
|
+
|
|
227
|
+
All nanoGPT experiments are powered by GPUs on [Lambda labs](https://lambdalabs.com), my favorite Cloud GPU provider. Thank you Lambda labs for sponsoring nanoGPT!
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
"""
|
|
2
|
+
A much shorter version of train.py for benchmarking
|
|
3
|
+
"""
|
|
4
|
+
import os
|
|
5
|
+
from contextlib import nullcontext
|
|
6
|
+
import numpy as np
|
|
7
|
+
import time
|
|
8
|
+
import torch
|
|
9
|
+
from model import GPTConfig, GPT
|
|
10
|
+
|
|
11
|
+
# -----------------------------------------------------------------------------
|
|
12
|
+
batch_size = 12
|
|
13
|
+
block_size = 1024
|
|
14
|
+
bias = False
|
|
15
|
+
real_data = True
|
|
16
|
+
seed = 1337
|
|
17
|
+
device = 'cuda' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1', etc.
|
|
18
|
+
dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16' # 'float32' or 'bfloat16' or 'float16'
|
|
19
|
+
compile = True # use PyTorch 2.0 to compile the model to be faster
|
|
20
|
+
profile = False # use pytorch profiler, or just simple benchmarking?
|
|
21
|
+
exec(open('configurator.py').read()) # overrides from command line or config file
|
|
22
|
+
# -----------------------------------------------------------------------------
|
|
23
|
+
|
|
24
|
+
torch.manual_seed(seed)
|
|
25
|
+
torch.cuda.manual_seed(seed)
|
|
26
|
+
torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
|
|
27
|
+
torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
|
|
28
|
+
device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast
|
|
29
|
+
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
|
|
30
|
+
ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)
|
|
31
|
+
|
|
32
|
+
# data loading init
|
|
33
|
+
if real_data:
|
|
34
|
+
dataset = 'openwebtext'
|
|
35
|
+
data_dir = os.path.join('data', dataset)
|
|
36
|
+
train_data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.uint16, mode='r')
|
|
37
|
+
def get_batch(split):
|
|
38
|
+
data = train_data # note ignore split in benchmarking script
|
|
39
|
+
ix = torch.randint(len(data) - block_size, (batch_size,))
|
|
40
|
+
x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix])
|
|
41
|
+
y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in ix])
|
|
42
|
+
x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
|
|
43
|
+
return x, y
|
|
44
|
+
else:
|
|
45
|
+
# alternatively, if fixed data is desired to not care about data loading
|
|
46
|
+
x = torch.randint(50304, (batch_size, block_size), device=device)
|
|
47
|
+
y = torch.randint(50304, (batch_size, block_size), device=device)
|
|
48
|
+
get_batch = lambda split: (x, y)
|
|
49
|
+
|
|
50
|
+
# model init
|
|
51
|
+
gptconf = GPTConfig(
|
|
52
|
+
block_size = block_size, # how far back does the model look? i.e. context size
|
|
53
|
+
n_layer = 12, n_head = 12, n_embd = 768, # size of the model
|
|
54
|
+
dropout = 0, # for determinism
|
|
55
|
+
bias = bias,
|
|
56
|
+
)
|
|
57
|
+
model = GPT(gptconf)
|
|
58
|
+
model.to(device)
|
|
59
|
+
|
|
60
|
+
optimizer = model.configure_optimizers(weight_decay=1e-2, learning_rate=1e-4, betas=(0.9, 0.95), device_type=device_type)
|
|
61
|
+
|
|
62
|
+
if compile:
|
|
63
|
+
print("Compiling model...")
|
|
64
|
+
model = torch.compile(model) # pytorch 2.0
|
|
65
|
+
|
|
66
|
+
if profile:
|
|
67
|
+
# useful docs on pytorch profiler:
|
|
68
|
+
# - tutorial https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html
|
|
69
|
+
# - api https://pytorch.org/docs/stable/profiler.html#torch.profiler.profile
|
|
70
|
+
wait, warmup, active = 5, 5, 5
|
|
71
|
+
num_steps = wait + warmup + active
|
|
72
|
+
with torch.profiler.profile(
|
|
73
|
+
activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
|
|
74
|
+
schedule=torch.profiler.schedule(wait=wait, warmup=warmup, active=active, repeat=1),
|
|
75
|
+
on_trace_ready=torch.profiler.tensorboard_trace_handler('./bench_log'),
|
|
76
|
+
record_shapes=False,
|
|
77
|
+
profile_memory=False,
|
|
78
|
+
with_stack=False, # incurs an additional overhead, disable if not needed
|
|
79
|
+
with_flops=True,
|
|
80
|
+
with_modules=False, # only for torchscript models atm
|
|
81
|
+
) as prof:
|
|
82
|
+
|
|
83
|
+
X, Y = get_batch('train')
|
|
84
|
+
for k in range(num_steps):
|
|
85
|
+
with ctx:
|
|
86
|
+
logits, loss = model(X, Y)
|
|
87
|
+
X, Y = get_batch('train')
|
|
88
|
+
optimizer.zero_grad(set_to_none=True)
|
|
89
|
+
loss.backward()
|
|
90
|
+
optimizer.step()
|
|
91
|
+
lossf = loss.item()
|
|
92
|
+
print(f"{k}/{num_steps} loss: {lossf:.4f}")
|
|
93
|
+
|
|
94
|
+
prof.step() # notify the profiler at end of each step
|
|
95
|
+
|
|
96
|
+
else:
|
|
97
|
+
|
|
98
|
+
# simple benchmarking
|
|
99
|
+
torch.cuda.synchronize()
|
|
100
|
+
for stage, num_steps in enumerate([10, 20]): # burnin, then benchmark
|
|
101
|
+
t0 = time.time()
|
|
102
|
+
X, Y = get_batch('train')
|
|
103
|
+
for k in range(num_steps):
|
|
104
|
+
with ctx:
|
|
105
|
+
logits, loss = model(X, Y)
|
|
106
|
+
X, Y = get_batch('train')
|
|
107
|
+
optimizer.zero_grad(set_to_none=True)
|
|
108
|
+
loss.backward()
|
|
109
|
+
optimizer.step()
|
|
110
|
+
lossf = loss.item()
|
|
111
|
+
print(f"{k}/{num_steps} loss: {lossf:.4f}")
|
|
112
|
+
torch.cuda.synchronize()
|
|
113
|
+
t1 = time.time()
|
|
114
|
+
dt = t1-t0
|
|
115
|
+
mfu = model.estimate_mfu(batch_size * 1 * num_steps, dt)
|
|
116
|
+
if stage == 1:
|
|
117
|
+
print(f"time per iteration: {dt/num_steps*1000:.4f}ms, MFU: {mfu*100:.2f}%")
|