nanogpt 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Gemfile +7 -0
- data/Gemfile.lock +42 -0
- data/README.md +102 -0
- data/bin/bench +210 -0
- data/bin/sample +76 -0
- data/bin/train +82 -0
- data/config/train_gpt2.json +19 -0
- data/config/train_shakespeare.json +14 -0
- data/config/train_shakespeare_char.json +14 -0
- data/data/openwebtext/prepare.rb +287 -0
- data/data/shakespeare/prepare.rb +61 -0
- data/data/shakespeare_char/input.txt +40000 -0
- data/data/shakespeare_char/prepare.rb +73 -0
- data/exe/nanogpt +338 -0
- data/lib/nano_gpt/config.rb +42 -0
- data/lib/nano_gpt/data_loader.rb +74 -0
- data/lib/nano_gpt/device.rb +56 -0
- data/lib/nano_gpt/layers/block.rb +25 -0
- data/lib/nano_gpt/layers/causal_self_attention.rb +73 -0
- data/lib/nano_gpt/layers/layer_norm.rb +21 -0
- data/lib/nano_gpt/layers/mlp.rb +23 -0
- data/lib/nano_gpt/lr_scheduler.rb +42 -0
- data/lib/nano_gpt/model.rb +218 -0
- data/lib/nano_gpt/tokenizer.rb +106 -0
- data/lib/nano_gpt/train_config.rb +259 -0
- data/lib/nano_gpt/trainer.rb +221 -0
- data/lib/nano_gpt/version.rb +5 -0
- data/lib/nano_gpt.rb +18 -0
- data/nanogpt.gemspec +37 -0
- metadata +133 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: b10e62747a63be10b519fdd8eb1eaabfc63214b01ab553d4230af865a0a8f327
|
|
4
|
+
data.tar.gz: 69db126e3d02c897045e543981223d1d29d72f7625bd381e3ba4badee88b6d20
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: 97f5846373224b889f22b80302a05271c7ab0276037e519c19c58cc0954aa982a4d3ac9992766aa73f6d1eec1dfe2b96b8a750414dd02818529e37789671cd87
|
|
7
|
+
data.tar.gz: accd2112564f004cc763dbaac418f35ce24426d4d86e53935a32bdf6411659463ed6cb8e9415146136ee78fa79748e39a1e506b5c44848a7b6e503dfee86e310
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
PATH
|
|
2
|
+
remote: .
|
|
3
|
+
specs:
|
|
4
|
+
nanogpt (0.1.0)
|
|
5
|
+
numo-narray (~> 0.9)
|
|
6
|
+
tiktoken_ruby (~> 0.0)
|
|
7
|
+
torch-rb (~> 0.14)
|
|
8
|
+
|
|
9
|
+
GEM
|
|
10
|
+
remote: https://rubygems.org/
|
|
11
|
+
specs:
|
|
12
|
+
diff-lcs (1.6.2)
|
|
13
|
+
numo-narray (0.9.2.1)
|
|
14
|
+
parquet (0.7.3-arm64-darwin)
|
|
15
|
+
rice (4.7.1)
|
|
16
|
+
rspec (3.13.2)
|
|
17
|
+
rspec-core (~> 3.13.0)
|
|
18
|
+
rspec-expectations (~> 3.13.0)
|
|
19
|
+
rspec-mocks (~> 3.13.0)
|
|
20
|
+
rspec-core (3.13.6)
|
|
21
|
+
rspec-support (~> 3.13.0)
|
|
22
|
+
rspec-expectations (3.13.5)
|
|
23
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
|
24
|
+
rspec-support (~> 3.13.0)
|
|
25
|
+
rspec-mocks (3.13.7)
|
|
26
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
|
27
|
+
rspec-support (~> 3.13.0)
|
|
28
|
+
rspec-support (3.13.6)
|
|
29
|
+
tiktoken_ruby (0.0.13-arm64-darwin)
|
|
30
|
+
torch-rb (0.22.2)
|
|
31
|
+
rice (>= 4.7)
|
|
32
|
+
|
|
33
|
+
PLATFORMS
|
|
34
|
+
arm64-darwin-24
|
|
35
|
+
|
|
36
|
+
DEPENDENCIES
|
|
37
|
+
nanogpt!
|
|
38
|
+
parquet (~> 0.5)
|
|
39
|
+
rspec (~> 3.12)
|
|
40
|
+
|
|
41
|
+
BUNDLED WITH
|
|
42
|
+
2.7.2
|
data/README.md
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
# nanoGPT
|
|
2
|
+
|
|
3
|
+
A Ruby port of Karpathy's [nanoGPT](https://github.com/karpathy/nanoGPT). Train GPT-2 style language models from scratch using [torch.rb](https://github.com/ankane/torch.rb).
|
|
4
|
+
|
|
5
|
+
Built for Ruby developers who want to understand how LLMs work by building one.
|
|
6
|
+
|
|
7
|
+
## Quick Start
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
gem install nanogpt
|
|
11
|
+
|
|
12
|
+
# Prepare Shakespeare dataset with character-level tokenizer
|
|
13
|
+
nanogpt prepare shakespeare_char
|
|
14
|
+
|
|
15
|
+
# Train (use MPS on Apple Silicon for 17x speedup)
|
|
16
|
+
nanogpt train --dataset=shakespeare_char --device=mps
|
|
17
|
+
|
|
18
|
+
# Generate text
|
|
19
|
+
nanogpt sample --dataset=shakespeare_char
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
Or from source:
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
git clone https://github.com/khasinski/nanogpt-rb
|
|
26
|
+
cd nanogpt-rb
|
|
27
|
+
bundle install
|
|
28
|
+
|
|
29
|
+
# Prepare data
|
|
30
|
+
bundle exec ruby data/shakespeare_char/prepare.rb
|
|
31
|
+
|
|
32
|
+
# Train
|
|
33
|
+
bundle exec exe/nanogpt train --dataset=shakespeare_char --device=mps
|
|
34
|
+
|
|
35
|
+
# Sample
|
|
36
|
+
bundle exec exe/nanogpt sample --dataset=shakespeare_char
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## Performance (M1 Max)
|
|
40
|
+
|
|
41
|
+
Training the default 10.65M parameter model on Shakespeare:
|
|
42
|
+
|
|
43
|
+
| Device | Time/iter | Notes |
|
|
44
|
+
|--------|-----------|-------|
|
|
45
|
+
| MPS | ~500ms | Recommended for Apple Silicon |
|
|
46
|
+
| CPU | ~8,500ms | 17x slower |
|
|
47
|
+
|
|
48
|
+
After ~2000 iterations (~20 min on MPS), the model generates coherent Shakespeare-like text.
|
|
49
|
+
|
|
50
|
+
## Commands
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
nanogpt train [options] # Train a model
|
|
54
|
+
nanogpt sample [options] # Generate text from trained model
|
|
55
|
+
nanogpt bench [options] # Run performance benchmarks
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
### Training Options
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
--dataset=NAME # Dataset to use (default: shakespeare_char)
|
|
62
|
+
--device=DEVICE # cpu or mps, cuda might work too 🤞(default: auto)
|
|
63
|
+
--max_iters=N # Training iterations (default: 5000)
|
|
64
|
+
--batch_size=N # Batch size (default: 64)
|
|
65
|
+
--block_size=N # Context length (default: 256)
|
|
66
|
+
--n_layer=N # Transformer layers (default: 6)
|
|
67
|
+
--n_head=N # Attention heads (default: 6)
|
|
68
|
+
--n_embd=N # Embedding dimension (default: 384)
|
|
69
|
+
--learning_rate=F # Learning rate (default: 1e-3)
|
|
70
|
+
--config=FILE # Load settings from JSON file
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
### Sampling Options
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
--dataset=NAME # Dataset (for tokenizer)
|
|
77
|
+
--out_dir=DIR # Checkpoint directory
|
|
78
|
+
--num_samples=N # Number of samples to generate
|
|
79
|
+
--max_new_tokens=N # Tokens per sample (default: 500)
|
|
80
|
+
--temperature=F # Sampling temperature (default: 0.8)
|
|
81
|
+
--top_k=N # Top-k sampling (default: 200)
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
## Features
|
|
85
|
+
|
|
86
|
+
- Full GPT-2 architecture (attention, MLP, layer norm, embeddings)
|
|
87
|
+
- MPS (Metal) and CUDA GPU acceleration via torch.rb
|
|
88
|
+
- Flash attention when dropout=0 (5x faster attention)
|
|
89
|
+
- Cosine learning rate schedule with warmup
|
|
90
|
+
- Gradient accumulation for larger effective batch sizes
|
|
91
|
+
- Checkpointing and resumption
|
|
92
|
+
- Character-level and GPT-2 BPE tokenizers
|
|
93
|
+
|
|
94
|
+
## Requirements
|
|
95
|
+
|
|
96
|
+
- Ruby >= 3.1
|
|
97
|
+
- LibTorch (installed automatically with torch-rb)
|
|
98
|
+
- For MPS: macOS 12.3+ with Apple Silicon
|
|
99
|
+
|
|
100
|
+
## License
|
|
101
|
+
|
|
102
|
+
MIT
|
data/bin/bench
ADDED
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
$stdout.sync = true
|
|
5
|
+
|
|
6
|
+
# Benchmark script for measuring training performance
|
|
7
|
+
# A shorter version of train.rb focused on benchmarking
|
|
8
|
+
#
|
|
9
|
+
# Usage:
|
|
10
|
+
# bundle exec ruby bin/bench
|
|
11
|
+
# bundle exec ruby bin/bench --batch_size=8 --block_size=512
|
|
12
|
+
# bundle exec ruby bin/bench --real_data=false # use random data
|
|
13
|
+
|
|
14
|
+
require_relative "../lib/nano_gpt"
|
|
15
|
+
|
|
16
|
+
# Default configuration (GPT-2 style model)
|
|
17
|
+
CONFIG = {
|
|
18
|
+
batch_size: 12,
|
|
19
|
+
block_size: 1024,
|
|
20
|
+
n_layer: 12,
|
|
21
|
+
n_head: 12,
|
|
22
|
+
n_embd: 768,
|
|
23
|
+
dropout: 0.0,
|
|
24
|
+
bias: false,
|
|
25
|
+
real_data: true,
|
|
26
|
+
dataset: "openwebtext",
|
|
27
|
+
seed: 1337,
|
|
28
|
+
device: "auto"
|
|
29
|
+
}.freeze
|
|
30
|
+
|
|
31
|
+
def parse_args(args, config)
|
|
32
|
+
result = config.dup
|
|
33
|
+
|
|
34
|
+
args.each do |arg|
|
|
35
|
+
next unless arg.start_with?("--") && arg.include?("=")
|
|
36
|
+
|
|
37
|
+
key, val = arg[2..].split("=", 2)
|
|
38
|
+
key = key.to_sym
|
|
39
|
+
|
|
40
|
+
unless result.key?(key)
|
|
41
|
+
puts "Warning: Unknown config key: #{key}"
|
|
42
|
+
next
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
result[key] = case result[key]
|
|
46
|
+
when Integer then val.to_i
|
|
47
|
+
when Float then val.to_f
|
|
48
|
+
when TrueClass, FalseClass then val.downcase == "true"
|
|
49
|
+
else val
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
result
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def get_batch_from_data(data, batch_size, block_size, device)
|
|
57
|
+
max_start = data.size - block_size - 1
|
|
58
|
+
indices = Array.new(batch_size) { rand(0..max_start) }
|
|
59
|
+
|
|
60
|
+
x_arrays = []
|
|
61
|
+
y_arrays = []
|
|
62
|
+
|
|
63
|
+
indices.each do |i|
|
|
64
|
+
x_arrays << data[i, block_size]
|
|
65
|
+
y_arrays << data[i + 1, block_size]
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
x = Torch.tensor(x_arrays, dtype: :long)
|
|
69
|
+
y = Torch.tensor(y_arrays, dtype: :long)
|
|
70
|
+
|
|
71
|
+
if device != "cpu"
|
|
72
|
+
x = x.to(device)
|
|
73
|
+
y = y.to(device)
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
[x, y]
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
def main
|
|
80
|
+
config = parse_args(ARGV, CONFIG)
|
|
81
|
+
|
|
82
|
+
puts "=" * 60
|
|
83
|
+
puts "NanoGPT Benchmark"
|
|
84
|
+
puts "=" * 60
|
|
85
|
+
puts ""
|
|
86
|
+
puts "Configuration:"
|
|
87
|
+
puts " batch_size: #{config[:batch_size]}"
|
|
88
|
+
puts " block_size: #{config[:block_size]}"
|
|
89
|
+
puts " n_layer: #{config[:n_layer]}"
|
|
90
|
+
puts " n_head: #{config[:n_head]}"
|
|
91
|
+
puts " n_embd: #{config[:n_embd]}"
|
|
92
|
+
puts " real_data: #{config[:real_data]}"
|
|
93
|
+
puts ""
|
|
94
|
+
|
|
95
|
+
# Resolve device
|
|
96
|
+
if config[:device] == "auto"
|
|
97
|
+
config[:device] = NanoGPT::Device.auto
|
|
98
|
+
end
|
|
99
|
+
device = config[:device]
|
|
100
|
+
puts "Device: #{device}"
|
|
101
|
+
|
|
102
|
+
Torch.manual_seed(config[:seed])
|
|
103
|
+
|
|
104
|
+
# Data loading
|
|
105
|
+
if config[:real_data]
|
|
106
|
+
data_dir = File.join("data", config[:dataset])
|
|
107
|
+
train_bin = File.join(data_dir, "train.bin")
|
|
108
|
+
|
|
109
|
+
unless File.exist?(train_bin)
|
|
110
|
+
puts ""
|
|
111
|
+
puts "Warning: #{train_bin} not found, using random data instead."
|
|
112
|
+
puts "To use real data, run: bundle exec ruby data/#{config[:dataset]}/prepare.rb"
|
|
113
|
+
puts ""
|
|
114
|
+
config[:real_data] = false
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
if config[:real_data]
|
|
119
|
+
# Load training data
|
|
120
|
+
bytes = File.binread(File.join("data", config[:dataset], "train.bin"))
|
|
121
|
+
train_data = bytes.unpack("S<*") # uint16 little-endian
|
|
122
|
+
puts "Loaded #{train_data.size} tokens from #{config[:dataset]}"
|
|
123
|
+
|
|
124
|
+
get_batch = lambda do
|
|
125
|
+
get_batch_from_data(train_data, config[:batch_size], config[:block_size], device)
|
|
126
|
+
end
|
|
127
|
+
else
|
|
128
|
+
# Use random data
|
|
129
|
+
vocab_size = 50304 # GPT-2 vocab size rounded up for efficiency
|
|
130
|
+
puts "Using random data (vocab_size=#{vocab_size})"
|
|
131
|
+
|
|
132
|
+
get_batch = lambda do
|
|
133
|
+
x = Torch.randint(vocab_size, [config[:batch_size], config[:block_size]], dtype: :long)
|
|
134
|
+
y = Torch.randint(vocab_size, [config[:batch_size], config[:block_size]], dtype: :long)
|
|
135
|
+
x = x.to(device) if device != "cpu"
|
|
136
|
+
y = y.to(device) if device != "cpu"
|
|
137
|
+
[x, y]
|
|
138
|
+
end
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
# Model init
|
|
142
|
+
puts ""
|
|
143
|
+
puts "Initializing model..."
|
|
144
|
+
model_config = NanoGPT::GPTConfig.new(
|
|
145
|
+
block_size: config[:block_size],
|
|
146
|
+
vocab_size: 50304, # GPT-2 vocab rounded up
|
|
147
|
+
n_layer: config[:n_layer],
|
|
148
|
+
n_head: config[:n_head],
|
|
149
|
+
n_embd: config[:n_embd],
|
|
150
|
+
dropout: config[:dropout],
|
|
151
|
+
bias: config[:bias]
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
model = NanoGPT::GPT.new(model_config)
|
|
155
|
+
model.to(device) if device != "cpu"
|
|
156
|
+
|
|
157
|
+
# Optimizer
|
|
158
|
+
optimizer = model.configure_optimizers(
|
|
159
|
+
weight_decay: 1e-2,
|
|
160
|
+
learning_rate: 1e-4,
|
|
161
|
+
betas: [0.9, 0.95],
|
|
162
|
+
device_type: NanoGPT::Device.type(device)
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
puts ""
|
|
166
|
+
puts "Starting benchmark..."
|
|
167
|
+
puts "-" * 60
|
|
168
|
+
|
|
169
|
+
# Benchmark: burn-in phase then measurement phase
|
|
170
|
+
[{ name: "burn-in", steps: 10 }, { name: "benchmark", steps: 20 }].each do |phase|
|
|
171
|
+
puts ""
|
|
172
|
+
puts "Phase: #{phase[:name]} (#{phase[:steps]} steps)"
|
|
173
|
+
|
|
174
|
+
x, y = get_batch.call
|
|
175
|
+
t0 = Time.now
|
|
176
|
+
|
|
177
|
+
phase[:steps].times do |k|
|
|
178
|
+
# Forward pass
|
|
179
|
+
_logits, loss = model.forward(x, targets: y)
|
|
180
|
+
|
|
181
|
+
# Get next batch
|
|
182
|
+
x, y = get_batch.call
|
|
183
|
+
|
|
184
|
+
# Backward pass
|
|
185
|
+
optimizer.zero_grad
|
|
186
|
+
loss.backward
|
|
187
|
+
optimizer.step
|
|
188
|
+
|
|
189
|
+
loss_val = loss.item
|
|
190
|
+
puts " #{k}/#{phase[:steps]} loss: #{format('%.4f', loss_val)}"
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
t1 = Time.now
|
|
194
|
+
dt = t1 - t0
|
|
195
|
+
|
|
196
|
+
if phase[:name] == "benchmark"
|
|
197
|
+
mfu = model.estimate_mfu(config[:batch_size] * phase[:steps], dt)
|
|
198
|
+
time_per_iter = dt / phase[:steps] * 1000
|
|
199
|
+
|
|
200
|
+
puts ""
|
|
201
|
+
puts "=" * 60
|
|
202
|
+
puts "Results:"
|
|
203
|
+
puts " Time per iteration: #{format('%.2f', time_per_iter)}ms"
|
|
204
|
+
puts " MFU: #{format('%.2f', mfu * 100)}%"
|
|
205
|
+
puts "=" * 60
|
|
206
|
+
end
|
|
207
|
+
end
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
main
|
data/bin/sample
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
# Generate text from a trained model
|
|
5
|
+
# Usage:
|
|
6
|
+
# bundle exec ruby bin/sample [options]
|
|
7
|
+
# bundle exec ruby bin/sample --config=config/sample.json
|
|
8
|
+
# bundle exec ruby bin/sample --out_dir=out-shakespeare --num_samples=3
|
|
9
|
+
|
|
10
|
+
require_relative "../lib/nano_gpt"
|
|
11
|
+
|
|
12
|
+
def main
|
|
13
|
+
config = NanoGPT::SampleConfig.load(ARGV)
|
|
14
|
+
|
|
15
|
+
# Resolve device (auto-detect if "auto")
|
|
16
|
+
if config[:device] == "auto"
|
|
17
|
+
config[:device] = NanoGPT::Device.auto
|
|
18
|
+
puts "Auto-detected device: #{config[:device]}"
|
|
19
|
+
end
|
|
20
|
+
device = config[:device]
|
|
21
|
+
|
|
22
|
+
Torch.manual_seed(config[:seed])
|
|
23
|
+
|
|
24
|
+
# Load checkpoint
|
|
25
|
+
ckpt_path = File.join(config[:out_dir], "ckpt.pt")
|
|
26
|
+
unless File.exist?(ckpt_path)
|
|
27
|
+
puts "Error: Checkpoint not found at #{ckpt_path}"
|
|
28
|
+
puts "Train a model first with: bundle exec ruby bin/train"
|
|
29
|
+
exit 1
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
puts "Loading checkpoint from #{ckpt_path}..."
|
|
33
|
+
checkpoint = Torch.load(ckpt_path)
|
|
34
|
+
|
|
35
|
+
# Recreate model from checkpoint
|
|
36
|
+
model_args = checkpoint["model_args"].transform_keys(&:to_sym)
|
|
37
|
+
model_config = NanoGPT::GPTConfig.new(**model_args)
|
|
38
|
+
model = NanoGPT::GPT.new(model_config)
|
|
39
|
+
model.load_state_dict(checkpoint["model"])
|
|
40
|
+
model.to(device) if device != "cpu"
|
|
41
|
+
model.eval
|
|
42
|
+
|
|
43
|
+
# Load tokenizer (auto-detects character-level vs GPT-2 BPE)
|
|
44
|
+
dataset_dir = File.join("data", config[:dataset])
|
|
45
|
+
tokenizer = NanoGPT::Tokenizer.for_dataset(dataset_dir)
|
|
46
|
+
puts "number of parameters: #{model.num_params / 1e6}M"
|
|
47
|
+
|
|
48
|
+
# Handle start text
|
|
49
|
+
start_text = config[:start]
|
|
50
|
+
if start_text.start_with?("FILE:")
|
|
51
|
+
start_text = File.read(start_text[5..])
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
# Encode starting prompt
|
|
55
|
+
start_ids = tokenizer.encode(start_text)
|
|
56
|
+
x = Torch.tensor([start_ids], dtype: :long, device: device)
|
|
57
|
+
|
|
58
|
+
puts "Generating #{config[:num_samples]} samples..."
|
|
59
|
+
puts "=" * 50
|
|
60
|
+
|
|
61
|
+
# Generate samples
|
|
62
|
+
config[:num_samples].times do |k|
|
|
63
|
+
y = model.generate(
|
|
64
|
+
x,
|
|
65
|
+
config[:max_new_tokens],
|
|
66
|
+
temperature: config[:temperature],
|
|
67
|
+
top_k: config[:top_k]
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
output = tokenizer.decode(y[0].to_a)
|
|
71
|
+
puts output
|
|
72
|
+
puts "-" * 50
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
main
|
data/bin/train
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
$stdout.sync = true
|
|
5
|
+
|
|
6
|
+
# Train a GPT model
|
|
7
|
+
# Usage:
|
|
8
|
+
# bundle exec ruby bin/train [options]
|
|
9
|
+
# bundle exec ruby bin/train --config=config/train_shakespeare.json
|
|
10
|
+
# bundle exec ruby bin/train --config=config/train_gpt2.json --max_iters=1000
|
|
11
|
+
|
|
12
|
+
require_relative "../lib/nano_gpt"
|
|
13
|
+
|
|
14
|
+
def main
|
|
15
|
+
config = NanoGPT::TrainConfig.load(ARGV)
|
|
16
|
+
|
|
17
|
+
# Resolve device (auto-detect if "auto")
|
|
18
|
+
if config[:device] == "auto"
|
|
19
|
+
config[:device] = NanoGPT::Device.auto
|
|
20
|
+
puts "Auto-detected device: #{config[:device]}"
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# Load tokenizer to get vocab_size (auto-detects char-level vs GPT-2 BPE)
|
|
24
|
+
data_dir = File.join("data", config[:dataset])
|
|
25
|
+
train_bin = File.join(data_dir, "train.bin")
|
|
26
|
+
|
|
27
|
+
unless File.exist?(train_bin)
|
|
28
|
+
puts "Error: #{train_bin} not found. Run the data preparation script first:"
|
|
29
|
+
puts " bundle exec ruby data/#{config[:dataset]}/prepare.rb"
|
|
30
|
+
exit 1
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
tokenizer = NanoGPT::Tokenizer.for_dataset(data_dir)
|
|
34
|
+
tokenizer_type = tokenizer.is_a?(NanoGPT::GPT2Tokenizer) ? "GPT-2 BPE" : "character-level"
|
|
35
|
+
puts "Loaded #{tokenizer_type} tokenizer with vocab_size=#{tokenizer.vocab_size}"
|
|
36
|
+
|
|
37
|
+
# Create model config
|
|
38
|
+
model_config = NanoGPT::GPTConfig.new(
|
|
39
|
+
block_size: config[:block_size],
|
|
40
|
+
vocab_size: tokenizer.vocab_size,
|
|
41
|
+
n_layer: config[:n_layer],
|
|
42
|
+
n_head: config[:n_head],
|
|
43
|
+
n_embd: config[:n_embd],
|
|
44
|
+
dropout: config[:dropout],
|
|
45
|
+
bias: config[:bias]
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
# Create model
|
|
49
|
+
model = NanoGPT::GPT.new(model_config)
|
|
50
|
+
|
|
51
|
+
# Move model to device (MPS, CUDA, or CPU)
|
|
52
|
+
device = config[:device]
|
|
53
|
+
if device != "cpu"
|
|
54
|
+
model.to(device)
|
|
55
|
+
puts "Model moved to #{device}"
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
# Create data loader
|
|
59
|
+
data_loader = NanoGPT::DataLoader.new(
|
|
60
|
+
data_dir: data_dir,
|
|
61
|
+
block_size: config[:block_size],
|
|
62
|
+
batch_size: config[:batch_size],
|
|
63
|
+
device: config[:device]
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
puts "Train data: #{data_loader.train_size} tokens"
|
|
67
|
+
puts "Val data: #{data_loader.val_size} tokens"
|
|
68
|
+
|
|
69
|
+
# Create trainer
|
|
70
|
+
trainer = NanoGPT::Trainer.new(
|
|
71
|
+
model: model,
|
|
72
|
+
data_loader: data_loader,
|
|
73
|
+
config: config.to_h
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
# Train!
|
|
77
|
+
trainer.train
|
|
78
|
+
|
|
79
|
+
puts "\nTraining complete! Checkpoint saved to #{config[:out_dir]}/ckpt.pt"
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
main
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
{
|
|
2
|
+
"out_dir": "out-gpt2",
|
|
3
|
+
"dataset": "openwebtext",
|
|
4
|
+
"batch_size": 12,
|
|
5
|
+
"block_size": 1024,
|
|
6
|
+
"n_layer": 12,
|
|
7
|
+
"n_head": 12,
|
|
8
|
+
"n_embd": 768,
|
|
9
|
+
"dropout": 0.0,
|
|
10
|
+
"bias": true,
|
|
11
|
+
"learning_rate": 6e-4,
|
|
12
|
+
"max_iters": 600000,
|
|
13
|
+
"warmup_iters": 2000,
|
|
14
|
+
"lr_decay_iters": 600000,
|
|
15
|
+
"min_lr": 6e-5,
|
|
16
|
+
"eval_interval": 1000,
|
|
17
|
+
"eval_iters": 200,
|
|
18
|
+
"device": "auto"
|
|
19
|
+
}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
{
|
|
2
|
+
"out_dir": "out-shakespeare",
|
|
3
|
+
"dataset": "shakespeare",
|
|
4
|
+
"batch_size": 64,
|
|
5
|
+
"block_size": 256,
|
|
6
|
+
"n_layer": 6,
|
|
7
|
+
"n_head": 6,
|
|
8
|
+
"n_embd": 384,
|
|
9
|
+
"dropout": 0.2,
|
|
10
|
+
"learning_rate": 1e-3,
|
|
11
|
+
"max_iters": 5000,
|
|
12
|
+
"eval_interval": 250,
|
|
13
|
+
"device": "auto"
|
|
14
|
+
}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
{
|
|
2
|
+
"out_dir": "out-shakespeare-char",
|
|
3
|
+
"dataset": "shakespeare_char",
|
|
4
|
+
"batch_size": 64,
|
|
5
|
+
"block_size": 256,
|
|
6
|
+
"n_layer": 6,
|
|
7
|
+
"n_head": 6,
|
|
8
|
+
"n_embd": 384,
|
|
9
|
+
"dropout": 0.2,
|
|
10
|
+
"learning_rate": 1e-3,
|
|
11
|
+
"max_iters": 5000,
|
|
12
|
+
"eval_interval": 250,
|
|
13
|
+
"device": "auto"
|
|
14
|
+
}
|