nanogpt 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: b10e62747a63be10b519fdd8eb1eaabfc63214b01ab553d4230af865a0a8f327
4
+ data.tar.gz: 69db126e3d02c897045e543981223d1d29d72f7625bd381e3ba4badee88b6d20
5
+ SHA512:
6
+ metadata.gz: 97f5846373224b889f22b80302a05271c7ab0276037e519c19c58cc0954aa982a4d3ac9992766aa73f6d1eec1dfe2b96b8a750414dd02818529e37789671cd87
7
+ data.tar.gz: accd2112564f004cc763dbaac418f35ce24426d4d86e53935a32bdf6411659463ed6cb8e9415146136ee78fa79748e39a1e506b5c44848a7b6e503dfee86e310
data/Gemfile ADDED
@@ -0,0 +1,7 @@
1
+ # frozen_string_literal: true
2
+
3
+ source "https://rubygems.org"
4
+
5
+ gemspec
6
+
7
+ gem "parquet", "~> 0.5", require: false # For OpenWebText data prep
data/Gemfile.lock ADDED
@@ -0,0 +1,42 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ nanogpt (0.1.0)
5
+ numo-narray (~> 0.9)
6
+ tiktoken_ruby (~> 0.0)
7
+ torch-rb (~> 0.14)
8
+
9
+ GEM
10
+ remote: https://rubygems.org/
11
+ specs:
12
+ diff-lcs (1.6.2)
13
+ numo-narray (0.9.2.1)
14
+ parquet (0.7.3-arm64-darwin)
15
+ rice (4.7.1)
16
+ rspec (3.13.2)
17
+ rspec-core (~> 3.13.0)
18
+ rspec-expectations (~> 3.13.0)
19
+ rspec-mocks (~> 3.13.0)
20
+ rspec-core (3.13.6)
21
+ rspec-support (~> 3.13.0)
22
+ rspec-expectations (3.13.5)
23
+ diff-lcs (>= 1.2.0, < 2.0)
24
+ rspec-support (~> 3.13.0)
25
+ rspec-mocks (3.13.7)
26
+ diff-lcs (>= 1.2.0, < 2.0)
27
+ rspec-support (~> 3.13.0)
28
+ rspec-support (3.13.6)
29
+ tiktoken_ruby (0.0.13-arm64-darwin)
30
+ torch-rb (0.22.2)
31
+ rice (>= 4.7)
32
+
33
+ PLATFORMS
34
+ arm64-darwin-24
35
+
36
+ DEPENDENCIES
37
+ nanogpt!
38
+ parquet (~> 0.5)
39
+ rspec (~> 3.12)
40
+
41
+ BUNDLED WITH
42
+ 2.7.2
data/README.md ADDED
@@ -0,0 +1,102 @@
1
+ # nanoGPT
2
+
3
+ A Ruby port of Karpathy's [nanoGPT](https://github.com/karpathy/nanoGPT). Train GPT-2 style language models from scratch using [torch.rb](https://github.com/ankane/torch.rb).
4
+
5
+ Built for Ruby developers who want to understand how LLMs work by building one.
6
+
7
+ ## Quick Start
8
+
9
+ ```bash
10
+ gem install nanogpt
11
+
12
+ # Prepare Shakespeare dataset with character-level tokenizer
13
+ nanogpt prepare shakespeare_char
14
+
15
+ # Train (use MPS on Apple Silicon for 17x speedup)
16
+ nanogpt train --dataset=shakespeare_char --device=mps
17
+
18
+ # Generate text
19
+ nanogpt sample --dataset=shakespeare_char
20
+ ```
21
+
22
+ Or from source:
23
+
24
+ ```bash
25
+ git clone https://github.com/khasinski/nanogpt-rb
26
+ cd nanogpt-rb
27
+ bundle install
28
+
29
+ # Prepare data
30
+ bundle exec ruby data/shakespeare_char/prepare.rb
31
+
32
+ # Train
33
+ bundle exec exe/nanogpt train --dataset=shakespeare_char --device=mps
34
+
35
+ # Sample
36
+ bundle exec exe/nanogpt sample --dataset=shakespeare_char
37
+ ```
38
+
39
+ ## Performance (M1 Max)
40
+
41
+ Training the default 10.65M parameter model on Shakespeare:
42
+
43
+ | Device | Time/iter | Notes |
44
+ |--------|-----------|-------|
45
+ | MPS | ~500ms | Recommended for Apple Silicon |
46
+ | CPU | ~8,500ms | 17x slower |
47
+
48
+ After ~2000 iterations (~20 min on MPS), the model generates coherent Shakespeare-like text.
49
+
50
+ ## Commands
51
+
52
+ ```bash
53
+ nanogpt train [options] # Train a model
54
+ nanogpt sample [options] # Generate text from trained model
55
+ nanogpt bench [options] # Run performance benchmarks
56
+ ```
57
+
58
+ ### Training Options
59
+
60
+ ```bash
61
+ --dataset=NAME # Dataset to use (default: shakespeare_char)
62
+ --device=DEVICE # cpu or mps, cuda might work too 🤞(default: auto)
63
+ --max_iters=N # Training iterations (default: 5000)
64
+ --batch_size=N # Batch size (default: 64)
65
+ --block_size=N # Context length (default: 256)
66
+ --n_layer=N # Transformer layers (default: 6)
67
+ --n_head=N # Attention heads (default: 6)
68
+ --n_embd=N # Embedding dimension (default: 384)
69
+ --learning_rate=F # Learning rate (default: 1e-3)
70
+ --config=FILE # Load settings from JSON file
71
+ ```
72
+
73
+ ### Sampling Options
74
+
75
+ ```bash
76
+ --dataset=NAME # Dataset (for tokenizer)
77
+ --out_dir=DIR # Checkpoint directory
78
+ --num_samples=N # Number of samples to generate
79
+ --max_new_tokens=N # Tokens per sample (default: 500)
80
+ --temperature=F # Sampling temperature (default: 0.8)
81
+ --top_k=N # Top-k sampling (default: 200)
82
+ ```
83
+
84
+ ## Features
85
+
86
+ - Full GPT-2 architecture (attention, MLP, layer norm, embeddings)
87
+ - MPS (Metal) and CUDA GPU acceleration via torch.rb
88
+ - Flash attention when dropout=0 (5x faster attention)
89
+ - Cosine learning rate schedule with warmup
90
+ - Gradient accumulation for larger effective batch sizes
91
+ - Checkpointing and resumption
92
+ - Character-level and GPT-2 BPE tokenizers
93
+
94
+ ## Requirements
95
+
96
+ - Ruby >= 3.1
97
+ - LibTorch (installed automatically with torch-rb)
98
+ - For MPS: macOS 12.3+ with Apple Silicon
99
+
100
+ ## License
101
+
102
+ MIT
data/bin/bench ADDED
@@ -0,0 +1,210 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ $stdout.sync = true
5
+
6
+ # Benchmark script for measuring training performance
7
+ # A shorter version of train.rb focused on benchmarking
8
+ #
9
+ # Usage:
10
+ # bundle exec ruby bin/bench
11
+ # bundle exec ruby bin/bench --batch_size=8 --block_size=512
12
+ # bundle exec ruby bin/bench --real_data=false # use random data
13
+
14
+ require_relative "../lib/nano_gpt"
15
+
16
+ # Default configuration (GPT-2 style model)
17
+ CONFIG = {
18
+ batch_size: 12,
19
+ block_size: 1024,
20
+ n_layer: 12,
21
+ n_head: 12,
22
+ n_embd: 768,
23
+ dropout: 0.0,
24
+ bias: false,
25
+ real_data: true,
26
+ dataset: "openwebtext",
27
+ seed: 1337,
28
+ device: "auto"
29
+ }.freeze
30
+
31
+ def parse_args(args, config)
32
+ result = config.dup
33
+
34
+ args.each do |arg|
35
+ next unless arg.start_with?("--") && arg.include?("=")
36
+
37
+ key, val = arg[2..].split("=", 2)
38
+ key = key.to_sym
39
+
40
+ unless result.key?(key)
41
+ puts "Warning: Unknown config key: #{key}"
42
+ next
43
+ end
44
+
45
+ result[key] = case result[key]
46
+ when Integer then val.to_i
47
+ when Float then val.to_f
48
+ when TrueClass, FalseClass then val.downcase == "true"
49
+ else val
50
+ end
51
+ end
52
+
53
+ result
54
+ end
55
+
56
+ def get_batch_from_data(data, batch_size, block_size, device)
57
+ max_start = data.size - block_size - 1
58
+ indices = Array.new(batch_size) { rand(0..max_start) }
59
+
60
+ x_arrays = []
61
+ y_arrays = []
62
+
63
+ indices.each do |i|
64
+ x_arrays << data[i, block_size]
65
+ y_arrays << data[i + 1, block_size]
66
+ end
67
+
68
+ x = Torch.tensor(x_arrays, dtype: :long)
69
+ y = Torch.tensor(y_arrays, dtype: :long)
70
+
71
+ if device != "cpu"
72
+ x = x.to(device)
73
+ y = y.to(device)
74
+ end
75
+
76
+ [x, y]
77
+ end
78
+
79
+ def main
80
+ config = parse_args(ARGV, CONFIG)
81
+
82
+ puts "=" * 60
83
+ puts "NanoGPT Benchmark"
84
+ puts "=" * 60
85
+ puts ""
86
+ puts "Configuration:"
87
+ puts " batch_size: #{config[:batch_size]}"
88
+ puts " block_size: #{config[:block_size]}"
89
+ puts " n_layer: #{config[:n_layer]}"
90
+ puts " n_head: #{config[:n_head]}"
91
+ puts " n_embd: #{config[:n_embd]}"
92
+ puts " real_data: #{config[:real_data]}"
93
+ puts ""
94
+
95
+ # Resolve device
96
+ if config[:device] == "auto"
97
+ config[:device] = NanoGPT::Device.auto
98
+ end
99
+ device = config[:device]
100
+ puts "Device: #{device}"
101
+
102
+ Torch.manual_seed(config[:seed])
103
+
104
+ # Data loading
105
+ if config[:real_data]
106
+ data_dir = File.join("data", config[:dataset])
107
+ train_bin = File.join(data_dir, "train.bin")
108
+
109
+ unless File.exist?(train_bin)
110
+ puts ""
111
+ puts "Warning: #{train_bin} not found, using random data instead."
112
+ puts "To use real data, run: bundle exec ruby data/#{config[:dataset]}/prepare.rb"
113
+ puts ""
114
+ config[:real_data] = false
115
+ end
116
+ end
117
+
118
+ if config[:real_data]
119
+ # Load training data
120
+ bytes = File.binread(File.join("data", config[:dataset], "train.bin"))
121
+ train_data = bytes.unpack("S<*") # uint16 little-endian
122
+ puts "Loaded #{train_data.size} tokens from #{config[:dataset]}"
123
+
124
+ get_batch = lambda do
125
+ get_batch_from_data(train_data, config[:batch_size], config[:block_size], device)
126
+ end
127
+ else
128
+ # Use random data
129
+ vocab_size = 50304 # GPT-2 vocab size rounded up for efficiency
130
+ puts "Using random data (vocab_size=#{vocab_size})"
131
+
132
+ get_batch = lambda do
133
+ x = Torch.randint(vocab_size, [config[:batch_size], config[:block_size]], dtype: :long)
134
+ y = Torch.randint(vocab_size, [config[:batch_size], config[:block_size]], dtype: :long)
135
+ x = x.to(device) if device != "cpu"
136
+ y = y.to(device) if device != "cpu"
137
+ [x, y]
138
+ end
139
+ end
140
+
141
+ # Model init
142
+ puts ""
143
+ puts "Initializing model..."
144
+ model_config = NanoGPT::GPTConfig.new(
145
+ block_size: config[:block_size],
146
+ vocab_size: 50304, # GPT-2 vocab rounded up
147
+ n_layer: config[:n_layer],
148
+ n_head: config[:n_head],
149
+ n_embd: config[:n_embd],
150
+ dropout: config[:dropout],
151
+ bias: config[:bias]
152
+ )
153
+
154
+ model = NanoGPT::GPT.new(model_config)
155
+ model.to(device) if device != "cpu"
156
+
157
+ # Optimizer
158
+ optimizer = model.configure_optimizers(
159
+ weight_decay: 1e-2,
160
+ learning_rate: 1e-4,
161
+ betas: [0.9, 0.95],
162
+ device_type: NanoGPT::Device.type(device)
163
+ )
164
+
165
+ puts ""
166
+ puts "Starting benchmark..."
167
+ puts "-" * 60
168
+
169
+ # Benchmark: burn-in phase then measurement phase
170
+ [{ name: "burn-in", steps: 10 }, { name: "benchmark", steps: 20 }].each do |phase|
171
+ puts ""
172
+ puts "Phase: #{phase[:name]} (#{phase[:steps]} steps)"
173
+
174
+ x, y = get_batch.call
175
+ t0 = Time.now
176
+
177
+ phase[:steps].times do |k|
178
+ # Forward pass
179
+ _logits, loss = model.forward(x, targets: y)
180
+
181
+ # Get next batch
182
+ x, y = get_batch.call
183
+
184
+ # Backward pass
185
+ optimizer.zero_grad
186
+ loss.backward
187
+ optimizer.step
188
+
189
+ loss_val = loss.item
190
+ puts " #{k}/#{phase[:steps]} loss: #{format('%.4f', loss_val)}"
191
+ end
192
+
193
+ t1 = Time.now
194
+ dt = t1 - t0
195
+
196
+ if phase[:name] == "benchmark"
197
+ mfu = model.estimate_mfu(config[:batch_size] * phase[:steps], dt)
198
+ time_per_iter = dt / phase[:steps] * 1000
199
+
200
+ puts ""
201
+ puts "=" * 60
202
+ puts "Results:"
203
+ puts " Time per iteration: #{format('%.2f', time_per_iter)}ms"
204
+ puts " MFU: #{format('%.2f', mfu * 100)}%"
205
+ puts "=" * 60
206
+ end
207
+ end
208
+ end
209
+
210
+ main
data/bin/sample ADDED
@@ -0,0 +1,76 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ # Generate text from a trained model
5
+ # Usage:
6
+ # bundle exec ruby bin/sample [options]
7
+ # bundle exec ruby bin/sample --config=config/sample.json
8
+ # bundle exec ruby bin/sample --out_dir=out-shakespeare --num_samples=3
9
+
10
+ require_relative "../lib/nano_gpt"
11
+
12
+ def main
13
+ config = NanoGPT::SampleConfig.load(ARGV)
14
+
15
+ # Resolve device (auto-detect if "auto")
16
+ if config[:device] == "auto"
17
+ config[:device] = NanoGPT::Device.auto
18
+ puts "Auto-detected device: #{config[:device]}"
19
+ end
20
+ device = config[:device]
21
+
22
+ Torch.manual_seed(config[:seed])
23
+
24
+ # Load checkpoint
25
+ ckpt_path = File.join(config[:out_dir], "ckpt.pt")
26
+ unless File.exist?(ckpt_path)
27
+ puts "Error: Checkpoint not found at #{ckpt_path}"
28
+ puts "Train a model first with: bundle exec ruby bin/train"
29
+ exit 1
30
+ end
31
+
32
+ puts "Loading checkpoint from #{ckpt_path}..."
33
+ checkpoint = Torch.load(ckpt_path)
34
+
35
+ # Recreate model from checkpoint
36
+ model_args = checkpoint["model_args"].transform_keys(&:to_sym)
37
+ model_config = NanoGPT::GPTConfig.new(**model_args)
38
+ model = NanoGPT::GPT.new(model_config)
39
+ model.load_state_dict(checkpoint["model"])
40
+ model.to(device) if device != "cpu"
41
+ model.eval
42
+
43
+ # Load tokenizer (auto-detects character-level vs GPT-2 BPE)
44
+ dataset_dir = File.join("data", config[:dataset])
45
+ tokenizer = NanoGPT::Tokenizer.for_dataset(dataset_dir)
46
+ puts "number of parameters: #{model.num_params / 1e6}M"
47
+
48
+ # Handle start text
49
+ start_text = config[:start]
50
+ if start_text.start_with?("FILE:")
51
+ start_text = File.read(start_text[5..])
52
+ end
53
+
54
+ # Encode starting prompt
55
+ start_ids = tokenizer.encode(start_text)
56
+ x = Torch.tensor([start_ids], dtype: :long, device: device)
57
+
58
+ puts "Generating #{config[:num_samples]} samples..."
59
+ puts "=" * 50
60
+
61
+ # Generate samples
62
+ config[:num_samples].times do |k|
63
+ y = model.generate(
64
+ x,
65
+ config[:max_new_tokens],
66
+ temperature: config[:temperature],
67
+ top_k: config[:top_k]
68
+ )
69
+
70
+ output = tokenizer.decode(y[0].to_a)
71
+ puts output
72
+ puts "-" * 50
73
+ end
74
+ end
75
+
76
+ main
data/bin/train ADDED
@@ -0,0 +1,82 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ $stdout.sync = true
5
+
6
+ # Train a GPT model
7
+ # Usage:
8
+ # bundle exec ruby bin/train [options]
9
+ # bundle exec ruby bin/train --config=config/train_shakespeare.json
10
+ # bundle exec ruby bin/train --config=config/train_gpt2.json --max_iters=1000
11
+
12
+ require_relative "../lib/nano_gpt"
13
+
14
+ def main
15
+ config = NanoGPT::TrainConfig.load(ARGV)
16
+
17
+ # Resolve device (auto-detect if "auto")
18
+ if config[:device] == "auto"
19
+ config[:device] = NanoGPT::Device.auto
20
+ puts "Auto-detected device: #{config[:device]}"
21
+ end
22
+
23
+ # Load tokenizer to get vocab_size (auto-detects char-level vs GPT-2 BPE)
24
+ data_dir = File.join("data", config[:dataset])
25
+ train_bin = File.join(data_dir, "train.bin")
26
+
27
+ unless File.exist?(train_bin)
28
+ puts "Error: #{train_bin} not found. Run the data preparation script first:"
29
+ puts " bundle exec ruby data/#{config[:dataset]}/prepare.rb"
30
+ exit 1
31
+ end
32
+
33
+ tokenizer = NanoGPT::Tokenizer.for_dataset(data_dir)
34
+ tokenizer_type = tokenizer.is_a?(NanoGPT::GPT2Tokenizer) ? "GPT-2 BPE" : "character-level"
35
+ puts "Loaded #{tokenizer_type} tokenizer with vocab_size=#{tokenizer.vocab_size}"
36
+
37
+ # Create model config
38
+ model_config = NanoGPT::GPTConfig.new(
39
+ block_size: config[:block_size],
40
+ vocab_size: tokenizer.vocab_size,
41
+ n_layer: config[:n_layer],
42
+ n_head: config[:n_head],
43
+ n_embd: config[:n_embd],
44
+ dropout: config[:dropout],
45
+ bias: config[:bias]
46
+ )
47
+
48
+ # Create model
49
+ model = NanoGPT::GPT.new(model_config)
50
+
51
+ # Move model to device (MPS, CUDA, or CPU)
52
+ device = config[:device]
53
+ if device != "cpu"
54
+ model.to(device)
55
+ puts "Model moved to #{device}"
56
+ end
57
+
58
+ # Create data loader
59
+ data_loader = NanoGPT::DataLoader.new(
60
+ data_dir: data_dir,
61
+ block_size: config[:block_size],
62
+ batch_size: config[:batch_size],
63
+ device: config[:device]
64
+ )
65
+
66
+ puts "Train data: #{data_loader.train_size} tokens"
67
+ puts "Val data: #{data_loader.val_size} tokens"
68
+
69
+ # Create trainer
70
+ trainer = NanoGPT::Trainer.new(
71
+ model: model,
72
+ data_loader: data_loader,
73
+ config: config.to_h
74
+ )
75
+
76
+ # Train!
77
+ trainer.train
78
+
79
+ puts "\nTraining complete! Checkpoint saved to #{config[:out_dir]}/ckpt.pt"
80
+ end
81
+
82
+ main
@@ -0,0 +1,19 @@
1
+ {
2
+ "out_dir": "out-gpt2",
3
+ "dataset": "openwebtext",
4
+ "batch_size": 12,
5
+ "block_size": 1024,
6
+ "n_layer": 12,
7
+ "n_head": 12,
8
+ "n_embd": 768,
9
+ "dropout": 0.0,
10
+ "bias": true,
11
+ "learning_rate": 6e-4,
12
+ "max_iters": 600000,
13
+ "warmup_iters": 2000,
14
+ "lr_decay_iters": 600000,
15
+ "min_lr": 6e-5,
16
+ "eval_interval": 1000,
17
+ "eval_iters": 200,
18
+ "device": "auto"
19
+ }
@@ -0,0 +1,14 @@
1
+ {
2
+ "out_dir": "out-shakespeare",
3
+ "dataset": "shakespeare",
4
+ "batch_size": 64,
5
+ "block_size": 256,
6
+ "n_layer": 6,
7
+ "n_head": 6,
8
+ "n_embd": 384,
9
+ "dropout": 0.2,
10
+ "learning_rate": 1e-3,
11
+ "max_iters": 5000,
12
+ "eval_interval": 250,
13
+ "device": "auto"
14
+ }
@@ -0,0 +1,14 @@
1
+ {
2
+ "out_dir": "out-shakespeare-char",
3
+ "dataset": "shakespeare_char",
4
+ "batch_size": 64,
5
+ "block_size": 256,
6
+ "n_layer": 6,
7
+ "n_head": 6,
8
+ "n_embd": 384,
9
+ "dropout": 0.2,
10
+ "learning_rate": 1e-3,
11
+ "max_iters": 5000,
12
+ "eval_interval": 250,
13
+ "device": "auto"
14
+ }