nanogpt 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Gemfile +7 -0
- data/Gemfile.lock +42 -0
- data/README.md +102 -0
- data/bin/bench +210 -0
- data/bin/sample +76 -0
- data/bin/train +82 -0
- data/config/train_gpt2.json +19 -0
- data/config/train_shakespeare.json +14 -0
- data/config/train_shakespeare_char.json +14 -0
- data/data/openwebtext/prepare.rb +287 -0
- data/data/shakespeare/prepare.rb +61 -0
- data/data/shakespeare_char/input.txt +40000 -0
- data/data/shakespeare_char/prepare.rb +73 -0
- data/exe/nanogpt +338 -0
- data/lib/nano_gpt/config.rb +42 -0
- data/lib/nano_gpt/data_loader.rb +74 -0
- data/lib/nano_gpt/device.rb +56 -0
- data/lib/nano_gpt/layers/block.rb +25 -0
- data/lib/nano_gpt/layers/causal_self_attention.rb +73 -0
- data/lib/nano_gpt/layers/layer_norm.rb +21 -0
- data/lib/nano_gpt/layers/mlp.rb +23 -0
- data/lib/nano_gpt/lr_scheduler.rb +42 -0
- data/lib/nano_gpt/model.rb +218 -0
- data/lib/nano_gpt/tokenizer.rb +106 -0
- data/lib/nano_gpt/train_config.rb +259 -0
- data/lib/nano_gpt/trainer.rb +221 -0
- data/lib/nano_gpt/version.rb +5 -0
- data/lib/nano_gpt.rb +18 -0
- data/nanogpt.gemspec +37 -0
- metadata +133 -0
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
# Prepare the Shakespeare dataset for character-level language modeling.
|
|
5
|
+
# Downloads the tiny shakespeare dataset and creates train.bin, val.bin, and meta.json
|
|
6
|
+
|
|
7
|
+
require "net/http"
|
|
8
|
+
require "openssl"
|
|
9
|
+
require "numo/narray"
|
|
10
|
+
require "json"
|
|
11
|
+
|
|
12
|
+
DATA_URL = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
|
|
13
|
+
SCRIPT_DIR = File.dirname(__FILE__)
|
|
14
|
+
|
|
15
|
+
def download_file(url)
|
|
16
|
+
uri = URI(url)
|
|
17
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
|
18
|
+
http.use_ssl = true
|
|
19
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE # GitHub CDN has CRL issues
|
|
20
|
+
response = http.get(uri.request_uri)
|
|
21
|
+
response.body
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
# Download data if not exists
|
|
25
|
+
input_path = File.join(SCRIPT_DIR, "input.txt")
|
|
26
|
+
unless File.exist?(input_path)
|
|
27
|
+
puts "Downloading tiny shakespeare..."
|
|
28
|
+
data = download_file(DATA_URL)
|
|
29
|
+
File.write(input_path, data)
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
data = File.read(input_path)
|
|
33
|
+
puts "Length of dataset in characters: #{data.length}"
|
|
34
|
+
|
|
35
|
+
# Build vocabulary from all unique characters
|
|
36
|
+
chars = data.chars.uniq.sort
|
|
37
|
+
vocab_size = chars.size
|
|
38
|
+
puts "All unique characters: #{chars.join.inspect}"
|
|
39
|
+
puts "Vocab size: #{vocab_size}"
|
|
40
|
+
|
|
41
|
+
# Create mappings
|
|
42
|
+
stoi = chars.each_with_index.to_h
|
|
43
|
+
itos = chars.each_with_index.map { |c, i| [i, c] }.to_h
|
|
44
|
+
|
|
45
|
+
# Encode function
|
|
46
|
+
encode = ->(s) { s.chars.map { |c| stoi[c] } }
|
|
47
|
+
|
|
48
|
+
# Train/val split (90/10)
|
|
49
|
+
n = data.length
|
|
50
|
+
train_data = data[0...(n * 0.9).to_i]
|
|
51
|
+
val_data = data[(n * 0.9).to_i..]
|
|
52
|
+
|
|
53
|
+
# Encode to integers
|
|
54
|
+
train_ids = encode.call(train_data)
|
|
55
|
+
val_ids = encode.call(val_data)
|
|
56
|
+
puts "Train has #{train_ids.length} tokens"
|
|
57
|
+
puts "Val has #{val_ids.length} tokens"
|
|
58
|
+
|
|
59
|
+
# Export to binary files (uint16)
|
|
60
|
+
train_arr = Numo::UInt16.cast(train_ids)
|
|
61
|
+
val_arr = Numo::UInt16.cast(val_ids)
|
|
62
|
+
File.binwrite(File.join(SCRIPT_DIR, "train.bin"), train_arr.to_binary)
|
|
63
|
+
File.binwrite(File.join(SCRIPT_DIR, "val.bin"), val_arr.to_binary)
|
|
64
|
+
|
|
65
|
+
# Save meta information as JSON
|
|
66
|
+
meta = {
|
|
67
|
+
"vocab_size" => vocab_size,
|
|
68
|
+
"itos" => itos.transform_keys(&:to_s),
|
|
69
|
+
"stoi" => stoi
|
|
70
|
+
}
|
|
71
|
+
File.write(File.join(SCRIPT_DIR, "meta.json"), JSON.pretty_generate(meta))
|
|
72
|
+
|
|
73
|
+
puts "Done! Created train.bin, val.bin, and meta.json"
|
data/exe/nanogpt
ADDED
|
@@ -0,0 +1,338 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
$stdout.sync = true
|
|
5
|
+
|
|
6
|
+
require "nano_gpt"
|
|
7
|
+
|
|
8
|
+
class NanoGPTCLI
|
|
9
|
+
COMMANDS = %w[train sample bench version help].freeze
|
|
10
|
+
|
|
11
|
+
def initialize(args)
|
|
12
|
+
@command = args.shift
|
|
13
|
+
@args = args
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def run
|
|
17
|
+
case @command
|
|
18
|
+
when "train"
|
|
19
|
+
train
|
|
20
|
+
when "sample"
|
|
21
|
+
sample
|
|
22
|
+
when "bench"
|
|
23
|
+
bench
|
|
24
|
+
when "version", "-v", "--version"
|
|
25
|
+
version
|
|
26
|
+
when "help", "-h", "--help", nil
|
|
27
|
+
help
|
|
28
|
+
else
|
|
29
|
+
puts "Unknown command: #{@command}"
|
|
30
|
+
puts ""
|
|
31
|
+
help
|
|
32
|
+
exit 1
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
private
|
|
37
|
+
|
|
38
|
+
def train
|
|
39
|
+
config = NanoGPT::TrainConfig.load(@args)
|
|
40
|
+
|
|
41
|
+
if config[:device] == "auto"
|
|
42
|
+
config[:device] = NanoGPT::Device.auto
|
|
43
|
+
puts "Auto-detected device: #{config[:device]}"
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
data_dir = File.join("data", config[:dataset])
|
|
47
|
+
train_bin = File.join(data_dir, "train.bin")
|
|
48
|
+
|
|
49
|
+
unless File.exist?(train_bin)
|
|
50
|
+
puts "Error: #{train_bin} not found. Run the data preparation script first:"
|
|
51
|
+
puts " bundle exec ruby data/#{config[:dataset]}/prepare.rb"
|
|
52
|
+
exit 1
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
tokenizer = NanoGPT::Tokenizer.for_dataset(data_dir)
|
|
56
|
+
tokenizer_type = tokenizer.is_a?(NanoGPT::GPT2Tokenizer) ? "GPT-2 BPE" : "character-level"
|
|
57
|
+
puts "Loaded #{tokenizer_type} tokenizer with vocab_size=#{tokenizer.vocab_size}"
|
|
58
|
+
|
|
59
|
+
model_config = NanoGPT::GPTConfig.new(
|
|
60
|
+
block_size: config[:block_size],
|
|
61
|
+
vocab_size: tokenizer.vocab_size,
|
|
62
|
+
n_layer: config[:n_layer],
|
|
63
|
+
n_head: config[:n_head],
|
|
64
|
+
n_embd: config[:n_embd],
|
|
65
|
+
dropout: config[:dropout],
|
|
66
|
+
bias: config[:bias]
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
model = NanoGPT::GPT.new(model_config)
|
|
70
|
+
|
|
71
|
+
device = config[:device]
|
|
72
|
+
if device != "cpu"
|
|
73
|
+
model.to(device)
|
|
74
|
+
puts "Model moved to #{device}"
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
data_loader = NanoGPT::DataLoader.new(
|
|
78
|
+
data_dir: data_dir,
|
|
79
|
+
block_size: config[:block_size],
|
|
80
|
+
batch_size: config[:batch_size],
|
|
81
|
+
device: config[:device]
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
puts "Train data: #{data_loader.train_size} tokens"
|
|
85
|
+
puts "Val data: #{data_loader.val_size} tokens"
|
|
86
|
+
|
|
87
|
+
trainer = NanoGPT::Trainer.new(
|
|
88
|
+
model: model,
|
|
89
|
+
data_loader: data_loader,
|
|
90
|
+
config: config.to_h
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
trainer.train
|
|
94
|
+
|
|
95
|
+
puts "\nTraining complete! Checkpoint saved to #{config[:out_dir]}/ckpt.pt"
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
def sample
|
|
99
|
+
config = NanoGPT::SampleConfig.load(@args)
|
|
100
|
+
|
|
101
|
+
if config[:device] == "auto"
|
|
102
|
+
config[:device] = NanoGPT::Device.auto
|
|
103
|
+
puts "Auto-detected device: #{config[:device]}"
|
|
104
|
+
end
|
|
105
|
+
device = config[:device]
|
|
106
|
+
|
|
107
|
+
Torch.manual_seed(config[:seed])
|
|
108
|
+
|
|
109
|
+
ckpt_path = File.join(config[:out_dir], "ckpt.pt")
|
|
110
|
+
unless File.exist?(ckpt_path)
|
|
111
|
+
puts "Error: Checkpoint not found at #{ckpt_path}"
|
|
112
|
+
puts "Train a model first with: nanogpt train"
|
|
113
|
+
exit 1
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
puts "Loading checkpoint from #{ckpt_path}..."
|
|
117
|
+
checkpoint = Torch.load(ckpt_path)
|
|
118
|
+
|
|
119
|
+
model_args = checkpoint["model_args"].transform_keys(&:to_sym)
|
|
120
|
+
model_config = NanoGPT::GPTConfig.new(**model_args)
|
|
121
|
+
model = NanoGPT::GPT.new(model_config)
|
|
122
|
+
model.load_state_dict(checkpoint["model"])
|
|
123
|
+
model.to(device) if device != "cpu"
|
|
124
|
+
model.eval
|
|
125
|
+
|
|
126
|
+
dataset_dir = File.join("data", config[:dataset])
|
|
127
|
+
tokenizer = NanoGPT::Tokenizer.for_dataset(dataset_dir)
|
|
128
|
+
puts "number of parameters: #{model.num_params / 1e6}M"
|
|
129
|
+
|
|
130
|
+
start_text = config[:start]
|
|
131
|
+
if start_text.start_with?("FILE:")
|
|
132
|
+
start_text = File.read(start_text[5..])
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
start_ids = tokenizer.encode(start_text)
|
|
136
|
+
x = Torch.tensor([start_ids], dtype: :long, device: device)
|
|
137
|
+
|
|
138
|
+
puts "Generating #{config[:num_samples]} samples..."
|
|
139
|
+
puts "=" * 50
|
|
140
|
+
|
|
141
|
+
config[:num_samples].times do |k|
|
|
142
|
+
y = model.generate(
|
|
143
|
+
x,
|
|
144
|
+
config[:max_new_tokens],
|
|
145
|
+
temperature: config[:temperature],
|
|
146
|
+
top_k: config[:top_k]
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
output = tokenizer.decode(y[0].to_a)
|
|
150
|
+
puts output
|
|
151
|
+
puts "-" * 50
|
|
152
|
+
end
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
def bench
|
|
156
|
+
bench_config = {
|
|
157
|
+
batch_size: 12,
|
|
158
|
+
block_size: 1024,
|
|
159
|
+
n_layer: 12,
|
|
160
|
+
n_head: 12,
|
|
161
|
+
n_embd: 768,
|
|
162
|
+
dropout: 0.0,
|
|
163
|
+
bias: false,
|
|
164
|
+
real_data: true,
|
|
165
|
+
dataset: "openwebtext",
|
|
166
|
+
seed: 1337,
|
|
167
|
+
device: "auto"
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
# Parse args
|
|
171
|
+
@args.each do |arg|
|
|
172
|
+
next unless arg.start_with?("--") && arg.include?("=")
|
|
173
|
+
|
|
174
|
+
key, val = arg[2..].split("=", 2)
|
|
175
|
+
key = key.to_sym
|
|
176
|
+
|
|
177
|
+
next unless bench_config.key?(key)
|
|
178
|
+
|
|
179
|
+
bench_config[key] = case bench_config[key]
|
|
180
|
+
when Integer then val.to_i
|
|
181
|
+
when Float then val.to_f
|
|
182
|
+
when TrueClass, FalseClass then val.downcase == "true"
|
|
183
|
+
else val
|
|
184
|
+
end
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
puts "=" * 60
|
|
188
|
+
puts "NanoGPT Benchmark"
|
|
189
|
+
puts "=" * 60
|
|
190
|
+
puts ""
|
|
191
|
+
puts "Configuration:"
|
|
192
|
+
puts " batch_size: #{bench_config[:batch_size]}"
|
|
193
|
+
puts " block_size: #{bench_config[:block_size]}"
|
|
194
|
+
puts " n_layer: #{bench_config[:n_layer]}"
|
|
195
|
+
puts " n_head: #{bench_config[:n_head]}"
|
|
196
|
+
puts " n_embd: #{bench_config[:n_embd]}"
|
|
197
|
+
puts " real_data: #{bench_config[:real_data]}"
|
|
198
|
+
puts ""
|
|
199
|
+
|
|
200
|
+
if bench_config[:device] == "auto"
|
|
201
|
+
bench_config[:device] = NanoGPT::Device.auto
|
|
202
|
+
end
|
|
203
|
+
device = bench_config[:device]
|
|
204
|
+
puts "Device: #{device}"
|
|
205
|
+
|
|
206
|
+
Torch.manual_seed(bench_config[:seed])
|
|
207
|
+
|
|
208
|
+
if bench_config[:real_data]
|
|
209
|
+
data_dir = File.join("data", bench_config[:dataset])
|
|
210
|
+
train_bin = File.join(data_dir, "train.bin")
|
|
211
|
+
|
|
212
|
+
unless File.exist?(train_bin)
|
|
213
|
+
puts ""
|
|
214
|
+
puts "Warning: #{train_bin} not found, using random data instead."
|
|
215
|
+
puts "To use real data, run: bundle exec ruby data/#{bench_config[:dataset]}/prepare.rb"
|
|
216
|
+
puts ""
|
|
217
|
+
bench_config[:real_data] = false
|
|
218
|
+
end
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
if bench_config[:real_data]
|
|
222
|
+
bytes = File.binread(File.join("data", bench_config[:dataset], "train.bin"))
|
|
223
|
+
train_data = bytes.unpack("S<*")
|
|
224
|
+
puts "Loaded #{train_data.size} tokens from #{bench_config[:dataset]}"
|
|
225
|
+
|
|
226
|
+
get_batch = lambda do
|
|
227
|
+
max_start = train_data.size - bench_config[:block_size] - 1
|
|
228
|
+
indices = Array.new(bench_config[:batch_size]) { rand(0..max_start) }
|
|
229
|
+
x_arrays = indices.map { |i| train_data[i, bench_config[:block_size]] }
|
|
230
|
+
y_arrays = indices.map { |i| train_data[i + 1, bench_config[:block_size]] }
|
|
231
|
+
x = Torch.tensor(x_arrays, dtype: :long)
|
|
232
|
+
y = Torch.tensor(y_arrays, dtype: :long)
|
|
233
|
+
x = x.to(device) if device != "cpu"
|
|
234
|
+
y = y.to(device) if device != "cpu"
|
|
235
|
+
[x, y]
|
|
236
|
+
end
|
|
237
|
+
else
|
|
238
|
+
vocab_size = 50304
|
|
239
|
+
puts "Using random data (vocab_size=#{vocab_size})"
|
|
240
|
+
|
|
241
|
+
get_batch = lambda do
|
|
242
|
+
x = Torch.randint(vocab_size, [bench_config[:batch_size], bench_config[:block_size]], dtype: :long)
|
|
243
|
+
y = Torch.randint(vocab_size, [bench_config[:batch_size], bench_config[:block_size]], dtype: :long)
|
|
244
|
+
x = x.to(device) if device != "cpu"
|
|
245
|
+
y = y.to(device) if device != "cpu"
|
|
246
|
+
[x, y]
|
|
247
|
+
end
|
|
248
|
+
end
|
|
249
|
+
|
|
250
|
+
puts ""
|
|
251
|
+
puts "Initializing model..."
|
|
252
|
+
model_config = NanoGPT::GPTConfig.new(
|
|
253
|
+
block_size: bench_config[:block_size],
|
|
254
|
+
vocab_size: 50304,
|
|
255
|
+
n_layer: bench_config[:n_layer],
|
|
256
|
+
n_head: bench_config[:n_head],
|
|
257
|
+
n_embd: bench_config[:n_embd],
|
|
258
|
+
dropout: bench_config[:dropout],
|
|
259
|
+
bias: bench_config[:bias]
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
model = NanoGPT::GPT.new(model_config)
|
|
263
|
+
model.to(device) if device != "cpu"
|
|
264
|
+
|
|
265
|
+
optimizer = model.configure_optimizers(
|
|
266
|
+
weight_decay: 1e-2,
|
|
267
|
+
learning_rate: 1e-4,
|
|
268
|
+
betas: [0.9, 0.95],
|
|
269
|
+
device_type: NanoGPT::Device.type(device)
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
puts ""
|
|
273
|
+
puts "Starting benchmark..."
|
|
274
|
+
puts "-" * 60
|
|
275
|
+
|
|
276
|
+
[{ name: "burn-in", steps: 10 }, { name: "benchmark", steps: 20 }].each do |phase|
|
|
277
|
+
puts ""
|
|
278
|
+
puts "Phase: #{phase[:name]} (#{phase[:steps]} steps)"
|
|
279
|
+
|
|
280
|
+
x, y = get_batch.call
|
|
281
|
+
t0 = Time.now
|
|
282
|
+
|
|
283
|
+
phase[:steps].times do |k|
|
|
284
|
+
_logits, loss = model.forward(x, targets: y)
|
|
285
|
+
x, y = get_batch.call
|
|
286
|
+
optimizer.zero_grad
|
|
287
|
+
loss.backward
|
|
288
|
+
optimizer.step
|
|
289
|
+
loss_val = loss.item
|
|
290
|
+
puts " #{k}/#{phase[:steps]} loss: #{format('%.4f', loss_val)}"
|
|
291
|
+
end
|
|
292
|
+
|
|
293
|
+
t1 = Time.now
|
|
294
|
+
dt = t1 - t0
|
|
295
|
+
|
|
296
|
+
if phase[:name] == "benchmark"
|
|
297
|
+
mfu = model.estimate_mfu(bench_config[:batch_size] * phase[:steps], dt)
|
|
298
|
+
time_per_iter = dt / phase[:steps] * 1000
|
|
299
|
+
|
|
300
|
+
puts ""
|
|
301
|
+
puts "=" * 60
|
|
302
|
+
puts "Results:"
|
|
303
|
+
puts " Time per iteration: #{format('%.2f', time_per_iter)}ms"
|
|
304
|
+
puts " MFU: #{format('%.2f', mfu * 100)}%"
|
|
305
|
+
puts "=" * 60
|
|
306
|
+
end
|
|
307
|
+
end
|
|
308
|
+
end
|
|
309
|
+
|
|
310
|
+
def version
|
|
311
|
+
puts "nanogpt #{NanoGPT::VERSION}"
|
|
312
|
+
end
|
|
313
|
+
|
|
314
|
+
def help
|
|
315
|
+
puts <<~HELP
|
|
316
|
+
nanogpt - A Ruby port of Karpathy's nanoGPT
|
|
317
|
+
|
|
318
|
+
Usage: nanogpt <command> [options]
|
|
319
|
+
|
|
320
|
+
Commands:
|
|
321
|
+
train Train a GPT model
|
|
322
|
+
sample Generate text from a trained model
|
|
323
|
+
bench Run performance benchmarks
|
|
324
|
+
version Show version
|
|
325
|
+
help Show this help message
|
|
326
|
+
|
|
327
|
+
Examples:
|
|
328
|
+
nanogpt train --config=config/train_shakespeare_char.json
|
|
329
|
+
nanogpt train --dataset=shakespeare_char --max_iters=1000
|
|
330
|
+
nanogpt sample --out_dir=out-shakespeare-char --num_samples=3
|
|
331
|
+
nanogpt bench --batch_size=8 --block_size=512
|
|
332
|
+
|
|
333
|
+
For more information, visit: https://github.com/khasinski/nanogpt-rb
|
|
334
|
+
HELP
|
|
335
|
+
end
|
|
336
|
+
end
|
|
337
|
+
|
|
338
|
+
NanoGPTCLI.new(ARGV).run
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module NanoGPT
|
|
4
|
+
# Configuration for GPT model architecture
|
|
5
|
+
class GPTConfig
|
|
6
|
+
attr_accessor :block_size, :vocab_size, :n_layer, :n_head, :n_embd, :dropout, :bias
|
|
7
|
+
|
|
8
|
+
def initialize(
|
|
9
|
+
block_size: 1024,
|
|
10
|
+
vocab_size: 50304,
|
|
11
|
+
n_layer: 12,
|
|
12
|
+
n_head: 12,
|
|
13
|
+
n_embd: 768,
|
|
14
|
+
dropout: 0.0,
|
|
15
|
+
bias: true
|
|
16
|
+
)
|
|
17
|
+
@block_size = block_size
|
|
18
|
+
@vocab_size = vocab_size
|
|
19
|
+
@n_layer = n_layer
|
|
20
|
+
@n_head = n_head
|
|
21
|
+
@n_embd = n_embd
|
|
22
|
+
@dropout = dropout
|
|
23
|
+
@bias = bias
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def to_h
|
|
27
|
+
{
|
|
28
|
+
block_size: @block_size,
|
|
29
|
+
vocab_size: @vocab_size,
|
|
30
|
+
n_layer: @n_layer,
|
|
31
|
+
n_head: @n_head,
|
|
32
|
+
n_embd: @n_embd,
|
|
33
|
+
dropout: @dropout,
|
|
34
|
+
bias: @bias
|
|
35
|
+
}
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def head_size
|
|
39
|
+
@n_embd / @n_head
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
end
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module NanoGPT
|
|
4
|
+
# Loads batches from binary token files
|
|
5
|
+
# Memory-efficient: reads from file each batch (like Python's memmap recreation)
|
|
6
|
+
class DataLoader
|
|
7
|
+
attr_reader :block_size, :batch_size
|
|
8
|
+
|
|
9
|
+
BYTES_PER_TOKEN = 2 # uint16
|
|
10
|
+
|
|
11
|
+
def initialize(data_dir:, block_size:, batch_size:, device: "cpu")
|
|
12
|
+
@data_dir = data_dir
|
|
13
|
+
@block_size = block_size
|
|
14
|
+
@batch_size = batch_size
|
|
15
|
+
@device = device
|
|
16
|
+
|
|
17
|
+
# Store file paths and sizes (NOT the data itself)
|
|
18
|
+
@train_path = File.join(data_dir, "train.bin")
|
|
19
|
+
@val_path = File.join(data_dir, "val.bin")
|
|
20
|
+
|
|
21
|
+
@train_size = File.size(@train_path) / BYTES_PER_TOKEN
|
|
22
|
+
@val_size = File.size(@val_path) / BYTES_PER_TOKEN
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def train_size
|
|
26
|
+
@train_size
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def val_size
|
|
30
|
+
@val_size
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# Get a batch of data
|
|
34
|
+
# Memory-efficient: recreates data view per batch to avoid memory leak
|
|
35
|
+
# (matches Python's memmap recreation pattern)
|
|
36
|
+
def get_batch(split)
|
|
37
|
+
path = split == :train ? @train_path : @val_path
|
|
38
|
+
data_size = split == :train ? @train_size : @val_size
|
|
39
|
+
|
|
40
|
+
# Random starting indices
|
|
41
|
+
max_start = data_size - @block_size - 1
|
|
42
|
+
indices = Array.new(@batch_size) { rand(0..max_start) }
|
|
43
|
+
|
|
44
|
+
# Read only the bytes we need from file (memory-efficient)
|
|
45
|
+
# This mimics Python's memmap recreation per batch
|
|
46
|
+
x_arrays = []
|
|
47
|
+
y_arrays = []
|
|
48
|
+
|
|
49
|
+
File.open(path, "rb") do |f|
|
|
50
|
+
indices.each do |i|
|
|
51
|
+
# Read x: tokens[i:i+block_size]
|
|
52
|
+
f.seek(i * BYTES_PER_TOKEN)
|
|
53
|
+
x_bytes = f.read((@block_size + 1) * BYTES_PER_TOKEN)
|
|
54
|
+
tokens = x_bytes.unpack("S<*") # uint16 little-endian
|
|
55
|
+
|
|
56
|
+
x_arrays << tokens[0...@block_size]
|
|
57
|
+
y_arrays << tokens[1..@block_size]
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
# Create tensors directly from arrays (avoiding Numo intermediate)
|
|
62
|
+
x = Torch.tensor(x_arrays, dtype: :long)
|
|
63
|
+
y = Torch.tensor(y_arrays, dtype: :long)
|
|
64
|
+
|
|
65
|
+
# Move to device (CPU, CUDA, or MPS)
|
|
66
|
+
if @device != "cpu"
|
|
67
|
+
x = x.to(@device)
|
|
68
|
+
y = y.to(@device)
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
[x, y]
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
end
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module NanoGPT
|
|
4
|
+
# Device detection and management
|
|
5
|
+
module Device
|
|
6
|
+
class << self
|
|
7
|
+
# Auto-detect the best available device
|
|
8
|
+
# Priority: CUDA > MPS > CPU
|
|
9
|
+
def auto
|
|
10
|
+
return "cuda" if cuda_available?
|
|
11
|
+
return "mps" if mps_available?
|
|
12
|
+
|
|
13
|
+
"cpu"
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
# Check if CUDA is available
|
|
17
|
+
def cuda_available?
|
|
18
|
+
Torch::CUDA.available?
|
|
19
|
+
rescue StandardError
|
|
20
|
+
false
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# Check if MPS (Metal Performance Shaders) is available
|
|
24
|
+
# MPS is Apple Silicon GPU acceleration
|
|
25
|
+
def mps_available?
|
|
26
|
+
# Try to create a tensor on MPS device
|
|
27
|
+
Torch.tensor([1.0], device: "mps")
|
|
28
|
+
true
|
|
29
|
+
rescue StandardError
|
|
30
|
+
false
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# Get device type string (for optimizer configuration, etc.)
|
|
34
|
+
def type(device)
|
|
35
|
+
case device.to_s
|
|
36
|
+
when /cuda/ then "cuda"
|
|
37
|
+
when /mps/ then "mps"
|
|
38
|
+
else "cpu"
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# Check if device is GPU (CUDA or MPS)
|
|
43
|
+
def gpu?(device)
|
|
44
|
+
%w[cuda mps].include?(type(device))
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# Print device info
|
|
48
|
+
def info
|
|
49
|
+
puts "Device detection:"
|
|
50
|
+
puts " CUDA available: #{cuda_available?}"
|
|
51
|
+
puts " MPS available: #{mps_available?}"
|
|
52
|
+
puts " Auto-selected: #{auto}"
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
end
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module NanoGPT
|
|
4
|
+
module Layers
|
|
5
|
+
# Transformer block: LayerNorm -> Attention -> LayerNorm -> MLP
|
|
6
|
+
class Block < Torch::NN::Module
|
|
7
|
+
def initialize(config)
|
|
8
|
+
super()
|
|
9
|
+
@ln_1 = LayerNorm.new(config.n_embd, bias: config.bias)
|
|
10
|
+
@attn = CausalSelfAttention.new(config)
|
|
11
|
+
@ln_2 = LayerNorm.new(config.n_embd, bias: config.bias)
|
|
12
|
+
@mlp = MLP.new(config)
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def forward(x)
|
|
16
|
+
x = x + @attn.call(@ln_1.call(x))
|
|
17
|
+
x = x + @mlp.call(@ln_2.call(x))
|
|
18
|
+
# Trigger GC to free intermediate tensors (critical for torch.rb memory management)
|
|
19
|
+
# Ruby's GC doesn't run frequently enough during forward pass, causing memory accumulation
|
|
20
|
+
GC.start(full_mark: false, immediate_sweep: true)
|
|
21
|
+
x
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
end
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module NanoGPT
|
|
4
|
+
module Layers
|
|
5
|
+
# Multi-head causal self-attention
|
|
6
|
+
class CausalSelfAttention < Torch::NN::Module
|
|
7
|
+
def initialize(config)
|
|
8
|
+
super()
|
|
9
|
+
raise ArgumentError, "n_embd must be divisible by n_head" unless (config.n_embd % config.n_head).zero?
|
|
10
|
+
|
|
11
|
+
@n_head = config.n_head
|
|
12
|
+
@n_embd = config.n_embd
|
|
13
|
+
@head_size = config.n_embd / config.n_head
|
|
14
|
+
@dropout_p = config.dropout
|
|
15
|
+
|
|
16
|
+
# Key, query, value projections for all heads, combined
|
|
17
|
+
@c_attn = Torch::NN::Linear.new(config.n_embd, 3 * config.n_embd, bias: config.bias)
|
|
18
|
+
# Output projection
|
|
19
|
+
@c_proj = Torch::NN::Linear.new(config.n_embd, config.n_embd, bias: config.bias)
|
|
20
|
+
# Regularization
|
|
21
|
+
@attn_dropout = Torch::NN::Dropout.new(p: config.dropout)
|
|
22
|
+
@resid_dropout = Torch::NN::Dropout.new(p: config.dropout)
|
|
23
|
+
|
|
24
|
+
# Use native scaled_dot_product_attention with is_causal=true when dropout is 0
|
|
25
|
+
# Native SDPA is ~5x faster but doesn't support dropout with is_causal mode
|
|
26
|
+
@flash = config.dropout == 0.0
|
|
27
|
+
|
|
28
|
+
# Causal mask for manual attention (only used when @flash is false)
|
|
29
|
+
unless @flash
|
|
30
|
+
mask = Torch.tril(Torch.ones(config.block_size, config.block_size))
|
|
31
|
+
register_buffer("mask", mask.view(1, 1, config.block_size, config.block_size))
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def forward(x)
|
|
36
|
+
b, t, c = x.shape
|
|
37
|
+
|
|
38
|
+
# Calculate Q, K, V
|
|
39
|
+
qkv = @c_attn.call(x)
|
|
40
|
+
q, k, v = qkv.split(@n_embd, 2)
|
|
41
|
+
|
|
42
|
+
# Reshape: (B, T, C) -> (B, nh, T, hs)
|
|
43
|
+
q = q.view(b, t, @n_head, @head_size).transpose(1, 2)
|
|
44
|
+
k = k.view(b, t, @n_head, @head_size).transpose(1, 2)
|
|
45
|
+
v = v.view(b, t, @n_head, @head_size).transpose(1, 2)
|
|
46
|
+
|
|
47
|
+
y = if @flash
|
|
48
|
+
# Native scaled_dot_product_attention with is_causal=true
|
|
49
|
+
# Uses Flash Attention on CUDA, optimized kernel on MPS
|
|
50
|
+
Torch::NN.scaled_dot_product_attention(q, k, v, nil, 0.0, true)
|
|
51
|
+
else
|
|
52
|
+
# Manual attention implementation with causal mask
|
|
53
|
+
scale = 1.0 / Math.sqrt(@head_size)
|
|
54
|
+
att = q.matmul(k.transpose(-2, -1))
|
|
55
|
+
att.mul!(scale)
|
|
56
|
+
|
|
57
|
+
# Apply causal mask - slice mask to current sequence length
|
|
58
|
+
mask_slice = @mask.narrow(2, 0, t).narrow(3, 0, t)
|
|
59
|
+
att.masked_fill!(mask_slice.eq(0), -Float::INFINITY)
|
|
60
|
+
att = Torch::NN::Functional.softmax(att, dim: -1)
|
|
61
|
+
att = @attn_dropout.call(att)
|
|
62
|
+
att.matmul(v)
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
# Reassemble heads: (B, nh, T, hs) -> (B, T, C)
|
|
66
|
+
y = y.transpose(1, 2).contiguous.view(b, t, c)
|
|
67
|
+
|
|
68
|
+
# Output projection
|
|
69
|
+
@resid_dropout.call(@c_proj.call(y))
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
end
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module NanoGPT
|
|
4
|
+
module Layers
|
|
5
|
+
# LayerNorm with optional bias (PyTorch doesn't support bias=false directly)
|
|
6
|
+
class LayerNorm < Torch::NN::Module
|
|
7
|
+
attr_reader :weight, :bias
|
|
8
|
+
|
|
9
|
+
def initialize(ndim, bias: true)
|
|
10
|
+
super()
|
|
11
|
+
@ndim = ndim
|
|
12
|
+
@weight = Torch::NN::Parameter.new(Torch.ones(ndim))
|
|
13
|
+
@bias = bias ? Torch::NN::Parameter.new(Torch.zeros(ndim)) : nil
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def forward(input)
|
|
17
|
+
Torch::NN::Functional.layer_norm(input, [@ndim], weight: @weight, bias: @bias, eps: 1e-5)
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|