nanogpt 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,73 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ # Prepare the Shakespeare dataset for character-level language modeling.
5
+ # Downloads the tiny shakespeare dataset and creates train.bin, val.bin, and meta.json
6
+
7
+ require "net/http"
8
+ require "openssl"
9
+ require "numo/narray"
10
+ require "json"
11
+
12
+ DATA_URL = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
13
+ SCRIPT_DIR = File.dirname(__FILE__)
14
+
15
+ def download_file(url)
16
+ uri = URI(url)
17
+ http = Net::HTTP.new(uri.host, uri.port)
18
+ http.use_ssl = true
19
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE # GitHub CDN has CRL issues
20
+ response = http.get(uri.request_uri)
21
+ response.body
22
+ end
23
+
24
+ # Download data if not exists
25
+ input_path = File.join(SCRIPT_DIR, "input.txt")
26
+ unless File.exist?(input_path)
27
+ puts "Downloading tiny shakespeare..."
28
+ data = download_file(DATA_URL)
29
+ File.write(input_path, data)
30
+ end
31
+
32
+ data = File.read(input_path)
33
+ puts "Length of dataset in characters: #{data.length}"
34
+
35
+ # Build vocabulary from all unique characters
36
+ chars = data.chars.uniq.sort
37
+ vocab_size = chars.size
38
+ puts "All unique characters: #{chars.join.inspect}"
39
+ puts "Vocab size: #{vocab_size}"
40
+
41
+ # Create mappings
42
+ stoi = chars.each_with_index.to_h
43
+ itos = chars.each_with_index.map { |c, i| [i, c] }.to_h
44
+
45
+ # Encode function
46
+ encode = ->(s) { s.chars.map { |c| stoi[c] } }
47
+
48
+ # Train/val split (90/10)
49
+ n = data.length
50
+ train_data = data[0...(n * 0.9).to_i]
51
+ val_data = data[(n * 0.9).to_i..]
52
+
53
+ # Encode to integers
54
+ train_ids = encode.call(train_data)
55
+ val_ids = encode.call(val_data)
56
+ puts "Train has #{train_ids.length} tokens"
57
+ puts "Val has #{val_ids.length} tokens"
58
+
59
+ # Export to binary files (uint16)
60
+ train_arr = Numo::UInt16.cast(train_ids)
61
+ val_arr = Numo::UInt16.cast(val_ids)
62
+ File.binwrite(File.join(SCRIPT_DIR, "train.bin"), train_arr.to_binary)
63
+ File.binwrite(File.join(SCRIPT_DIR, "val.bin"), val_arr.to_binary)
64
+
65
+ # Save meta information as JSON
66
+ meta = {
67
+ "vocab_size" => vocab_size,
68
+ "itos" => itos.transform_keys(&:to_s),
69
+ "stoi" => stoi
70
+ }
71
+ File.write(File.join(SCRIPT_DIR, "meta.json"), JSON.pretty_generate(meta))
72
+
73
+ puts "Done! Created train.bin, val.bin, and meta.json"
data/exe/nanogpt ADDED
@@ -0,0 +1,338 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ $stdout.sync = true
5
+
6
+ require "nano_gpt"
7
+
8
+ class NanoGPTCLI
9
+ COMMANDS = %w[train sample bench version help].freeze
10
+
11
+ def initialize(args)
12
+ @command = args.shift
13
+ @args = args
14
+ end
15
+
16
+ def run
17
+ case @command
18
+ when "train"
19
+ train
20
+ when "sample"
21
+ sample
22
+ when "bench"
23
+ bench
24
+ when "version", "-v", "--version"
25
+ version
26
+ when "help", "-h", "--help", nil
27
+ help
28
+ else
29
+ puts "Unknown command: #{@command}"
30
+ puts ""
31
+ help
32
+ exit 1
33
+ end
34
+ end
35
+
36
+ private
37
+
38
+ def train
39
+ config = NanoGPT::TrainConfig.load(@args)
40
+
41
+ if config[:device] == "auto"
42
+ config[:device] = NanoGPT::Device.auto
43
+ puts "Auto-detected device: #{config[:device]}"
44
+ end
45
+
46
+ data_dir = File.join("data", config[:dataset])
47
+ train_bin = File.join(data_dir, "train.bin")
48
+
49
+ unless File.exist?(train_bin)
50
+ puts "Error: #{train_bin} not found. Run the data preparation script first:"
51
+ puts " bundle exec ruby data/#{config[:dataset]}/prepare.rb"
52
+ exit 1
53
+ end
54
+
55
+ tokenizer = NanoGPT::Tokenizer.for_dataset(data_dir)
56
+ tokenizer_type = tokenizer.is_a?(NanoGPT::GPT2Tokenizer) ? "GPT-2 BPE" : "character-level"
57
+ puts "Loaded #{tokenizer_type} tokenizer with vocab_size=#{tokenizer.vocab_size}"
58
+
59
+ model_config = NanoGPT::GPTConfig.new(
60
+ block_size: config[:block_size],
61
+ vocab_size: tokenizer.vocab_size,
62
+ n_layer: config[:n_layer],
63
+ n_head: config[:n_head],
64
+ n_embd: config[:n_embd],
65
+ dropout: config[:dropout],
66
+ bias: config[:bias]
67
+ )
68
+
69
+ model = NanoGPT::GPT.new(model_config)
70
+
71
+ device = config[:device]
72
+ if device != "cpu"
73
+ model.to(device)
74
+ puts "Model moved to #{device}"
75
+ end
76
+
77
+ data_loader = NanoGPT::DataLoader.new(
78
+ data_dir: data_dir,
79
+ block_size: config[:block_size],
80
+ batch_size: config[:batch_size],
81
+ device: config[:device]
82
+ )
83
+
84
+ puts "Train data: #{data_loader.train_size} tokens"
85
+ puts "Val data: #{data_loader.val_size} tokens"
86
+
87
+ trainer = NanoGPT::Trainer.new(
88
+ model: model,
89
+ data_loader: data_loader,
90
+ config: config.to_h
91
+ )
92
+
93
+ trainer.train
94
+
95
+ puts "\nTraining complete! Checkpoint saved to #{config[:out_dir]}/ckpt.pt"
96
+ end
97
+
98
+ def sample
99
+ config = NanoGPT::SampleConfig.load(@args)
100
+
101
+ if config[:device] == "auto"
102
+ config[:device] = NanoGPT::Device.auto
103
+ puts "Auto-detected device: #{config[:device]}"
104
+ end
105
+ device = config[:device]
106
+
107
+ Torch.manual_seed(config[:seed])
108
+
109
+ ckpt_path = File.join(config[:out_dir], "ckpt.pt")
110
+ unless File.exist?(ckpt_path)
111
+ puts "Error: Checkpoint not found at #{ckpt_path}"
112
+ puts "Train a model first with: nanogpt train"
113
+ exit 1
114
+ end
115
+
116
+ puts "Loading checkpoint from #{ckpt_path}..."
117
+ checkpoint = Torch.load(ckpt_path)
118
+
119
+ model_args = checkpoint["model_args"].transform_keys(&:to_sym)
120
+ model_config = NanoGPT::GPTConfig.new(**model_args)
121
+ model = NanoGPT::GPT.new(model_config)
122
+ model.load_state_dict(checkpoint["model"])
123
+ model.to(device) if device != "cpu"
124
+ model.eval
125
+
126
+ dataset_dir = File.join("data", config[:dataset])
127
+ tokenizer = NanoGPT::Tokenizer.for_dataset(dataset_dir)
128
+ puts "number of parameters: #{model.num_params / 1e6}M"
129
+
130
+ start_text = config[:start]
131
+ if start_text.start_with?("FILE:")
132
+ start_text = File.read(start_text[5..])
133
+ end
134
+
135
+ start_ids = tokenizer.encode(start_text)
136
+ x = Torch.tensor([start_ids], dtype: :long, device: device)
137
+
138
+ puts "Generating #{config[:num_samples]} samples..."
139
+ puts "=" * 50
140
+
141
+ config[:num_samples].times do |k|
142
+ y = model.generate(
143
+ x,
144
+ config[:max_new_tokens],
145
+ temperature: config[:temperature],
146
+ top_k: config[:top_k]
147
+ )
148
+
149
+ output = tokenizer.decode(y[0].to_a)
150
+ puts output
151
+ puts "-" * 50
152
+ end
153
+ end
154
+
155
+ def bench
156
+ bench_config = {
157
+ batch_size: 12,
158
+ block_size: 1024,
159
+ n_layer: 12,
160
+ n_head: 12,
161
+ n_embd: 768,
162
+ dropout: 0.0,
163
+ bias: false,
164
+ real_data: true,
165
+ dataset: "openwebtext",
166
+ seed: 1337,
167
+ device: "auto"
168
+ }
169
+
170
+ # Parse args
171
+ @args.each do |arg|
172
+ next unless arg.start_with?("--") && arg.include?("=")
173
+
174
+ key, val = arg[2..].split("=", 2)
175
+ key = key.to_sym
176
+
177
+ next unless bench_config.key?(key)
178
+
179
+ bench_config[key] = case bench_config[key]
180
+ when Integer then val.to_i
181
+ when Float then val.to_f
182
+ when TrueClass, FalseClass then val.downcase == "true"
183
+ else val
184
+ end
185
+ end
186
+
187
+ puts "=" * 60
188
+ puts "NanoGPT Benchmark"
189
+ puts "=" * 60
190
+ puts ""
191
+ puts "Configuration:"
192
+ puts " batch_size: #{bench_config[:batch_size]}"
193
+ puts " block_size: #{bench_config[:block_size]}"
194
+ puts " n_layer: #{bench_config[:n_layer]}"
195
+ puts " n_head: #{bench_config[:n_head]}"
196
+ puts " n_embd: #{bench_config[:n_embd]}"
197
+ puts " real_data: #{bench_config[:real_data]}"
198
+ puts ""
199
+
200
+ if bench_config[:device] == "auto"
201
+ bench_config[:device] = NanoGPT::Device.auto
202
+ end
203
+ device = bench_config[:device]
204
+ puts "Device: #{device}"
205
+
206
+ Torch.manual_seed(bench_config[:seed])
207
+
208
+ if bench_config[:real_data]
209
+ data_dir = File.join("data", bench_config[:dataset])
210
+ train_bin = File.join(data_dir, "train.bin")
211
+
212
+ unless File.exist?(train_bin)
213
+ puts ""
214
+ puts "Warning: #{train_bin} not found, using random data instead."
215
+ puts "To use real data, run: bundle exec ruby data/#{bench_config[:dataset]}/prepare.rb"
216
+ puts ""
217
+ bench_config[:real_data] = false
218
+ end
219
+ end
220
+
221
+ if bench_config[:real_data]
222
+ bytes = File.binread(File.join("data", bench_config[:dataset], "train.bin"))
223
+ train_data = bytes.unpack("S<*")
224
+ puts "Loaded #{train_data.size} tokens from #{bench_config[:dataset]}"
225
+
226
+ get_batch = lambda do
227
+ max_start = train_data.size - bench_config[:block_size] - 1
228
+ indices = Array.new(bench_config[:batch_size]) { rand(0..max_start) }
229
+ x_arrays = indices.map { |i| train_data[i, bench_config[:block_size]] }
230
+ y_arrays = indices.map { |i| train_data[i + 1, bench_config[:block_size]] }
231
+ x = Torch.tensor(x_arrays, dtype: :long)
232
+ y = Torch.tensor(y_arrays, dtype: :long)
233
+ x = x.to(device) if device != "cpu"
234
+ y = y.to(device) if device != "cpu"
235
+ [x, y]
236
+ end
237
+ else
238
+ vocab_size = 50304
239
+ puts "Using random data (vocab_size=#{vocab_size})"
240
+
241
+ get_batch = lambda do
242
+ x = Torch.randint(vocab_size, [bench_config[:batch_size], bench_config[:block_size]], dtype: :long)
243
+ y = Torch.randint(vocab_size, [bench_config[:batch_size], bench_config[:block_size]], dtype: :long)
244
+ x = x.to(device) if device != "cpu"
245
+ y = y.to(device) if device != "cpu"
246
+ [x, y]
247
+ end
248
+ end
249
+
250
+ puts ""
251
+ puts "Initializing model..."
252
+ model_config = NanoGPT::GPTConfig.new(
253
+ block_size: bench_config[:block_size],
254
+ vocab_size: 50304,
255
+ n_layer: bench_config[:n_layer],
256
+ n_head: bench_config[:n_head],
257
+ n_embd: bench_config[:n_embd],
258
+ dropout: bench_config[:dropout],
259
+ bias: bench_config[:bias]
260
+ )
261
+
262
+ model = NanoGPT::GPT.new(model_config)
263
+ model.to(device) if device != "cpu"
264
+
265
+ optimizer = model.configure_optimizers(
266
+ weight_decay: 1e-2,
267
+ learning_rate: 1e-4,
268
+ betas: [0.9, 0.95],
269
+ device_type: NanoGPT::Device.type(device)
270
+ )
271
+
272
+ puts ""
273
+ puts "Starting benchmark..."
274
+ puts "-" * 60
275
+
276
+ [{ name: "burn-in", steps: 10 }, { name: "benchmark", steps: 20 }].each do |phase|
277
+ puts ""
278
+ puts "Phase: #{phase[:name]} (#{phase[:steps]} steps)"
279
+
280
+ x, y = get_batch.call
281
+ t0 = Time.now
282
+
283
+ phase[:steps].times do |k|
284
+ _logits, loss = model.forward(x, targets: y)
285
+ x, y = get_batch.call
286
+ optimizer.zero_grad
287
+ loss.backward
288
+ optimizer.step
289
+ loss_val = loss.item
290
+ puts " #{k}/#{phase[:steps]} loss: #{format('%.4f', loss_val)}"
291
+ end
292
+
293
+ t1 = Time.now
294
+ dt = t1 - t0
295
+
296
+ if phase[:name] == "benchmark"
297
+ mfu = model.estimate_mfu(bench_config[:batch_size] * phase[:steps], dt)
298
+ time_per_iter = dt / phase[:steps] * 1000
299
+
300
+ puts ""
301
+ puts "=" * 60
302
+ puts "Results:"
303
+ puts " Time per iteration: #{format('%.2f', time_per_iter)}ms"
304
+ puts " MFU: #{format('%.2f', mfu * 100)}%"
305
+ puts "=" * 60
306
+ end
307
+ end
308
+ end
309
+
310
+ def version
311
+ puts "nanogpt #{NanoGPT::VERSION}"
312
+ end
313
+
314
+ def help
315
+ puts <<~HELP
316
+ nanogpt - A Ruby port of Karpathy's nanoGPT
317
+
318
+ Usage: nanogpt <command> [options]
319
+
320
+ Commands:
321
+ train Train a GPT model
322
+ sample Generate text from a trained model
323
+ bench Run performance benchmarks
324
+ version Show version
325
+ help Show this help message
326
+
327
+ Examples:
328
+ nanogpt train --config=config/train_shakespeare_char.json
329
+ nanogpt train --dataset=shakespeare_char --max_iters=1000
330
+ nanogpt sample --out_dir=out-shakespeare-char --num_samples=3
331
+ nanogpt bench --batch_size=8 --block_size=512
332
+
333
+ For more information, visit: https://github.com/khasinski/nanogpt-rb
334
+ HELP
335
+ end
336
+ end
337
+
338
+ NanoGPTCLI.new(ARGV).run
@@ -0,0 +1,42 @@
1
+ # frozen_string_literal: true
2
+
3
+ module NanoGPT
4
+ # Configuration for GPT model architecture
5
+ class GPTConfig
6
+ attr_accessor :block_size, :vocab_size, :n_layer, :n_head, :n_embd, :dropout, :bias
7
+
8
+ def initialize(
9
+ block_size: 1024,
10
+ vocab_size: 50304,
11
+ n_layer: 12,
12
+ n_head: 12,
13
+ n_embd: 768,
14
+ dropout: 0.0,
15
+ bias: true
16
+ )
17
+ @block_size = block_size
18
+ @vocab_size = vocab_size
19
+ @n_layer = n_layer
20
+ @n_head = n_head
21
+ @n_embd = n_embd
22
+ @dropout = dropout
23
+ @bias = bias
24
+ end
25
+
26
+ def to_h
27
+ {
28
+ block_size: @block_size,
29
+ vocab_size: @vocab_size,
30
+ n_layer: @n_layer,
31
+ n_head: @n_head,
32
+ n_embd: @n_embd,
33
+ dropout: @dropout,
34
+ bias: @bias
35
+ }
36
+ end
37
+
38
+ def head_size
39
+ @n_embd / @n_head
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,74 @@
1
+ # frozen_string_literal: true
2
+
3
+ module NanoGPT
4
+ # Loads batches from binary token files
5
+ # Memory-efficient: reads from file each batch (like Python's memmap recreation)
6
+ class DataLoader
7
+ attr_reader :block_size, :batch_size
8
+
9
+ BYTES_PER_TOKEN = 2 # uint16
10
+
11
+ def initialize(data_dir:, block_size:, batch_size:, device: "cpu")
12
+ @data_dir = data_dir
13
+ @block_size = block_size
14
+ @batch_size = batch_size
15
+ @device = device
16
+
17
+ # Store file paths and sizes (NOT the data itself)
18
+ @train_path = File.join(data_dir, "train.bin")
19
+ @val_path = File.join(data_dir, "val.bin")
20
+
21
+ @train_size = File.size(@train_path) / BYTES_PER_TOKEN
22
+ @val_size = File.size(@val_path) / BYTES_PER_TOKEN
23
+ end
24
+
25
+ def train_size
26
+ @train_size
27
+ end
28
+
29
+ def val_size
30
+ @val_size
31
+ end
32
+
33
+ # Get a batch of data
34
+ # Memory-efficient: recreates data view per batch to avoid memory leak
35
+ # (matches Python's memmap recreation pattern)
36
+ def get_batch(split)
37
+ path = split == :train ? @train_path : @val_path
38
+ data_size = split == :train ? @train_size : @val_size
39
+
40
+ # Random starting indices
41
+ max_start = data_size - @block_size - 1
42
+ indices = Array.new(@batch_size) { rand(0..max_start) }
43
+
44
+ # Read only the bytes we need from file (memory-efficient)
45
+ # This mimics Python's memmap recreation per batch
46
+ x_arrays = []
47
+ y_arrays = []
48
+
49
+ File.open(path, "rb") do |f|
50
+ indices.each do |i|
51
+ # Read x: tokens[i:i+block_size]
52
+ f.seek(i * BYTES_PER_TOKEN)
53
+ x_bytes = f.read((@block_size + 1) * BYTES_PER_TOKEN)
54
+ tokens = x_bytes.unpack("S<*") # uint16 little-endian
55
+
56
+ x_arrays << tokens[0...@block_size]
57
+ y_arrays << tokens[1..@block_size]
58
+ end
59
+ end
60
+
61
+ # Create tensors directly from arrays (avoiding Numo intermediate)
62
+ x = Torch.tensor(x_arrays, dtype: :long)
63
+ y = Torch.tensor(y_arrays, dtype: :long)
64
+
65
+ # Move to device (CPU, CUDA, or MPS)
66
+ if @device != "cpu"
67
+ x = x.to(@device)
68
+ y = y.to(@device)
69
+ end
70
+
71
+ [x, y]
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,56 @@
1
+ # frozen_string_literal: true
2
+
3
+ module NanoGPT
4
+ # Device detection and management
5
+ module Device
6
+ class << self
7
+ # Auto-detect the best available device
8
+ # Priority: CUDA > MPS > CPU
9
+ def auto
10
+ return "cuda" if cuda_available?
11
+ return "mps" if mps_available?
12
+
13
+ "cpu"
14
+ end
15
+
16
+ # Check if CUDA is available
17
+ def cuda_available?
18
+ Torch::CUDA.available?
19
+ rescue StandardError
20
+ false
21
+ end
22
+
23
+ # Check if MPS (Metal Performance Shaders) is available
24
+ # MPS is Apple Silicon GPU acceleration
25
+ def mps_available?
26
+ # Try to create a tensor on MPS device
27
+ Torch.tensor([1.0], device: "mps")
28
+ true
29
+ rescue StandardError
30
+ false
31
+ end
32
+
33
+ # Get device type string (for optimizer configuration, etc.)
34
+ def type(device)
35
+ case device.to_s
36
+ when /cuda/ then "cuda"
37
+ when /mps/ then "mps"
38
+ else "cpu"
39
+ end
40
+ end
41
+
42
+ # Check if device is GPU (CUDA or MPS)
43
+ def gpu?(device)
44
+ %w[cuda mps].include?(type(device))
45
+ end
46
+
47
+ # Print device info
48
+ def info
49
+ puts "Device detection:"
50
+ puts " CUDA available: #{cuda_available?}"
51
+ puts " MPS available: #{mps_available?}"
52
+ puts " Auto-selected: #{auto}"
53
+ end
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,25 @@
1
+ # frozen_string_literal: true
2
+
3
+ module NanoGPT
4
+ module Layers
5
+ # Transformer block: LayerNorm -> Attention -> LayerNorm -> MLP
6
+ class Block < Torch::NN::Module
7
+ def initialize(config)
8
+ super()
9
+ @ln_1 = LayerNorm.new(config.n_embd, bias: config.bias)
10
+ @attn = CausalSelfAttention.new(config)
11
+ @ln_2 = LayerNorm.new(config.n_embd, bias: config.bias)
12
+ @mlp = MLP.new(config)
13
+ end
14
+
15
+ def forward(x)
16
+ x = x + @attn.call(@ln_1.call(x))
17
+ x = x + @mlp.call(@ln_2.call(x))
18
+ # Trigger GC to free intermediate tensors (critical for torch.rb memory management)
19
+ # Ruby's GC doesn't run frequently enough during forward pass, causing memory accumulation
20
+ GC.start(full_mark: false, immediate_sweep: true)
21
+ x
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,73 @@
1
+ # frozen_string_literal: true
2
+
3
+ module NanoGPT
4
+ module Layers
5
+ # Multi-head causal self-attention
6
+ class CausalSelfAttention < Torch::NN::Module
7
+ def initialize(config)
8
+ super()
9
+ raise ArgumentError, "n_embd must be divisible by n_head" unless (config.n_embd % config.n_head).zero?
10
+
11
+ @n_head = config.n_head
12
+ @n_embd = config.n_embd
13
+ @head_size = config.n_embd / config.n_head
14
+ @dropout_p = config.dropout
15
+
16
+ # Key, query, value projections for all heads, combined
17
+ @c_attn = Torch::NN::Linear.new(config.n_embd, 3 * config.n_embd, bias: config.bias)
18
+ # Output projection
19
+ @c_proj = Torch::NN::Linear.new(config.n_embd, config.n_embd, bias: config.bias)
20
+ # Regularization
21
+ @attn_dropout = Torch::NN::Dropout.new(p: config.dropout)
22
+ @resid_dropout = Torch::NN::Dropout.new(p: config.dropout)
23
+
24
+ # Use native scaled_dot_product_attention with is_causal=true when dropout is 0
25
+ # Native SDPA is ~5x faster but doesn't support dropout with is_causal mode
26
+ @flash = config.dropout == 0.0
27
+
28
+ # Causal mask for manual attention (only used when @flash is false)
29
+ unless @flash
30
+ mask = Torch.tril(Torch.ones(config.block_size, config.block_size))
31
+ register_buffer("mask", mask.view(1, 1, config.block_size, config.block_size))
32
+ end
33
+ end
34
+
35
+ def forward(x)
36
+ b, t, c = x.shape
37
+
38
+ # Calculate Q, K, V
39
+ qkv = @c_attn.call(x)
40
+ q, k, v = qkv.split(@n_embd, 2)
41
+
42
+ # Reshape: (B, T, C) -> (B, nh, T, hs)
43
+ q = q.view(b, t, @n_head, @head_size).transpose(1, 2)
44
+ k = k.view(b, t, @n_head, @head_size).transpose(1, 2)
45
+ v = v.view(b, t, @n_head, @head_size).transpose(1, 2)
46
+
47
+ y = if @flash
48
+ # Native scaled_dot_product_attention with is_causal=true
49
+ # Uses Flash Attention on CUDA, optimized kernel on MPS
50
+ Torch::NN.scaled_dot_product_attention(q, k, v, nil, 0.0, true)
51
+ else
52
+ # Manual attention implementation with causal mask
53
+ scale = 1.0 / Math.sqrt(@head_size)
54
+ att = q.matmul(k.transpose(-2, -1))
55
+ att.mul!(scale)
56
+
57
+ # Apply causal mask - slice mask to current sequence length
58
+ mask_slice = @mask.narrow(2, 0, t).narrow(3, 0, t)
59
+ att.masked_fill!(mask_slice.eq(0), -Float::INFINITY)
60
+ att = Torch::NN::Functional.softmax(att, dim: -1)
61
+ att = @attn_dropout.call(att)
62
+ att.matmul(v)
63
+ end
64
+
65
+ # Reassemble heads: (B, nh, T, hs) -> (B, T, C)
66
+ y = y.transpose(1, 2).contiguous.view(b, t, c)
67
+
68
+ # Output projection
69
+ @resid_dropout.call(@c_proj.call(y))
70
+ end
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ module NanoGPT
4
+ module Layers
5
+ # LayerNorm with optional bias (PyTorch doesn't support bias=false directly)
6
+ class LayerNorm < Torch::NN::Module
7
+ attr_reader :weight, :bias
8
+
9
+ def initialize(ndim, bias: true)
10
+ super()
11
+ @ndim = ndim
12
+ @weight = Torch::NN::Parameter.new(Torch.ones(ndim))
13
+ @bias = bias ? Torch::NN::Parameter.new(Torch.zeros(ndim)) : nil
14
+ end
15
+
16
+ def forward(input)
17
+ Torch::NN::Functional.layer_norm(input, [@ndim], weight: @weight, bias: @bias, eps: 1e-5)
18
+ end
19
+ end
20
+ end
21
+ end