nanogpt 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +30 -1
- data/docs/ARCHITECTURE.md +429 -0
- data/exe/nanogpt +210 -233
- data/lib/nano_gpt/bpe_textfile_preparer.rb +105 -0
- data/lib/nano_gpt/data_loader.rb +5 -20
- data/lib/nano_gpt/layers/block.rb +6 -1
- data/lib/nano_gpt/layers/causal_self_attention.rb +11 -1
- data/lib/nano_gpt/model.rb +1 -7
- data/lib/nano_gpt/textfile_preparer.rb +189 -0
- data/lib/nano_gpt/train_config.rb +80 -146
- data/lib/nano_gpt/trainer.rb +21 -48
- data/lib/nano_gpt/version.rb +1 -1
- data/lib/nano_gpt/web/metrics_store.rb +136 -0
- data/lib/nano_gpt/web/server.rb +294 -0
- data/lib/nano_gpt/web/sse_notifier.rb +37 -0
- data/lib/nano_gpt/web/training_state.rb +56 -0
- data/lib/nano_gpt/web/training_worker.rb +153 -0
- data/lib/nano_gpt/web/views/layout.erb +78 -0
- data/lib/nano_gpt/web/views/run_detail.erb +432 -0
- data/lib/nano_gpt/web/views/runs.erb +434 -0
- data/lib/nano_gpt/web/web_trainer.rb +210 -0
- data/lib/nano_gpt/web.rb +9 -0
- data/lib/nano_gpt.rb +1 -0
- data/nanogpt.gemspec +4 -0
- metadata +71 -2
data/exe/nanogpt
CHANGED
|
@@ -6,7 +6,7 @@ $stdout.sync = true
|
|
|
6
6
|
require "nano_gpt"
|
|
7
7
|
|
|
8
8
|
class NanoGPTCLI
|
|
9
|
-
COMMANDS = %w[prepare train sample bench version help].freeze
|
|
9
|
+
COMMANDS = %w[prepare train sample bench web check version help].freeze
|
|
10
10
|
|
|
11
11
|
def initialize(args)
|
|
12
12
|
@command = args.shift
|
|
@@ -23,6 +23,10 @@ class NanoGPTCLI
|
|
|
23
23
|
sample
|
|
24
24
|
when "bench"
|
|
25
25
|
bench
|
|
26
|
+
when "web"
|
|
27
|
+
web
|
|
28
|
+
when "check"
|
|
29
|
+
check
|
|
26
30
|
when "version", "-v", "--version"
|
|
27
31
|
version
|
|
28
32
|
when "help", "-h", "--help", nil
|
|
@@ -40,7 +44,6 @@ class NanoGPTCLI
|
|
|
40
44
|
def prepare
|
|
41
45
|
dataset = @args.first
|
|
42
46
|
|
|
43
|
-
# Find available datasets
|
|
44
47
|
data_dir = File.join(File.dirname(__FILE__), "..", "data")
|
|
45
48
|
available = Dir.glob(File.join(data_dir, "*", "prepare.rb")).map do |path|
|
|
46
49
|
File.basename(File.dirname(path))
|
|
@@ -77,7 +80,6 @@ class NanoGPTCLI
|
|
|
77
80
|
exit 1
|
|
78
81
|
end
|
|
79
82
|
|
|
80
|
-
# Set output directory to current working directory
|
|
81
83
|
output_dir = File.join(Dir.pwd, "data", dataset)
|
|
82
84
|
ENV["NANOGPT_DATA_DIR"] = output_dir
|
|
83
85
|
|
|
@@ -87,10 +89,6 @@ class NanoGPTCLI
|
|
|
87
89
|
end
|
|
88
90
|
|
|
89
91
|
def prepare_textfile
|
|
90
|
-
require "numo/narray"
|
|
91
|
-
require "json"
|
|
92
|
-
require "fileutils"
|
|
93
|
-
|
|
94
92
|
input_path = nil
|
|
95
93
|
output_name = nil
|
|
96
94
|
val_ratio = 0.1
|
|
@@ -116,144 +114,15 @@ class NanoGPTCLI
|
|
|
116
114
|
exit 1
|
|
117
115
|
end
|
|
118
116
|
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
file_size = File.size(input_path)
|
|
129
|
-
puts "Preparing text file: #{input_path}"
|
|
130
|
-
puts "File size: #{(file_size / 1_000_000.0).round(2)} MB"
|
|
131
|
-
puts "Output directory: #{output_dir}"
|
|
132
|
-
puts "Validation ratio: #{val_ratio}"
|
|
133
|
-
puts ""
|
|
134
|
-
|
|
135
|
-
# Phase 1: Build vocabulary by reading entire file
|
|
136
|
-
# For very large files, we read line by line to avoid memory issues
|
|
137
|
-
puts "Phase 1: Building vocabulary..."
|
|
138
|
-
char_set = Set.new
|
|
139
|
-
char_count = 0
|
|
140
|
-
|
|
141
|
-
# Detect encoding: check if file is valid UTF-8, otherwise assume Windows-1252
|
|
142
|
-
sample = File.binread(input_path, 100_000)
|
|
143
|
-
encoding = sample.force_encoding("UTF-8").valid_encoding? ? "UTF-8" : "Windows-1252:UTF-8"
|
|
144
|
-
puts " Detected encoding: #{encoding.split(':').first}"
|
|
145
|
-
|
|
146
|
-
File.foreach(input_path, encoding: encoding) do |line|
|
|
147
|
-
line.each_char { |c| char_set.add(c) }
|
|
148
|
-
char_count += line.length
|
|
149
|
-
print "\r Scanned #{char_count} characters, #{char_set.size} unique..." if (char_count % 100_000) < 1000
|
|
150
|
-
end
|
|
151
|
-
puts "\r Scanned #{char_count} characters, #{char_set.size} unique..."
|
|
152
|
-
|
|
153
|
-
chars = char_set.to_a.sort
|
|
154
|
-
vocab_size = chars.size
|
|
155
|
-
puts "Vocabulary size: #{vocab_size}"
|
|
156
|
-
|
|
157
|
-
stoi = chars.each_with_index.to_h
|
|
158
|
-
itos = chars.each_with_index.map { |c, i| [i, c] }.to_h
|
|
159
|
-
|
|
160
|
-
# Phase 2: Calculate split point
|
|
161
|
-
total_chars = char_count
|
|
162
|
-
val_chars = (total_chars * val_ratio).to_i
|
|
163
|
-
train_chars = total_chars - val_chars
|
|
164
|
-
puts ""
|
|
165
|
-
puts "Train: #{train_chars} characters"
|
|
166
|
-
puts "Val: #{val_chars} characters"
|
|
167
|
-
|
|
168
|
-
# Phase 3: Encode and write train.bin (streaming line by line)
|
|
169
|
-
puts ""
|
|
170
|
-
puts "Phase 2: Encoding and writing train.bin..."
|
|
171
|
-
train_path = File.join(output_dir, "train.bin")
|
|
172
|
-
chars_written = 0
|
|
173
|
-
buffer = []
|
|
174
|
-
buffer_size = 100_000
|
|
175
|
-
|
|
176
|
-
File.open(train_path, "wb") do |output|
|
|
177
|
-
File.foreach(input_path, encoding: encoding) do |line|
|
|
178
|
-
line.each_char do |c|
|
|
179
|
-
break if chars_written >= train_chars
|
|
180
|
-
|
|
181
|
-
buffer << stoi[c]
|
|
182
|
-
chars_written += 1
|
|
183
|
-
|
|
184
|
-
if buffer.size >= buffer_size
|
|
185
|
-
arr = Numo::UInt16.cast(buffer)
|
|
186
|
-
output.write(arr.to_binary)
|
|
187
|
-
buffer.clear
|
|
188
|
-
print "\r Written #{chars_written}/#{train_chars} characters..."
|
|
189
|
-
end
|
|
190
|
-
end
|
|
191
|
-
break if chars_written >= train_chars
|
|
192
|
-
end
|
|
193
|
-
|
|
194
|
-
unless buffer.empty?
|
|
195
|
-
arr = Numo::UInt16.cast(buffer)
|
|
196
|
-
output.write(arr.to_binary)
|
|
197
|
-
buffer.clear
|
|
198
|
-
end
|
|
199
|
-
end
|
|
200
|
-
puts ""
|
|
201
|
-
|
|
202
|
-
# Phase 4: Encode and write val.bin (streaming line by line)
|
|
203
|
-
puts "Phase 3: Encoding and writing val.bin..."
|
|
204
|
-
val_path = File.join(output_dir, "val.bin")
|
|
205
|
-
chars_written = 0
|
|
206
|
-
skipped = 0
|
|
207
|
-
buffer = []
|
|
208
|
-
|
|
209
|
-
File.open(val_path, "wb") do |output|
|
|
210
|
-
File.foreach(input_path, encoding: encoding) do |line|
|
|
211
|
-
line.each_char do |c|
|
|
212
|
-
if skipped < train_chars
|
|
213
|
-
skipped += 1
|
|
214
|
-
next
|
|
215
|
-
end
|
|
216
|
-
|
|
217
|
-
buffer << stoi[c]
|
|
218
|
-
chars_written += 1
|
|
219
|
-
|
|
220
|
-
if buffer.size >= buffer_size
|
|
221
|
-
arr = Numo::UInt16.cast(buffer)
|
|
222
|
-
output.write(arr.to_binary)
|
|
223
|
-
buffer.clear
|
|
224
|
-
print "\r Written #{chars_written}/#{val_chars} characters..."
|
|
225
|
-
end
|
|
226
|
-
end
|
|
227
|
-
end
|
|
228
|
-
|
|
229
|
-
unless buffer.empty?
|
|
230
|
-
arr = Numo::UInt16.cast(buffer)
|
|
231
|
-
output.write(arr.to_binary)
|
|
232
|
-
buffer.clear
|
|
233
|
-
end
|
|
234
|
-
end
|
|
235
|
-
puts ""
|
|
236
|
-
|
|
237
|
-
# Phase 5: Save meta.json
|
|
238
|
-
puts "Phase 4: Saving meta.json..."
|
|
239
|
-
meta = {
|
|
240
|
-
"vocab_size" => vocab_size,
|
|
241
|
-
"itos" => itos.transform_keys(&:to_s),
|
|
242
|
-
"stoi" => stoi
|
|
243
|
-
}
|
|
244
|
-
File.write(File.join(output_dir, "meta.json"), JSON.pretty_generate(meta))
|
|
245
|
-
|
|
246
|
-
train_size_mb = File.size(train_path) / 1_000_000.0
|
|
247
|
-
val_size_mb = File.size(val_path) / 1_000_000.0
|
|
248
|
-
|
|
249
|
-
puts ""
|
|
250
|
-
puts "Done!"
|
|
251
|
-
puts " train.bin: #{train_chars} tokens (#{train_size_mb.round(2)} MB)"
|
|
252
|
-
puts " val.bin: #{val_chars} tokens (#{val_size_mb.round(2)} MB)"
|
|
253
|
-
puts " meta.json: vocab_size=#{vocab_size}"
|
|
254
|
-
puts ""
|
|
255
|
-
puts "To train:"
|
|
256
|
-
puts " nanogpt train --dataset=#{output_name}"
|
|
117
|
+
preparer = NanoGPT::TextfilePreparer.new(
|
|
118
|
+
input_path: input_path,
|
|
119
|
+
output_name: output_name,
|
|
120
|
+
val_ratio: val_ratio
|
|
121
|
+
)
|
|
122
|
+
preparer.prepare
|
|
123
|
+
rescue RuntimeError => e
|
|
124
|
+
puts "Error: #{e.message}"
|
|
125
|
+
exit 1
|
|
257
126
|
end
|
|
258
127
|
|
|
259
128
|
def train
|
|
@@ -359,7 +228,7 @@ class NanoGPTCLI
|
|
|
359
228
|
puts "Generating #{config[:num_samples]} samples..."
|
|
360
229
|
puts "=" * 50
|
|
361
230
|
|
|
362
|
-
config[:num_samples].times do |
|
|
231
|
+
config[:num_samples].times do |_k|
|
|
363
232
|
y = model.generate(
|
|
364
233
|
x,
|
|
365
234
|
config[:max_new_tokens],
|
|
@@ -374,110 +243,58 @@ class NanoGPTCLI
|
|
|
374
243
|
end
|
|
375
244
|
|
|
376
245
|
def bench
|
|
377
|
-
|
|
378
|
-
batch_size: 12,
|
|
379
|
-
block_size: 1024,
|
|
380
|
-
n_layer: 12,
|
|
381
|
-
n_head: 12,
|
|
382
|
-
n_embd: 768,
|
|
383
|
-
dropout: 0.0,
|
|
384
|
-
bias: false,
|
|
385
|
-
real_data: true,
|
|
386
|
-
dataset: "openwebtext",
|
|
387
|
-
seed: 1337,
|
|
388
|
-
device: "auto"
|
|
389
|
-
}
|
|
390
|
-
|
|
391
|
-
# Parse args
|
|
392
|
-
@args.each do |arg|
|
|
393
|
-
next unless arg.start_with?("--") && arg.include?("=")
|
|
394
|
-
|
|
395
|
-
key, val = arg[2..].split("=", 2)
|
|
396
|
-
key = key.to_sym
|
|
397
|
-
|
|
398
|
-
next unless bench_config.key?(key)
|
|
399
|
-
|
|
400
|
-
bench_config[key] = case bench_config[key]
|
|
401
|
-
when Integer then val.to_i
|
|
402
|
-
when Float then val.to_f
|
|
403
|
-
when TrueClass, FalseClass then val.downcase == "true"
|
|
404
|
-
else val
|
|
405
|
-
end
|
|
406
|
-
end
|
|
246
|
+
config = NanoGPT::BenchConfig.load(@args)
|
|
407
247
|
|
|
408
248
|
puts "=" * 60
|
|
409
249
|
puts "NanoGPT Benchmark"
|
|
410
250
|
puts "=" * 60
|
|
411
251
|
puts ""
|
|
412
252
|
puts "Configuration:"
|
|
413
|
-
puts " batch_size: #{
|
|
414
|
-
puts " block_size: #{
|
|
415
|
-
puts " n_layer: #{
|
|
416
|
-
puts " n_head: #{
|
|
417
|
-
puts " n_embd: #{
|
|
418
|
-
puts " real_data: #{
|
|
253
|
+
puts " batch_size: #{config[:batch_size]}"
|
|
254
|
+
puts " block_size: #{config[:block_size]}"
|
|
255
|
+
puts " n_layer: #{config[:n_layer]}"
|
|
256
|
+
puts " n_head: #{config[:n_head]}"
|
|
257
|
+
puts " n_embd: #{config[:n_embd]}"
|
|
258
|
+
puts " real_data: #{config[:real_data]}"
|
|
419
259
|
puts ""
|
|
420
260
|
|
|
421
|
-
if
|
|
422
|
-
|
|
261
|
+
if config[:device] == "auto"
|
|
262
|
+
config[:device] = NanoGPT::Device.auto
|
|
423
263
|
end
|
|
424
|
-
device =
|
|
264
|
+
device = config[:device]
|
|
425
265
|
puts "Device: #{device}"
|
|
426
266
|
|
|
427
|
-
Torch.manual_seed(
|
|
267
|
+
Torch.manual_seed(config[:seed])
|
|
428
268
|
|
|
429
|
-
if
|
|
430
|
-
data_dir = File.join("data",
|
|
269
|
+
if config[:real_data]
|
|
270
|
+
data_dir = File.join("data", config[:dataset])
|
|
431
271
|
train_bin = File.join(data_dir, "train.bin")
|
|
432
272
|
|
|
433
273
|
unless File.exist?(train_bin)
|
|
434
274
|
puts ""
|
|
435
275
|
puts "Warning: #{train_bin} not found, using random data instead."
|
|
436
|
-
puts "To use real data, run: bundle exec ruby data/#{
|
|
276
|
+
puts "To use real data, run: bundle exec ruby data/#{config[:dataset]}/prepare.rb"
|
|
437
277
|
puts ""
|
|
438
|
-
|
|
278
|
+
config[:real_data] = false
|
|
439
279
|
end
|
|
440
280
|
end
|
|
441
281
|
|
|
442
|
-
if
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
get_batch = lambda do
|
|
448
|
-
max_start = train_data.size - bench_config[:block_size] - 1
|
|
449
|
-
indices = Array.new(bench_config[:batch_size]) { rand(0..max_start) }
|
|
450
|
-
x_arrays = indices.map { |i| train_data[i, bench_config[:block_size]] }
|
|
451
|
-
y_arrays = indices.map { |i| train_data[i + 1, bench_config[:block_size]] }
|
|
452
|
-
x = Torch.tensor(x_arrays, dtype: :long)
|
|
453
|
-
y = Torch.tensor(y_arrays, dtype: :long)
|
|
454
|
-
x = x.to(device) if device != "cpu"
|
|
455
|
-
y = y.to(device) if device != "cpu"
|
|
456
|
-
[x, y]
|
|
457
|
-
end
|
|
458
|
-
else
|
|
459
|
-
vocab_size = 50304
|
|
460
|
-
puts "Using random data (vocab_size=#{vocab_size})"
|
|
461
|
-
|
|
462
|
-
get_batch = lambda do
|
|
463
|
-
x = Torch.randint(vocab_size, [bench_config[:batch_size], bench_config[:block_size]], dtype: :long)
|
|
464
|
-
y = Torch.randint(vocab_size, [bench_config[:batch_size], bench_config[:block_size]], dtype: :long)
|
|
465
|
-
x = x.to(device) if device != "cpu"
|
|
466
|
-
y = y.to(device) if device != "cpu"
|
|
467
|
-
[x, y]
|
|
468
|
-
end
|
|
469
|
-
end
|
|
282
|
+
get_batch = if config[:real_data]
|
|
283
|
+
create_real_data_batch_fn(config, device)
|
|
284
|
+
else
|
|
285
|
+
create_random_data_batch_fn(config, device)
|
|
286
|
+
end
|
|
470
287
|
|
|
471
288
|
puts ""
|
|
472
289
|
puts "Initializing model..."
|
|
473
290
|
model_config = NanoGPT::GPTConfig.new(
|
|
474
|
-
block_size:
|
|
291
|
+
block_size: config[:block_size],
|
|
475
292
|
vocab_size: 50304,
|
|
476
|
-
n_layer:
|
|
477
|
-
n_head:
|
|
478
|
-
n_embd:
|
|
479
|
-
dropout:
|
|
480
|
-
bias:
|
|
293
|
+
n_layer: config[:n_layer],
|
|
294
|
+
n_head: config[:n_head],
|
|
295
|
+
n_embd: config[:n_embd],
|
|
296
|
+
dropout: config[:dropout],
|
|
297
|
+
bias: config[:bias]
|
|
481
298
|
)
|
|
482
299
|
|
|
483
300
|
model = NanoGPT::GPT.new(model_config)
|
|
@@ -494,6 +311,41 @@ class NanoGPTCLI
|
|
|
494
311
|
puts "Starting benchmark..."
|
|
495
312
|
puts "-" * 60
|
|
496
313
|
|
|
314
|
+
run_benchmark_phases(model, optimizer, get_batch, config)
|
|
315
|
+
end
|
|
316
|
+
|
|
317
|
+
def create_real_data_batch_fn(config, device)
|
|
318
|
+
bytes = File.binread(File.join("data", config[:dataset], "train.bin"))
|
|
319
|
+
train_data = bytes.unpack("S<*")
|
|
320
|
+
puts "Loaded #{train_data.size} tokens from #{config[:dataset]}"
|
|
321
|
+
|
|
322
|
+
lambda do
|
|
323
|
+
max_start = train_data.size - config[:block_size] - 1
|
|
324
|
+
indices = Array.new(config[:batch_size]) { rand(0..max_start) }
|
|
325
|
+
x_arrays = indices.map { |i| train_data[i, config[:block_size]] }
|
|
326
|
+
y_arrays = indices.map { |i| train_data[i + 1, config[:block_size]] }
|
|
327
|
+
x = Torch.tensor(x_arrays, dtype: :long)
|
|
328
|
+
y = Torch.tensor(y_arrays, dtype: :long)
|
|
329
|
+
x = x.to(device) if device != "cpu"
|
|
330
|
+
y = y.to(device) if device != "cpu"
|
|
331
|
+
[x, y]
|
|
332
|
+
end
|
|
333
|
+
end
|
|
334
|
+
|
|
335
|
+
def create_random_data_batch_fn(config, device)
|
|
336
|
+
vocab_size = 50304
|
|
337
|
+
puts "Using random data (vocab_size=#{vocab_size})"
|
|
338
|
+
|
|
339
|
+
lambda do
|
|
340
|
+
x = Torch.randint(vocab_size, [config[:batch_size], config[:block_size]], dtype: :long)
|
|
341
|
+
y = Torch.randint(vocab_size, [config[:batch_size], config[:block_size]], dtype: :long)
|
|
342
|
+
x = x.to(device) if device != "cpu"
|
|
343
|
+
y = y.to(device) if device != "cpu"
|
|
344
|
+
[x, y]
|
|
345
|
+
end
|
|
346
|
+
end
|
|
347
|
+
|
|
348
|
+
def run_benchmark_phases(model, optimizer, get_batch, config)
|
|
497
349
|
[{ name: "burn-in", steps: 10 }, { name: "benchmark", steps: 20 }].each do |phase|
|
|
498
350
|
puts ""
|
|
499
351
|
puts "Phase: #{phase[:name]} (#{phase[:steps]} steps)"
|
|
@@ -514,18 +366,140 @@ class NanoGPTCLI
|
|
|
514
366
|
t1 = Time.now
|
|
515
367
|
dt = t1 - t0
|
|
516
368
|
|
|
517
|
-
|
|
518
|
-
mfu = model.estimate_mfu(bench_config[:batch_size] * phase[:steps], dt)
|
|
519
|
-
time_per_iter = dt / phase[:steps] * 1000
|
|
369
|
+
next unless phase[:name] == "benchmark"
|
|
520
370
|
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
371
|
+
mfu = model.estimate_mfu(config[:batch_size] * phase[:steps], dt)
|
|
372
|
+
time_per_iter = dt / phase[:steps] * 1000
|
|
373
|
+
|
|
374
|
+
puts ""
|
|
375
|
+
puts "=" * 60
|
|
376
|
+
puts "Results:"
|
|
377
|
+
puts " Time per iteration: #{format('%.2f', time_per_iter)}ms"
|
|
378
|
+
puts " MFU: #{format('%.2f', mfu * 100)}%"
|
|
379
|
+
puts "=" * 60
|
|
380
|
+
end
|
|
381
|
+
end
|
|
382
|
+
|
|
383
|
+
def web
|
|
384
|
+
require "nano_gpt/web"
|
|
385
|
+
|
|
386
|
+
port = 4567
|
|
387
|
+
@args.each do |arg|
|
|
388
|
+
if arg.start_with?("--port=")
|
|
389
|
+
port = arg.split("=", 2).last.to_i
|
|
527
390
|
end
|
|
528
391
|
end
|
|
392
|
+
|
|
393
|
+
training_state = NanoGPT::Web::TrainingState.new
|
|
394
|
+
metrics_store = NanoGPT::Web::MetricsStore.new
|
|
395
|
+
sse_notifier = NanoGPT::Web::SSENotifier.new
|
|
396
|
+
|
|
397
|
+
worker = NanoGPT::Web::TrainingWorker.new(
|
|
398
|
+
training_state: training_state,
|
|
399
|
+
metrics_store: metrics_store,
|
|
400
|
+
sse_notifier: sse_notifier
|
|
401
|
+
)
|
|
402
|
+
|
|
403
|
+
NanoGPT::Web::Server.training_state = training_state
|
|
404
|
+
NanoGPT::Web::Server.metrics_store = metrics_store
|
|
405
|
+
NanoGPT::Web::Server.sse_notifier = sse_notifier
|
|
406
|
+
NanoGPT::Web::Server.training_worker = worker
|
|
407
|
+
|
|
408
|
+
puts "Starting nanoGPT web dashboard on http://localhost:#{port}"
|
|
409
|
+
puts "Device: #{NanoGPT::Device.auto}"
|
|
410
|
+
|
|
411
|
+
# Web server runs in a background thread; the main thread is reserved
|
|
412
|
+
# for Torch operations (processed via the training worker queue).
|
|
413
|
+
Thread.new { NanoGPT::Web::Server.run!(port: port, bind: "0.0.0.0") }
|
|
414
|
+
sleep 1
|
|
415
|
+
worker.run
|
|
416
|
+
end
|
|
417
|
+
|
|
418
|
+
def check
|
|
419
|
+
puts "nanoGPT environment check"
|
|
420
|
+
puts "=" * 40
|
|
421
|
+
puts ""
|
|
422
|
+
|
|
423
|
+
# Ruby version
|
|
424
|
+
puts "Ruby: #{RUBY_VERSION} (#{RUBY_PLATFORM})"
|
|
425
|
+
puts ""
|
|
426
|
+
|
|
427
|
+
# Torch availability
|
|
428
|
+
print "torch-rb: "
|
|
429
|
+
begin
|
|
430
|
+
require "torch"
|
|
431
|
+
puts "#{Torch::VERSION} OK"
|
|
432
|
+
rescue LoadError => e
|
|
433
|
+
puts "FAILED -- #{e.message}"
|
|
434
|
+
puts " Install with: gem install torch-rb"
|
|
435
|
+
puts ""
|
|
436
|
+
puts "Check complete (with errors)."
|
|
437
|
+
return
|
|
438
|
+
end
|
|
439
|
+
puts ""
|
|
440
|
+
|
|
441
|
+
# Device detection
|
|
442
|
+
puts "Devices:"
|
|
443
|
+
puts " CPU: always available"
|
|
444
|
+
|
|
445
|
+
mps_available = begin
|
|
446
|
+
Torch::Backends::MPS.available?
|
|
447
|
+
rescue
|
|
448
|
+
false
|
|
449
|
+
end
|
|
450
|
+
puts " MPS: #{mps_available ? 'available' : 'not available'}"
|
|
451
|
+
|
|
452
|
+
cuda_available = begin
|
|
453
|
+
Torch::CUDA.available?
|
|
454
|
+
rescue
|
|
455
|
+
false
|
|
456
|
+
end
|
|
457
|
+
puts " CUDA: #{cuda_available ? 'available' : 'not available'}"
|
|
458
|
+
|
|
459
|
+
device = NanoGPT::Device.auto
|
|
460
|
+
puts ""
|
|
461
|
+
puts " Selected device: #{device}"
|
|
462
|
+
puts ""
|
|
463
|
+
|
|
464
|
+
# Basic tensor operation
|
|
465
|
+
print "Tensor ops (CPU): "
|
|
466
|
+
begin
|
|
467
|
+
a = Torch.tensor([1.0, 2.0, 3.0])
|
|
468
|
+
b = Torch.tensor([4.0, 5.0, 6.0])
|
|
469
|
+
c = a + b
|
|
470
|
+
raise "unexpected result" unless c.to_a == [5.0, 7.0, 9.0]
|
|
471
|
+
puts "OK"
|
|
472
|
+
rescue => e
|
|
473
|
+
puts "FAILED -- #{e.message}"
|
|
474
|
+
end
|
|
475
|
+
|
|
476
|
+
# Test on selected device
|
|
477
|
+
if device != "cpu"
|
|
478
|
+
print "Tensor ops (#{device}): "
|
|
479
|
+
begin
|
|
480
|
+
a = Torch.tensor([1.0, 2.0, 3.0], device: device)
|
|
481
|
+
b = Torch.tensor([4.0, 5.0, 6.0], device: device)
|
|
482
|
+
c = (a + b).cpu
|
|
483
|
+
raise "unexpected result" unless c.to_a == [5.0, 7.0, 9.0]
|
|
484
|
+
puts "OK"
|
|
485
|
+
rescue => e
|
|
486
|
+
puts "FAILED -- #{e.message}"
|
|
487
|
+
end
|
|
488
|
+
end
|
|
489
|
+
|
|
490
|
+
# Matmul test (more representative of model workload)
|
|
491
|
+
print "Matrix multiply (#{device}): "
|
|
492
|
+
begin
|
|
493
|
+
m = Torch.randn(64, 384, device: device)
|
|
494
|
+
w = Torch.randn(384, 384, device: device)
|
|
495
|
+
_result = Torch.matmul(m, w)
|
|
496
|
+
puts "OK"
|
|
497
|
+
rescue => e
|
|
498
|
+
puts "FAILED -- #{e.message}"
|
|
499
|
+
end
|
|
500
|
+
|
|
501
|
+
puts ""
|
|
502
|
+
puts "All checks passed. Ready to train!"
|
|
529
503
|
end
|
|
530
504
|
|
|
531
505
|
def version
|
|
@@ -543,6 +517,8 @@ class NanoGPTCLI
|
|
|
543
517
|
train Train a GPT model
|
|
544
518
|
sample Generate text from a trained model
|
|
545
519
|
bench Run performance benchmarks
|
|
520
|
+
web Start the web dashboard
|
|
521
|
+
check Verify environment (torch, CUDA/MPS, tensor ops)
|
|
546
522
|
version Show version
|
|
547
523
|
help Show this help message
|
|
548
524
|
|
|
@@ -551,6 +527,7 @@ class NanoGPTCLI
|
|
|
551
527
|
nanogpt train --dataset=shakespeare_char --device=mps
|
|
552
528
|
nanogpt sample --dataset=shakespeare_char --num_samples=3
|
|
553
529
|
nanogpt bench --batch_size=8 --block_size=512
|
|
530
|
+
nanogpt web --port=4567
|
|
554
531
|
|
|
555
532
|
For more information, visit: https://github.com/khasinski/nanogpt-rb
|
|
556
533
|
HELP
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "numo/narray"
|
|
4
|
+
require "fileutils"
|
|
5
|
+
|
|
6
|
+
module NanoGPT
|
|
7
|
+
# Prepares custom text files for training with GPT-2 BPE tokenization
|
|
8
|
+
# Mirrors TextfilePreparer but uses tiktoken instead of character-level encoding.
|
|
9
|
+
# Does NOT write meta.json -- absence triggers GPT-2 auto-detection in Tokenizer.for_dataset.
|
|
10
|
+
class BPETextfilePreparer
|
|
11
|
+
BUFFER_SIZE = 100_000
|
|
12
|
+
|
|
13
|
+
attr_reader :input_path, :output_dir, :val_ratio
|
|
14
|
+
|
|
15
|
+
def initialize(input_path:, output_name: nil, val_ratio: 0.1)
|
|
16
|
+
@input_path = input_path
|
|
17
|
+
@val_ratio = val_ratio
|
|
18
|
+
@output_name = output_name || derive_output_name(input_path)
|
|
19
|
+
@output_dir = File.join(Dir.pwd, "data", @output_name)
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def prepare
|
|
23
|
+
validate_input!
|
|
24
|
+
FileUtils.mkdir_p(@output_dir)
|
|
25
|
+
|
|
26
|
+
print_header
|
|
27
|
+
encoding = detect_encoding
|
|
28
|
+
tokens = tokenize_file(encoding)
|
|
29
|
+
train_tokens, val_tokens = split_tokens(tokens)
|
|
30
|
+
|
|
31
|
+
write_bin(File.join(@output_dir, "train.bin"), train_tokens, "train")
|
|
32
|
+
write_bin(File.join(@output_dir, "val.bin"), val_tokens, "val")
|
|
33
|
+
|
|
34
|
+
print_summary(train_tokens.size, val_tokens.size)
|
|
35
|
+
@output_name
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
private
|
|
39
|
+
|
|
40
|
+
def derive_output_name(path)
|
|
41
|
+
File.basename(path, ".*").gsub(/[^a-zA-Z0-9_-]/, "_")
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def validate_input!
|
|
45
|
+
raise "File not found: #{@input_path}" unless File.exist?(@input_path)
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def print_header
|
|
49
|
+
file_size = File.size(@input_path)
|
|
50
|
+
puts "Preparing text file (BPE): #{@input_path}"
|
|
51
|
+
puts "File size: #{(file_size / 1_000_000.0).round(2)} MB"
|
|
52
|
+
puts "Output directory: #{@output_dir}"
|
|
53
|
+
puts "Validation ratio: #{@val_ratio}"
|
|
54
|
+
puts ""
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def detect_encoding
|
|
58
|
+
sample = File.binread(@input_path, 100_000)
|
|
59
|
+
encoding = sample.force_encoding("UTF-8").valid_encoding? ? "UTF-8" : "Windows-1252:UTF-8"
|
|
60
|
+
puts " Detected encoding: #{encoding.split(':').first}"
|
|
61
|
+
encoding
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def tokenize_file(encoding)
|
|
65
|
+
puts "Phase 1: Tokenizing with GPT-2 BPE..."
|
|
66
|
+
require "tiktoken_ruby"
|
|
67
|
+
enc = Tiktoken.get_encoding(:r50k_base)
|
|
68
|
+
|
|
69
|
+
text = File.read(@input_path, encoding: encoding)
|
|
70
|
+
tokens = enc.encode(text)
|
|
71
|
+
puts " #{tokens.size} tokens from #{text.length} characters"
|
|
72
|
+
tokens
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
def split_tokens(tokens)
|
|
76
|
+
val_count = (tokens.size * @val_ratio).to_i
|
|
77
|
+
train_count = tokens.size - val_count
|
|
78
|
+
puts ""
|
|
79
|
+
puts "Train: #{train_count} tokens"
|
|
80
|
+
puts "Val: #{val_count} tokens"
|
|
81
|
+
[tokens[0...train_count], tokens[train_count..]]
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
def write_bin(path, tokens, label)
|
|
85
|
+
puts ""
|
|
86
|
+
puts "Phase 2: Writing #{label}.bin..."
|
|
87
|
+
arr = Numo::UInt16.cast(tokens)
|
|
88
|
+
File.binwrite(path, arr.to_binary)
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
def print_summary(train_count, val_count)
|
|
92
|
+
train_size_mb = File.size(File.join(@output_dir, "train.bin")) / 1_000_000.0
|
|
93
|
+
val_size_mb = File.size(File.join(@output_dir, "val.bin")) / 1_000_000.0
|
|
94
|
+
|
|
95
|
+
puts ""
|
|
96
|
+
puts "Done!"
|
|
97
|
+
puts " train.bin: #{train_count} tokens (#{train_size_mb.round(2)} MB)"
|
|
98
|
+
puts " val.bin: #{val_count} tokens (#{val_size_mb.round(2)} MB)"
|
|
99
|
+
puts " No meta.json (GPT-2 tokenizer auto-detected)"
|
|
100
|
+
puts ""
|
|
101
|
+
puts "To train:"
|
|
102
|
+
puts " nanogpt train --dataset=#{@output_name}"
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
end
|