nanogpt 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/exe/nanogpt CHANGED
@@ -6,7 +6,7 @@ $stdout.sync = true
6
6
  require "nano_gpt"
7
7
 
8
8
  class NanoGPTCLI
9
- COMMANDS = %w[prepare train sample bench version help].freeze
9
+ COMMANDS = %w[prepare train sample bench web check version help].freeze
10
10
 
11
11
  def initialize(args)
12
12
  @command = args.shift
@@ -23,6 +23,10 @@ class NanoGPTCLI
23
23
  sample
24
24
  when "bench"
25
25
  bench
26
+ when "web"
27
+ web
28
+ when "check"
29
+ check
26
30
  when "version", "-v", "--version"
27
31
  version
28
32
  when "help", "-h", "--help", nil
@@ -40,7 +44,6 @@ class NanoGPTCLI
40
44
  def prepare
41
45
  dataset = @args.first
42
46
 
43
- # Find available datasets
44
47
  data_dir = File.join(File.dirname(__FILE__), "..", "data")
45
48
  available = Dir.glob(File.join(data_dir, "*", "prepare.rb")).map do |path|
46
49
  File.basename(File.dirname(path))
@@ -77,7 +80,6 @@ class NanoGPTCLI
77
80
  exit 1
78
81
  end
79
82
 
80
- # Set output directory to current working directory
81
83
  output_dir = File.join(Dir.pwd, "data", dataset)
82
84
  ENV["NANOGPT_DATA_DIR"] = output_dir
83
85
 
@@ -87,10 +89,6 @@ class NanoGPTCLI
87
89
  end
88
90
 
89
91
  def prepare_textfile
90
- require "numo/narray"
91
- require "json"
92
- require "fileutils"
93
-
94
92
  input_path = nil
95
93
  output_name = nil
96
94
  val_ratio = 0.1
@@ -116,144 +114,15 @@ class NanoGPTCLI
116
114
  exit 1
117
115
  end
118
116
 
119
- unless File.exist?(input_path)
120
- puts "Error: File not found: #{input_path}"
121
- exit 1
122
- end
123
-
124
- output_name ||= File.basename(input_path, ".*").gsub(/[^a-zA-Z0-9_-]/, "_")
125
- output_dir = File.join(Dir.pwd, "data", output_name)
126
- FileUtils.mkdir_p(output_dir)
127
-
128
- file_size = File.size(input_path)
129
- puts "Preparing text file: #{input_path}"
130
- puts "File size: #{(file_size / 1_000_000.0).round(2)} MB"
131
- puts "Output directory: #{output_dir}"
132
- puts "Validation ratio: #{val_ratio}"
133
- puts ""
134
-
135
- # Phase 1: Build vocabulary by reading entire file
136
- # For very large files, we read line by line to avoid memory issues
137
- puts "Phase 1: Building vocabulary..."
138
- char_set = Set.new
139
- char_count = 0
140
-
141
- # Detect encoding: check if file is valid UTF-8, otherwise assume Windows-1252
142
- sample = File.binread(input_path, 100_000)
143
- encoding = sample.force_encoding("UTF-8").valid_encoding? ? "UTF-8" : "Windows-1252:UTF-8"
144
- puts " Detected encoding: #{encoding.split(':').first}"
145
-
146
- File.foreach(input_path, encoding: encoding) do |line|
147
- line.each_char { |c| char_set.add(c) }
148
- char_count += line.length
149
- print "\r Scanned #{char_count} characters, #{char_set.size} unique..." if (char_count % 100_000) < 1000
150
- end
151
- puts "\r Scanned #{char_count} characters, #{char_set.size} unique..."
152
-
153
- chars = char_set.to_a.sort
154
- vocab_size = chars.size
155
- puts "Vocabulary size: #{vocab_size}"
156
-
157
- stoi = chars.each_with_index.to_h
158
- itos = chars.each_with_index.map { |c, i| [i, c] }.to_h
159
-
160
- # Phase 2: Calculate split point
161
- total_chars = char_count
162
- val_chars = (total_chars * val_ratio).to_i
163
- train_chars = total_chars - val_chars
164
- puts ""
165
- puts "Train: #{train_chars} characters"
166
- puts "Val: #{val_chars} characters"
167
-
168
- # Phase 3: Encode and write train.bin (streaming line by line)
169
- puts ""
170
- puts "Phase 2: Encoding and writing train.bin..."
171
- train_path = File.join(output_dir, "train.bin")
172
- chars_written = 0
173
- buffer = []
174
- buffer_size = 100_000
175
-
176
- File.open(train_path, "wb") do |output|
177
- File.foreach(input_path, encoding: encoding) do |line|
178
- line.each_char do |c|
179
- break if chars_written >= train_chars
180
-
181
- buffer << stoi[c]
182
- chars_written += 1
183
-
184
- if buffer.size >= buffer_size
185
- arr = Numo::UInt16.cast(buffer)
186
- output.write(arr.to_binary)
187
- buffer.clear
188
- print "\r Written #{chars_written}/#{train_chars} characters..."
189
- end
190
- end
191
- break if chars_written >= train_chars
192
- end
193
-
194
- unless buffer.empty?
195
- arr = Numo::UInt16.cast(buffer)
196
- output.write(arr.to_binary)
197
- buffer.clear
198
- end
199
- end
200
- puts ""
201
-
202
- # Phase 4: Encode and write val.bin (streaming line by line)
203
- puts "Phase 3: Encoding and writing val.bin..."
204
- val_path = File.join(output_dir, "val.bin")
205
- chars_written = 0
206
- skipped = 0
207
- buffer = []
208
-
209
- File.open(val_path, "wb") do |output|
210
- File.foreach(input_path, encoding: encoding) do |line|
211
- line.each_char do |c|
212
- if skipped < train_chars
213
- skipped += 1
214
- next
215
- end
216
-
217
- buffer << stoi[c]
218
- chars_written += 1
219
-
220
- if buffer.size >= buffer_size
221
- arr = Numo::UInt16.cast(buffer)
222
- output.write(arr.to_binary)
223
- buffer.clear
224
- print "\r Written #{chars_written}/#{val_chars} characters..."
225
- end
226
- end
227
- end
228
-
229
- unless buffer.empty?
230
- arr = Numo::UInt16.cast(buffer)
231
- output.write(arr.to_binary)
232
- buffer.clear
233
- end
234
- end
235
- puts ""
236
-
237
- # Phase 5: Save meta.json
238
- puts "Phase 4: Saving meta.json..."
239
- meta = {
240
- "vocab_size" => vocab_size,
241
- "itos" => itos.transform_keys(&:to_s),
242
- "stoi" => stoi
243
- }
244
- File.write(File.join(output_dir, "meta.json"), JSON.pretty_generate(meta))
245
-
246
- train_size_mb = File.size(train_path) / 1_000_000.0
247
- val_size_mb = File.size(val_path) / 1_000_000.0
248
-
249
- puts ""
250
- puts "Done!"
251
- puts " train.bin: #{train_chars} tokens (#{train_size_mb.round(2)} MB)"
252
- puts " val.bin: #{val_chars} tokens (#{val_size_mb.round(2)} MB)"
253
- puts " meta.json: vocab_size=#{vocab_size}"
254
- puts ""
255
- puts "To train:"
256
- puts " nanogpt train --dataset=#{output_name}"
117
+ preparer = NanoGPT::TextfilePreparer.new(
118
+ input_path: input_path,
119
+ output_name: output_name,
120
+ val_ratio: val_ratio
121
+ )
122
+ preparer.prepare
123
+ rescue RuntimeError => e
124
+ puts "Error: #{e.message}"
125
+ exit 1
257
126
  end
258
127
 
259
128
  def train
@@ -359,7 +228,7 @@ class NanoGPTCLI
359
228
  puts "Generating #{config[:num_samples]} samples..."
360
229
  puts "=" * 50
361
230
 
362
- config[:num_samples].times do |k|
231
+ config[:num_samples].times do |_k|
363
232
  y = model.generate(
364
233
  x,
365
234
  config[:max_new_tokens],
@@ -374,110 +243,58 @@ class NanoGPTCLI
374
243
  end
375
244
 
376
245
  def bench
377
- bench_config = {
378
- batch_size: 12,
379
- block_size: 1024,
380
- n_layer: 12,
381
- n_head: 12,
382
- n_embd: 768,
383
- dropout: 0.0,
384
- bias: false,
385
- real_data: true,
386
- dataset: "openwebtext",
387
- seed: 1337,
388
- device: "auto"
389
- }
390
-
391
- # Parse args
392
- @args.each do |arg|
393
- next unless arg.start_with?("--") && arg.include?("=")
394
-
395
- key, val = arg[2..].split("=", 2)
396
- key = key.to_sym
397
-
398
- next unless bench_config.key?(key)
399
-
400
- bench_config[key] = case bench_config[key]
401
- when Integer then val.to_i
402
- when Float then val.to_f
403
- when TrueClass, FalseClass then val.downcase == "true"
404
- else val
405
- end
406
- end
246
+ config = NanoGPT::BenchConfig.load(@args)
407
247
 
408
248
  puts "=" * 60
409
249
  puts "NanoGPT Benchmark"
410
250
  puts "=" * 60
411
251
  puts ""
412
252
  puts "Configuration:"
413
- puts " batch_size: #{bench_config[:batch_size]}"
414
- puts " block_size: #{bench_config[:block_size]}"
415
- puts " n_layer: #{bench_config[:n_layer]}"
416
- puts " n_head: #{bench_config[:n_head]}"
417
- puts " n_embd: #{bench_config[:n_embd]}"
418
- puts " real_data: #{bench_config[:real_data]}"
253
+ puts " batch_size: #{config[:batch_size]}"
254
+ puts " block_size: #{config[:block_size]}"
255
+ puts " n_layer: #{config[:n_layer]}"
256
+ puts " n_head: #{config[:n_head]}"
257
+ puts " n_embd: #{config[:n_embd]}"
258
+ puts " real_data: #{config[:real_data]}"
419
259
  puts ""
420
260
 
421
- if bench_config[:device] == "auto"
422
- bench_config[:device] = NanoGPT::Device.auto
261
+ if config[:device] == "auto"
262
+ config[:device] = NanoGPT::Device.auto
423
263
  end
424
- device = bench_config[:device]
264
+ device = config[:device]
425
265
  puts "Device: #{device}"
426
266
 
427
- Torch.manual_seed(bench_config[:seed])
267
+ Torch.manual_seed(config[:seed])
428
268
 
429
- if bench_config[:real_data]
430
- data_dir = File.join("data", bench_config[:dataset])
269
+ if config[:real_data]
270
+ data_dir = File.join("data", config[:dataset])
431
271
  train_bin = File.join(data_dir, "train.bin")
432
272
 
433
273
  unless File.exist?(train_bin)
434
274
  puts ""
435
275
  puts "Warning: #{train_bin} not found, using random data instead."
436
- puts "To use real data, run: bundle exec ruby data/#{bench_config[:dataset]}/prepare.rb"
276
+ puts "To use real data, run: bundle exec ruby data/#{config[:dataset]}/prepare.rb"
437
277
  puts ""
438
- bench_config[:real_data] = false
278
+ config[:real_data] = false
439
279
  end
440
280
  end
441
281
 
442
- if bench_config[:real_data]
443
- bytes = File.binread(File.join("data", bench_config[:dataset], "train.bin"))
444
- train_data = bytes.unpack("S<*")
445
- puts "Loaded #{train_data.size} tokens from #{bench_config[:dataset]}"
446
-
447
- get_batch = lambda do
448
- max_start = train_data.size - bench_config[:block_size] - 1
449
- indices = Array.new(bench_config[:batch_size]) { rand(0..max_start) }
450
- x_arrays = indices.map { |i| train_data[i, bench_config[:block_size]] }
451
- y_arrays = indices.map { |i| train_data[i + 1, bench_config[:block_size]] }
452
- x = Torch.tensor(x_arrays, dtype: :long)
453
- y = Torch.tensor(y_arrays, dtype: :long)
454
- x = x.to(device) if device != "cpu"
455
- y = y.to(device) if device != "cpu"
456
- [x, y]
457
- end
458
- else
459
- vocab_size = 50304
460
- puts "Using random data (vocab_size=#{vocab_size})"
461
-
462
- get_batch = lambda do
463
- x = Torch.randint(vocab_size, [bench_config[:batch_size], bench_config[:block_size]], dtype: :long)
464
- y = Torch.randint(vocab_size, [bench_config[:batch_size], bench_config[:block_size]], dtype: :long)
465
- x = x.to(device) if device != "cpu"
466
- y = y.to(device) if device != "cpu"
467
- [x, y]
468
- end
469
- end
282
+ get_batch = if config[:real_data]
283
+ create_real_data_batch_fn(config, device)
284
+ else
285
+ create_random_data_batch_fn(config, device)
286
+ end
470
287
 
471
288
  puts ""
472
289
  puts "Initializing model..."
473
290
  model_config = NanoGPT::GPTConfig.new(
474
- block_size: bench_config[:block_size],
291
+ block_size: config[:block_size],
475
292
  vocab_size: 50304,
476
- n_layer: bench_config[:n_layer],
477
- n_head: bench_config[:n_head],
478
- n_embd: bench_config[:n_embd],
479
- dropout: bench_config[:dropout],
480
- bias: bench_config[:bias]
293
+ n_layer: config[:n_layer],
294
+ n_head: config[:n_head],
295
+ n_embd: config[:n_embd],
296
+ dropout: config[:dropout],
297
+ bias: config[:bias]
481
298
  )
482
299
 
483
300
  model = NanoGPT::GPT.new(model_config)
@@ -494,6 +311,41 @@ class NanoGPTCLI
494
311
  puts "Starting benchmark..."
495
312
  puts "-" * 60
496
313
 
314
+ run_benchmark_phases(model, optimizer, get_batch, config)
315
+ end
316
+
317
+ def create_real_data_batch_fn(config, device)
318
+ bytes = File.binread(File.join("data", config[:dataset], "train.bin"))
319
+ train_data = bytes.unpack("S<*")
320
+ puts "Loaded #{train_data.size} tokens from #{config[:dataset]}"
321
+
322
+ lambda do
323
+ max_start = train_data.size - config[:block_size] - 1
324
+ indices = Array.new(config[:batch_size]) { rand(0..max_start) }
325
+ x_arrays = indices.map { |i| train_data[i, config[:block_size]] }
326
+ y_arrays = indices.map { |i| train_data[i + 1, config[:block_size]] }
327
+ x = Torch.tensor(x_arrays, dtype: :long)
328
+ y = Torch.tensor(y_arrays, dtype: :long)
329
+ x = x.to(device) if device != "cpu"
330
+ y = y.to(device) if device != "cpu"
331
+ [x, y]
332
+ end
333
+ end
334
+
335
+ def create_random_data_batch_fn(config, device)
336
+ vocab_size = 50304
337
+ puts "Using random data (vocab_size=#{vocab_size})"
338
+
339
+ lambda do
340
+ x = Torch.randint(vocab_size, [config[:batch_size], config[:block_size]], dtype: :long)
341
+ y = Torch.randint(vocab_size, [config[:batch_size], config[:block_size]], dtype: :long)
342
+ x = x.to(device) if device != "cpu"
343
+ y = y.to(device) if device != "cpu"
344
+ [x, y]
345
+ end
346
+ end
347
+
348
+ def run_benchmark_phases(model, optimizer, get_batch, config)
497
349
  [{ name: "burn-in", steps: 10 }, { name: "benchmark", steps: 20 }].each do |phase|
498
350
  puts ""
499
351
  puts "Phase: #{phase[:name]} (#{phase[:steps]} steps)"
@@ -514,18 +366,140 @@ class NanoGPTCLI
514
366
  t1 = Time.now
515
367
  dt = t1 - t0
516
368
 
517
- if phase[:name] == "benchmark"
518
- mfu = model.estimate_mfu(bench_config[:batch_size] * phase[:steps], dt)
519
- time_per_iter = dt / phase[:steps] * 1000
369
+ next unless phase[:name] == "benchmark"
520
370
 
521
- puts ""
522
- puts "=" * 60
523
- puts "Results:"
524
- puts " Time per iteration: #{format('%.2f', time_per_iter)}ms"
525
- puts " MFU: #{format('%.2f', mfu * 100)}%"
526
- puts "=" * 60
371
+ mfu = model.estimate_mfu(config[:batch_size] * phase[:steps], dt)
372
+ time_per_iter = dt / phase[:steps] * 1000
373
+
374
+ puts ""
375
+ puts "=" * 60
376
+ puts "Results:"
377
+ puts " Time per iteration: #{format('%.2f', time_per_iter)}ms"
378
+ puts " MFU: #{format('%.2f', mfu * 100)}%"
379
+ puts "=" * 60
380
+ end
381
+ end
382
+
383
+ def web
384
+ require "nano_gpt/web"
385
+
386
+ port = 4567
387
+ @args.each do |arg|
388
+ if arg.start_with?("--port=")
389
+ port = arg.split("=", 2).last.to_i
527
390
  end
528
391
  end
392
+
393
+ training_state = NanoGPT::Web::TrainingState.new
394
+ metrics_store = NanoGPT::Web::MetricsStore.new
395
+ sse_notifier = NanoGPT::Web::SSENotifier.new
396
+
397
+ worker = NanoGPT::Web::TrainingWorker.new(
398
+ training_state: training_state,
399
+ metrics_store: metrics_store,
400
+ sse_notifier: sse_notifier
401
+ )
402
+
403
+ NanoGPT::Web::Server.training_state = training_state
404
+ NanoGPT::Web::Server.metrics_store = metrics_store
405
+ NanoGPT::Web::Server.sse_notifier = sse_notifier
406
+ NanoGPT::Web::Server.training_worker = worker
407
+
408
+ puts "Starting nanoGPT web dashboard on http://localhost:#{port}"
409
+ puts "Device: #{NanoGPT::Device.auto}"
410
+
411
+ # Web server runs in a background thread; the main thread is reserved
412
+ # for Torch operations (processed via the training worker queue).
413
+ Thread.new { NanoGPT::Web::Server.run!(port: port, bind: "0.0.0.0") }
414
+ sleep 1
415
+ worker.run
416
+ end
417
+
418
+ def check
419
+ puts "nanoGPT environment check"
420
+ puts "=" * 40
421
+ puts ""
422
+
423
+ # Ruby version
424
+ puts "Ruby: #{RUBY_VERSION} (#{RUBY_PLATFORM})"
425
+ puts ""
426
+
427
+ # Torch availability
428
+ print "torch-rb: "
429
+ begin
430
+ require "torch"
431
+ puts "#{Torch::VERSION} OK"
432
+ rescue LoadError => e
433
+ puts "FAILED -- #{e.message}"
434
+ puts " Install with: gem install torch-rb"
435
+ puts ""
436
+ puts "Check complete (with errors)."
437
+ return
438
+ end
439
+ puts ""
440
+
441
+ # Device detection
442
+ puts "Devices:"
443
+ puts " CPU: always available"
444
+
445
+ mps_available = begin
446
+ Torch::Backends::MPS.available?
447
+ rescue
448
+ false
449
+ end
450
+ puts " MPS: #{mps_available ? 'available' : 'not available'}"
451
+
452
+ cuda_available = begin
453
+ Torch::CUDA.available?
454
+ rescue
455
+ false
456
+ end
457
+ puts " CUDA: #{cuda_available ? 'available' : 'not available'}"
458
+
459
+ device = NanoGPT::Device.auto
460
+ puts ""
461
+ puts " Selected device: #{device}"
462
+ puts ""
463
+
464
+ # Basic tensor operation
465
+ print "Tensor ops (CPU): "
466
+ begin
467
+ a = Torch.tensor([1.0, 2.0, 3.0])
468
+ b = Torch.tensor([4.0, 5.0, 6.0])
469
+ c = a + b
470
+ raise "unexpected result" unless c.to_a == [5.0, 7.0, 9.0]
471
+ puts "OK"
472
+ rescue => e
473
+ puts "FAILED -- #{e.message}"
474
+ end
475
+
476
+ # Test on selected device
477
+ if device != "cpu"
478
+ print "Tensor ops (#{device}): "
479
+ begin
480
+ a = Torch.tensor([1.0, 2.0, 3.0], device: device)
481
+ b = Torch.tensor([4.0, 5.0, 6.0], device: device)
482
+ c = (a + b).cpu
483
+ raise "unexpected result" unless c.to_a == [5.0, 7.0, 9.0]
484
+ puts "OK"
485
+ rescue => e
486
+ puts "FAILED -- #{e.message}"
487
+ end
488
+ end
489
+
490
+ # Matmul test (more representative of model workload)
491
+ print "Matrix multiply (#{device}): "
492
+ begin
493
+ m = Torch.randn(64, 384, device: device)
494
+ w = Torch.randn(384, 384, device: device)
495
+ _result = Torch.matmul(m, w)
496
+ puts "OK"
497
+ rescue => e
498
+ puts "FAILED -- #{e.message}"
499
+ end
500
+
501
+ puts ""
502
+ puts "All checks passed. Ready to train!"
529
503
  end
530
504
 
531
505
  def version
@@ -543,6 +517,8 @@ class NanoGPTCLI
543
517
  train Train a GPT model
544
518
  sample Generate text from a trained model
545
519
  bench Run performance benchmarks
520
+ web Start the web dashboard
521
+ check Verify environment (torch, CUDA/MPS, tensor ops)
546
522
  version Show version
547
523
  help Show this help message
548
524
 
@@ -551,6 +527,7 @@ class NanoGPTCLI
551
527
  nanogpt train --dataset=shakespeare_char --device=mps
552
528
  nanogpt sample --dataset=shakespeare_char --num_samples=3
553
529
  nanogpt bench --batch_size=8 --block_size=512
530
+ nanogpt web --port=4567
554
531
 
555
532
  For more information, visit: https://github.com/khasinski/nanogpt-rb
556
533
  HELP
@@ -0,0 +1,105 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "numo/narray"
4
+ require "fileutils"
5
+
6
+ module NanoGPT
7
+ # Prepares custom text files for training with GPT-2 BPE tokenization
8
+ # Mirrors TextfilePreparer but uses tiktoken instead of character-level encoding.
9
+ # Does NOT write meta.json -- absence triggers GPT-2 auto-detection in Tokenizer.for_dataset.
10
+ class BPETextfilePreparer
11
+ BUFFER_SIZE = 100_000
12
+
13
+ attr_reader :input_path, :output_dir, :val_ratio
14
+
15
+ def initialize(input_path:, output_name: nil, val_ratio: 0.1)
16
+ @input_path = input_path
17
+ @val_ratio = val_ratio
18
+ @output_name = output_name || derive_output_name(input_path)
19
+ @output_dir = File.join(Dir.pwd, "data", @output_name)
20
+ end
21
+
22
+ def prepare
23
+ validate_input!
24
+ FileUtils.mkdir_p(@output_dir)
25
+
26
+ print_header
27
+ encoding = detect_encoding
28
+ tokens = tokenize_file(encoding)
29
+ train_tokens, val_tokens = split_tokens(tokens)
30
+
31
+ write_bin(File.join(@output_dir, "train.bin"), train_tokens, "train")
32
+ write_bin(File.join(@output_dir, "val.bin"), val_tokens, "val")
33
+
34
+ print_summary(train_tokens.size, val_tokens.size)
35
+ @output_name
36
+ end
37
+
38
+ private
39
+
40
+ def derive_output_name(path)
41
+ File.basename(path, ".*").gsub(/[^a-zA-Z0-9_-]/, "_")
42
+ end
43
+
44
+ def validate_input!
45
+ raise "File not found: #{@input_path}" unless File.exist?(@input_path)
46
+ end
47
+
48
+ def print_header
49
+ file_size = File.size(@input_path)
50
+ puts "Preparing text file (BPE): #{@input_path}"
51
+ puts "File size: #{(file_size / 1_000_000.0).round(2)} MB"
52
+ puts "Output directory: #{@output_dir}"
53
+ puts "Validation ratio: #{@val_ratio}"
54
+ puts ""
55
+ end
56
+
57
+ def detect_encoding
58
+ sample = File.binread(@input_path, 100_000)
59
+ encoding = sample.force_encoding("UTF-8").valid_encoding? ? "UTF-8" : "Windows-1252:UTF-8"
60
+ puts " Detected encoding: #{encoding.split(':').first}"
61
+ encoding
62
+ end
63
+
64
+ def tokenize_file(encoding)
65
+ puts "Phase 1: Tokenizing with GPT-2 BPE..."
66
+ require "tiktoken_ruby"
67
+ enc = Tiktoken.get_encoding(:r50k_base)
68
+
69
+ text = File.read(@input_path, encoding: encoding)
70
+ tokens = enc.encode(text)
71
+ puts " #{tokens.size} tokens from #{text.length} characters"
72
+ tokens
73
+ end
74
+
75
+ def split_tokens(tokens)
76
+ val_count = (tokens.size * @val_ratio).to_i
77
+ train_count = tokens.size - val_count
78
+ puts ""
79
+ puts "Train: #{train_count} tokens"
80
+ puts "Val: #{val_count} tokens"
81
+ [tokens[0...train_count], tokens[train_count..]]
82
+ end
83
+
84
+ def write_bin(path, tokens, label)
85
+ puts ""
86
+ puts "Phase 2: Writing #{label}.bin..."
87
+ arr = Numo::UInt16.cast(tokens)
88
+ File.binwrite(path, arr.to_binary)
89
+ end
90
+
91
+ def print_summary(train_count, val_count)
92
+ train_size_mb = File.size(File.join(@output_dir, "train.bin")) / 1_000_000.0
93
+ val_size_mb = File.size(File.join(@output_dir, "val.bin")) / 1_000_000.0
94
+
95
+ puts ""
96
+ puts "Done!"
97
+ puts " train.bin: #{train_count} tokens (#{train_size_mb.round(2)} MB)"
98
+ puts " val.bin: #{val_count} tokens (#{val_size_mb.round(2)} MB)"
99
+ puts " No meta.json (GPT-2 tokenizer auto-detected)"
100
+ puts ""
101
+ puts "To train:"
102
+ puts " nanogpt train --dataset=#{@output_name}"
103
+ end
104
+ end
105
+ end