RubyGems - nanogpt - Versions diffs - 0.2.0 → 0.3.0 - Mend

nanogpt 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

checksums.yaml +4 -4
data/Gemfile.lock +30 -1
data/docs/ARCHITECTURE.md +429 -0
data/exe/nanogpt +210 -233
data/lib/nano_gpt/bpe_textfile_preparer.rb +105 -0
data/lib/nano_gpt/data_loader.rb +5 -20
data/lib/nano_gpt/layers/block.rb +6 -1
data/lib/nano_gpt/layers/causal_self_attention.rb +11 -1
data/lib/nano_gpt/model.rb +1 -7
data/lib/nano_gpt/textfile_preparer.rb +189 -0
data/lib/nano_gpt/train_config.rb +80 -146
data/lib/nano_gpt/trainer.rb +21 -48
data/lib/nano_gpt/version.rb +1 -1
data/lib/nano_gpt/web/metrics_store.rb +136 -0
data/lib/nano_gpt/web/server.rb +294 -0
data/lib/nano_gpt/web/sse_notifier.rb +37 -0
data/lib/nano_gpt/web/training_state.rb +56 -0
data/lib/nano_gpt/web/training_worker.rb +153 -0
data/lib/nano_gpt/web/views/layout.erb +78 -0
data/lib/nano_gpt/web/views/run_detail.erb +432 -0
data/lib/nano_gpt/web/views/runs.erb +434 -0
data/lib/nano_gpt/web/web_trainer.rb +210 -0
data/lib/nano_gpt/web.rb +9 -0
data/lib/nano_gpt.rb +1 -0
data/nanogpt.gemspec +4 -0
metadata +71 -2

data/lib/nano_gpt/data_loader.rb CHANGED Viewed

@@ -4,9 +4,10 @@ module NanoGPT
   # Loads batches from binary token files
   # Memory-efficient: reads from file each batch (like Python's memmap recreation)
   class DataLoader
-    attr_reader :block_size, :batch_size
+    # Each token is stored as uint16 (2 bytes) in little-endian format
+    BYTES_PER_TOKEN = 2
-    BYTES_PER_TOKEN = 2  # uint16
+    attr_reader :block_size, :batch_size, :train_size, :val_size
     def initialize(data_dir:, block_size:, batch_size:, device: "cpu")
       @data_dir = data_dir
@@ -14,7 +15,6 @@ module NanoGPT
       @batch_size = batch_size
       @device = device
-      # Store file paths and sizes (NOT the data itself)
       @train_path = File.join(data_dir, "train.bin")
       @val_path = File.join(data_dir, "val.bin")
@@ -22,47 +22,32 @@ module NanoGPT
       @val_size = File.size(@val_path) / BYTES_PER_TOKEN
     end
-    def train_size
-      @train_size
-    end
-    def val_size
-      @val_size
-    end
     # Get a batch of data
-    # Memory-efficient: recreates data view per batch to avoid memory leak
-    # (matches Python's memmap recreation pattern)
+    # Memory-efficient: reads only needed bytes per batch to avoid memory leak
     def get_batch(split)
       path = split == :train ? @train_path : @val_path
       data_size = split == :train ? @train_size : @val_size
-      # Random starting indices
       max_start = data_size - @block_size - 1
       indices = Array.new(@batch_size) { rand(0..max_start) }
-      # Read only the bytes we need from file (memory-efficient)
-      # This mimics Python's memmap recreation per batch
       x_arrays = []
       y_arrays = []
       File.open(path, "rb") do |f|
         indices.each do |i|
-          # Read x: tokens[i:i+block_size]
           f.seek(i * BYTES_PER_TOKEN)
           x_bytes = f.read((@block_size + 1) * BYTES_PER_TOKEN)
-          tokens = x_bytes.unpack("S<*")  # uint16 little-endian
+          tokens = x_bytes.unpack("S<*") # uint16 little-endian
           x_arrays << tokens[0...@block_size]
           y_arrays << tokens[1..@block_size]
         end
       end
-      # Create tensors directly from arrays (avoiding Numo intermediate)
       x = Torch.tensor(x_arrays, dtype: :long)
       y = Torch.tensor(y_arrays, dtype: :long)
-      # Move to device (CPU, CUDA, or MPS)
       if @device != "cpu"
         x = x.to(@device)
         y = y.to(@device)

data/lib/nano_gpt/layers/block.rb CHANGED Viewed

@@ -4,6 +4,8 @@ module NanoGPT
   module Layers
     # Transformer block: LayerNorm -> Attention -> LayerNorm -> MLP
     class Block < Torch::NN::Module
+      attr_reader :attn
       def initialize(config)
         super()
         @ln_1 = LayerNorm.new(config.n_embd, bias: config.bias)
@@ -16,10 +18,13 @@ module NanoGPT
         x = x + @attn.call(@ln_1.call(x))
         x = x + @mlp.call(@ln_2.call(x))
         # Trigger GC to free intermediate tensors (critical for torch.rb memory management)
-        # Ruby's GC doesn't run frequently enough during forward pass, causing memory accumulation
         GC.start(full_mark: false, immediate_sweep: true)
         x
       end
+      def crop_mask(block_size)
+        @attn.crop_mask(block_size)
+      end
     end
   end
 end

data/lib/nano_gpt/layers/causal_self_attention.rb CHANGED Viewed

@@ -27,7 +27,10 @@ module NanoGPT
         # Causal mask for manual attention (only used when @flash is false)
         unless @flash
-          mask = Torch.tril(Torch.ones(config.block_size, config.block_size))
+          # Torch.tril segfaults with torch-rb 0.22.2 + PyTorch 2.10, use equivalent
+          rows = Torch.arange(config.block_size).unsqueeze(1)
+          cols = Torch.arange(config.block_size).unsqueeze(0)
+          mask = rows.ge(cols).to(dtype: :float32)
           register_buffer("mask", mask.view(1, 1, config.block_size, config.block_size))
         end
       end
@@ -68,6 +71,13 @@ module NanoGPT
         # Output projection
         @resid_dropout.call(@c_proj.call(y))
       end
+      # Crop the causal mask to a smaller block size (for inference optimization)
+      def crop_mask(block_size)
+        return unless defined?(@mask) && @mask
+        @mask = @mask[nil, nil, 0...block_size, 0...block_size]
+      end
     end
   end
 end

data/lib/nano_gpt/model.rb CHANGED Viewed

@@ -159,13 +159,7 @@ module NanoGPT
       @wpe = new_wpe
       # Update attention masks in all blocks
-      @h.each do |block|
-        attn = block.instance_variable_get(:@attn)
-        next unless attn.instance_variable_defined?(:@mask)
-        mask = attn.instance_variable_get(:@mask)
-        attn.instance_variable_set(:@mask, mask[nil, nil, 0...block_size, 0...block_size])
-      end
+      @h.each { |block| block.crop_mask(block_size) }
     end
     def configure_optimizers(weight_decay:, learning_rate:, betas:, device_type:)

data/lib/nano_gpt/textfile_preparer.rb ADDED Viewed

@@ -0,0 +1,189 @@
+# frozen_string_literal: true
+require "numo/narray"
+require "json"
+require "fileutils"
+require "set"
+module NanoGPT
+  # Prepares custom text files for training with character-level tokenization
+  # Handles large files efficiently through streaming
+  class TextfilePreparer
+    BUFFER_SIZE = 100_000
+    attr_reader :input_path, :output_dir, :val_ratio
+    def initialize(input_path:, output_name: nil, val_ratio: 0.1)
+      @input_path = input_path
+      @val_ratio = val_ratio
+      @output_name = output_name || derive_output_name(input_path)
+      @output_dir = File.join(Dir.pwd, "data", @output_name)
+    end
+    def prepare
+      validate_input!
+      FileUtils.mkdir_p(@output_dir)
+      print_header
+      encoding = detect_encoding
+      vocab, char_count = build_vocabulary(encoding)
+      stoi, itos = build_mappings(vocab)
+      train_chars, val_chars = calculate_split(char_count)
+      write_train_bin(encoding, stoi, train_chars)
+      write_val_bin(encoding, stoi, train_chars, val_chars)
+      write_meta_json(vocab.size, stoi, itos)
+      print_summary(train_chars, val_chars, vocab.size)
+      @output_name
+    end
+    private
+    def derive_output_name(path)
+      File.basename(path, ".*").gsub(/[^a-zA-Z0-9_-]/, "_")
+    end
+    def validate_input!
+      raise "File not found: #{@input_path}" unless File.exist?(@input_path)
+    end
+    def print_header
+      file_size = File.size(@input_path)
+      puts "Preparing text file: #{@input_path}"
+      puts "File size: #{(file_size / 1_000_000.0).round(2)} MB"
+      puts "Output directory: #{@output_dir}"
+      puts "Validation ratio: #{@val_ratio}"
+      puts ""
+    end
+    def detect_encoding
+      sample = File.binread(@input_path, 100_000)
+      encoding = sample.force_encoding("UTF-8").valid_encoding? ? "UTF-8" : "Windows-1252:UTF-8"
+      puts "  Detected encoding: #{encoding.split(':').first}"
+      encoding
+    end
+    def build_vocabulary(encoding)
+      puts "Phase 1: Building vocabulary..."
+      char_set = Set.new
+      char_count = 0
+      File.foreach(@input_path, encoding: encoding) do |line|
+        line.each_char { |c| char_set.add(c) }
+        char_count += line.length
+        print "\r  Scanned #{char_count} characters, #{char_set.size} unique..." if (char_count % 100_000) < 1000
+      end
+      puts "\r  Scanned #{char_count} characters, #{char_set.size} unique..."
+      puts "Vocabulary size: #{char_set.size}"
+      [char_set.to_a.sort, char_count]
+    end
+    def build_mappings(vocab)
+      stoi = vocab.each_with_index.to_h
+      itos = vocab.each_with_index.map { |c, i| [i, c] }.to_h
+      [stoi, itos]
+    end
+    def calculate_split(total_chars)
+      val_chars = (total_chars * @val_ratio).to_i
+      train_chars = total_chars - val_chars
+      puts ""
+      puts "Train: #{train_chars} characters"
+      puts "Val: #{val_chars} characters"
+      [train_chars, val_chars]
+    end
+    def write_train_bin(encoding, stoi, train_chars)
+      puts ""
+      puts "Phase 2: Encoding and writing train.bin..."
+      train_path = File.join(@output_dir, "train.bin")
+      write_tokens_to_bin(train_path, train_chars) do |on_char|
+        chars_written = 0
+        File.foreach(@input_path, encoding: encoding) do |line|
+          line.each_char do |c|
+            break if chars_written >= train_chars
+            on_char.call(stoi[c])
+            chars_written += 1
+          end
+          break if chars_written >= train_chars
+        end
+      end
+    end
+    def write_val_bin(encoding, stoi, train_chars, val_chars)
+      puts "Phase 3: Encoding and writing val.bin..."
+      val_path = File.join(@output_dir, "val.bin")
+      write_tokens_to_bin(val_path, val_chars) do |on_char|
+        skipped = 0
+        chars_written = 0
+        File.foreach(@input_path, encoding: encoding) do |line|
+          line.each_char do |c|
+            if skipped < train_chars
+              skipped += 1
+              next
+            end
+            on_char.call(stoi[c])
+            chars_written += 1
+          end
+        end
+      end
+    end
+    # Shared helper for writing tokens to binary files with buffering
+    def write_tokens_to_bin(path, total_chars)
+      buffer = []
+      chars_written = 0
+      File.open(path, "wb") do |output|
+        on_char = lambda do |token|
+          buffer << token
+          chars_written += 1
+          if buffer.size >= BUFFER_SIZE
+            flush_buffer(output, buffer)
+            print "\r  Written #{chars_written}/#{total_chars} characters..."
+          end
+        end
+        yield on_char
+        flush_buffer(output, buffer) unless buffer.empty?
+      end
+      puts ""
+    end
+    def flush_buffer(output, buffer)
+      arr = Numo::UInt16.cast(buffer)
+      output.write(arr.to_binary)
+      buffer.clear
+    end
+    def write_meta_json(vocab_size, stoi, itos)
+      puts "Phase 4: Saving meta.json..."
+      meta = {
+        "vocab_size" => vocab_size,
+        "itos" => itos.transform_keys(&:to_s),
+        "stoi" => stoi
+      }
+      File.write(File.join(@output_dir, "meta.json"), JSON.pretty_generate(meta))
+    end
+    def print_summary(train_chars, val_chars, vocab_size)
+      train_size_mb = File.size(File.join(@output_dir, "train.bin")) / 1_000_000.0
+      val_size_mb = File.size(File.join(@output_dir, "val.bin")) / 1_000_000.0
+      puts ""
+      puts "Done!"
+      puts "  train.bin: #{train_chars} tokens (#{train_size_mb.round(2)} MB)"
+      puts "  val.bin: #{val_chars} tokens (#{val_size_mb.round(2)} MB)"
+      puts "  meta.json: vocab_size=#{vocab_size}"
+      puts ""
+      puts "To train:"
+      puts "  nanogpt train --dataset=#{@output_name}"
+    end
+  end
+end

data/lib/nano_gpt/train_config.rb CHANGED Viewed

@@ -3,67 +3,22 @@
 require "json"
 module NanoGPT
-  # Configuration system for training and sampling
+  # Base configuration class with shared functionality
   # Supports JSON config files with command-line overrides
   #
   # Priority (highest to lowest):
   #   1. Command-line arguments (--key=value)
   #   2. JSON config file (--config=path.json)
   #   3. Default values
-  #
-  # Usage:
-  #   config = TrainConfig.load(ARGV)
-  #   config[:learning_rate]  # => 0.001
-  #
-  class TrainConfig
-    # Defaults match bin/train exactly
-    DEFAULTS = {
-      # I/O
-      out_dir: "out-shakespeare-char",
-      eval_interval: 250,
-      log_interval: 10,
-      eval_iters: 200,
-      eval_only: false,
-      always_save_checkpoint: false,
-      init_from: "scratch",  # 'scratch' or 'resume'
-      # Data
-      dataset: "shakespeare_char",
-      batch_size: 64,
-      block_size: 256,
-      gradient_accumulation_steps: 1,
-      # Model
-      n_layer: 6,
-      n_head: 6,
-      n_embd: 384,
-      dropout: 0.2,
-      bias: false,
-      # Optimizer
-      learning_rate: 1e-3,
-      weight_decay: 1e-1,
-      beta1: 0.9,
-      beta2: 0.99,
-      grad_clip: 1.0,
-      # LR scheduler
-      decay_lr: true,
-      warmup_iters: 100,
-      lr_decay_iters: 5000,
-      min_lr: 1e-4,
-      # Training
-      max_iters: 5000,
-      # System
-      device: "auto"
-    }.freeze
+  class BaseConfig
+    def self.defaults
+      raise NotImplementedError, "Subclasses must define DEFAULTS"
+    end
     attr_reader :values
     def initialize(values = {})
-      @values = DEFAULTS.merge(values)
+      @values = self.class.defaults.merge(values)
     end
     def [](key)
@@ -79,25 +34,17 @@ module NanoGPT
     end
     # Load config from command-line args
-    # Supports:
-    #   --config=path/to/config.json  (load JSON file)
-    #   --key=value                   (override specific values)
     def self.load(args)
       config = new
       # First pass: find and load JSON config file
-      config_file = nil
       args.each do |arg|
         if arg.start_with?("--config=")
-          config_file = arg.split("=", 2).last
+          config.load_json(arg.split("=", 2).last)
           break
         end
       end
-      if config_file
-        config.load_json(config_file)
-      end
       # Second pass: apply command-line overrides
       args.each do |arg|
         next unless arg.start_with?("--") && arg.include?("=")
@@ -120,9 +67,7 @@ module NanoGPT
     # Load values from JSON file
     def load_json(path)
-      unless File.exist?(path)
-        raise "Config file not found: #{path}"
-      end
+      raise "Config file not found: #{path}" unless File.exist?(path)
       json = JSON.parse(File.read(path))
       puts "Loaded config from #{path}"
@@ -145,8 +90,6 @@ module NanoGPT
       puts "Saved config to #{path}"
     end
-    private
     def self.parse_value(val, existing)
       case existing
       when Integer then val.to_i
@@ -155,11 +98,61 @@ module NanoGPT
       else val
       end
     end
+    private_class_method :parse_value
+  end
+  # Configuration for training
+  class TrainConfig < BaseConfig
+    DEFAULTS = {
+      # I/O
+      out_dir: "out-shakespeare-char",
+      eval_interval: 250,
+      log_interval: 10,
+      eval_iters: 200,
+      eval_only: false,
+      always_save_checkpoint: false,
+      init_from: "scratch",
+      # Data
+      dataset: "shakespeare_char",
+      batch_size: 64,
+      block_size: 256,
+      gradient_accumulation_steps: 1,
+      # Model
+      n_layer: 6,
+      n_head: 6,
+      n_embd: 384,
+      dropout: 0.2,
+      bias: false,
+      # Optimizer
+      learning_rate: 1e-3,
+      weight_decay: 1e-1,
+      beta1: 0.9,
+      beta2: 0.99,
+      grad_clip: 1.0,
+      # LR scheduler
+      decay_lr: true,
+      warmup_iters: 100,
+      lr_decay_iters: 5000,
+      min_lr: 1e-4,
+      # Training
+      max_iters: 5000,
+      # System
+      device: "auto"
+    }.freeze
+    def self.defaults
+      DEFAULTS
+    end
   end
   # Configuration for sampling/generation
-  class SampleConfig
-    # Defaults match bin/sample exactly
+  class SampleConfig < BaseConfig
     DEFAULTS = {
       out_dir: "out-shakespeare-char",
       dataset: "shakespeare_char",
@@ -172,88 +165,29 @@ module NanoGPT
       device: "auto"
     }.freeze
-    attr_reader :values
-    def initialize(values = {})
-      @values = DEFAULTS.merge(values)
-    end
-    def [](key)
-      @values[key.to_sym]
-    end
-    def []=(key, value)
-      @values[key.to_sym] = value
-    end
-    def to_h
-      @values.dup
-    end
-    def self.load(args)
-      config = new
-      # First pass: find and load JSON config file
-      config_file = nil
-      args.each do |arg|
-        if arg.start_with?("--config=")
-          config_file = arg.split("=", 2).last
-          break
-        end
-      end
-      if config_file
-        config.load_json(config_file)
-      end
-      # Second pass: apply command-line overrides
-      args.each do |arg|
-        next unless arg.start_with?("--") && arg.include?("=")
-        next if arg.start_with?("--config=")
-        key, val = arg[2..].split("=", 2)
-        key = key.to_sym
-        unless config.values.key?(key)
-          puts "Warning: Unknown config key: #{key}"
-          next
-        end
-        config[key] = parse_value(val, config[key])
-      end
-      config
-    end
-    def load_json(path)
-      unless File.exist?(path)
-        raise "Config file not found: #{path}"
-      end
-      json = JSON.parse(File.read(path))
-      puts "Loaded config from #{path}"
-      json.each do |key, val|
-        key = key.to_sym
-        unless @values.key?(key)
-          puts "Warning: Unknown config key in JSON: #{key}"
-          next
-        end
-        @values[key] = val
-      end
-      self
+    def self.defaults
+      DEFAULTS
     end
+  end
-    private
+  # Configuration for benchmarking
+  class BenchConfig < BaseConfig
+    DEFAULTS = {
+      batch_size: 12,
+      block_size: 1024,
+      n_layer: 12,
+      n_head: 12,
+      n_embd: 768,
+      dropout: 0.0,
+      bias: false,
+      real_data: true,
+      dataset: "openwebtext",
+      seed: 1337,
+      device: "auto"
+    }.freeze
-    def self.parse_value(val, existing)
-      case existing
-      when Integer then val.to_i
-      when Float then val.to_f
-      when TrueClass, FalseClass then val.downcase == "true"
-      else val
-      end
+    def self.defaults
+      DEFAULTS
     end
   end
 end