RubyGems - nanogpt - Versions diffs - 0.1.0 - Mend

nanogpt 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

checksums.yaml +7 -0
data/Gemfile +7 -0
data/Gemfile.lock +42 -0
data/README.md +102 -0
data/bin/bench +210 -0
data/bin/sample +76 -0
data/bin/train +82 -0
data/config/train_gpt2.json +19 -0
data/config/train_shakespeare.json +14 -0
data/config/train_shakespeare_char.json +14 -0
data/data/openwebtext/prepare.rb +287 -0
data/data/shakespeare/prepare.rb +61 -0
data/data/shakespeare_char/input.txt +40000 -0
data/data/shakespeare_char/prepare.rb +73 -0
data/exe/nanogpt +338 -0
data/lib/nano_gpt/config.rb +42 -0
data/lib/nano_gpt/data_loader.rb +74 -0
data/lib/nano_gpt/device.rb +56 -0
data/lib/nano_gpt/layers/block.rb +25 -0
data/lib/nano_gpt/layers/causal_self_attention.rb +73 -0
data/lib/nano_gpt/layers/layer_norm.rb +21 -0
data/lib/nano_gpt/layers/mlp.rb +23 -0
data/lib/nano_gpt/lr_scheduler.rb +42 -0
data/lib/nano_gpt/model.rb +218 -0
data/lib/nano_gpt/tokenizer.rb +106 -0
data/lib/nano_gpt/train_config.rb +259 -0
data/lib/nano_gpt/trainer.rb +221 -0
data/lib/nano_gpt/version.rb +5 -0
data/lib/nano_gpt.rb +18 -0
data/nanogpt.gemspec +37 -0
metadata +133 -0

data/lib/nano_gpt/layers/mlp.rb ADDED Viewed

@@ -0,0 +1,23 @@
+# frozen_string_literal: true
+module NanoGPT
+  module Layers
+    # Feed-forward network with GELU activation
+    class MLP < Torch::NN::Module
+      def initialize(config)
+        super()
+        @c_fc = Torch::NN::Linear.new(config.n_embd, 4 * config.n_embd, bias: config.bias)
+        @gelu = Torch::NN::GELU.new
+        @c_proj = Torch::NN::Linear.new(4 * config.n_embd, config.n_embd, bias: config.bias)
+        @dropout = Torch::NN::Dropout.new(p: config.dropout)
+      end
+      def forward(x)
+        x = @c_fc.call(x)
+        x = @gelu.call(x)
+        x = @c_proj.call(x)
+        @dropout.call(x)
+      end
+    end
+  end
+end

data/lib/nano_gpt/lr_scheduler.rb ADDED Viewed

@@ -0,0 +1,42 @@
+# frozen_string_literal: true
+module NanoGPT
+  # Cosine learning rate scheduler with linear warmup
+  class LRScheduler
+    attr_reader :learning_rate, :min_lr, :warmup_iters, :lr_decay_iters
+    def initialize(learning_rate:, min_lr:, warmup_iters:, lr_decay_iters:)
+      @learning_rate = learning_rate
+      @min_lr = min_lr
+      @warmup_iters = warmup_iters
+      @lr_decay_iters = lr_decay_iters
+    end
+    # Get learning rate for given iteration
+    def get_lr(iter)
+      # 1) Linear warmup for warmup_iters steps
+      if iter < @warmup_iters
+        return @learning_rate * (iter + 1).to_f / (@warmup_iters + 1)
+      end
+      # 2) If iter > lr_decay_iters, return min learning rate
+      if iter > @lr_decay_iters
+        return @min_lr
+      end
+      # 3) In between, use cosine decay down to min learning rate
+      decay_ratio = (iter - @warmup_iters).to_f / (@lr_decay_iters - @warmup_iters)
+      coeff = 0.5 * (1.0 + Math.cos(Math::PI * decay_ratio))
+      @min_lr + coeff * (@learning_rate - @min_lr)
+    end
+    # Apply learning rate to optimizer
+    def step(optimizer, iter)
+      lr = get_lr(iter)
+      optimizer.param_groups.each do |group|
+        group[:lr] = lr
+      end
+      lr
+    end
+  end
+end

data/lib/nano_gpt/model.rb ADDED Viewed

@@ -0,0 +1,218 @@
+# frozen_string_literal: true
+module NanoGPT
+  # GPT Language Model
+  class GPT < Torch::NN::Module
+    attr_reader :config
+    def initialize(config)
+      super()
+      raise ArgumentError, "vocab_size must be set" unless config.vocab_size
+      raise ArgumentError, "block_size must be set" unless config.block_size
+      @config = config
+      # Token and position embeddings
+      @wte = Torch::NN::Embedding.new(config.vocab_size, config.n_embd)
+      @wpe = Torch::NN::Embedding.new(config.block_size, config.n_embd)
+      @drop = Torch::NN::Dropout.new(p: config.dropout)
+      # Transformer blocks
+      @h = Torch::NN::ModuleList.new(
+        config.n_layer.times.map { Layers::Block.new(config) }
+      )
+      # Final layer norm
+      @ln_f = Layers::LayerNorm.new(config.n_embd, bias: config.bias)
+      # Note: We use weight tying - lm_head shares weights with wte
+      # Instead of a separate Linear layer, we use wte.weight directly in forward
+      # Initialize weights
+      apply(method(:_init_weights))
+      # Special scaled init for residual projections (per GPT-2 paper)
+      named_parameters.each do |name, param|
+        if name.end_with?("c_proj.weight")
+          Torch::NN::Init.normal!(param, mean: 0.0, std: 0.02 / Math.sqrt(2 * config.n_layer))
+        end
+      end
+      puts format("number of parameters: %.2fM", num_params / 1e6)
+    end
+    def num_params(non_embedding: true)
+      n_params = parameters.sum(&:numel)
+      n_params -= @wpe.weight.numel if non_embedding
+      n_params
+    end
+    # Estimate model flops utilization (MFU)
+    # See PaLM paper Appendix B: https://arxiv.org/abs/2204.02311
+    def estimate_mfu(fwdbwd_per_iter, dt)
+      n = num_params
+      cfg = @config
+      l = cfg.n_layer
+      h = cfg.n_head
+      q = cfg.n_embd / cfg.n_head
+      t = cfg.block_size
+      # FLOPs per token and per forward-backward pass
+      flops_per_token = 6 * n + 12 * l * h * q * t
+      flops_per_fwdbwd = flops_per_token * t
+      flops_per_iter = flops_per_fwdbwd * fwdbwd_per_iter
+      # Express throughput as ratio of A100 bfloat16 peak FLOPS (312 TFLOPS)
+      flops_achieved = flops_per_iter / dt
+      flops_promised = 312e12
+      flops_achieved / flops_promised
+    end
+    def forward(idx, targets: nil)
+      b, t = idx.shape
+      raise ArgumentError, "Sequence length #{t} exceeds block_size #{@config.block_size}" if t > @config.block_size
+      device = idx.device
+      # Position indices
+      pos = Torch.arange(0, t, dtype: :long, device: device)
+      # Embeddings
+      tok_emb = @wte.call(idx)  # (B, T, n_embd)
+      pos_emb = @wpe.call(pos)  # (T, n_embd)
+      x = @drop.call(tok_emb + pos_emb)
+      # Transformer blocks
+      @h.each { |block| x = block.call(x) }
+      # Final layer norm
+      x = @ln_f.call(x)
+      if targets
+        # Training: compute logits for all positions using tied weights
+        logits = Torch::NN::Functional.linear(x, @wte.weight, nil)
+        loss = Torch::NN::Functional.cross_entropy(
+          logits.view(-1, logits.size(-1)),
+          targets.view(-1),
+          ignore_index: -1
+        )
+      else
+        # Inference: only compute logits for last position (optimization)
+        # Use narrow to get last position: x[:, -1:, :]
+        x_last = x.narrow(1, x.size(1) - 1, 1)
+        logits = Torch::NN::Functional.linear(x_last, @wte.weight, nil)
+        loss = nil
+      end
+      [logits, loss]
+    end
+    def generate(idx, max_new_tokens, temperature: 1.0, top_k: nil)
+      Torch.no_grad do
+        max_new_tokens.times do
+          # Crop context if exceeds block_size
+          idx_cond = if idx.size(1) <= @config.block_size
+                       idx
+                     else
+                       idx.narrow(1, idx.size(1) - @config.block_size, @config.block_size)
+                     end
+          # Forward pass
+          logits, _loss = forward(idx_cond)
+          # Get logits for last position and scale by temperature
+          # logits shape is (B, 1, vocab_size), squeeze to (B, vocab_size)
+          logits = logits.squeeze(1) / temperature
+          # Optional top-k filtering
+          if top_k
+            k = [top_k, logits.size(-1)].min
+            v, _indices = logits.topk(k)
+            # Get the k-th largest value as threshold
+            threshold = v.narrow(1, k - 1, 1)
+            # Mask out values below threshold
+            logits = logits.masked_fill(logits.lt(threshold), -Float::INFINITY)
+          end
+          # Sample from probability distribution
+          probs = Torch::NN::Functional.softmax(logits, dim: -1)
+          idx_next = Torch.multinomial(probs, num_samples: 1)
+          # Append to sequence
+          idx = Torch.cat([idx, idx_next], dim: 1)
+        end
+      end
+      idx
+    end
+    def crop_block_size(block_size)
+      raise ArgumentError, "Cannot crop to larger block_size" if block_size > @config.block_size
+      @config.block_size = block_size
+      # Create new embedding with cropped weights
+      new_wpe = Torch::NN::Embedding.new(block_size, @config.n_embd)
+      Torch.no_grad do
+        new_wpe.weight.copy!(@wpe.weight[0...block_size])
+      end
+      @wpe = new_wpe
+      # Update attention masks in all blocks
+      @h.each do |block|
+        attn = block.instance_variable_get(:@attn)
+        next unless attn.instance_variable_defined?(:@mask)
+        mask = attn.instance_variable_get(:@mask)
+        attn.instance_variable_set(:@mask, mask[nil, nil, 0...block_size, 0...block_size])
+      end
+    end
+    def configure_optimizers(weight_decay:, learning_rate:, betas:, device_type:)
+      # Separate parameters into decay and no-decay groups
+      # All 2D+ params (weights) get weight decay, 1D params (biases, layernorm) don't
+      decay_params = []
+      nodecay_params = []
+      parameters.each do |param|
+        next unless param.requires_grad
+        if param.dim >= 2
+          decay_params << param
+        else
+          nodecay_params << param
+        end
+      end
+      num_decay = decay_params.sum(&:numel)
+      num_nodecay = nodecay_params.sum(&:numel)
+      puts "num decayed parameter tensors: #{decay_params.size}, with #{num_decay} parameters"
+      puts "num non-decayed parameter tensors: #{nodecay_params.size}, with #{num_nodecay} parameters"
+      # Create optimizer with parameter groups (using symbol keys for torch.rb)
+      Torch::Optim::AdamW.new(
+        [
+          { params: decay_params, weight_decay: weight_decay },
+          { params: nodecay_params, weight_decay: 0.0 }
+        ],
+        lr: learning_rate,
+        betas: betas
+      )
+    end
+    private
+    def _init_weights(mod)
+      case mod
+      when Torch::NN::Linear
+        Torch::NN::Init.normal!(mod.weight, mean: 0.0, std: 0.02)
+        # Check if bias exists (it won't when bias: false)
+        if mod.instance_variable_defined?(:@bias) && mod.instance_variable_get(:@bias)
+          Torch::NN::Init.zeros!(mod.instance_variable_get(:@bias))
+        end
+      when Torch::NN::Embedding
+        Torch::NN::Init.normal!(mod.weight, mean: 0.0, std: 0.02)
+      end
+    end
+  end
+end

data/lib/nano_gpt/tokenizer.rb ADDED Viewed

@@ -0,0 +1,106 @@
+# frozen_string_literal: true
+require "json"
+module NanoGPT
+  # Base tokenizer interface
+  class Tokenizer
+    attr_reader :vocab_size
+    def encode(text)
+      raise NotImplementedError
+    end
+    def decode(ids)
+      raise NotImplementedError
+    end
+    # Auto-detect and load the appropriate tokenizer
+    # If meta.json exists, use character-level; otherwise use GPT-2 BPE
+    def self.for_dataset(dataset_dir)
+      meta_path = File.join(dataset_dir, "meta.json")
+      if File.exist?(meta_path)
+        CharTokenizer.from_file(meta_path)
+      else
+        GPT2Tokenizer.new
+      end
+    end
+  end
+  # Character-level tokenizer
+  class CharTokenizer < Tokenizer
+    attr_reader :stoi, :itos
+    def initialize(stoi: nil, itos: nil)
+      super()
+      @stoi = stoi || {}
+      @itos = itos || {}
+      @vocab_size = @stoi.size
+    end
+    # Build vocabulary from text
+    def self.from_text(text)
+      chars = text.chars.uniq.sort
+      stoi = chars.each_with_index.to_h
+      itos = chars.each_with_index.map { |c, i| [i, c] }.to_h
+      new(stoi: stoi, itos: itos)
+    end
+    # Load from meta.json file
+    def self.from_file(path)
+      meta = JSON.parse(File.read(path))
+      # Convert string keys to integers for itos
+      itos = meta["itos"].transform_keys(&:to_i)
+      new(stoi: meta["stoi"], itos: itos)
+    end
+    # Encode string to list of integers
+    def encode(text)
+      text.chars.map { |c| @stoi[c] }
+    end
+    # Decode list of integers to string
+    def decode(ids)
+      ids.map { |i| @itos[i] }.join
+    end
+    # Save to meta.json file
+    def save(path)
+      meta = {
+        "vocab_size" => @vocab_size,
+        "stoi" => @stoi,
+        "itos" => @itos.transform_keys(&:to_s)
+      }
+      File.write(path, JSON.pretty_generate(meta))
+    end
+  end
+  # GPT-2 BPE tokenizer using tiktoken
+  class GPT2Tokenizer < Tokenizer
+    GPT2_VOCAB_SIZE = 50257
+    EOT_TOKEN = "<|endoftext|>"
+    def initialize
+      super()
+      require "tiktoken_ruby"
+      # GPT-2 uses the r50k_base encoding
+      @enc = Tiktoken.get_encoding(:r50k_base)
+      @vocab_size = GPT2_VOCAB_SIZE
+    end
+    # Encode string to list of integers
+    def encode(text)
+      @enc.encode(text)
+    end
+    # Decode list of integers to string
+    def decode(ids)
+      @enc.decode(ids)
+    end
+    # Get the end-of-text token ID
+    def eot_token
+      @enc.encode(EOT_TOKEN).first
+    end
+  end
+end

data/lib/nano_gpt/train_config.rb ADDED Viewed

@@ -0,0 +1,259 @@
+# frozen_string_literal: true
+require "json"
+module NanoGPT
+  # Configuration system for training and sampling
+  # Supports JSON config files with command-line overrides
+  #
+  # Priority (highest to lowest):
+  #   1. Command-line arguments (--key=value)
+  #   2. JSON config file (--config=path.json)
+  #   3. Default values
+  #
+  # Usage:
+  #   config = TrainConfig.load(ARGV)
+  #   config[:learning_rate]  # => 0.001
+  #
+  class TrainConfig
+    # Defaults match bin/train exactly
+    DEFAULTS = {
+      # I/O
+      out_dir: "out-shakespeare-char",
+      eval_interval: 250,
+      log_interval: 10,
+      eval_iters: 200,
+      eval_only: false,
+      always_save_checkpoint: false,
+      init_from: "scratch",  # 'scratch' or 'resume'
+      # Data
+      dataset: "shakespeare_char",
+      batch_size: 64,
+      block_size: 256,
+      gradient_accumulation_steps: 1,
+      # Model
+      n_layer: 6,
+      n_head: 6,
+      n_embd: 384,
+      dropout: 0.2,
+      bias: false,
+      # Optimizer
+      learning_rate: 1e-3,
+      weight_decay: 1e-1,
+      beta1: 0.9,
+      beta2: 0.99,
+      grad_clip: 1.0,
+      # LR scheduler
+      decay_lr: true,
+      warmup_iters: 100,
+      lr_decay_iters: 5000,
+      min_lr: 1e-4,
+      # Training
+      max_iters: 5000,
+      # System
+      device: "auto"
+    }.freeze
+    attr_reader :values
+    def initialize(values = {})
+      @values = DEFAULTS.merge(values)
+    end
+    def [](key)
+      @values[key.to_sym]
+    end
+    def []=(key, value)
+      @values[key.to_sym] = value
+    end
+    def to_h
+      @values.dup
+    end
+    # Load config from command-line args
+    # Supports:
+    #   --config=path/to/config.json  (load JSON file)
+    #   --key=value                   (override specific values)
+    def self.load(args)
+      config = new
+      # First pass: find and load JSON config file
+      config_file = nil
+      args.each do |arg|
+        if arg.start_with?("--config=")
+          config_file = arg.split("=", 2).last
+          break
+        end
+      end
+      if config_file
+        config.load_json(config_file)
+      end
+      # Second pass: apply command-line overrides
+      args.each do |arg|
+        next unless arg.start_with?("--") && arg.include?("=")
+        next if arg.start_with?("--config=")
+        key, val = arg[2..].split("=", 2)
+        key = key.to_sym
+        unless config.values.key?(key)
+          puts "Warning: Unknown config key: #{key}"
+          next
+        end
+        config[key] = parse_value(val, config[key])
+        puts "Override: #{key} = #{config[key]}"
+      end
+      config
+    end
+    # Load values from JSON file
+    def load_json(path)
+      unless File.exist?(path)
+        raise "Config file not found: #{path}"
+      end
+      json = JSON.parse(File.read(path))
+      puts "Loaded config from #{path}"
+      json.each do |key, val|
+        key = key.to_sym
+        unless @values.key?(key)
+          puts "Warning: Unknown config key in JSON: #{key}"
+          next
+        end
+        @values[key] = val
+      end
+      self
+    end
+    # Save current config to JSON file
+    def save_json(path)
+      File.write(path, JSON.pretty_generate(@values))
+      puts "Saved config to #{path}"
+    end
+    private
+    def self.parse_value(val, existing)
+      case existing
+      when Integer then val.to_i
+      when Float then val.to_f
+      when TrueClass, FalseClass then val.downcase == "true"
+      else val
+      end
+    end
+  end
+  # Configuration for sampling/generation
+  class SampleConfig
+    # Defaults match bin/sample exactly
+    DEFAULTS = {
+      out_dir: "out-shakespeare-char",
+      dataset: "shakespeare_char",
+      start: "\n",
+      num_samples: 5,
+      max_new_tokens: 500,
+      temperature: 0.8,
+      top_k: 200,
+      seed: 1337,
+      device: "auto"
+    }.freeze
+    attr_reader :values
+    def initialize(values = {})
+      @values = DEFAULTS.merge(values)
+    end
+    def [](key)
+      @values[key.to_sym]
+    end
+    def []=(key, value)
+      @values[key.to_sym] = value
+    end
+    def to_h
+      @values.dup
+    end
+    def self.load(args)
+      config = new
+      # First pass: find and load JSON config file
+      config_file = nil
+      args.each do |arg|
+        if arg.start_with?("--config=")
+          config_file = arg.split("=", 2).last
+          break
+        end
+      end
+      if config_file
+        config.load_json(config_file)
+      end
+      # Second pass: apply command-line overrides
+      args.each do |arg|
+        next unless arg.start_with?("--") && arg.include?("=")
+        next if arg.start_with?("--config=")
+        key, val = arg[2..].split("=", 2)
+        key = key.to_sym
+        unless config.values.key?(key)
+          puts "Warning: Unknown config key: #{key}"
+          next
+        end
+        config[key] = parse_value(val, config[key])
+      end
+      config
+    end
+    def load_json(path)
+      unless File.exist?(path)
+        raise "Config file not found: #{path}"
+      end
+      json = JSON.parse(File.read(path))
+      puts "Loaded config from #{path}"
+      json.each do |key, val|
+        key = key.to_sym
+        unless @values.key?(key)
+          puts "Warning: Unknown config key in JSON: #{key}"
+          next
+        end
+        @values[key] = val
+      end
+      self
+    end
+    private
+    def self.parse_value(val, existing)
+      case existing
+      when Integer then val.to_i
+      when Float then val.to_f
+      when TrueClass, FalseClass then val.downcase == "true"
+      else val
+      end
+    end
+  end
+end