RubyGems - nanochat - Versions diffs - 0.1.0.pre - Mend

nanochat 0.1.0.pre

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

checksums.yaml +7 -0
data/LICENSE +25 -0
data/README.md +129 -0
data/bin/nanochat-setup +186 -0
data/bin/package-checkpoint +122 -0
data/bin/speedrun.sh +32 -0
data/bin/train-tiny-model +190 -0
data/bin/train-with-python-nanochat.sh +167 -0
data/lib/nanochat/checkpoint_manager.rb +40 -0
data/lib/nanochat/common.rb +32 -0
data/lib/nanochat/config.rb +49 -0
data/lib/nanochat/engine.rb +152 -0
data/lib/nanochat/gpt.rb +285 -0
data/lib/nanochat/tokenizer.rb +119 -0
data/lib/nanochat/version.rb +5 -0
data/lib/nanochat.rb +27 -0
metadata +91 -0

data/bin/train-with-python-nanochat.sh ADDED Viewed

@@ -0,0 +1,167 @@
+#!/bin/bash
+# Train a tiny nanochat checkpoint using Python nanochat
+#
+# This script is adapted from python-nanochat/dev/runcpu.sh
+# Original: https://github.com/karpathy/nanochat by Andrej Karpathy
+#
+# REQUIREMENTS:
+#   - Python nanochat cloned at: ../python-nanochat or ./python-nanochat
+#   - Python 3.10+
+#   - Rust (for building rustbpe tokenizer)
+#   - ~1GB disk space for data
+#   - ~30 minutes on CPU or ~5 minutes on GPU
+#
+# USAGE:
+#   bash bin/train-with-python-nanochat.sh
+#
+# OUTPUT:
+#   Trained checkpoint at: ~/.cache/nanochat/model.pt
+#   Tokenizer at: ~/.cache/nanochat/tokenizer/tokenizer.json
+set -e  # Exit on error
+echo "🔥 Train Tiny Nanochat Model"
+echo "======================================================================"
+echo ""
+echo "This will train a d4 model (4 layers, minimal for demos)"
+echo "Output: ~/.cache/nanochat/"
+echo ""
+echo "⏱️  Estimated time: ~30 minutes on CPU"
+echo ""
+echo "📝 Attribution: Using training scripts from"
+echo "   https://github.com/karpathy/nanochat by Andrej Karpathy"
+echo ""
+echo "======================================================================"
+echo ""
+# Find python-nanochat directory
+if [ -d "python-nanochat" ]; then
+    PYTHON_NANOCHAT_DIR="python-nanochat"
+elif [ -d "../python-nanochat" ]; then
+    PYTHON_NANOCHAT_DIR="../python-nanochat"
+else
+    echo "❌ Python nanochat not found"
+    echo ""
+    echo "Clone it first:"
+    echo "  git clone https://github.com/karpathy/nanochat python-nanochat"
+    echo ""
+    exit 1
+fi
+echo "✅ Python nanochat found at: $PYTHON_NANOCHAT_DIR"
+echo ""
+# Change to python-nanochat directory
+cd "$PYTHON_NANOCHAT_DIR"
+# Setup environment
+export OMP_NUM_THREADS=1
+NANOCHAT_BASE_DIR="$HOME/.cache/nanochat"
+mkdir -p "$NANOCHAT_BASE_DIR"
+echo "🔧 Setting up Python environment..."
+command -v uv &> /dev/null || curl -LsSf https://astral.sh/uv/install.sh | sh
+[ -d ".venv" ] || uv venv
+uv sync --extra cpu
+source .venv/bin/activate
+echo ""
+echo "🦀 Building Rust tokenizer..."
+curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+source "$HOME/.cargo/env"
+uv run maturin develop --release --manifest-path rustbpe/Cargo.toml
+echo ""
+echo "📦 Downloading evaluation bundle..."
+EVAL_BUNDLE_URL=https://karpathy-public.s3.us-west-2.amazonaws.com/eval_bundle.zip
+if [ ! -d "$NANOCHAT_BASE_DIR/eval_bundle" ]; then
+    curl -L -o eval_bundle.zip "$EVAL_BUNDLE_URL"
+    unzip -q eval_bundle.zip
+    rm eval_bundle.zip
+    mv eval_bundle "$NANOCHAT_BASE_DIR"
+fi
+echo ""
+# Reset report
+python -m nanochat.report reset
+echo "📚 Downloading training data (~1GB)..."
+python -m nanochat.dataset -n 4
+echo ""
+echo "🔤 Training tokenizer..."
+python -m scripts.tok_train --max_chars=1000000000
+python -m scripts.tok_eval
+echo ""
+echo "🚀 Training base model (50 iterations, ~30 mins)..."
+python -m scripts.base_train \
+    --depth=4 \
+    --max_seq_len=1024 \
+    --device_batch_size=1 \
+    --total_batch_size=1024 \
+    --eval_every=50 \
+    --eval_tokens=4096 \
+    --core_metric_every=50 \
+    --core_metric_max_per_task=12 \
+    --sample_every=50 \
+    --num_iterations=50
+echo ""
+echo "📊 Evaluating base model..."
+python -m scripts.base_loss --device_batch_size=1 --split_tokens=4096
+python -m scripts.base_eval --max-per-task=16
+echo ""
+echo "🎯 Midtraining (100 iterations)..."
+python -m scripts.mid_train \
+    --max_seq_len=1024 \
+    --device_batch_size=1 \
+    --eval_every=50 \
+    --eval_tokens=4096 \
+    --total_batch_size=1024 \
+    --num_iterations=100
+echo ""
+echo "💬 Supervised fine-tuning (100 iterations)..."
+python -m scripts.chat_sft \
+    --device_batch_size=1 \
+    --target_examples_per_step=4 \
+    --num_iterations=100 \
+    --eval_steps=4 \
+    --eval_metrics_max_problems=16
+echo ""
+echo "📝 Generating training report..."
+python -m nanochat.report generate
+echo ""
+echo "======================================================================"
+echo "✅ Training complete!"
+echo ""
+echo "📦 Checkpoint location:"
+echo "  Model: $NANOCHAT_BASE_DIR/model.pt"
+echo "  Tokenizer: $NANOCHAT_BASE_DIR/tokenizer/tokenizer.json"
+echo ""
+echo "🎯 Next Steps - Use Your Model in Ruby"
+echo "======================================================================"
+echo ""
+echo "# Interactive chat"
+echo "ruby examples/chat_cli.rb"
+echo ""
+echo "# Web UI (visit http://localhost:8000)"
+echo "ruby examples/chat_web.rb"
+echo ""
+echo "# Generate text"
+echo "ruby examples/generate_text.rb 'Once upon a time'"
+echo ""
+echo "# Fine-tune on your data"
+echo "ruby examples/finetune.rb --data my_data.txt --output custom.pt"
+echo ""
+echo "======================================================================"
+echo ""
+echo "📦 Optional: Package this checkpoint for distribution"
+echo ""
+echo "tar -czf nanochat-tiny-d4.tar.gz -C $(dirname $NANOCHAT_BASE_DIR) $(basename $NANOCHAT_BASE_DIR)"
+echo ""

data/lib/nanochat/checkpoint_manager.rb ADDED Viewed

@@ -0,0 +1,40 @@
+# frozen_string_literal: true
+module Nanochat
+  # Checkpoint loading and saving
+  module CheckpointManager
+    class << self
+      def load(path)
+        raise ArgumentError, "Checkpoint not found: #{path}" unless File.exist?(path)
+        Torch.load(path)
+      end
+      def save(path, model: nil, state_dict: nil, optimizer: nil, config: nil, **metadata)
+        raise ArgumentError, 'Must provide either model: or state_dict:' if model.nil? && state_dict.nil?
+        FileUtils.mkdir_p(File.dirname(path))
+        model_dict = model ? model.state_dict : state_dict
+        model_dict = convert_keys_to_strings(model_dict)
+        data = {
+          'model' => model_dict,
+          'config' => config&.to_h&.transform_keys(&:to_s),
+          **metadata.transform_keys(&:to_s)
+        }
+        data['optimizer'] = optimizer.state_dict if optimizer
+        Torch.save(data, path)
+      end
+      private
+      def convert_keys_to_strings(hash)
+        hash.transform_keys(&:to_s).transform_values do |value|
+          value.is_a?(Hash) ? convert_keys_to_strings(value) : value
+        end
+      end
+    end
+  end
+end

data/lib/nanochat/common.rb ADDED Viewed

@@ -0,0 +1,32 @@
+# frozen_string_literal: true
+require 'fileutils'
+module Nanochat
+  # Common utilities
+  module Common
+    class << self
+      def device
+        @device ||= if Torch::CUDA.available?
+                      Torch.device('cuda')
+                    elsif defined?(Torch::Backends::MPS) && Torch::Backends::MPS.available?
+                      Torch.device('mps')
+                    else
+                      Torch.device('cpu')
+                    end
+      end
+      def seed(seed_value)
+        Torch.manual_seed(seed_value)
+        Torch::CUDA.manual_seed_all(seed_value) if Torch::CUDA.available?
+      end
+      def default_cache_dir = ENV.fetch('NANOCHAT_BASE_DIR') { File.expand_path('~/.cache/nanochat') }
+      def ensure_dir(path)
+        FileUtils.mkdir_p(path) unless File.directory?(path)
+        path
+      end
+    end
+  end
+end

data/lib/nanochat/config.rb ADDED Viewed

@@ -0,0 +1,49 @@
+# frozen_string_literal: true
+module Nanochat
+  # GPT model configuration
+  Config = Data.define(
+    :vocab_size,
+    :block_size,      # context length
+    :n_embd,          # embedding dimension
+    :n_head,          # query heads
+    :n_kv_head,       # key/value heads (MQA)
+    :n_layer          # transformer blocks
+  ) do
+    def self.default
+      new(
+        vocab_size: 50_304,
+        block_size: 1024,
+        n_embd: 768,
+        n_head: 6,
+        n_kv_head: 6,
+        n_layer: 12
+      )
+    end
+    def self.from_checkpoint(checkpoint)
+      config_dict = checkpoint['config'] || checkpoint[:config]
+      config_dict['block_size'] ||= config_dict['sequence_len'] if config_dict.is_a?(Hash)
+      new(**config_dict.transform_keys(&:to_sym))
+    end
+    def validate!
+      raise ArgumentError, "vocab_size (#{vocab_size}) must be positive" unless vocab_size.positive?
+      raise ArgumentError, "block_size (#{block_size}) must be positive" unless block_size.positive?
+      raise ArgumentError, "n_embd (#{n_embd}) must be positive" unless n_embd.positive?
+      raise ArgumentError, "n_head (#{n_head}) must be positive" unless n_head.positive?
+      raise ArgumentError, "n_kv_head (#{n_kv_head}) must be positive" unless n_kv_head.positive?
+      raise ArgumentError, "n_layer (#{n_layer}) must be positive" unless n_layer.positive?
+      raise ArgumentError, "n_embd (#{n_embd}) must be divisible by n_head (#{n_head})" unless (n_embd % n_head).zero?
+      unless n_kv_head <= n_head
+        raise ArgumentError,
+              "Invalid MQA: n_kv_head (#{n_kv_head}) must be <= n_head (#{n_head})"
+      end
+      return if (n_head % n_kv_head).zero?
+      raise ArgumentError,
+            "Invalid MQA: n_head (#{n_head}) must be divisible by n_kv_head (#{n_kv_head})"
+    end
+  end
+end

data/lib/nanochat/engine.rb ADDED Viewed

@@ -0,0 +1,152 @@
+# frozen_string_literal: true
+# Nanochat Engine: Efficient inference with KV caching
+# Ruby port of nanochat by Andrej Karpathy (https://github.com/karpathy/nanochat)
+module Nanochat
+  # KV cache for efficient inference
+  class KVCache
+    attr_reader :pos
+    def initialize(batch_size, num_heads, seq_len, head_dim, num_layers)
+      @kv_shape = [num_layers, 2, batch_size, num_heads, seq_len, head_dim]
+      @kv_cache = nil
+      @pos = 0
+    end
+    def reset = @pos = 0
+    def insert_kv(layer_idx, key, value)
+      @kv_cache = Torch.empty(@kv_shape, dtype: key.dtype, device: key.device) if @kv_cache.nil?
+      _batch, _heads, t_add, _dim = key.size
+      t0 = @pos
+      t1 = @pos + t_add
+      if t1 > @kv_cache.size(4)
+        t_needed = t1 + 1024
+        t_needed = (t_needed + 1023) & ~1023
+        current_shape = @kv_shape.dup
+        current_shape[4] = t_needed
+        @kv_cache = @kv_cache.resize(current_shape)
+        @kv_shape = current_shape
+      end
+      @kv_cache[layer_idx, 0, 0..-1, 0..-1, t0...t1, 0..-1] = key
+      @kv_cache[layer_idx, 1, 0..-1, 0..-1, t0...t1, 0..-1] = value
+      key_view = @kv_cache[layer_idx, 0, 0..-1, 0..-1, 0...t1, 0..-1]
+      value_view = @kv_cache[layer_idx, 1, 0..-1, 0..-1, 0...t1, 0..-1]
+      @pos = t1 if layer_idx == @kv_cache.size(0) - 1
+      [key_view, value_view]
+    end
+  end
+  # Text generation engine
+  class Engine
+    def initialize(model:, tokenizer:, device: nil)
+      @model = model
+      @tokenizer = tokenizer
+      @device = device || Common.device
+      @model.to(@device)
+      @model.eval
+    end
+    def generate(prompt, max_tokens: 100, temperature: 1.0, top_k: nil, top_p: nil)
+      tokens = []
+      generate_stream(prompt, max_tokens:, temperature:, top_k:, top_p:) do |token_text, _token_id|
+        tokens << token_text
+      end
+      tokens.join
+    end
+    # Generate text with streaming. Yields token_text (String), token_id (Integer).
+    # Accepts string prompts or token arrays.
+    def generate_stream(prompt, max_tokens: 100, temperature: 1.0, top_k: nil, top_p: nil)
+      tokens = prompt.is_a?(Array) ? prompt : @tokenizer.encode(prompt)
+      return if tokens.empty?
+      config = @model.config
+      kv_cache = KVCache.new(
+        1,
+        config.n_kv_head,
+        tokens.length + max_tokens,
+        config.n_embd / config.n_head,
+        config.n_layer
+      )
+      input_ids = Torch.tensor([tokens], dtype: :long).to(@device)
+      generated_tokens = []
+      Torch.no_grad do
+        max_tokens.times do
+          logits = @model.call(input_ids, kv_cache:)
+          next_token_logits = logits[0..-1, -1, 0..-1]
+          next_token = sample(next_token_logits, temperature, top_k, top_p)
+          token_id = next_token[0, 0].item
+          break if token_id == @tokenizer.eos_token_id
+          token_text = @tokenizer.decode([token_id])
+          yield(token_text, token_id) if block_given?
+          input_ids = next_token.view(1, 1)
+          generated_tokens << token_id
+        end
+      end
+      generated_tokens
+    end
+    private
+    def sample(logits, temperature, top_k, top_p)
+      return logits.argmax(-1, keepdim: true) if temperature.zero?
+      if top_k
+        k = [top_k, logits.size(-1)].min
+        vals, idx = Torch.topk(logits, k, dim: -1)
+        vals /= temperature
+        probs = Torch::NN::F.softmax(vals, dim: -1)
+        choice = Torch.multinomial(probs, num_samples: 1)
+        return idx.gather(1, choice)
+      end
+      # Top-p (nucleus) sampling
+      if top_p && top_p < 1.0
+        scaled_logits = logits / temperature
+        probs = Torch::NN::F.softmax(scaled_logits, dim: -1)
+        # Sort probabilities in descending order
+        sorted_probs, sorted_indices = probs.sort(dim: -1, descending: true)
+        # Compute cumulative probabilities
+        cumulative_probs = sorted_probs.cumsum(dim: -1)
+        # Remove tokens with cumulative probability above threshold
+        # Keep at least one token (the highest probability one)
+        sorted_indices_to_remove = Torch.gt(cumulative_probs, top_p)
+        sorted_indices_to_remove[0..-1, 0] = false
+        # Zero out probabilities for removed tokens
+        sorted_probs[sorted_indices_to_remove] = 0.0
+        # Renormalize probabilities
+        sorted_probs /= sorted_probs.sum(dim: -1, keepdim: true)
+        # Sample from filtered distribution
+        choice = Torch.multinomial(sorted_probs, num_samples: 1)
+        # Map back to original vocabulary indices
+        return sorted_indices.gather(1, choice)
+      end
+      scaled_logits = logits / temperature
+      probs = Torch::NN::F.softmax(scaled_logits, dim: -1)
+      Torch.multinomial(probs, num_samples: 1)
+    end
+  end
+end