RubyGems - nanogpt - Versions diffs - 0.2.0 → 0.3.0 - Mend

nanogpt 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

checksums.yaml +4 -4
data/Gemfile.lock +30 -1
data/docs/ARCHITECTURE.md +429 -0
data/exe/nanogpt +210 -233
data/lib/nano_gpt/bpe_textfile_preparer.rb +105 -0
data/lib/nano_gpt/data_loader.rb +5 -20
data/lib/nano_gpt/layers/block.rb +6 -1
data/lib/nano_gpt/layers/causal_self_attention.rb +11 -1
data/lib/nano_gpt/model.rb +1 -7
data/lib/nano_gpt/textfile_preparer.rb +189 -0
data/lib/nano_gpt/train_config.rb +80 -146
data/lib/nano_gpt/trainer.rb +21 -48
data/lib/nano_gpt/version.rb +1 -1
data/lib/nano_gpt/web/metrics_store.rb +136 -0
data/lib/nano_gpt/web/server.rb +294 -0
data/lib/nano_gpt/web/sse_notifier.rb +37 -0
data/lib/nano_gpt/web/training_state.rb +56 -0
data/lib/nano_gpt/web/training_worker.rb +153 -0
data/lib/nano_gpt/web/views/layout.erb +78 -0
data/lib/nano_gpt/web/views/run_detail.erb +432 -0
data/lib/nano_gpt/web/views/runs.erb +434 -0
data/lib/nano_gpt/web/web_trainer.rb +210 -0
data/lib/nano_gpt/web.rb +9 -0
data/lib/nano_gpt.rb +1 -0
data/nanogpt.gemspec +4 -0
metadata +71 -2

data/lib/nano_gpt/trainer.rb CHANGED Viewed

@@ -4,13 +4,24 @@ require "fileutils"
 module NanoGPT
   # Training loop for GPT models
+  # Accepts a TrainConfig (or hash with same keys) for all configuration
   class Trainer
+    # Default optimizer parameters (can be overridden via config)
+    OPTIMIZER_DEFAULTS = {
+      weight_decay: 1e-1,
+      beta1: 0.9,
+      beta2: 0.99,
+      grad_clip: 1.0,
+      always_save_checkpoint: false,
+      eval_only: false
+    }.freeze
     attr_reader :model, :optimizer, :config, :iter_num, :best_val_loss
-    def initialize(model:, data_loader:, config: {})
+    def initialize(model:, data_loader:, config:)
       @model = model
       @data_loader = data_loader
-      @config = default_config.merge(config)
+      @config = OPTIMIZER_DEFAULTS.merge(symbolize_keys(config.is_a?(Hash) ? config : config.to_h))
       @iter_num = 0
       @best_val_loss = Float::INFINITY
@@ -19,36 +30,6 @@ module NanoGPT
       setup_lr_scheduler
     end
-    def default_config
-      {
-        out_dir: "out",
-        eval_interval: 250,
-        log_interval: 10,
-        eval_iters: 200,
-        eval_only: false,
-        always_save_checkpoint: false,
-        # Optimizer
-        learning_rate: 1e-3,
-        weight_decay: 1e-1,
-        beta1: 0.9,
-        beta2: 0.99,
-        grad_clip: 1.0,
-        # LR scheduler
-        decay_lr: true,
-        warmup_iters: 100,
-        lr_decay_iters: 5000,
-        min_lr: 1e-4,
-        # Training
-        max_iters: 5000,
-        gradient_accumulation_steps: 1,
-        device: "cpu"
-      }
-    end
     def train
       puts "Starting training..."
       puts "Tokens per iteration: #{tokens_per_iter}"
@@ -58,10 +39,8 @@ module NanoGPT
       t0 = Time.now
       while @iter_num <= @config[:max_iters]
-        # Set learning rate for this iteration
         lr = @config[:decay_lr] ? @lr_scheduler.step(@optimizer, @iter_num) : @config[:learning_rate]
-        # Evaluate and checkpoint
         if @iter_num % @config[:eval_interval] == 0
           losses = estimate_loss
           puts "step #{@iter_num}: train loss #{losses[:train].round(4)}, val loss #{losses[:val].round(4)}"
@@ -74,29 +53,24 @@ module NanoGPT
         break if @iter_num == 0 && @config[:eval_only]
-        # Forward/backward with gradient accumulation
         @optimizer.zero_grad
         accumulated_loss = 0.0
-        @config[:gradient_accumulation_steps].times do |micro_step|
-          logits, loss = @model.call(x, targets: y)
+        @config[:gradient_accumulation_steps].times do |_micro_step|
+          _logits, loss = @model.call(x, targets: y)
           loss = loss / @config[:gradient_accumulation_steps]
           accumulated_loss += loss.item
           loss.backward
-          # Prefetch next batch
           x, y = @data_loader.get_batch(:train)
         end
-        # Gradient clipping (manual implementation since torch.rb lacks clip_grad_norm_)
         if @config[:grad_clip] > 0.0
           clip_grad_norm(@model.parameters, @config[:grad_clip])
         end
-        # Optimizer step
         @optimizer.step
-        # Logging
         t1 = Time.now
         dt = t1 - t0
         t0 = t1
@@ -135,9 +109,7 @@ module NanoGPT
       FileUtils.mkdir_p(@config[:out_dir])
       path = File.join(@config[:out_dir], "ckpt.pt")
-      # Note: torch.rb doesn't support optimizer.state_dict yet
-      # We save model state and training metadata
-      # Convert symbol keys to strings for Torch.save compatibility
+      # Torch.save requires string keys
       checkpoint = {
         "model" => @model.state_dict,
         "model_args" => stringify_keys(@model.config.to_h),
@@ -157,7 +129,6 @@ module NanoGPT
       @iter_num = checkpoint["iter_num"]
       @best_val_loss = checkpoint["best_val_loss"]
-      # Reinitialize optimizer (since we can't restore optimizer state in torch.rb)
       setup_optimizer
       puts "Loaded checkpoint from #{path} (iter #{@iter_num})"
@@ -166,21 +137,23 @@ module NanoGPT
     private
-    # Convert symbol keys to strings recursively (for Torch.save)
+    def symbolize_keys(hash)
+      hash.transform_keys(&:to_sym)
+    end
     def stringify_keys(hash)
       hash.transform_keys(&:to_s).transform_values do |v|
         v.is_a?(Hash) ? stringify_keys(v) : v
       end
     end
-    # Manual gradient clipping (torch.rb doesn't have clip_grad_norm_)
     def clip_grad_norm(parameters, max_norm)
       total_norm = 0.0
       parameters.each do |p|
         next unless p.grad
         param_norm = p.grad.data.norm(2).item
-        total_norm += param_norm ** 2
+        total_norm += param_norm**2
       end
       total_norm = Math.sqrt(total_norm)

data/lib/nano_gpt/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module NanoGPT
-  VERSION = "0.2.0"
+  VERSION = "0.3.0"
 end

data/lib/nano_gpt/web/metrics_store.rb ADDED Viewed

@@ -0,0 +1,136 @@
+# frozen_string_literal: true
+require "sqlite3"
+require "json"
+module NanoGPT
+  module Web
+    # SQLite-backed storage for training metrics
+    # Uses WAL mode for concurrent read/write access
+    class MetricsStore
+      attr_reader :db_path
+      def initialize(db_path = "nanogpt_metrics.db")
+        @db_path = db_path
+        @db = SQLite3::Database.new(db_path)
+        @db.results_as_hash = true
+        @db.execute("PRAGMA journal_mode=WAL")
+        @db.execute("PRAGMA synchronous=NORMAL")
+        create_tables
+      end
+      def create_run(dataset:, config:, status: "running")
+        @db.execute(
+          "INSERT INTO training_runs (dataset, config_json, status, started_at) VALUES (?, ?, ?, ?)",
+          [dataset, JSON.generate(config), status, Time.now.iso8601]
+        )
+        @db.last_insert_row_id
+      end
+      def update_run(run_id, **attrs)
+        sets = []
+        values = []
+        attrs.each do |key, value|
+          sets << "#{key} = ?"
+          values << value
+        end
+        values << run_id
+        @db.execute("UPDATE training_runs SET #{sets.join(', ')} WHERE id = ?", values)
+      end
+      def record_metrics(run_id, iteration, metrics_hash)
+        recorded_at = Time.now.iso8601
+        metrics_hash.each do |metric_type, value|
+          @db.execute(
+            "INSERT INTO metrics (run_id, iteration, metric_type, value, recorded_at) VALUES (?, ?, ?, ?, ?)",
+            [run_id, iteration, metric_type.to_s, value, recorded_at]
+          )
+        end
+      end
+      def metrics_for_run(run_id)
+        @db.execute(
+          "SELECT iteration, metric_type, value FROM metrics WHERE run_id = ? ORDER BY iteration",
+          [run_id]
+        )
+      end
+      def latest_run
+        @db.get_first_row("SELECT * FROM training_runs ORDER BY id DESC LIMIT 1")
+      end
+      def get_run(run_id)
+        @db.get_first_row("SELECT * FROM training_runs WHERE id = ?", [run_id])
+      end
+      def list_runs(limit: 50)
+        @db.execute("SELECT * FROM training_runs ORDER BY id DESC LIMIT ?", [limit])
+      end
+      def record_checkpoint(run_id, path:, suffix:, iteration:, val_loss:)
+        @db.execute(
+          "INSERT INTO checkpoints (run_id, path, suffix, iteration, val_loss, saved_at) VALUES (?, ?, ?, ?, ?, ?)",
+          [run_id, path, suffix, iteration, val_loss == Float::INFINITY ? nil : val_loss, Time.now.iso8601]
+        )
+      end
+      def checkpoints_for_run(run_id)
+        @db.execute(
+          "SELECT * FROM checkpoints WHERE run_id = ? ORDER BY iteration DESC",
+          [run_id]
+        )
+      end
+      def close
+        @db.close
+      end
+      private
+      def create_tables
+        @db.execute(<<~SQL)
+          CREATE TABLE IF NOT EXISTS training_runs (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            dataset TEXT NOT NULL,
+            config_json TEXT NOT NULL,
+            status TEXT NOT NULL DEFAULT 'running',
+            started_at TEXT NOT NULL,
+            stopped_at TEXT,
+            current_iter INTEGER DEFAULT 0,
+            best_val_loss REAL,
+            checkpoint_path TEXT
+          )
+        SQL
+        @db.execute(<<~SQL)
+          CREATE TABLE IF NOT EXISTS metrics (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            run_id INTEGER NOT NULL,
+            iteration INTEGER NOT NULL,
+            metric_type TEXT NOT NULL,
+            value REAL NOT NULL,
+            recorded_at TEXT NOT NULL,
+            FOREIGN KEY (run_id) REFERENCES training_runs(id)
+          )
+        SQL
+        @db.execute(<<~SQL)
+          CREATE INDEX IF NOT EXISTS idx_metrics_run_id ON metrics(run_id, iteration)
+        SQL
+        @db.execute(<<~SQL)
+          CREATE TABLE IF NOT EXISTS checkpoints (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            run_id INTEGER NOT NULL,
+            path TEXT NOT NULL,
+            suffix TEXT NOT NULL,
+            iteration INTEGER NOT NULL,
+            val_loss REAL,
+            saved_at TEXT NOT NULL,
+            FOREIGN KEY (run_id) REFERENCES training_runs(id)
+          )
+        SQL
+      end
+    end
+  end
+end

data/lib/nano_gpt/web/server.rb ADDED Viewed

@@ -0,0 +1,294 @@
+# frozen_string_literal: true
+require "sinatra/base"
+require "json"
+require "tempfile"
+require "fileutils"
+module NanoGPT
+  module Web
+    class Server < Sinatra::Base
+      set :views, File.join(__dir__, "views")
+      set :server, :webrick
+      set :logging, true
+      # Shared state - initialized before server starts
+      class << self
+        attr_accessor :training_state, :metrics_store, :sse_notifier, :training_worker
+      end
+      # ---- Pages ----
+      get "/" do
+        erb :runs, layout: :layout
+      end
+      get "/runs/:id" do
+        run = self.class.metrics_store.get_run(params[:id].to_i)
+        halt 404, "Run not found" unless run
+        @run = run
+        erb :run_detail, layout: :layout
+      end
+      # ---- Training Status ----
+      get "/train/status" do
+        content_type :json
+        self.class.training_state.to_json
+      end
+      # ---- Start Training ----
+      post "/train/start" do
+        content_type :json
+        state = self.class.training_state
+        if state[:status] == "running"
+          halt 409, JSON.generate(error: "Training already running")
+        end
+        body = JSON.parse(request.body.read) rescue {}
+        config = build_train_config(body)
+        data_dir = File.join("data", config[:dataset])
+        train_bin = File.join(data_dir, "train.bin")
+        unless File.exist?(train_bin)
+          halt 422, JSON.generate(error: "Dataset not found: #{config[:dataset]}. Run prepare first.")
+        end
+        state.reset_stop!
+        state.update(status: "running", dataset: config[:dataset], max_iters: config[:max_iters])
+        run_id = self.class.metrics_store.create_run(
+          dataset: config[:dataset],
+          config: config
+        )
+        state.update(run_id: run_id)
+        self.class.training_worker.enqueue(:start, config: config, data_dir: data_dir, run_id: run_id)
+        JSON.generate(run_id: run_id, status: "started")
+      end
+      # ---- Stop Training ----
+      post "/train/stop" do
+        content_type :json
+        request.body.read rescue nil
+        state = self.class.training_state
+        unless state[:status] == "running"
+          halt 409, JSON.generate(error: "No training in progress")
+        end
+        state.request_stop!
+        JSON.generate(status: "stop_requested")
+      end
+      # ---- Resume Training ----
+      post "/train/resume" do
+        content_type :json
+        state = self.class.training_state
+        if state[:status] == "running"
+          halt 409, JSON.generate(error: "Training already running")
+        end
+        body = JSON.parse(request.body.read) rescue {}
+        config = build_train_config(body)
+        ckpt_path = body["checkpoint_path"]
+        if ckpt_path.nil? || !File.exist?(ckpt_path)
+          ckpt_path = File.join(config[:out_dir], "ckpt.pt")
+        end
+        unless File.exist?(ckpt_path)
+          halt 422, JSON.generate(error: "No checkpoint found at #{ckpt_path}")
+        end
+        data_dir = File.join("data", config[:dataset])
+        unless File.exist?(File.join(data_dir, "train.bin"))
+          halt 422, JSON.generate(error: "Dataset not found: #{config[:dataset]}")
+        end
+        state.reset_stop!
+        state.update(status: "running", dataset: config[:dataset], max_iters: config[:max_iters])
+        run_id = self.class.metrics_store.create_run(
+          dataset: config[:dataset],
+          config: config,
+          status: "running"
+        )
+        state.update(run_id: run_id)
+        self.class.training_worker.enqueue(:resume,
+          config: config, data_dir: data_dir, run_id: run_id, checkpoint_path: ckpt_path)
+        JSON.generate(run_id: run_id, status: "resumed")
+      end
+      # ---- Metrics polling ----
+      get "/metrics/poll" do
+        content_type :json
+        state = self.class.training_state.to_h
+        run_id = state[:run_id]
+        since_iter = (params[:since_iter] || 0).to_i
+        recent = {}
+        if run_id
+          all_metrics = self.class.metrics_store.metrics_for_run(run_id)
+          all_metrics.each do |row|
+            next if row["iteration"] <= since_iter
+            type = row["metric_type"]
+            recent[type] ||= []
+            recent[type] << { iteration: row["iteration"], value: row["value"] }
+          end
+        end
+        JSON.generate(state: state, metrics: recent)
+      end
+      # ---- Runs API ----
+      get "/api/runs" do
+        content_type :json
+        runs = self.class.metrics_store.list_runs
+        JSON.generate(runs: runs)
+      end
+      get "/api/runs/:id" do
+        content_type :json
+        run = self.class.metrics_store.get_run(params[:id].to_i)
+        halt 404, JSON.generate(error: "Run not found") unless run
+        JSON.generate(run: run)
+      end
+      get "/api/runs/:id/metrics" do
+        content_type :json
+        run_id = params[:id].to_i
+        run = self.class.metrics_store.get_run(run_id)
+        halt 404, JSON.generate(error: "Run not found") unless run
+        metrics = self.class.metrics_store.metrics_for_run(run_id)
+        grouped = {}
+        metrics.each do |row|
+          type = row["metric_type"]
+          grouped[type] ||= []
+          grouped[type] << { iteration: row["iteration"], value: row["value"] }
+        end
+        JSON.generate(run: run, metrics: grouped)
+      end
+      get "/api/runs/:id/checkpoints" do
+        content_type :json
+        run_id = params[:id].to_i
+        checkpoints = self.class.metrics_store.checkpoints_for_run(run_id).dup
+        # Also include legacy checkpoint_path from the run record
+        run = self.class.metrics_store.get_run(run_id)
+        if run && run["checkpoint_path"] && File.exist?(run["checkpoint_path"])
+          unless checkpoints.any? { |c| c["path"] == run["checkpoint_path"] }
+            checkpoints.unshift({
+              "id" => nil, "run_id" => run_id, "path" => run["checkpoint_path"],
+              "suffix" => "legacy", "iteration" => run["current_iter"] || 0,
+              "val_loss" => run["best_val_loss"], "saved_at" => run["stopped_at"]
+            })
+          end
+        end
+        JSON.generate(checkpoints: checkpoints)
+      end
+      # ---- Text Generation API ----
+      post "/generate/run" do
+        content_type :json
+        body = JSON.parse(request.body.read) rescue {}
+        prompt = body["prompt"] || "\n"
+        temperature = (body["temperature"] || 0.8).to_f
+        max_tokens = (body["max_tokens"] || 200).to_i
+        top_k = body["top_k"]&.to_i
+        checkpoint_path = body["checkpoint_path"]
+        dataset = body["dataset"] || "shakespeare_char"
+        if checkpoint_path && !File.exist?(checkpoint_path)
+          halt 422, JSON.generate(error: "Checkpoint not found: #{checkpoint_path}")
+        end
+        unless checkpoint_path
+          out_dir = body["out_dir"] || "out-shakespeare-char"
+          checkpoint_path = File.join(out_dir, "ckpt.pt")
+          unless File.exist?(checkpoint_path)
+            halt 422, JSON.generate(error: "No checkpoint found at #{checkpoint_path}")
+          end
+        end
+        if self.class.training_state[:status] == "running"
+          halt 409, JSON.generate(error: "Cannot generate while training is running. Stop training first.")
+        end
+        result = self.class.training_worker.enqueue_sync(:generate,
+          prompt: prompt, temperature: temperature, max_tokens: max_tokens,
+          top_k: top_k, checkpoint_path: checkpoint_path, dataset: dataset)
+        if result[:ok]
+          JSON.generate(text: result[:text])
+        else
+          halt 500, JSON.generate(error: result[:error])
+        end
+      end
+      # ---- Dataset Upload ----
+      post "/datasets/upload" do
+        content_type :json
+        unless params[:file] && params[:file][:tempfile]
+          halt 422, JSON.generate(error: "No file uploaded")
+        end
+        name = params[:name]&.strip
+        name = File.basename(params[:file][:filename], ".*").gsub(/[^a-zA-Z0-9_-]/, "_") if name.nil? || name.empty?
+        tokenizer_type = params[:tokenizer] || "char"
+        val_ratio = (params[:val_ratio] || 0.1).to_f
+        tmp_path = File.join(Dir.tmpdir, "nanogpt_upload_#{name}_#{Time.now.to_i}.txt")
+        FileUtils.cp(params[:file][:tempfile].path, tmp_path)
+        result = self.class.training_worker.enqueue_sync(:prepare_dataset,
+          input_path: tmp_path, output_name: name, tokenizer: tokenizer_type, val_ratio: val_ratio)
+        File.delete(tmp_path) if File.exist?(tmp_path)
+        if result[:ok]
+          JSON.generate(ok: true, dataset: result[:dataset])
+        else
+          halt 500, JSON.generate(ok: false, error: result[:error])
+        end
+      end
+      # ---- Available Datasets ----
+      get "/datasets" do
+        content_type :json
+        datasets = Dir.glob("data/*/train.bin").map do |path|
+          File.basename(File.dirname(path))
+        end.sort
+        JSON.generate(datasets: datasets)
+      end
+      private
+      def build_train_config(overrides = {})
+        config = TrainConfig.new.to_h
+        overrides.each do |key, value|
+          sym = key.to_sym
+          config[sym] = value if config.key?(sym)
+        end
+        config[:device] = Device.auto
+        config[:out_dir] = "out-#{config[:dataset]}" if overrides["dataset"] && !overrides["out_dir"]
+        config
+      end
+    end
+  end
+end

data/lib/nano_gpt/web/sse_notifier.rb ADDED Viewed

@@ -0,0 +1,37 @@
+# frozen_string_literal: true
+require "json"
+module NanoGPT
+  module Web
+    # Manages Server-Sent Events connections and broadcasts
+    class SSENotifier
+      def initialize
+        @connections = []
+        @mutex = Mutex.new
+      end
+      def add(connection)
+        @mutex.synchronize { @connections << connection }
+      end
+      def remove(connection)
+        @mutex.synchronize { @connections.delete(connection) }
+      end
+      def broadcast(type:, data:)
+        payload = "event: #{type}\ndata: #{JSON.generate(data)}\n\n"
+        @mutex.synchronize do
+          @connections.reject! do |conn|
+            begin
+              conn << payload
+              false
+            rescue IOError, Errno::EPIPE
+              true
+            end
+          end
+        end
+      end
+    end
+  end
+end

data/lib/nano_gpt/web/training_state.rb ADDED Viewed

@@ -0,0 +1,56 @@
+# frozen_string_literal: true
+require "json"
+module NanoGPT
+  module Web
+    # Thread-safe shared state for training status
+    class TrainingState
+      STATUSES = %w[idle running stopped completed].freeze
+      def initialize
+        @mutex = Mutex.new
+        @state = {
+          status: "idle",
+          run_id: nil,
+          current_iter: 0,
+          current_loss: nil,
+          best_val_loss: nil,
+          max_iters: 0,
+          dataset: nil
+        }
+        @stop_requested = false
+      end
+      def update(**attrs)
+        @mutex.synchronize do
+          attrs.each { |k, v| @state[k] = v if @state.key?(k) }
+        end
+      end
+      def [](key)
+        @mutex.synchronize { @state[key] }
+      end
+      def request_stop!
+        @mutex.synchronize { @stop_requested = true }
+      end
+      def stop_requested?
+        @mutex.synchronize { @stop_requested }
+      end
+      def reset_stop!
+        @mutex.synchronize { @stop_requested = false }
+      end
+      def to_json(*)
+        @mutex.synchronize { JSON.generate(@state) }
+      end
+      def to_h
+        @mutex.synchronize { @state.dup }
+      end
+    end
+  end
+end