RubyGems - ignis-dl - Versions diffs - 0.0.1 - Mend

ignis-dl 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

checksums.yaml +7 -0
data/README.md +15 -0
data/lib/ignis-dl.rb +48 -0
data/lib/nnw/ai/gpt2_loader.rb +144 -0
data/lib/nnw/ai/inference.rb +224 -0
data/lib/nnw/ai/kv_cache.rb +79 -0
data/lib/nnw/ai/llama_loader.rb +100 -0
data/lib/nnw/ai/loss.rb +170 -0
data/lib/nnw/ai/nn/dropout.rb +68 -0
data/lib/nnw/ai/nn/embedding.rb +86 -0
data/lib/nnw/ai/nn/layer_norm.rb +54 -0
data/lib/nnw/ai/nn/linear.rb +80 -0
data/lib/nnw/ai/nn/module.rb +178 -0
data/lib/nnw/ai/nn/rms_norm.rb +43 -0
data/lib/nnw/ai/nn/sequential.rb +52 -0
data/lib/nnw/ai/optim/adam.rb +63 -0
data/lib/nnw/ai/optim/adamw.rb +63 -0
data/lib/nnw/ai/optim/base.rb +90 -0
data/lib/nnw/ai/optim/lr_scheduler.rb +118 -0
data/lib/nnw/ai/optim/sgd.rb +49 -0
data/lib/nnw/ai/safetensors.rb +220 -0
data/lib/nnw/ai/server.rb +268 -0
data/lib/nnw/ai/tokenizer.rb +413 -0
data/lib/nnw/ai/trainer.rb +245 -0
data/lib/nnw/ai/transformer/attention.rb +89 -0
data/lib/nnw/ai/transformer/block.rb +90 -0
data/lib/nnw/ai/transformer/feed_forward.rb +53 -0
data/lib/nnw/ai/transformer/model.rb +189 -0
data/lib/nnw/ai/transformer/modern.rb +191 -0
data/lib/nnw/ai/transformer/swiglu.rb +39 -0
data/lib/nnw/ai/weight_map.rb +139 -0
metadata +91 -0

data/lib/nnw/ai/trainer.rb ADDED Viewed

@@ -0,0 +1,245 @@
+# frozen_string_literal: true
+module Ignis
+  module AI
+    # Trainer — complete training loop with gradient accumulation,
+    # checkpointing, and multi-GPU support via NvCCL.
+    class Trainer
+      # @return [Transformer::Model]
+      attr_reader :model
+      # @return [Optim::Base]
+      attr_reader :optimizer
+      # @return [Hash] training metrics
+      attr_reader :metrics
+      # @param model [Transformer::Model]
+      # @param optimizer [Optim::Base]
+      # @param scheduler [Optim::LRScheduler::*, nil]
+      # @param grad_accumulation_steps [Integer] accumulate gradients over N steps
+      # @param max_grad_norm [Float] gradient clipping norm
+      # @param use_nvccl [Boolean] enable multi-GPU gradient sync
+      # @param checkpoint_dir [String, nil] directory for saving checkpoints
+      def initialize(model:, optimizer:, scheduler: nil,
+                     grad_accumulation_steps: 1, max_grad_norm: 1.0,
+                     use_nvccl: false, checkpoint_dir: nil)
+        @model = model
+        @optimizer = optimizer
+        @scheduler = scheduler
+        @grad_accumulation_steps = grad_accumulation_steps
+        @max_grad_norm = max_grad_norm
+        @use_nvccl = use_nvccl
+        @checkpoint_dir = checkpoint_dir
+        @metrics = { steps: 0, total_loss: 0.0, best_loss: Float::INFINITY }
+        @model.train!
+      end
+      # Train for a specified number of steps.
+      # @param data_loader [DataLoader] provides batches
+      # @param steps [Integer] total training steps
+      # @param log_interval [Integer] log every N steps
+      # @param checkpoint_interval [Integer] save every N steps
+      # @param eval_fn [Proc, nil] evaluation function called at log intervals
+      # @return [Hash] final metrics
+      def train(data_loader, steps:, log_interval: 100,
+                checkpoint_interval: 1000, eval_fn: nil)
+        @model.train!
+        accumulated_loss = 0.0
+        steps.times do |step|
+          # Get batch
+          batch = data_loader.next_batch
+          input_ids = batch[:input_ids]
+          targets = batch[:targets]
+          # Forward pass
+          logits = @model.call(input_ids)
+          loss = Loss.cross_entropy(logits, targets)
+          # Scale loss for gradient accumulation
+          scaled_loss = loss * (1.0 / @grad_accumulation_steps)
+          # Backward pass
+          scaled_loss.backward!
+          accumulated_loss += loss.item
+          # Optimizer step (every grad_accumulation_steps)
+          if (step + 1) % @grad_accumulation_steps == 0
+            # Gradient clipping
+            grad_norm = @optimizer.clip_grad_norm!(@max_grad_norm)
+            # Multi-GPU gradient sync
+            if @use_nvccl
+              sync_gradients_nvccl!
+            end
+            # Optimizer step
+            @optimizer.step
+            @optimizer.zero_grad!
+            @scheduler&.step
+            @metrics[:steps] += 1
+            @metrics[:total_loss] += accumulated_loss / @grad_accumulation_steps
+            # Logging
+            if @metrics[:steps] % log_interval == 0
+              avg_loss = @metrics[:total_loss] / @metrics[:steps]
+              lr = @optimizer.lr
+              Ignis.logger.info(
+                "Step #{@metrics[:steps]} | Loss: #{'%.4f' % (accumulated_loss / @grad_accumulation_steps)} | " \
+                "Avg Loss: #{'%.4f' % avg_loss} | LR: #{'%.2e' % lr} | Grad Norm: #{'%.2f' % grad_norm}"
+              )
+              # EventBus publish
+              if defined?(Ignis::Shared::EventBus)
+                Ignis::Shared::EventBus.publish(:training_step, {
+                  step: @metrics[:steps],
+                  loss: accumulated_loss / @grad_accumulation_steps,
+                  avg_loss: avg_loss,
+                  lr: lr,
+                  grad_norm: grad_norm
+                })
+              end
+              # Eval
+              if eval_fn
+                @model.eval!
+                eval_fn.call(@model, @metrics[:steps])
+                @model.train!
+              end
+            end
+            # Checkpointing
+            if @checkpoint_dir && @metrics[:steps] % checkpoint_interval == 0
+              save_checkpoint!
+            end
+            accumulated_loss = 0.0
+          end
+          # Clear tape each iteration
+          Tape.clear!
+        end
+        @metrics
+      end
+      # Save model checkpoint.
+      # @return [String] checkpoint path
+      def save_checkpoint!
+        return unless @checkpoint_dir
+        Dir.mkdir(@checkpoint_dir) unless Dir.exist?(@checkpoint_dir)
+        path = File.join(@checkpoint_dir, "checkpoint_step_#{@metrics[:steps]}.safetensors")
+        tensors = {}
+        @model.named_parameters.each do |name, param|
+          tensors[name] = param
+        end
+        Safetensors.save(tensors, path, metadata: {
+          "step" => @metrics[:steps].to_s,
+          "loss" => (@metrics[:total_loss] / [@metrics[:steps], 1].max).to_s,
+          "framework" => "nnw"
+        })
+        Ignis.logger.info("Checkpoint saved: #{path}")
+        path
+      end
+      # Load from checkpoint.
+      # @param path [String]
+      # @return [void]
+      def load_checkpoint!(path)
+        Safetensors.load_model(@model, path, strict: false)
+        Ignis.logger.info("Checkpoint loaded: #{path}")
+      end
+      private
+      # Sync gradients across GPUs via NvCCL AllReduce.
+      # @return [void]
+      def sync_gradients_nvccl!
+        return unless defined?(Ignis::Collective)
+        @model.parameters.each do |p|
+          next unless p.grad
+          Ignis::Collective.all_reduce(p.grad, op: :sum)
+        end
+      end
+    end
+    # DataLoader — batching, shuffling, and GPU prefetch for training data.
+    class DataLoader
+      # @param data [Array<Array<Integer>>] tokenized sequences
+      # @param batch_size [Integer]
+      # @param seq_len [Integer] sequence length per sample
+      # @param device_id [Integer]
+      # @param shuffle [Boolean]
+      def initialize(data, batch_size:, seq_len:, device_id: 0, shuffle: true)
+        @data = data.flatten
+        @batch_size = batch_size
+        @seq_len = seq_len
+        @device_id = device_id
+        @shuffle = shuffle
+        @position = 0
+        # Shuffle on init
+        reshuffle! if @shuffle
+      end
+      # Get next training batch.
+      # @return [Hash{Symbol => Tensor}] :input_ids, :targets
+      def next_batch
+        total_tokens = @batch_size * @seq_len
+        if @position + total_tokens + 1 > @data.length
+          @position = 0
+          reshuffle! if @shuffle
+        end
+        input_ids = []
+        targets = []
+        @batch_size.times do |b|
+          start = @position + b * @seq_len
+          input_ids.concat(@data[start, @seq_len])
+          targets.concat(@data[start + 1, @seq_len])
+        end
+        @position += total_tokens
+        input_nv = Ignis::Shared::NvArray.new(shape: [@batch_size, @seq_len], dtype: :int32,
+                                             device_id: @device_id)
+        input_nv.from_host(input_ids)
+        target_nv = Ignis::Shared::NvArray.new(shape: [@batch_size * @seq_len], dtype: :int32,
+                                              device_id: @device_id)
+        target_nv.from_host(targets)
+        {
+          input_ids: Tensor.new(data: input_nv, requires_grad: false),
+          targets: Tensor.new(data: target_nv, requires_grad: false)
+        }
+      end
+      # Number of batches per epoch.
+      # @return [Integer]
+      def num_batches
+        (@data.length - 1) / (@batch_size * @seq_len)
+      end
+      private
+      def reshuffle!
+        # Shuffle at chunk boundaries (not individual tokens)
+        chunk_size = @seq_len + 1
+        chunks = @data.each_slice(chunk_size).to_a
+        chunks.shuffle!
+        @data = chunks.flatten
+      end
+    end
+  end
+end

data/lib/nnw/ai/transformer/attention.rb ADDED Viewed

@@ -0,0 +1,89 @@
+# frozen_string_literal: true
+module Ignis
+  module AI
+    module Transformer
+      # Multi-Head Attention with optional causal masking.
+      # Supports standard scaled dot-product attention.
+      #
+      # @example
+      #   attn = MultiHeadAttention.new(768, 12)
+      #   out = attn.call(x, x, x, mask: causal_mask)
+      class MultiHeadAttention < NN::Module
+        # @param embed_dim [Integer] model dimension
+        # @param num_heads [Integer] number of attention heads
+        # @param dropout [Float] attention dropout rate
+        # @param bias [Boolean] whether projections have bias
+        # @param device_id [Integer]
+        def initialize(embed_dim, num_heads, dropout: 0.0, bias: true, causal: true, device_id: 0)
+          super()
+          raise ArgumentError, "embed_dim must be divisible by num_heads" unless (embed_dim % num_heads).zero?
+          @embed_dim = embed_dim
+          @num_heads = num_heads
+          @head_dim = embed_dim / num_heads
+          @scale = 1.0 / Math.sqrt(@head_dim)
+          @causal = causal
+          @q_proj = register_module("q_proj", NN::Linear.new(embed_dim, embed_dim, bias: bias, device_id: device_id))
+          @k_proj = register_module("k_proj", NN::Linear.new(embed_dim, embed_dim, bias: bias, device_id: device_id))
+          @v_proj = register_module("v_proj", NN::Linear.new(embed_dim, embed_dim, bias: bias, device_id: device_id))
+          @out_proj = register_module("out_proj", NN::Linear.new(embed_dim, embed_dim, bias: bias, device_id: device_id))
+          @attn_dropout = register_module("attn_dropout", NN::Dropout.new(p: dropout))
+        end
+        # Forward pass: scaled dot-product attention.
+        # @param query [Tensor] [batch, seq_q, embed_dim]
+        # @param key [Tensor] [batch, seq_k, embed_dim]
+        # @param value [Tensor] [batch, seq_k, embed_dim]
+        # @param mask [Tensor, nil] attention mask
+        # @return [Tensor] [batch, seq_q, embed_dim]
+        def forward(query, key, value, mask: nil)
+          # Project Q, K, V → [seq, embed_dim] each (batch = 1)
+          q = @q_proj.call(query)
+          k = @k_proj.call(key)
+          v = @v_proj.call(value)
+          # Real multi-head scaled dot-product attention with causal masking.
+          # Each head attends over its own [seq, head_dim] slice; the per-head
+          # Flash-Attention-2 kernel applies 1/sqrt(head_dim) scaling, the causal
+          # mask, and a numerically-stable online softmax. (A nil mask still means
+          # causal for a GPT-2-style decoder; pass causal: false at construction
+          # for bidirectional attention.)
+          context = q.sdpa(k, v, num_heads: @num_heads, causal: @causal)
+          # Output projection
+          @out_proj.call(context)
+        end
+        # Incremental attention for one new token via a KV cache (decode path).
+        # Projects only this token's q/k/v, appends k/v to the cache, then attends
+        # the single query over all cached keys/values (the new token is the last
+        # position, so it sees everything — no causal mask). Must run under no_grad.
+        # @param x [Tensor] [1, embed] hidden state of the new token (post-norm)
+        # @param cache [KVCache]
+        # @param layer [Integer] this block's layer index
+        # @return [Tensor] [1, embed]
+        def decode_step(x, cache, layer)
+          raise "decode_step requires a causal attention module" unless @causal
+          q = @q_proj.call(x)
+          k = @k_proj.call(x)
+          v = @v_proj.call(x)
+          cache.append(layer, k.data, v.data)
+          kview = Tensor.new(data: cache.k_view(layer), requires_grad: false)
+          vview = Tensor.new(data: cache.v_view(layer), requires_grad: false)
+          ctx = q.decode_sdpa(kview, vview, num_heads: @num_heads)
+          @out_proj.call(ctx)
+        end
+        # @return [String]
+        def to_s
+          "MultiHeadAttention(embed=#{@embed_dim}, heads=#{@num_heads}, head_dim=#{@head_dim})"
+        end
+      end
+    end
+  end
+end

data/lib/nnw/ai/transformer/block.rb ADDED Viewed

@@ -0,0 +1,90 @@
+# frozen_string_literal: true
+module Ignis
+  module AI
+    module Transformer
+      # Single Transformer block: Attention + FF with residual connections.
+      # Supports both pre-norm (GPT-2, LLaMA) and post-norm (original) variants.
+      class Block < NN::Module
+        # @param embed_dim [Integer]
+        # @param num_heads [Integer]
+        # @param ff_dim [Integer]
+        # @param dropout [Float]
+        # @param pre_norm [Boolean] pre-norm (true) vs post-norm
+        # @param activation [Symbol]
+        # @param device_id [Integer]
+        def initialize(embed_dim, num_heads, ff_dim, dropout: 0.0,
+                       pre_norm: true, activation: :gelu, device_id: 0)
+          super()
+          @pre_norm = pre_norm
+          @attention = register_module("attention",
+                        MultiHeadAttention.new(embed_dim, num_heads, dropout: dropout, device_id: device_id))
+          @feed_forward = register_module("feed_forward",
+                           FeedForward.new(embed_dim, ff_dim, activation: activation,
+                                           dropout: dropout, device_id: device_id))
+          @norm1 = register_module("norm1", NN::LayerNorm.new(embed_dim, device_id: device_id))
+          @norm2 = register_module("norm2", NN::LayerNorm.new(embed_dim, device_id: device_id))
+          @dropout = register_module("dropout", NN::Dropout.new(p: dropout))
+        end
+        # Forward pass.
+        # @param x [Tensor] input [batch*seq, embed_dim]
+        # @param mask [Tensor, nil] attention mask
+        # @return [Tensor]
+        def forward(x, mask: nil)
+          if @pre_norm
+            # Pre-norm (GPT-2 style): Norm → Attn → Residual, Norm → FF → Residual
+            residual = x
+            h = @norm1.call(x)
+            h = @attention.call(h, h, h, mask: mask)
+            h = @dropout.call(h)
+            x = residual + h
+            residual = x
+            h = @norm2.call(x)
+            h = @feed_forward.call(h)
+            h = @dropout.call(h)
+            residual + h
+          else
+            # Post-norm (original Transformer): Attn → Residual → Norm
+            residual = x
+            h = @attention.call(x, x, x, mask: mask)
+            h = @dropout.call(h)
+            x = @norm1.call(residual + h)
+            residual = x
+            h = @feed_forward.call(x)
+            h = @dropout.call(h)
+            @norm2.call(residual + h)
+          end
+        end
+        # Incremental forward for one new token (decode path, pre-norm only).
+        # Mirrors #forward but routes attention through the KV cache and operates
+        # on a single [1, embed] row. Dropout is identity in eval mode, so it is
+        # omitted. @param x [Tensor] [1, embed]; @param cache [KVCache];
+        # @param layer [Integer] this block's index. @return [Tensor] [1, embed]
+        def decode_step(x, cache, layer)
+          raise "Block#decode_step is only implemented for pre-norm blocks" unless @pre_norm
+          residual = x
+          h = @norm1.call(x)
+          h = @attention.decode_step(h, cache, layer)
+          x = residual + h
+          residual = x
+          h = @norm2.call(x)
+          h = @feed_forward.call(h)
+          residual + h
+        end
+        # @return [String]
+        def to_s
+          style = @pre_norm ? "pre-norm" : "post-norm"
+          "Block(#{style}, attn=#{@attention}, ff=#{@feed_forward})"
+        end
+      end
+    end
+  end
+end

data/lib/nnw/ai/transformer/feed_forward.rb ADDED Viewed

@@ -0,0 +1,53 @@
+# frozen_string_literal: true
+module Ignis
+  module AI
+    module Transformer
+      # Feed-forward network: Linear → Activation → Dropout → Linear
+      # Used in each Transformer block after attention.
+      class FeedForward < NN::Module
+        # @param embed_dim [Integer] model dimension
+        # @param ff_dim [Integer] feed-forward hidden dimension (typically 4x embed_dim)
+        # @param activation [Symbol] :gelu, :relu, or :silu
+        # @param dropout [Float] dropout rate
+        # @param device_id [Integer]
+        def initialize(embed_dim, ff_dim, activation: :gelu, dropout: 0.0, device_id: 0)
+          super()
+          @activation = activation
+          @fc1 = register_module("fc1", NN::Linear.new(embed_dim, ff_dim, device_id: device_id))
+          @fc2 = register_module("fc2", NN::Linear.new(ff_dim, embed_dim, device_id: device_id))
+          @dropout = register_module("dropout", NN::Dropout.new(p: dropout))
+        end
+        # Forward pass.
+        # @param x [Tensor]
+        # @return [Tensor]
+        def forward(x)
+          h = @fc1.call(x)
+          h = apply_activation(h)
+          h = @dropout.call(h)
+          @fc2.call(h)
+        end
+        # @return [String]
+        def to_s
+          "FeedForward(fc1=#{@fc1}, activation=#{@activation}, fc2=#{@fc2})"
+        end
+        private
+        # @param x [Tensor]
+        # @return [Tensor]
+        def apply_activation(x)
+          case @activation
+          when :gelu then x.gelu
+          when :relu then x.relu
+          when :silu then x.silu
+          else raise ArgumentError, "Unknown activation: #{@activation}"
+          end
+        end
+      end
+    end
+  end
+end

data/lib/nnw/ai/transformer/model.rb ADDED Viewed

@@ -0,0 +1,189 @@
+# frozen_string_literal: true
+module Ignis
+  module AI
+    module Transformer
+      # Full Transformer language model.
+      #
+      # token_embedding → position_embedding → N × Block → LayerNorm → LM head
+      #
+      # Factory methods provide standard model configurations:
+      #   .gpt2_small  → 124M params
+      #   .gpt2_medium → 345M params
+      #   .gpt2_large  → 774M params
+      class Model < NN::Module
+        # @return [Integer]
+        attr_reader :vocab_size, :embed_dim, :num_heads, :num_layers, :max_seq_len
+        # @param vocab_size [Integer] vocabulary size
+        # @param embed_dim [Integer] model dimension
+        # @param num_heads [Integer] attention heads per block
+        # @param num_layers [Integer] number of Transformer blocks
+        # @param ff_dim [Integer] feed-forward hidden dimension
+        # @param max_seq_len [Integer] maximum sequence length
+        # @param dropout [Float]
+        # @param activation [Symbol]
+        # @param pre_norm [Boolean]
+        # @param device_id [Integer]
+        def initialize(vocab_size:, embed_dim:, num_heads:, num_layers:,
+                       ff_dim:, max_seq_len:, dropout: 0.0,
+                       activation: :gelu, pre_norm: true, device_id: 0)
+          super()
+          @vocab_size = vocab_size
+          @embed_dim = embed_dim
+          @num_heads = num_heads
+          @num_layers = num_layers
+          @max_seq_len = max_seq_len
+          @device_id = device_id
+          @token_embedding = register_module("token_embedding",
+                              NN::Embedding.new(vocab_size, embed_dim, device_id: device_id))
+          @position_embedding = register_module("position_embedding",
+                                 NN::Embedding.new(max_seq_len, embed_dim, device_id: device_id))
+          @blocks = []
+          num_layers.times do |i|
+            block = Block.new(embed_dim, num_heads, ff_dim,
+                              dropout: dropout, pre_norm: pre_norm,
+                              activation: activation, device_id: device_id)
+            @blocks << register_module("blocks.#{i}", block)
+          end
+          @norm = register_module("norm", NN::LayerNorm.new(embed_dim, device_id: device_id))
+          @head = register_module("head",
+                   NN::Linear.new(embed_dim, vocab_size, bias: false, device_id: device_id))
+          @dropout = register_module("dropout", NN::Dropout.new(p: dropout))
+        end
+        # Forward pass: returns logits.
+        # @param input_ids [Tensor] token indices [batch_size, seq_len] (int32)
+        # @param mask [Tensor, nil] attention mask
+        # @return [Tensor] logits [batch_size * seq_len, vocab_size]
+        def forward(input_ids, mask: nil)
+          seq_len = input_ids.shape[-1]
+          # Create position indices
+          positions_data = (0...seq_len).to_a
+          pos_nv = Ignis::Shared::NvArray.new(shape: [seq_len], dtype: :int32,
+                                             device_id: input_ids.device_id)
+          pos_nv.from_host(positions_data)
+          positions = Tensor.new(data: pos_nv, requires_grad: false)
+          # Embeddings
+          tok_emb = @token_embedding.call(input_ids)   # [batch, seq, embed]
+          pos_emb = @position_embedding.call(positions) # [seq, embed]
+          # Combine and dropout
+          x = tok_emb + pos_emb
+          x = @dropout.call(x)
+          # Transformer blocks
+          @blocks.each do |block|
+            x = block.call(x, mask: mask)
+          end
+          # Final norm and LM head
+          x = @norm.call(x)
+          @head.call(x)  # → logits [batch*seq, vocab]
+        end
+        # Allocate a fresh KV cache sized for this model.
+        # @param device_id [Integer]
+        # @return [KVCache]
+        def make_kv_cache(device_id: @device_id)
+          KVCache.new(num_layers: @num_layers, max_seq_len: @max_seq_len,
+                      embed_dim: @embed_dim, device_id: device_id)
+        end
+        # Incremental forward for ONE new token using a KV cache (decode path).
+        # Equivalent to the last-position logits of a full forward over the whole
+        # prefix, but O(prefix) instead of O(prefix²): only this token is projected
+        # and embedded; its query attends over cached K/V. Must run under
+        # Tape.no_grad (no autograd). Append order matches the prefix order, so
+        # callers feed the prompt token-by-token before sampling.
+        # @param token_id [Integer] the new token's id
+        # @param cache [KVCache]
+        # @return [Tensor] logits [1, vocab]
+        def decode_step(token_id, cache)
+          pos = cache.length
+          raise "KVCache full: position #{pos} exceeds max_seq_len #{@max_seq_len}" if pos >= @max_seq_len
+          tok = Tensor.from_host([token_id], shape: [1], dtype: :int32, device_id: @device_id)
+          pos_t = Tensor.from_host([pos], shape: [1], dtype: :int32, device_id: @device_id)
+          x = @token_embedding.call(tok) + @position_embedding.call(pos_t) # [1, embed]
+          @blocks.each_with_index { |block, i| x = block.decode_step(x, cache, i) }
+          cache.advance!
+          x = @norm.call(x)
+          @head.call(x) # [1, vocab]
+        end
+        # -------------------------------------------------------------------
+        # Factory methods for standard configurations
+        # -------------------------------------------------------------------
+        # GPT-2 Small: 124M parameters
+        # @param device_id [Integer]
+        # @return [Model]
+        def self.gpt2_small(device_id: 0)
+          new(
+            vocab_size: 50257,
+            embed_dim: 768,
+            num_heads: 12,
+            num_layers: 12,
+            ff_dim: 3072,
+            max_seq_len: 1024,
+            dropout: 0.1,
+            activation: :gelu,
+            pre_norm: true,
+            device_id: device_id
+          )
+        end
+        # GPT-2 Medium: 345M parameters
+        # @param device_id [Integer]
+        # @return [Model]
+        def self.gpt2_medium(device_id: 0)
+          new(
+            vocab_size: 50257,
+            embed_dim: 1024,
+            num_heads: 16,
+            num_layers: 24,
+            ff_dim: 4096,
+            max_seq_len: 1024,
+            dropout: 0.1,
+            activation: :gelu,
+            pre_norm: true,
+            device_id: device_id
+          )
+        end
+        # GPT-2 Large: 774M parameters
+        # @param device_id [Integer]
+        # @return [Model]
+        def self.gpt2_large(device_id: 0)
+          new(
+            vocab_size: 50257,
+            embed_dim: 1280,
+            num_heads: 20,
+            num_layers: 36,
+            ff_dim: 5120,
+            max_seq_len: 1024,
+            dropout: 0.1,
+            activation: :gelu,
+            pre_norm: true,
+            device_id: device_id
+          )
+        end
+        # @return [String]
+        def to_s
+          "TransformerModel(vocab=#{@vocab_size}, embed=#{@embed_dim}, " \
+          "heads=#{@num_heads}, layers=#{@num_layers}, " \
+          "params=#{num_parameters})"
+        end
+      end
+    end
+  end
+end