RubyGems - mlx-ruby-lm - Versions diffs - 0.30.7.1 - Mend

mlx-ruby-lm 0.30.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (138) hide show

checksums.yaml +7 -0
data/LICENSE.txt +21 -0
data/README.md +83 -0
data/exe/mlx_lm +7 -0
data/lib/mlx_lm/benchmark.rb +67 -0
data/lib/mlx_lm/chat_template.rb +41 -0
data/lib/mlx_lm/cli.rb +113 -0
data/lib/mlx_lm/config.rb +30 -0
data/lib/mlx_lm/convert_utils.rb +51 -0
data/lib/mlx_lm/generate.rb +204 -0
data/lib/mlx_lm/load_utils.rb +87 -0
data/lib/mlx_lm/model_args.rb +54 -0
data/lib/mlx_lm/models/activations.rb +46 -0
data/lib/mlx_lm/models/afm7.rb +131 -0
data/lib/mlx_lm/models/afmoe.rb +421 -0
data/lib/mlx_lm/models/apertus.rb +179 -0
data/lib/mlx_lm/models/baichuan_m1.rb +306 -0
data/lib/mlx_lm/models/bailing_moe.rb +399 -0
data/lib/mlx_lm/models/bailing_moe_linear.rb +91 -0
data/lib/mlx_lm/models/bitlinear_layers.rb +108 -0
data/lib/mlx_lm/models/bitnet.rb +176 -0
data/lib/mlx_lm/models/cache.rb +792 -0
data/lib/mlx_lm/models/cohere.rb +150 -0
data/lib/mlx_lm/models/cohere2.rb +224 -0
data/lib/mlx_lm/models/dbrx.rb +286 -0
data/lib/mlx_lm/models/deepseek.rb +239 -0
data/lib/mlx_lm/models/deepseek_v2.rb +108 -0
data/lib/mlx_lm/models/deepseek_v3.rb +34 -0
data/lib/mlx_lm/models/deepseek_v32.rb +45 -0
data/lib/mlx_lm/models/dots1.rb +292 -0
data/lib/mlx_lm/models/ernie4_5.rb +165 -0
data/lib/mlx_lm/models/ernie4_5_moe.rb +97 -0
data/lib/mlx_lm/models/exaone.rb +169 -0
data/lib/mlx_lm/models/exaone4.rb +233 -0
data/lib/mlx_lm/models/exaone_moe.rb +421 -0
data/lib/mlx_lm/models/falcon_h1.rb +102 -0
data/lib/mlx_lm/models/gated_delta.rb +136 -0
data/lib/mlx_lm/models/gemma.rb +159 -0
data/lib/mlx_lm/models/gemma2.rb +198 -0
data/lib/mlx_lm/models/gemma3.rb +85 -0
data/lib/mlx_lm/models/gemma3_text.rb +270 -0
data/lib/mlx_lm/models/gemma3n.rb +79 -0
data/lib/mlx_lm/models/glm.rb +164 -0
data/lib/mlx_lm/models/glm4.rb +180 -0
data/lib/mlx_lm/models/glm4_moe.rb +343 -0
data/lib/mlx_lm/models/glm4_moe_lite.rb +131 -0
data/lib/mlx_lm/models/glm_moe_dsa.rb +26 -0
data/lib/mlx_lm/models/gpt2.rb +166 -0
data/lib/mlx_lm/models/gpt_bigcode.rb +154 -0
data/lib/mlx_lm/models/gpt_neox.rb +178 -0
data/lib/mlx_lm/models/gpt_oss.rb +319 -0
data/lib/mlx_lm/models/granite.rb +170 -0
data/lib/mlx_lm/models/granitemoe.rb +58 -0
data/lib/mlx_lm/models/granitemoehybrid.rb +178 -0
data/lib/mlx_lm/models/helium.rb +158 -0
data/lib/mlx_lm/models/hunyuan.rb +378 -0
data/lib/mlx_lm/models/hunyuan_v1_dense.rb +235 -0
data/lib/mlx_lm/models/internlm2.rb +160 -0
data/lib/mlx_lm/models/internlm3.rb +237 -0
data/lib/mlx_lm/models/iquestloopcoder.rb +261 -0
data/lib/mlx_lm/models/jamba.rb +158 -0
data/lib/mlx_lm/models/kimi_k25.rb +98 -0
data/lib/mlx_lm/models/kimi_linear.rb +124 -0
data/lib/mlx_lm/models/kimi_vl.rb +93 -0
data/lib/mlx_lm/models/klear.rb +283 -0
data/lib/mlx_lm/models/lfm2.rb +120 -0
data/lib/mlx_lm/models/lfm2_moe.rb +421 -0
data/lib/mlx_lm/models/lfm2_vl.rb +67 -0
data/lib/mlx_lm/models/lille_130m.rb +148 -0
data/lib/mlx_lm/models/llama.rb +183 -0
data/lib/mlx_lm/models/llama4.rb +357 -0
data/lib/mlx_lm/models/llama4_text.rb +195 -0
data/lib/mlx_lm/models/longcat_flash.rb +153 -0
data/lib/mlx_lm/models/longcat_flash_ngram.rb +137 -0
data/lib/mlx_lm/models/mamba.rb +301 -0
data/lib/mlx_lm/models/mamba2.rb +292 -0
data/lib/mlx_lm/models/mimo.rb +174 -0
data/lib/mlx_lm/models/mimo_v2_flash.rb +491 -0
data/lib/mlx_lm/models/minicpm.rb +169 -0
data/lib/mlx_lm/models/minicpm3.rb +237 -0
data/lib/mlx_lm/models/minimax.rb +282 -0
data/lib/mlx_lm/models/ministral3.rb +304 -0
data/lib/mlx_lm/models/mistral3.rb +84 -0
data/lib/mlx_lm/models/mixtral.rb +192 -0
data/lib/mlx_lm/models/mla.rb +75 -0
data/lib/mlx_lm/models/nanochat.rb +167 -0
data/lib/mlx_lm/models/nemotron.rb +202 -0
data/lib/mlx_lm/models/nemotron_h.rb +212 -0
data/lib/mlx_lm/models/nemotron_nas.rb +404 -0
data/lib/mlx_lm/models/olmo.rb +165 -0
data/lib/mlx_lm/models/olmo2.rb +169 -0
data/lib/mlx_lm/models/olmo3.rb +254 -0
data/lib/mlx_lm/models/olmoe.rb +64 -0
data/lib/mlx_lm/models/openelm.rb +208 -0
data/lib/mlx_lm/models/phi.rb +156 -0
data/lib/mlx_lm/models/phi3.rb +171 -0
data/lib/mlx_lm/models/phi3small.rb +196 -0
data/lib/mlx_lm/models/phimoe.rb +206 -0
data/lib/mlx_lm/models/phixtral.rb +208 -0
data/lib/mlx_lm/models/pipeline.rb +37 -0
data/lib/mlx_lm/models/pixtral.rb +47 -0
data/lib/mlx_lm/models/plamo.rb +169 -0
data/lib/mlx_lm/models/plamo2.rb +173 -0
data/lib/mlx_lm/models/qwen.rb +175 -0
data/lib/mlx_lm/models/qwen2.rb +162 -0
data/lib/mlx_lm/models/qwen2_moe.rb +189 -0
data/lib/mlx_lm/models/qwen2_vl.rb +48 -0
data/lib/mlx_lm/models/qwen3.rb +167 -0
data/lib/mlx_lm/models/qwen3_5.rb +69 -0
data/lib/mlx_lm/models/qwen3_5_moe.rb +54 -0
data/lib/mlx_lm/models/qwen3_moe.rb +166 -0
data/lib/mlx_lm/models/qwen3_next.rb +147 -0
data/lib/mlx_lm/models/qwen3_vl.rb +48 -0
data/lib/mlx_lm/models/qwen3_vl_moe.rb +92 -0
data/lib/mlx_lm/models/recurrent_gemma.rb +444 -0
data/lib/mlx_lm/models/rope_utils.rb +316 -0
data/lib/mlx_lm/models/rwkv7.rb +101 -0
data/lib/mlx_lm/models/seed_oss.rb +167 -0
data/lib/mlx_lm/models/smollm3.rb +89 -0
data/lib/mlx_lm/models/solar_open.rb +79 -0
data/lib/mlx_lm/models/ssm.rb +162 -0
data/lib/mlx_lm/models/stablelm.rb +160 -0
data/lib/mlx_lm/models/starcoder2.rb +161 -0
data/lib/mlx_lm/models/step3p5.rb +479 -0
data/lib/mlx_lm/models/switch_layers.rb +221 -0
data/lib/mlx_lm/models/telechat3.rb +192 -0
data/lib/mlx_lm/models/youtu_llm.rb +230 -0
data/lib/mlx_lm/models.rb +33 -0
data/lib/mlx_lm/perplexity.rb +48 -0
data/lib/mlx_lm/quantize.rb +131 -0
data/lib/mlx_lm/sample_utils.rb +159 -0
data/lib/mlx_lm/server.rb +190 -0
data/lib/mlx_lm/tokenizer_utils.rb +158 -0
data/lib/mlx_lm/tuner/lora.rb +165 -0
data/lib/mlx_lm/version.rb +3 -0
data/lib/mlx_lm/weight_utils.rb +170 -0
data/lib/mlx_lm.rb +135 -0
metadata +272 -0

data/lib/mlx_lm/models/nemotron_nas.rb ADDED Viewed

@@ -0,0 +1,404 @@
+require_relative "cache"
+require_relative "rope_utils"
+module MlxLm
+  module Models
+    module NemotronNas
+      module_function
+      def find_multiple(n, k)
+        remainder = n % k
+        remainder.zero? ? n : (n + k - remainder)
+      end
+      def ffn_mult_to_intermediate_size(ffn_mult, hidden_size)
+        intermediate_size = (2 * ffn_mult.to_f * hidden_size / 3).to_i
+        find_multiple(intermediate_size, 256)
+      end
+      class AttentionConfig
+        attr_reader :no_op, :replace_with_linear, :sparsify, :n_heads_in_group, :window_length,
+          :num_sink_tokens, :use_prefill_window_in_sink_attention, :unshifted_sink
+        def initialize(
+          no_op: false,
+          replace_with_linear: false,
+          sparsify: nil,
+          n_heads_in_group: nil,
+          window_length: nil,
+          num_sink_tokens: nil,
+          use_prefill_window_in_sink_attention: false,
+          unshifted_sink: false
+        )
+          @no_op = no_op
+          @replace_with_linear = replace_with_linear
+          @sparsify = sparsify
+          @n_heads_in_group = n_heads_in_group
+          @window_length = window_length
+          @num_sink_tokens = num_sink_tokens
+          @use_prefill_window_in_sink_attention = use_prefill_window_in_sink_attention
+          @unshifted_sink = unshifted_sink
+          if @no_op || @replace_with_linear
+            @n_heads_in_group = nil
+            @window_length = nil
+            @num_sink_tokens = nil
+          else
+            raise ArgumentError, "n_heads_in_group must be specified for active attention blocks" if @n_heads_in_group.nil?
+            raise ArgumentError, "n_heads_in_group must be positive, got #{@n_heads_in_group}" if @n_heads_in_group.to_i <= 0
+          end
+        end
+        def self.from_dict(data)
+          hash = _symbolize_keys(data || {})
+          new(**hash)
+        end
+        def self._symbolize_keys(hash)
+          hash.each_with_object({}) { |(k, v), out| out[k.to_sym] = v }
+        end
+        private_class_method :_symbolize_keys
+      end
+      class FFNConfig
+        attr_reader :no_op, :replace_with_linear, :sparsify, :ffn_mult
+        def initialize(
+          no_op: false,
+          replace_with_linear: false,
+          sparsify: nil,
+          ffn_mult: nil
+        )
+          @no_op = no_op
+          @replace_with_linear = replace_with_linear
+          @sparsify = sparsify
+          @ffn_mult = ffn_mult
+          if @no_op || @replace_with_linear
+            @ffn_mult = nil
+          else
+            raise ArgumentError, "ffn_mult must be specified for active FFN blocks" if @ffn_mult.nil?
+            @ffn_mult = @ffn_mult.to_f.round(6)
+          end
+        end
+        def self.from_dict(data)
+          hash = _symbolize_keys(data || {})
+          new(**hash)
+        end
+        def self._symbolize_keys(hash)
+          hash.each_with_object({}) { |(k, v), out| out[k.to_sym] = v }
+        end
+        private_class_method :_symbolize_keys
+      end
+      class BlockConfig
+        attr_reader :attention, :ffn
+        def initialize(attention:, ffn:)
+          @attention = attention
+          @ffn = ffn
+        end
+        def self.from_dict(data)
+          hash = data || {}
+          attention_data = hash["attention"] || hash[:attention] || {}
+          ffn_data = hash["ffn"] || hash[:ffn] || {}
+          new(
+            attention: AttentionConfig.from_dict(attention_data),
+            ffn: FFNConfig.from_dict(ffn_data)
+          )
+        end
+      end
+      class ModelArgs < BaseModelArgs
+        field :model_type, default: "nemotron-nas"
+        field :hidden_size, default: 8192
+        field :num_hidden_layers, default: 80
+        field :num_attention_heads, default: 64
+        field :rms_norm_eps, default: 1e-5
+        field :vocab_size, default: 128_256
+        field :block_configs, default: []
+        field :hidden_act, default: "silu"
+        field :attention_bias, default: false
+        field :mlp_bias, default: false
+        field :rope_theta, default: 500_000.0
+        field :rope_scaling, default: nil
+        field :max_position_embeddings, default: 131_072
+        field :tie_word_embeddings, default: false
+        def initialize(**kwargs)
+          super
+          @block_configs = Array(@block_configs).map do |config|
+            config.is_a?(BlockConfig) ? config : BlockConfig.from_dict(config)
+          end
+          if @block_configs.length != @num_hidden_layers
+            raise ArgumentError,
+              "Number of block_configs (#{@block_configs.length}) must match num_hidden_layers (#{@num_hidden_layers})"
+          end
+          validate_rope_scaling!
+          validate_block_configs!
+        end
+        private
+        def validate_rope_scaling!
+          return unless @rope_scaling
+          factor = rope_scaling_value(:factor)
+          raise ArgumentError, "rope_scaling must contain 'factor'" if factor.nil?
+          rope_type = rope_scaling_value(:rope_type) || rope_scaling_value(:type)
+          raise ArgumentError, "rope_scaling must contain 'rope_type'" if rope_type.nil?
+          normalized = @rope_scaling.dup
+          normalized["rope_type"] = rope_type
+          normalized[:rope_type] = rope_type
+          @rope_scaling = normalized
+        end
+        def rope_scaling_value(key)
+          return nil unless @rope_scaling
+          return @rope_scaling[key] if @rope_scaling.key?(key)
+          @rope_scaling[key.to_s]
+        end
+        def validate_block_configs!
+          @block_configs.each_with_index do |block_config, i|
+            attention = block_config.attention
+            next if attention.no_op || attention.replace_with_linear
+            heads_in_group = attention.n_heads_in_group.to_i
+            if (@num_attention_heads % heads_in_group) != 0
+              raise ArgumentError,
+                "Layer #{i}: num_attention_heads (#{@num_attention_heads}) must be divisible by n_heads_in_group (#{attention.n_heads_in_group})"
+            end
+          end
+        end
+      end
+      class Attention < MLX::NN::Module
+        def initialize(args, attention_config)
+          super()
+          dim = args.hidden_size
+          @n_heads = args.num_attention_heads
+          @n_kv_heads = @n_heads / attention_config.n_heads_in_group
+          @head_dim = args.hidden_size / @n_heads
+          raise ArgumentError, "hidden_size (#{dim}) must be divisible by num_attention_heads (#{@n_heads})" if (@head_dim * @n_heads) != dim
+          @scale = @head_dim**(-0.5)
+          self.q_proj = MLX::NN::Linear.new(dim, @n_heads * @head_dim, bias: args.attention_bias)
+          self.k_proj = MLX::NN::Linear.new(dim, @n_kv_heads * @head_dim, bias: args.attention_bias)
+          self.v_proj = MLX::NN::Linear.new(dim, @n_kv_heads * @head_dim, bias: args.attention_bias)
+          self.o_proj = MLX::NN::Linear.new(@n_heads * @head_dim, dim, bias: args.attention_bias)
+          self.rope = MlxLm::Models.initialize_rope(
+            @head_dim,
+            args.rope_theta,
+            false,
+            args.rope_scaling,
+            max_position_embeddings: args.max_position_embeddings
+          )
+        end
+        def call(x, mask: nil, cache: nil)
+          mx = MLX::Core
+          b, l, _d = x.shape
+          queries = q_proj.call(x).reshape([b, l, @n_heads, @head_dim]).transpose([0, 2, 1, 3])
+          keys = k_proj.call(x).reshape([b, l, @n_kv_heads, @head_dim]).transpose([0, 2, 1, 3])
+          values = v_proj.call(x).reshape([b, l, @n_kv_heads, @head_dim]).transpose([0, 2, 1, 3])
+          if cache
+            queries = rope.call(queries, offset: cache.offset)
+            keys = rope.call(keys, offset: cache.offset)
+            keys, values = cache.update_and_fetch(keys, values)
+          else
+            queries = rope.call(queries)
+            keys = rope.call(keys)
+          end
+          output = mx.scaled_dot_product_attention(queries, keys, values, @scale, mask)
+          output = output.transpose([0, 2, 1, 3]).reshape([b, l, @n_heads * @head_dim])
+          o_proj.call(output)
+        end
+      end
+      class MLP < MLX::NN::Module
+        def initialize(args, ffn_config)
+          super()
+          hidden_dim = NemotronNas.ffn_mult_to_intermediate_size(ffn_config.ffn_mult, args.hidden_size)
+          @act_fn = args.hidden_act
+          supported = %w[silu relu gelu gelu_new gelu_fast]
+          unless supported.include?(@act_fn)
+            raise ArgumentError, "Unknown activation function: #{@act_fn}"
+          end
+          self.gate_proj = MLX::NN::Linear.new(args.hidden_size, hidden_dim, bias: args.mlp_bias)
+          self.down_proj = MLX::NN::Linear.new(hidden_dim, args.hidden_size, bias: args.mlp_bias)
+          self.up_proj = MLX::NN::Linear.new(args.hidden_size, hidden_dim, bias: args.mlp_bias)
+        end
+        def call(x)
+          gate = _activate(gate_proj.call(x))
+          down_proj.call(gate * up_proj.call(x))
+        end
+        private
+        def _activate(x)
+          case @act_fn
+          when "silu"
+            MLX::NN.silu(x)
+          when "relu"
+            MLX::NN.relu(x)
+          when "gelu"
+            MLX::NN.gelu(x)
+          when "gelu_new", "gelu_fast"
+            MLX::NN.gelu_approx(x)
+          else
+            x
+          end
+        end
+      end
+      class LinearSubblockReplacement < MLX::NN::Module
+        def initialize(hidden_size, bias)
+          super()
+          self.linear = MLX::NN::Linear.new(hidden_size, hidden_size, bias: bias)
+        end
+        def call(x, mask: nil, cache: nil)
+          _ = mask
+          _ = cache
+          linear.call(x)
+        end
+      end
+      class TransformerBlock < MLX::NN::Module
+        def initialize(args, layer_idx)
+          super()
+          block_config = args.block_configs[layer_idx]
+          @attention_config = block_config.attention
+          @ffn_config = block_config.ffn
+          self.input_layernorm = MLX::NN::RMSNorm.new(args.hidden_size, eps: args.rms_norm_eps) unless @attention_config.no_op
+          self.self_attn = if @attention_config.no_op
+            nil
+          elsif @attention_config.replace_with_linear
+            LinearSubblockReplacement.new(args.hidden_size, args.attention_bias)
+          else
+            Attention.new(args, @attention_config)
+          end
+          self.post_attention_layernorm = MLX::NN::RMSNorm.new(args.hidden_size, eps: args.rms_norm_eps) unless @ffn_config.no_op
+          self.mlp = if @ffn_config.no_op
+            nil
+          elsif @ffn_config.replace_with_linear
+            LinearSubblockReplacement.new(args.hidden_size, args.mlp_bias)
+          else
+            MLP.new(args, @ffn_config)
+          end
+        end
+        def call(x, mask: nil, cache: nil)
+          if self_attn
+            residual = x
+            h = input_layernorm.call(x)
+            x = residual + self_attn.call(h, mask: mask, cache: cache)
+          end
+          if mlp
+            residual = x
+            h = post_attention_layernorm.call(x)
+            x = residual + mlp.call(h)
+          end
+          x
+        end
+      end
+      class NemotronNASModel < MLX::NN::Module
+        attr_reader :num_attn_layers
+        def initialize(args)
+          super()
+          self.embed_tokens = MLX::NN::Embedding.new(args.vocab_size, args.hidden_size)
+          self.layers = Array.new(args.num_hidden_layers) { |layer_idx| TransformerBlock.new(args, layer_idx) }
+          self.norm = MLX::NN::RMSNorm.new(args.hidden_size, eps: args.rms_norm_eps)
+          @num_attn_layers = layers.count { |layer| !layer.self_attn.nil? }
+        end
+        def call(inputs, cache: nil)
+          h = embed_tokens.call(inputs)
+          layer_cache = cache || [nil] * @num_attn_layers
+          mask = _create_attention_mask(h, layer_cache[0])
+          cache_idx = 0
+          layers.each do |layer|
+            layer_state = if layer.self_attn
+              state = layer_cache[cache_idx]
+              cache_idx += 1
+              state
+            end
+            h = layer.call(h, mask: mask, cache: layer_state)
+          end
+          norm.call(h)
+        end
+        private
+        def _create_attention_mask(h, cache)
+          n = h.shape[1]
+          return cache.make_mask(n) if cache && cache.respond_to?(:make_mask)
+          return nil if n == 1
+          "causal"
+        end
+      end
+      class Model < MLX::NN::Module
+        def initialize(args)
+          super()
+          @args = args
+          self.model_type = args.model_type
+          self.model = NemotronNASModel.new(args)
+          self.lm_head = MLX::NN::Linear.new(args.hidden_size, args.vocab_size, bias: false) unless args.tie_word_embeddings
+        end
+        def call(inputs, cache: nil)
+          out = model.call(inputs, cache: cache)
+          if @args.tie_word_embeddings
+            model.embed_tokens.as_linear(out)
+          else
+            lm_head.call(out)
+          end
+        end
+        def sanitize(weights)
+          result = weights.reject { |k, _| k.include?("self_attn.rotary_emb.inv_freq") }
+          result.delete("lm_head.weight") if @args.tie_word_embeddings
+          result
+        end
+        def layers
+          model.layers
+        end
+        def make_cache
+          layers.filter_map do |layer|
+            MlxLm::KVCache.new if layer.self_attn
+          end
+        end
+      end
+      Models.register("nemotron-nas", Model, ModelArgs)
+    end
+  end
+end

data/lib/mlx_lm/models/olmo.rb ADDED Viewed

@@ -0,0 +1,165 @@
+module MlxLm
+  module Models
+    module OLMo
+      class ModelArgs < BaseModelArgs
+        field :model_type, default: "olmo"
+        field :d_model, default: nil
+        field :n_layers, default: nil
+        field :mlp_hidden_size, default: nil
+        field :n_heads, default: nil
+        field :vocab_size, default: 50304
+        field :embedding_size, default: nil
+        field :rope_theta, default: 10000.0
+        field :rope_traditional, default: false
+        field :mlp_ratio, default: 4
+        field :weight_tying, default: false
+        # Compatibility aliases used in some generic tests/config builders.
+        field :hidden_size, default: nil
+        field :num_hidden_layers, default: nil
+        field :intermediate_size, default: nil
+        field :num_attention_heads, default: nil
+        field :tie_word_embeddings, default: nil
+        def initialize(**kwargs)
+          super
+          @d_model = @hidden_size if @hidden_size
+          @n_layers = @num_hidden_layers if @num_hidden_layers
+          @n_heads = @num_attention_heads if @num_attention_heads
+          @mlp_hidden_size = @intermediate_size if @intermediate_size
+          @weight_tying = @tie_word_embeddings unless @tie_word_embeddings.nil?
+          @d_model ||= 4096
+          @n_layers ||= 32
+          @n_heads ||= 32
+          @embedding_size ||= @vocab_size
+          @mlp_hidden_size ||= @mlp_ratio * @d_model
+        end
+      end
+      class TransformerBlock < MLX::NN::Module
+        def initialize(args)
+          super()
+          dim = args.d_model
+          @n_heads = args.n_heads
+          @head_dim = dim / @n_heads
+          @scale = @head_dim**(-0.5)
+          @ff_hidden_size = args.mlp_hidden_size
+          self.ff_proj = MLX::NN::Linear.new(dim, @ff_hidden_size, bias: false)
+          self.ff_out = MLX::NN::Linear.new(@ff_hidden_size / 2, dim, bias: false)
+          self.att_norm = MLX::NN::LayerNorm.new(dim, affine: false)
+          self.ff_norm = MLX::NN::LayerNorm.new(dim, affine: false)
+          self.att_proj = MLX::NN::Linear.new(dim, 3 * dim, bias: false)
+          self.attn_out = MLX::NN::Linear.new(dim, dim, bias: false)
+          self.rope = MLX::NN::RoPE.new(
+            @head_dim,
+            traditional: args.rope_traditional,
+            base: args.rope_theta
+          )
+        end
+        def attend(x, mask: nil, cache: nil)
+          mx = MLX::Core
+          b, l, d = x.shape
+          qkv = att_proj.call(x)
+          queries, keys, values = mx.split(qkv, [d, 2 * d], 2)
+          queries = queries.reshape([b, l, @n_heads, @head_dim]).transpose([0, 2, 1, 3])
+          keys = keys.reshape([b, l, @n_heads, @head_dim]).transpose([0, 2, 1, 3])
+          values = values.reshape([b, l, @n_heads, @head_dim]).transpose([0, 2, 1, 3])
+          if cache
+            queries = rope.call(queries, offset: cache.offset)
+            keys = rope.call(keys, offset: cache.offset)
+            keys, values = cache.update_and_fetch(keys, values)
+          else
+            queries = rope.call(queries)
+            keys = rope.call(keys)
+          end
+          output = mx.scaled_dot_product_attention(queries, keys, values, @scale, mask)
+          output = output.transpose([0, 2, 1, 3]).reshape([b, l, d])
+          attn_out.call(output)
+        end
+        def call(x, mask: nil, cache: nil)
+          mx = MLX::Core
+          r = attend(att_norm.call(x), mask: mask, cache: cache)
+          h = x + r
+          ff_hidden = ff_proj.call(ff_norm.call(h))
+          x1, x2 = mx.split(ff_hidden, [@ff_hidden_size / 2], 2)
+          h + ff_out.call(Activations.swiglu(x2, x1))
+        end
+      end
+      class Transformer < MLX::NN::Module
+        def initialize(args)
+          super()
+          @weight_tying = args.weight_tying
+          self.wte = MLX::NN::Embedding.new(args.embedding_size, args.d_model)
+          self.blocks = Array.new(args.n_layers) { TransformerBlock.new(args) }
+          self.ff_out = MLX::NN::Linear.new(args.d_model, args.embedding_size, bias: false) unless @weight_tying
+          self.norm = MLX::NN::LayerNorm.new(args.d_model, affine: false)
+        end
+        def call(inputs, cache: nil)
+          h = wte.call(inputs)
+          layer_cache = cache || [nil] * blocks.length
+          mask = nil
+          mask = "causal" if h.shape[1] > 1
+          blocks.each_with_index do |block, i|
+            h = block.call(h, mask: mask, cache: layer_cache[i])
+          end
+          h = norm.call(h)
+          if @weight_tying
+            wte.as_linear(h)
+          else
+            ff_out.call(h)
+          end
+        end
+      end
+      class OlmoModel < MLX::NN::Module
+        def initialize(args)
+          super()
+          self.transformer = Transformer.new(args)
+        end
+        def call(inputs, cache: nil)
+          transformer.call(inputs, cache: cache)
+        end
+      end
+      class Model < MLX::NN::Module
+        def initialize(args)
+          super()
+          self.model_type = args.model_type
+          self.model = OlmoModel.new(args)
+          self.args = args
+        end
+        def call(inputs, cache: nil)
+          model.call(inputs, cache: cache)
+        end
+        def layers
+          model.transformer.blocks
+        end
+      end
+      Models.register("olmo", Model, ModelArgs)
+    end
+  end
+end