ignis-dl 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,170 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ignis
4
+ module AI
5
+ # Loss functions for training.
6
+ # Each returns a scalar Tensor with autograd support.
7
+ module Loss
8
+ class << self
9
+ # Cross-entropy loss (classification, language modeling).
10
+ # Fused log-softmax + NLL for numerical stability.
11
+ # @param logits [Tensor] model output [batch_size, vocab_size]
12
+ # @param targets [Tensor] target indices [batch_size] (int32)
13
+ # @param label_smoothing [Float] label smoothing factor (0.0 = none)
14
+ # @return [Tensor] scalar loss
15
+ def cross_entropy(logits, targets, label_smoothing: 0.0)
16
+ batch_size = logits.shape[0]
17
+ vocab_size = logits.shape[-1]
18
+
19
+ # Allocate outputs
20
+ losses_nv = Ignis::Shared::NvArray.new(shape: [batch_size], dtype: :float32,
21
+ device_id: logits.device_id)
22
+ losses_nv.from_host(Array.new(batch_size, 0.0))
23
+
24
+ log_softmax_nv = Ignis::Shared::NvArray.new(shape: logits.shape, dtype: :float32,
25
+ device_id: logits.device_id)
26
+ log_softmax_nv.from_host(Array.new(logits.numel, 0.0))
27
+
28
+ # Forward kernel
29
+ kernel = Ignis::JIT::Kernels::Loss.cross_entropy_forward
30
+ kernel.launch(grid: [(batch_size + 255) / 256], block: [256],
31
+ args: [logits.data, targets.data, losses_nv, log_softmax_nv,
32
+ batch_size, vocab_size, label_smoothing.to_f])
33
+
34
+ # Mean reduction
35
+ mean_nv = Ignis::Shared::NvArray.new(shape: [1], dtype: :float32, device_id: logits.device_id)
36
+ mean_nv.from_host([0.0])
37
+ mean_k = Ignis::JIT::Kernels::Loss.mean_reduce
38
+ mean_k.launch(grid: [1], block: [1], args: [losses_nv, mean_nv, batch_size])
39
+
40
+ result = Tensor.new(data: mean_nv, requires_grad: logits.requires_grad, is_leaf: false)
41
+
42
+ if logits.requires_grad
43
+ saved_lsm = log_softmax_nv
44
+ saved_targets = targets.data
45
+ Tape.record(result, inputs: [logits]) do |grad|
46
+ # Scale by the MEAN reduction (1/batch_size) AND the chained upstream
47
+ # gradient. The upstream grad was previously ignored (hardcoded to
48
+ # 1/batch_size), so any downstream loss scaling — e.g. the Trainer's
49
+ # gradient-accumulation division, or loss*k — was silently dropped,
50
+ # making effective gradients wrong. grad is the scalar [1] cotangent
51
+ # of this scalar loss.
52
+ upstream = grad.to_host[0].to_f
53
+ grad_scale = Ignis::Shared::NvArray.new(shape: [batch_size], dtype: :float32,
54
+ device_id: logits.device_id)
55
+ scale_val = upstream / batch_size
56
+ grad_scale.from_host(Array.new(batch_size, scale_val))
57
+
58
+ grad_logits = Ignis::Shared::NvArray.new(shape: logits.shape, dtype: :float32,
59
+ device_id: logits.device_id)
60
+ grad_logits.from_host(Array.new(logits.numel, 0.0))
61
+
62
+ bk = Ignis::JIT::Kernels::Loss.cross_entropy_backward
63
+ total = batch_size * vocab_size
64
+ bk.launch(grid: [(total + 255) / 256], block: [256],
65
+ args: [saved_lsm, saved_targets, grad_scale,
66
+ grad_logits, batch_size, vocab_size, label_smoothing.to_f])
67
+ [grad_logits]
68
+ end
69
+ end
70
+
71
+ result
72
+ end
73
+
74
+ # Mean squared error loss.
75
+ # @param predictions [Tensor]
76
+ # @param targets [Tensor]
77
+ # @return [Tensor] scalar loss
78
+ def mse(predictions, targets)
79
+ n = predictions.numel
80
+
81
+ losses_nv = Ignis::Shared::NvArray.new(shape: predictions.shape, dtype: :float32,
82
+ device_id: predictions.device_id)
83
+ losses_nv.from_host(Array.new(n, 0.0))
84
+
85
+ kernel = Ignis::JIT::Kernels::Loss.mse_forward
86
+ kernel.launch(grid: [(n + 255) / 256], block: [256],
87
+ args: [predictions.data, targets.data, losses_nv, n])
88
+
89
+ # Mean
90
+ mean_nv = Ignis::Shared::NvArray.new(shape: [1], dtype: :float32,
91
+ device_id: predictions.device_id)
92
+ mean_nv.from_host([0.0])
93
+ mean_k = Ignis::JIT::Kernels::Loss.mean_reduce
94
+ mean_k.launch(grid: [1], block: [1], args: [losses_nv, mean_nv, n])
95
+
96
+ result = Tensor.new(data: mean_nv, requires_grad: predictions.requires_grad, is_leaf: false)
97
+
98
+ if predictions.requires_grad
99
+ saved_pred = predictions.data
100
+ saved_tgt = targets.data
101
+ Tape.record(result, inputs: [predictions]) do |grad|
102
+ grad_input = Ignis::Shared::NvArray.new(shape: predictions.shape, dtype: :float32,
103
+ device_id: predictions.device_id)
104
+ grad_input.from_host(Array.new(n, 0.0))
105
+ bk = Ignis::JIT::Kernels::Loss.mse_backward
106
+ scale = 1.0 / n
107
+ bk.launch(grid: [(n + 255) / 256], block: [256],
108
+ args: [saved_pred, saved_tgt, grad, grad_input, n, scale.to_f])
109
+ [grad_input]
110
+ end
111
+ end
112
+
113
+ result
114
+ end
115
+
116
+ # Binary cross-entropy with logits (sigmoid applied inside).
117
+ # @param logits [Tensor]
118
+ # @param targets [Tensor] (0.0 or 1.0)
119
+ # @return [Tensor] scalar loss
120
+ def binary_cross_entropy(logits, targets)
121
+ n = logits.numel
122
+
123
+ losses_nv = Ignis::Shared::NvArray.new(shape: logits.shape, dtype: :float32,
124
+ device_id: logits.device_id)
125
+ losses_nv.from_host(Array.new(n, 0.0))
126
+
127
+ kernel = Ignis::JIT::Kernels::Loss.bce_forward
128
+ kernel.launch(grid: [(n + 255) / 256], block: [256],
129
+ args: [logits.data, targets.data, losses_nv, n])
130
+
131
+ mean_nv = Ignis::Shared::NvArray.new(shape: [1], dtype: :float32,
132
+ device_id: logits.device_id)
133
+ mean_nv.from_host([0.0])
134
+ mean_k = Ignis::JIT::Kernels::Loss.mean_reduce
135
+ mean_k.launch(grid: [1], block: [1], args: [losses_nv, mean_nv, n])
136
+
137
+ result = Tensor.new(data: mean_nv, requires_grad: logits.requires_grad, is_leaf: false)
138
+
139
+ if logits.requires_grad
140
+ saved_logits = logits.data
141
+ saved_targets = targets.data
142
+ Tape.record(result, inputs: [logits]) do |grad|
143
+ grad_input = Ignis::Shared::NvArray.new(shape: logits.shape, dtype: :float32,
144
+ device_id: logits.device_id)
145
+ grad_input.from_host(Array.new(n, 0.0))
146
+ bk = Ignis::JIT::Kernels::Loss.bce_backward
147
+ bk.launch(grid: [(n + 255) / 256], block: [256],
148
+ args: [saved_logits, saved_targets, grad, grad_input, n])
149
+ [grad_input]
150
+ end
151
+ end
152
+
153
+ result
154
+ end
155
+
156
+ # KL divergence: KL(p || q) = sum(p * log(p/q))
157
+ # @param log_q [Tensor] log probabilities of model
158
+ # @param p [Tensor] target distribution
159
+ # @return [Tensor] scalar loss
160
+ def kl_divergence(log_q, p)
161
+ # KL = sum(p * (log(p) - log_q))
162
+ # Implement via existing tensor ops
163
+ diff = p * (p.relu + Tensor.from_host([1e-8], shape: [1], device_id: p.device_id)) - log_q
164
+ loss = (p * diff).sum
165
+ loss
166
+ end
167
+ end
168
+ end
169
+ end
170
+ end
@@ -0,0 +1,68 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "module"
4
+
5
+ module Ignis
6
+ module AI
7
+ module NN
8
+ # Dropout: randomly zeros elements during training.
9
+ # In eval mode, passes input through unchanged.
10
+ # Uses inverted dropout: scale by 1/(1-p) during training.
11
+ class Dropout < Module
12
+ # @param p [Float] probability of zeroing an element (0.0 to 1.0)
13
+ def initialize(p: 0.1)
14
+ super()
15
+ @p = p
16
+ end
17
+
18
+ # Forward pass.
19
+ # In training mode: apply random mask and scale by 1/(1-p).
20
+ # In eval mode: pass through.
21
+ # @param x [Tensor]
22
+ # @return [Tensor]
23
+ def forward(x)
24
+ return x unless @training
25
+ return x if @p == 0.0
26
+
27
+ # Generate bernoulli mask on host, transfer to GPU
28
+ n = x.numel
29
+ scale = 1.0 / (1.0 - @p)
30
+ mask_data = Array.new(n) { Kernel.rand >= @p ? scale : 0.0 }
31
+
32
+ mask_nv = Ignis::Shared::NvArray.new(shape: x.shape, dtype: x.dtype, device_id: x.device_id)
33
+ mask_nv.from_host(mask_data)
34
+
35
+ # Elementwise multiply
36
+ result_nv = Ignis::Shared::NvArray.new(shape: x.shape, dtype: x.dtype, device_id: x.device_id)
37
+ result_nv.from_host(Array.new(n, 0.0))
38
+
39
+ kernel = Ignis::JIT::Kernels::Elementwise.mul_forward
40
+ kernel.launch(grid: [(n + 255) / 256], block: [256],
41
+ args: [x.data, mask_nv, result_nv, n])
42
+
43
+ result = Tensor.new(data: result_nv, requires_grad: x.requires_grad, is_leaf: false)
44
+
45
+ if x.requires_grad
46
+ Tape.record(result, inputs: [x]) do |grad|
47
+ # Backward: multiply grad by same mask
48
+ grad_in = Ignis::Shared::NvArray.new(shape: grad.shape, dtype: grad.dtype,
49
+ device_id: grad.device_id)
50
+ grad_in.from_host(Array.new(grad.numel, 0.0))
51
+ mk = Ignis::JIT::Kernels::Elementwise.mul_forward
52
+ mk.launch(grid: [(grad.numel + 255) / 256], block: [256],
53
+ args: [grad, mask_nv, grad_in, grad.numel])
54
+ [grad_in]
55
+ end
56
+ end
57
+
58
+ result
59
+ end
60
+
61
+ # @return [String]
62
+ def to_s
63
+ "Dropout(p=#{@p})"
64
+ end
65
+ end
66
+ end
67
+ end
68
+ end
@@ -0,0 +1,86 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "module"
4
+
5
+ module Ignis
6
+ module AI
7
+ module NN
8
+ # Embedding layer: maps integer indices to dense vectors.
9
+ # Forward uses gather_rows JIT kernel.
10
+ # Backward uses scatter_add with atomicAdd.
11
+ class Embedding < Module
12
+ # @return [Tensor] weight matrix [num_embeddings, embedding_dim]
13
+ attr_reader :weight
14
+
15
+ # @param num_embeddings [Integer] vocabulary size
16
+ # @param embedding_dim [Integer] dimension of embedding vectors
17
+ # @param device_id [Integer]
18
+ def initialize(num_embeddings, embedding_dim, device_id: 0)
19
+ super()
20
+ @num_embeddings = num_embeddings
21
+ @embedding_dim = embedding_dim
22
+
23
+ # Initialize uniform[-scale, scale] (scale = 1/sqrt(embedding_dim)) via a
24
+ # device kernel. The old host Array.new(num_embeddings*embedding_dim) was a
25
+ # 262M-element / ~10GB array for a 128k-vocab model — infeasible. The kaiming
26
+ # uniform kernel with bound=scale produces the same [-scale, scale] range.
27
+ scale = 1.0 / Math.sqrt(embedding_dim)
28
+ weight_nv = Ignis::Shared::NvArray.new(shape: [num_embeddings, embedding_dim],
29
+ dtype: :float32, device_id: device_id)
30
+ weight_nv.to_device
31
+ n = num_embeddings * embedding_dim
32
+ init_kernel = Ignis::JIT::Kernels::Elementwise.kaiming_uniform_init
33
+ init_kernel.launch(grid: [(n + 255) / 256], block: [256],
34
+ args: [weight_nv, scale.to_f, Ignis::JIT::Kernel::U64.new(::Random.new.rand(2**64)), n])
35
+
36
+ @weight = register_parameter("weight",
37
+ Tensor.new(data: weight_nv, requires_grad: true))
38
+ end
39
+
40
+ # Forward pass: gather rows from weight table.
41
+ # @param indices [Tensor] integer indices [batch_size, seq_len] (int32 on GPU)
42
+ # @return [Tensor] embeddings [batch_size, seq_len, embedding_dim]
43
+ def forward(indices)
44
+ num_indices = indices.numel
45
+ output_shape = indices.shape + [@embedding_dim]
46
+ output_nv = Ignis::Shared::NvArray.new(shape: output_shape, dtype: :float32,
47
+ device_id: @weight.device_id)
48
+ output_nv.from_host(Array.new(num_indices * @embedding_dim, 0.0))
49
+
50
+ kernel = Ignis::JIT::Kernels::Elementwise.gather_rows
51
+ total = num_indices * @embedding_dim
52
+ kernel.launch(grid: [(total + 255) / 256], block: [256],
53
+ args: [@weight.data, indices.data, output_nv, num_indices, @embedding_dim])
54
+
55
+ result = Tensor.new(data: output_nv,
56
+ requires_grad: @weight.requires_grad,
57
+ is_leaf: false)
58
+
59
+ if @weight.requires_grad
60
+ saved_indices = indices.data
61
+ saved_weight = @weight
62
+ Tape.record(result, inputs: [@weight]) do |grad|
63
+ # scatter_add: accumulate gradients for each embedding index
64
+ grad_weight = Ignis::Shared::NvArray.new(
65
+ shape: [@num_embeddings, @embedding_dim],
66
+ dtype: :float32, device_id: @weight.device_id)
67
+ grad_weight.from_host(Array.new(@num_embeddings * @embedding_dim, 0.0))
68
+
69
+ scatter_k = Ignis::JIT::Kernels::Elementwise.scatter_add
70
+ scatter_k.launch(grid: [(total + 255) / 256], block: [256],
71
+ args: [grad, saved_indices, grad_weight, num_indices, @embedding_dim])
72
+ [grad_weight]
73
+ end
74
+ end
75
+
76
+ result
77
+ end
78
+
79
+ # @return [String]
80
+ def to_s
81
+ "Embedding(num=#{@num_embeddings}, dim=#{@embedding_dim})"
82
+ end
83
+ end
84
+ end
85
+ end
86
+ end
@@ -0,0 +1,54 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "module"
4
+
5
+ module Ignis
6
+ module AI
7
+ module NN
8
+ # Layer normalization: y = gamma * (x - mean) / sqrt(var + eps) + beta
9
+ # Normalizes along the last dimension(s).
10
+ class LayerNorm < Module
11
+ # @return [Tensor] gamma (scale)
12
+ attr_reader :weight
13
+
14
+ # @return [Tensor] beta (shift)
15
+ attr_reader :bias
16
+
17
+ # @param normalized_shape [Integer] size of the last dimension
18
+ # @param eps [Float] epsilon for numerical stability
19
+ # @param device_id [Integer]
20
+ def initialize(normalized_shape, eps: 1e-5, device_id: 0)
21
+ super()
22
+ @normalized_shape = normalized_shape
23
+ @eps = eps
24
+
25
+ # Initialize weight (gamma) to ones
26
+ weight_nv = Ignis::Shared::NvArray.new(shape: [normalized_shape],
27
+ dtype: :float32, device_id: device_id)
28
+ weight_nv.from_host(Array.new(normalized_shape, 1.0))
29
+ @weight = register_parameter("weight",
30
+ Tensor.new(data: weight_nv, requires_grad: true))
31
+
32
+ # Initialize bias (beta) to zeros
33
+ bias_nv = Ignis::Shared::NvArray.new(shape: [normalized_shape],
34
+ dtype: :float32, device_id: device_id)
35
+ bias_nv.from_host(Array.new(normalized_shape, 0.0))
36
+ @bias = register_parameter("bias",
37
+ Tensor.new(data: bias_nv, requires_grad: true))
38
+ end
39
+
40
+ # Forward pass: applies layer normalization.
41
+ # @param x [Tensor] input tensor [*, normalized_shape]
42
+ # @return [Tensor] normalized tensor
43
+ def forward(x)
44
+ x.layer_norm(@weight, @bias, eps: @eps)
45
+ end
46
+
47
+ # @return [String]
48
+ def to_s
49
+ "LayerNorm(#{@normalized_shape}, eps=#{@eps})"
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,80 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "module"
4
+
5
+ module Ignis
6
+ module AI
7
+ module NN
8
+ # Linear layer: y = x @ W^T + b
9
+ # Uses cuBLAS via Ignis::LinAlg::Matmul for the hot path.
10
+ class Linear < Module
11
+ # @return [Tensor] weight matrix [out_features, in_features]
12
+ attr_reader :weight
13
+
14
+ # @return [Tensor, nil] bias vector [out_features]
15
+ attr_reader :bias
16
+
17
+ # @param in_features [Integer]
18
+ # @param out_features [Integer]
19
+ # @param bias [Boolean] whether to include bias
20
+ # @param device_id [Integer]
21
+ def initialize(in_features, out_features, bias: true, device_id: 0)
22
+ super()
23
+ @in_features = in_features
24
+ @out_features = out_features
25
+
26
+ # Kaiming uniform initialization: bound = sqrt(6 / in_features)
27
+ bound = Math.sqrt(6.0 / in_features)
28
+
29
+ weight_nv = Ignis::Shared::NvArray.new(shape: [out_features, in_features],
30
+ dtype: :float32, device_id: device_id)
31
+ # Allocate on device (no host Array — for a 128k-vocab tied head that array
32
+ # would be 262M Ruby Floats / ~10GB). The kaiming kernel writes every element.
33
+ weight_nv.to_device
34
+
35
+ # Initialize with Kaiming uniform via JIT kernel
36
+ init_kernel = Ignis::JIT::Kernels::Elementwise.kaiming_uniform_init
37
+ n = out_features * in_features
38
+ seed = ::Random.new.rand(2**64) # stdlib RNG (Ignis::Random is the cuRAND module)
39
+ init_kernel.launch(grid: [(n + 255) / 256], block: [256],
40
+ args: [weight_nv, bound.to_f, Ignis::JIT::Kernel::U64.new(seed), n])
41
+
42
+ @weight = register_parameter("weight",
43
+ Tensor.new(data: weight_nv, requires_grad: true))
44
+
45
+ if bias
46
+ bias_nv = Ignis::Shared::NvArray.new(shape: [out_features],
47
+ dtype: :float32, device_id: device_id)
48
+ bias_bound = 1.0 / Math.sqrt(in_features)
49
+ bias_nv.to_device
50
+ init_kernel.launch(grid: [(out_features + 255) / 256], block: [256],
51
+ args: [bias_nv, bias_bound.to_f, Ignis::JIT::Kernel::U64.new(seed + 1), out_features])
52
+
53
+ @bias = register_parameter("bias",
54
+ Tensor.new(data: bias_nv, requires_grad: true))
55
+ else
56
+ @bias = nil
57
+ end
58
+ end
59
+
60
+ # Forward pass: x @ W^T + b
61
+ # @param x [Tensor] input [*, in_features]
62
+ # @return [Tensor] output [*, out_features]
63
+ def forward(x)
64
+ # x @ W^T, with cuBLAS doing the transpose in the GEMM (transpose_b) —
65
+ # avoids materializing W^T every forward (the LM head's was a 765ms
66
+ # transpose of a 38M-element weight).
67
+ out = x.matmul(@weight, transpose_b: true)
68
+ # Bias is [out_features]; broadcast-add it across rows of [*, out_features].
69
+ out = out.add_bias(@bias) if @bias
70
+ out
71
+ end
72
+
73
+ # @return [String]
74
+ def to_s
75
+ "Linear(in=#{@in_features}, out=#{@out_features}, bias=#{!@bias.nil?})"
76
+ end
77
+ end
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,178 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ignis
4
+ module AI
5
+ module NN
6
+ # Module — base class for all neural network layers.
7
+ #
8
+ # Provides parameter management, state dict serialization,
9
+ # training/eval mode toggling, and automatic parameter collection.
10
+ #
11
+ # @example
12
+ # class MyModel < Ignis::AI::NN::Module
13
+ # def initialize
14
+ # super
15
+ # @linear = register_module("linear", Linear.new(768, 256))
16
+ # end
17
+ #
18
+ # def forward(x)
19
+ # @linear.call(x).relu
20
+ # end
21
+ # end
22
+ class Module
23
+ # @return [Boolean] whether in training mode
24
+ attr_reader :training
25
+
26
+ def initialize
27
+ @training = true
28
+ @modules = {} # name → Module
29
+ @parameters_map = {} # name → Tensor
30
+ end
31
+
32
+ # Override in subclass to define forward pass.
33
+ # @raise [NotImplementedError]
34
+ def forward(*args)
35
+ raise NotImplementedError, "#{self.class}#forward must be implemented"
36
+ end
37
+
38
+ # Make modules callable. Forwards positional AND keyword args (e.g. the
39
+ # `mask:` passed to Block/MultiHeadAttention) through to #forward.
40
+ # @return [Tensor]
41
+ def call(*args, **kwargs, &block)
42
+ forward(*args, **kwargs, &block)
43
+ end
44
+
45
+ # Collect all leaf parameters (requires_grad: true) recursively.
46
+ # @return [Array<Tensor>]
47
+ def parameters
48
+ params = []
49
+ @parameters_map.each_value { |p| params << p }
50
+ @modules.each_value { |m| params.concat(m.parameters) }
51
+ params
52
+ end
53
+
54
+ # Named parameters as a flat hash.
55
+ # @param prefix [String] prefix for nested module names
56
+ # @return [Hash{String => Tensor}]
57
+ def named_parameters(prefix: "")
58
+ result = {}
59
+ @parameters_map.each do |name, param|
60
+ key = prefix.empty? ? name : "#{prefix}.#{name}"
61
+ result[key] = param
62
+ end
63
+ @modules.each do |name, mod|
64
+ mod_prefix = prefix.empty? ? name : "#{prefix}.#{name}"
65
+ result.merge!(mod.named_parameters(prefix: mod_prefix))
66
+ end
67
+ result
68
+ end
69
+
70
+ # Switch to training mode (enables dropout, batch norm updates).
71
+ # @return [self]
72
+ def train!
73
+ @training = true
74
+ @modules.each_value { |m| m.train! }
75
+ self
76
+ end
77
+
78
+ # Switch to eval mode (disables dropout, uses running stats for BN).
79
+ # @return [self]
80
+ def eval!
81
+ @training = false
82
+ @modules.each_value { |m| m.eval! }
83
+ self
84
+ end
85
+
86
+ # Move all parameters to a specific device.
87
+ # @param device_id [Integer]
88
+ # @return [self]
89
+ def to(device_id:)
90
+ parameters.each do |p|
91
+ next if p.device_id == device_id
92
+ host_data = p.to_host
93
+ new_nv = Ignis::Shared::NvArray.new(shape: p.shape, dtype: p.dtype, device_id: device_id)
94
+ new_nv.from_host(host_data)
95
+ p.instance_variable_set(:@data, new_nv)
96
+ end
97
+ self
98
+ end
99
+
100
+ # Zero all gradients.
101
+ # @return [void]
102
+ def zero_grad!
103
+ parameters.each(&:zero_grad!)
104
+ end
105
+
106
+ # Export state dict (parameter name → Ruby Array of values).
107
+ # @return [Hash{String => Array}]
108
+ def state_dict
109
+ result = {}
110
+ named_parameters.each do |name, param|
111
+ result[name] = param.to_host
112
+ end
113
+ result
114
+ end
115
+
116
+ # Load state dict from Hash of {name → Ruby Array}.
117
+ # @param dict [Hash{String => Array}]
118
+ # @param strict [Boolean] if true, raises on missing/unexpected keys
119
+ # @return [self]
120
+ def load_state_dict(dict, strict: true)
121
+ params = named_parameters
122
+
123
+ if strict
124
+ missing = params.keys - dict.keys
125
+ unexpected = dict.keys - params.keys
126
+ raise KeyError, "Missing keys: #{missing}" unless missing.empty?
127
+ raise KeyError, "Unexpected keys: #{unexpected}" unless unexpected.empty?
128
+ end
129
+
130
+ dict.each do |name, values|
131
+ next unless params.key?(name)
132
+ param = params[name]
133
+ param.data.from_host(values)
134
+ end
135
+
136
+ self
137
+ end
138
+
139
+ # Total parameter count.
140
+ # @return [Integer]
141
+ def num_parameters
142
+ parameters.sum(&:numel)
143
+ end
144
+
145
+ # String representation.
146
+ # @return [String]
147
+ def to_s
148
+ parts = ["#{self.class.name}("]
149
+ @modules.each do |name, mod|
150
+ parts << " (#{name}): #{mod}"
151
+ end
152
+ parts << ")"
153
+ parts.join("\n")
154
+ end
155
+
156
+ protected
157
+
158
+ # Register a parameter (leaf tensor with requires_grad: true).
159
+ # @param name [String]
160
+ # @param tensor [Tensor]
161
+ # @return [Tensor]
162
+ def register_parameter(name, tensor)
163
+ @parameters_map[name.to_s] = tensor
164
+ tensor
165
+ end
166
+
167
+ # Register a submodule.
168
+ # @param name [String]
169
+ # @param mod [Module]
170
+ # @return [Module]
171
+ def register_module(name, mod)
172
+ @modules[name.to_s] = mod
173
+ mod
174
+ end
175
+ end
176
+ end
177
+ end
178
+ end
@@ -0,0 +1,43 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "module"
4
+
5
+ module Ignis
6
+ module AI
7
+ module NN
8
+ # Root-mean-square layer normalization: y = gamma * x / sqrt(mean(x^2) + eps).
9
+ # Used by Llama, Qwen, Mistral, SmolLM, Phi. Unlike LayerNorm there is no
10
+ # mean-subtraction and no bias — only a learned per-feature scale (gamma).
11
+ class RMSNorm < Module
12
+ # @return [Tensor] gamma (scale), initialized to ones
13
+ attr_reader :weight
14
+
15
+ # @param normalized_shape [Integer] size of the last dimension
16
+ # @param eps [Float] epsilon for numerical stability (Llama/Qwen use 1e-6/1e-5)
17
+ # @param device_id [Integer]
18
+ def initialize(normalized_shape, eps: 1e-6, device_id: 0)
19
+ super()
20
+ @normalized_shape = normalized_shape
21
+ @eps = eps
22
+
23
+ weight_nv = Ignis::Shared::NvArray.new(shape: [normalized_shape],
24
+ dtype: :float32, device_id: device_id)
25
+ weight_nv.from_host(Array.new(normalized_shape, 1.0))
26
+ @weight = register_parameter("weight",
27
+ Tensor.new(data: weight_nv, requires_grad: true))
28
+ end
29
+
30
+ # @param x [Tensor] input [*, normalized_shape]
31
+ # @return [Tensor]
32
+ def forward(x)
33
+ x.rms_norm(@weight, eps: @eps)
34
+ end
35
+
36
+ # @return [String]
37
+ def to_s
38
+ "RMSNorm(#{@normalized_shape}, eps=#{@eps})"
39
+ end
40
+ end
41
+ end
42
+ end
43
+ end