ignis-dl 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +15 -0
- data/lib/ignis-dl.rb +48 -0
- data/lib/nnw/ai/gpt2_loader.rb +144 -0
- data/lib/nnw/ai/inference.rb +224 -0
- data/lib/nnw/ai/kv_cache.rb +79 -0
- data/lib/nnw/ai/llama_loader.rb +100 -0
- data/lib/nnw/ai/loss.rb +170 -0
- data/lib/nnw/ai/nn/dropout.rb +68 -0
- data/lib/nnw/ai/nn/embedding.rb +86 -0
- data/lib/nnw/ai/nn/layer_norm.rb +54 -0
- data/lib/nnw/ai/nn/linear.rb +80 -0
- data/lib/nnw/ai/nn/module.rb +178 -0
- data/lib/nnw/ai/nn/rms_norm.rb +43 -0
- data/lib/nnw/ai/nn/sequential.rb +52 -0
- data/lib/nnw/ai/optim/adam.rb +63 -0
- data/lib/nnw/ai/optim/adamw.rb +63 -0
- data/lib/nnw/ai/optim/base.rb +90 -0
- data/lib/nnw/ai/optim/lr_scheduler.rb +118 -0
- data/lib/nnw/ai/optim/sgd.rb +49 -0
- data/lib/nnw/ai/safetensors.rb +220 -0
- data/lib/nnw/ai/server.rb +268 -0
- data/lib/nnw/ai/tokenizer.rb +413 -0
- data/lib/nnw/ai/trainer.rb +245 -0
- data/lib/nnw/ai/transformer/attention.rb +89 -0
- data/lib/nnw/ai/transformer/block.rb +90 -0
- data/lib/nnw/ai/transformer/feed_forward.rb +53 -0
- data/lib/nnw/ai/transformer/model.rb +189 -0
- data/lib/nnw/ai/transformer/modern.rb +191 -0
- data/lib/nnw/ai/transformer/swiglu.rb +39 -0
- data/lib/nnw/ai/weight_map.rb +139 -0
- metadata +91 -0
data/lib/nnw/ai/loss.rb
ADDED
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Ignis
|
|
4
|
+
module AI
|
|
5
|
+
# Loss functions for training.
|
|
6
|
+
# Each returns a scalar Tensor with autograd support.
|
|
7
|
+
module Loss
|
|
8
|
+
class << self
|
|
9
|
+
# Cross-entropy loss (classification, language modeling).
|
|
10
|
+
# Fused log-softmax + NLL for numerical stability.
|
|
11
|
+
# @param logits [Tensor] model output [batch_size, vocab_size]
|
|
12
|
+
# @param targets [Tensor] target indices [batch_size] (int32)
|
|
13
|
+
# @param label_smoothing [Float] label smoothing factor (0.0 = none)
|
|
14
|
+
# @return [Tensor] scalar loss
|
|
15
|
+
def cross_entropy(logits, targets, label_smoothing: 0.0)
|
|
16
|
+
batch_size = logits.shape[0]
|
|
17
|
+
vocab_size = logits.shape[-1]
|
|
18
|
+
|
|
19
|
+
# Allocate outputs
|
|
20
|
+
losses_nv = Ignis::Shared::NvArray.new(shape: [batch_size], dtype: :float32,
|
|
21
|
+
device_id: logits.device_id)
|
|
22
|
+
losses_nv.from_host(Array.new(batch_size, 0.0))
|
|
23
|
+
|
|
24
|
+
log_softmax_nv = Ignis::Shared::NvArray.new(shape: logits.shape, dtype: :float32,
|
|
25
|
+
device_id: logits.device_id)
|
|
26
|
+
log_softmax_nv.from_host(Array.new(logits.numel, 0.0))
|
|
27
|
+
|
|
28
|
+
# Forward kernel
|
|
29
|
+
kernel = Ignis::JIT::Kernels::Loss.cross_entropy_forward
|
|
30
|
+
kernel.launch(grid: [(batch_size + 255) / 256], block: [256],
|
|
31
|
+
args: [logits.data, targets.data, losses_nv, log_softmax_nv,
|
|
32
|
+
batch_size, vocab_size, label_smoothing.to_f])
|
|
33
|
+
|
|
34
|
+
# Mean reduction
|
|
35
|
+
mean_nv = Ignis::Shared::NvArray.new(shape: [1], dtype: :float32, device_id: logits.device_id)
|
|
36
|
+
mean_nv.from_host([0.0])
|
|
37
|
+
mean_k = Ignis::JIT::Kernels::Loss.mean_reduce
|
|
38
|
+
mean_k.launch(grid: [1], block: [1], args: [losses_nv, mean_nv, batch_size])
|
|
39
|
+
|
|
40
|
+
result = Tensor.new(data: mean_nv, requires_grad: logits.requires_grad, is_leaf: false)
|
|
41
|
+
|
|
42
|
+
if logits.requires_grad
|
|
43
|
+
saved_lsm = log_softmax_nv
|
|
44
|
+
saved_targets = targets.data
|
|
45
|
+
Tape.record(result, inputs: [logits]) do |grad|
|
|
46
|
+
# Scale by the MEAN reduction (1/batch_size) AND the chained upstream
|
|
47
|
+
# gradient. The upstream grad was previously ignored (hardcoded to
|
|
48
|
+
# 1/batch_size), so any downstream loss scaling — e.g. the Trainer's
|
|
49
|
+
# gradient-accumulation division, or loss*k — was silently dropped,
|
|
50
|
+
# making effective gradients wrong. grad is the scalar [1] cotangent
|
|
51
|
+
# of this scalar loss.
|
|
52
|
+
upstream = grad.to_host[0].to_f
|
|
53
|
+
grad_scale = Ignis::Shared::NvArray.new(shape: [batch_size], dtype: :float32,
|
|
54
|
+
device_id: logits.device_id)
|
|
55
|
+
scale_val = upstream / batch_size
|
|
56
|
+
grad_scale.from_host(Array.new(batch_size, scale_val))
|
|
57
|
+
|
|
58
|
+
grad_logits = Ignis::Shared::NvArray.new(shape: logits.shape, dtype: :float32,
|
|
59
|
+
device_id: logits.device_id)
|
|
60
|
+
grad_logits.from_host(Array.new(logits.numel, 0.0))
|
|
61
|
+
|
|
62
|
+
bk = Ignis::JIT::Kernels::Loss.cross_entropy_backward
|
|
63
|
+
total = batch_size * vocab_size
|
|
64
|
+
bk.launch(grid: [(total + 255) / 256], block: [256],
|
|
65
|
+
args: [saved_lsm, saved_targets, grad_scale,
|
|
66
|
+
grad_logits, batch_size, vocab_size, label_smoothing.to_f])
|
|
67
|
+
[grad_logits]
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
result
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
# Mean squared error loss.
|
|
75
|
+
# @param predictions [Tensor]
|
|
76
|
+
# @param targets [Tensor]
|
|
77
|
+
# @return [Tensor] scalar loss
|
|
78
|
+
def mse(predictions, targets)
|
|
79
|
+
n = predictions.numel
|
|
80
|
+
|
|
81
|
+
losses_nv = Ignis::Shared::NvArray.new(shape: predictions.shape, dtype: :float32,
|
|
82
|
+
device_id: predictions.device_id)
|
|
83
|
+
losses_nv.from_host(Array.new(n, 0.0))
|
|
84
|
+
|
|
85
|
+
kernel = Ignis::JIT::Kernels::Loss.mse_forward
|
|
86
|
+
kernel.launch(grid: [(n + 255) / 256], block: [256],
|
|
87
|
+
args: [predictions.data, targets.data, losses_nv, n])
|
|
88
|
+
|
|
89
|
+
# Mean
|
|
90
|
+
mean_nv = Ignis::Shared::NvArray.new(shape: [1], dtype: :float32,
|
|
91
|
+
device_id: predictions.device_id)
|
|
92
|
+
mean_nv.from_host([0.0])
|
|
93
|
+
mean_k = Ignis::JIT::Kernels::Loss.mean_reduce
|
|
94
|
+
mean_k.launch(grid: [1], block: [1], args: [losses_nv, mean_nv, n])
|
|
95
|
+
|
|
96
|
+
result = Tensor.new(data: mean_nv, requires_grad: predictions.requires_grad, is_leaf: false)
|
|
97
|
+
|
|
98
|
+
if predictions.requires_grad
|
|
99
|
+
saved_pred = predictions.data
|
|
100
|
+
saved_tgt = targets.data
|
|
101
|
+
Tape.record(result, inputs: [predictions]) do |grad|
|
|
102
|
+
grad_input = Ignis::Shared::NvArray.new(shape: predictions.shape, dtype: :float32,
|
|
103
|
+
device_id: predictions.device_id)
|
|
104
|
+
grad_input.from_host(Array.new(n, 0.0))
|
|
105
|
+
bk = Ignis::JIT::Kernels::Loss.mse_backward
|
|
106
|
+
scale = 1.0 / n
|
|
107
|
+
bk.launch(grid: [(n + 255) / 256], block: [256],
|
|
108
|
+
args: [saved_pred, saved_tgt, grad, grad_input, n, scale.to_f])
|
|
109
|
+
[grad_input]
|
|
110
|
+
end
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
result
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
# Binary cross-entropy with logits (sigmoid applied inside).
|
|
117
|
+
# @param logits [Tensor]
|
|
118
|
+
# @param targets [Tensor] (0.0 or 1.0)
|
|
119
|
+
# @return [Tensor] scalar loss
|
|
120
|
+
def binary_cross_entropy(logits, targets)
|
|
121
|
+
n = logits.numel
|
|
122
|
+
|
|
123
|
+
losses_nv = Ignis::Shared::NvArray.new(shape: logits.shape, dtype: :float32,
|
|
124
|
+
device_id: logits.device_id)
|
|
125
|
+
losses_nv.from_host(Array.new(n, 0.0))
|
|
126
|
+
|
|
127
|
+
kernel = Ignis::JIT::Kernels::Loss.bce_forward
|
|
128
|
+
kernel.launch(grid: [(n + 255) / 256], block: [256],
|
|
129
|
+
args: [logits.data, targets.data, losses_nv, n])
|
|
130
|
+
|
|
131
|
+
mean_nv = Ignis::Shared::NvArray.new(shape: [1], dtype: :float32,
|
|
132
|
+
device_id: logits.device_id)
|
|
133
|
+
mean_nv.from_host([0.0])
|
|
134
|
+
mean_k = Ignis::JIT::Kernels::Loss.mean_reduce
|
|
135
|
+
mean_k.launch(grid: [1], block: [1], args: [losses_nv, mean_nv, n])
|
|
136
|
+
|
|
137
|
+
result = Tensor.new(data: mean_nv, requires_grad: logits.requires_grad, is_leaf: false)
|
|
138
|
+
|
|
139
|
+
if logits.requires_grad
|
|
140
|
+
saved_logits = logits.data
|
|
141
|
+
saved_targets = targets.data
|
|
142
|
+
Tape.record(result, inputs: [logits]) do |grad|
|
|
143
|
+
grad_input = Ignis::Shared::NvArray.new(shape: logits.shape, dtype: :float32,
|
|
144
|
+
device_id: logits.device_id)
|
|
145
|
+
grad_input.from_host(Array.new(n, 0.0))
|
|
146
|
+
bk = Ignis::JIT::Kernels::Loss.bce_backward
|
|
147
|
+
bk.launch(grid: [(n + 255) / 256], block: [256],
|
|
148
|
+
args: [saved_logits, saved_targets, grad, grad_input, n])
|
|
149
|
+
[grad_input]
|
|
150
|
+
end
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
result
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
# KL divergence: KL(p || q) = sum(p * log(p/q))
|
|
157
|
+
# @param log_q [Tensor] log probabilities of model
|
|
158
|
+
# @param p [Tensor] target distribution
|
|
159
|
+
# @return [Tensor] scalar loss
|
|
160
|
+
def kl_divergence(log_q, p)
|
|
161
|
+
# KL = sum(p * (log(p) - log_q))
|
|
162
|
+
# Implement via existing tensor ops
|
|
163
|
+
diff = p * (p.relu + Tensor.from_host([1e-8], shape: [1], device_id: p.device_id)) - log_q
|
|
164
|
+
loss = (p * diff).sum
|
|
165
|
+
loss
|
|
166
|
+
end
|
|
167
|
+
end
|
|
168
|
+
end
|
|
169
|
+
end
|
|
170
|
+
end
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "module"
|
|
4
|
+
|
|
5
|
+
module Ignis
|
|
6
|
+
module AI
|
|
7
|
+
module NN
|
|
8
|
+
# Dropout: randomly zeros elements during training.
|
|
9
|
+
# In eval mode, passes input through unchanged.
|
|
10
|
+
# Uses inverted dropout: scale by 1/(1-p) during training.
|
|
11
|
+
class Dropout < Module
|
|
12
|
+
# @param p [Float] probability of zeroing an element (0.0 to 1.0)
|
|
13
|
+
def initialize(p: 0.1)
|
|
14
|
+
super()
|
|
15
|
+
@p = p
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
# Forward pass.
|
|
19
|
+
# In training mode: apply random mask and scale by 1/(1-p).
|
|
20
|
+
# In eval mode: pass through.
|
|
21
|
+
# @param x [Tensor]
|
|
22
|
+
# @return [Tensor]
|
|
23
|
+
def forward(x)
|
|
24
|
+
return x unless @training
|
|
25
|
+
return x if @p == 0.0
|
|
26
|
+
|
|
27
|
+
# Generate bernoulli mask on host, transfer to GPU
|
|
28
|
+
n = x.numel
|
|
29
|
+
scale = 1.0 / (1.0 - @p)
|
|
30
|
+
mask_data = Array.new(n) { Kernel.rand >= @p ? scale : 0.0 }
|
|
31
|
+
|
|
32
|
+
mask_nv = Ignis::Shared::NvArray.new(shape: x.shape, dtype: x.dtype, device_id: x.device_id)
|
|
33
|
+
mask_nv.from_host(mask_data)
|
|
34
|
+
|
|
35
|
+
# Elementwise multiply
|
|
36
|
+
result_nv = Ignis::Shared::NvArray.new(shape: x.shape, dtype: x.dtype, device_id: x.device_id)
|
|
37
|
+
result_nv.from_host(Array.new(n, 0.0))
|
|
38
|
+
|
|
39
|
+
kernel = Ignis::JIT::Kernels::Elementwise.mul_forward
|
|
40
|
+
kernel.launch(grid: [(n + 255) / 256], block: [256],
|
|
41
|
+
args: [x.data, mask_nv, result_nv, n])
|
|
42
|
+
|
|
43
|
+
result = Tensor.new(data: result_nv, requires_grad: x.requires_grad, is_leaf: false)
|
|
44
|
+
|
|
45
|
+
if x.requires_grad
|
|
46
|
+
Tape.record(result, inputs: [x]) do |grad|
|
|
47
|
+
# Backward: multiply grad by same mask
|
|
48
|
+
grad_in = Ignis::Shared::NvArray.new(shape: grad.shape, dtype: grad.dtype,
|
|
49
|
+
device_id: grad.device_id)
|
|
50
|
+
grad_in.from_host(Array.new(grad.numel, 0.0))
|
|
51
|
+
mk = Ignis::JIT::Kernels::Elementwise.mul_forward
|
|
52
|
+
mk.launch(grid: [(grad.numel + 255) / 256], block: [256],
|
|
53
|
+
args: [grad, mask_nv, grad_in, grad.numel])
|
|
54
|
+
[grad_in]
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
result
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
# @return [String]
|
|
62
|
+
def to_s
|
|
63
|
+
"Dropout(p=#{@p})"
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
end
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "module"
|
|
4
|
+
|
|
5
|
+
module Ignis
|
|
6
|
+
module AI
|
|
7
|
+
module NN
|
|
8
|
+
# Embedding layer: maps integer indices to dense vectors.
|
|
9
|
+
# Forward uses gather_rows JIT kernel.
|
|
10
|
+
# Backward uses scatter_add with atomicAdd.
|
|
11
|
+
class Embedding < Module
|
|
12
|
+
# @return [Tensor] weight matrix [num_embeddings, embedding_dim]
|
|
13
|
+
attr_reader :weight
|
|
14
|
+
|
|
15
|
+
# @param num_embeddings [Integer] vocabulary size
|
|
16
|
+
# @param embedding_dim [Integer] dimension of embedding vectors
|
|
17
|
+
# @param device_id [Integer]
|
|
18
|
+
def initialize(num_embeddings, embedding_dim, device_id: 0)
|
|
19
|
+
super()
|
|
20
|
+
@num_embeddings = num_embeddings
|
|
21
|
+
@embedding_dim = embedding_dim
|
|
22
|
+
|
|
23
|
+
# Initialize uniform[-scale, scale] (scale = 1/sqrt(embedding_dim)) via a
|
|
24
|
+
# device kernel. The old host Array.new(num_embeddings*embedding_dim) was a
|
|
25
|
+
# 262M-element / ~10GB array for a 128k-vocab model — infeasible. The kaiming
|
|
26
|
+
# uniform kernel with bound=scale produces the same [-scale, scale] range.
|
|
27
|
+
scale = 1.0 / Math.sqrt(embedding_dim)
|
|
28
|
+
weight_nv = Ignis::Shared::NvArray.new(shape: [num_embeddings, embedding_dim],
|
|
29
|
+
dtype: :float32, device_id: device_id)
|
|
30
|
+
weight_nv.to_device
|
|
31
|
+
n = num_embeddings * embedding_dim
|
|
32
|
+
init_kernel = Ignis::JIT::Kernels::Elementwise.kaiming_uniform_init
|
|
33
|
+
init_kernel.launch(grid: [(n + 255) / 256], block: [256],
|
|
34
|
+
args: [weight_nv, scale.to_f, Ignis::JIT::Kernel::U64.new(::Random.new.rand(2**64)), n])
|
|
35
|
+
|
|
36
|
+
@weight = register_parameter("weight",
|
|
37
|
+
Tensor.new(data: weight_nv, requires_grad: true))
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# Forward pass: gather rows from weight table.
|
|
41
|
+
# @param indices [Tensor] integer indices [batch_size, seq_len] (int32 on GPU)
|
|
42
|
+
# @return [Tensor] embeddings [batch_size, seq_len, embedding_dim]
|
|
43
|
+
def forward(indices)
|
|
44
|
+
num_indices = indices.numel
|
|
45
|
+
output_shape = indices.shape + [@embedding_dim]
|
|
46
|
+
output_nv = Ignis::Shared::NvArray.new(shape: output_shape, dtype: :float32,
|
|
47
|
+
device_id: @weight.device_id)
|
|
48
|
+
output_nv.from_host(Array.new(num_indices * @embedding_dim, 0.0))
|
|
49
|
+
|
|
50
|
+
kernel = Ignis::JIT::Kernels::Elementwise.gather_rows
|
|
51
|
+
total = num_indices * @embedding_dim
|
|
52
|
+
kernel.launch(grid: [(total + 255) / 256], block: [256],
|
|
53
|
+
args: [@weight.data, indices.data, output_nv, num_indices, @embedding_dim])
|
|
54
|
+
|
|
55
|
+
result = Tensor.new(data: output_nv,
|
|
56
|
+
requires_grad: @weight.requires_grad,
|
|
57
|
+
is_leaf: false)
|
|
58
|
+
|
|
59
|
+
if @weight.requires_grad
|
|
60
|
+
saved_indices = indices.data
|
|
61
|
+
saved_weight = @weight
|
|
62
|
+
Tape.record(result, inputs: [@weight]) do |grad|
|
|
63
|
+
# scatter_add: accumulate gradients for each embedding index
|
|
64
|
+
grad_weight = Ignis::Shared::NvArray.new(
|
|
65
|
+
shape: [@num_embeddings, @embedding_dim],
|
|
66
|
+
dtype: :float32, device_id: @weight.device_id)
|
|
67
|
+
grad_weight.from_host(Array.new(@num_embeddings * @embedding_dim, 0.0))
|
|
68
|
+
|
|
69
|
+
scatter_k = Ignis::JIT::Kernels::Elementwise.scatter_add
|
|
70
|
+
scatter_k.launch(grid: [(total + 255) / 256], block: [256],
|
|
71
|
+
args: [grad, saved_indices, grad_weight, num_indices, @embedding_dim])
|
|
72
|
+
[grad_weight]
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
result
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# @return [String]
|
|
80
|
+
def to_s
|
|
81
|
+
"Embedding(num=#{@num_embeddings}, dim=#{@embedding_dim})"
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
end
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "module"
|
|
4
|
+
|
|
5
|
+
module Ignis
|
|
6
|
+
module AI
|
|
7
|
+
module NN
|
|
8
|
+
# Layer normalization: y = gamma * (x - mean) / sqrt(var + eps) + beta
|
|
9
|
+
# Normalizes along the last dimension(s).
|
|
10
|
+
class LayerNorm < Module
|
|
11
|
+
# @return [Tensor] gamma (scale)
|
|
12
|
+
attr_reader :weight
|
|
13
|
+
|
|
14
|
+
# @return [Tensor] beta (shift)
|
|
15
|
+
attr_reader :bias
|
|
16
|
+
|
|
17
|
+
# @param normalized_shape [Integer] size of the last dimension
|
|
18
|
+
# @param eps [Float] epsilon for numerical stability
|
|
19
|
+
# @param device_id [Integer]
|
|
20
|
+
def initialize(normalized_shape, eps: 1e-5, device_id: 0)
|
|
21
|
+
super()
|
|
22
|
+
@normalized_shape = normalized_shape
|
|
23
|
+
@eps = eps
|
|
24
|
+
|
|
25
|
+
# Initialize weight (gamma) to ones
|
|
26
|
+
weight_nv = Ignis::Shared::NvArray.new(shape: [normalized_shape],
|
|
27
|
+
dtype: :float32, device_id: device_id)
|
|
28
|
+
weight_nv.from_host(Array.new(normalized_shape, 1.0))
|
|
29
|
+
@weight = register_parameter("weight",
|
|
30
|
+
Tensor.new(data: weight_nv, requires_grad: true))
|
|
31
|
+
|
|
32
|
+
# Initialize bias (beta) to zeros
|
|
33
|
+
bias_nv = Ignis::Shared::NvArray.new(shape: [normalized_shape],
|
|
34
|
+
dtype: :float32, device_id: device_id)
|
|
35
|
+
bias_nv.from_host(Array.new(normalized_shape, 0.0))
|
|
36
|
+
@bias = register_parameter("bias",
|
|
37
|
+
Tensor.new(data: bias_nv, requires_grad: true))
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# Forward pass: applies layer normalization.
|
|
41
|
+
# @param x [Tensor] input tensor [*, normalized_shape]
|
|
42
|
+
# @return [Tensor] normalized tensor
|
|
43
|
+
def forward(x)
|
|
44
|
+
x.layer_norm(@weight, @bias, eps: @eps)
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# @return [String]
|
|
48
|
+
def to_s
|
|
49
|
+
"LayerNorm(#{@normalized_shape}, eps=#{@eps})"
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
end
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "module"
|
|
4
|
+
|
|
5
|
+
module Ignis
|
|
6
|
+
module AI
|
|
7
|
+
module NN
|
|
8
|
+
# Linear layer: y = x @ W^T + b
|
|
9
|
+
# Uses cuBLAS via Ignis::LinAlg::Matmul for the hot path.
|
|
10
|
+
class Linear < Module
|
|
11
|
+
# @return [Tensor] weight matrix [out_features, in_features]
|
|
12
|
+
attr_reader :weight
|
|
13
|
+
|
|
14
|
+
# @return [Tensor, nil] bias vector [out_features]
|
|
15
|
+
attr_reader :bias
|
|
16
|
+
|
|
17
|
+
# @param in_features [Integer]
|
|
18
|
+
# @param out_features [Integer]
|
|
19
|
+
# @param bias [Boolean] whether to include bias
|
|
20
|
+
# @param device_id [Integer]
|
|
21
|
+
def initialize(in_features, out_features, bias: true, device_id: 0)
|
|
22
|
+
super()
|
|
23
|
+
@in_features = in_features
|
|
24
|
+
@out_features = out_features
|
|
25
|
+
|
|
26
|
+
# Kaiming uniform initialization: bound = sqrt(6 / in_features)
|
|
27
|
+
bound = Math.sqrt(6.0 / in_features)
|
|
28
|
+
|
|
29
|
+
weight_nv = Ignis::Shared::NvArray.new(shape: [out_features, in_features],
|
|
30
|
+
dtype: :float32, device_id: device_id)
|
|
31
|
+
# Allocate on device (no host Array — for a 128k-vocab tied head that array
|
|
32
|
+
# would be 262M Ruby Floats / ~10GB). The kaiming kernel writes every element.
|
|
33
|
+
weight_nv.to_device
|
|
34
|
+
|
|
35
|
+
# Initialize with Kaiming uniform via JIT kernel
|
|
36
|
+
init_kernel = Ignis::JIT::Kernels::Elementwise.kaiming_uniform_init
|
|
37
|
+
n = out_features * in_features
|
|
38
|
+
seed = ::Random.new.rand(2**64) # stdlib RNG (Ignis::Random is the cuRAND module)
|
|
39
|
+
init_kernel.launch(grid: [(n + 255) / 256], block: [256],
|
|
40
|
+
args: [weight_nv, bound.to_f, Ignis::JIT::Kernel::U64.new(seed), n])
|
|
41
|
+
|
|
42
|
+
@weight = register_parameter("weight",
|
|
43
|
+
Tensor.new(data: weight_nv, requires_grad: true))
|
|
44
|
+
|
|
45
|
+
if bias
|
|
46
|
+
bias_nv = Ignis::Shared::NvArray.new(shape: [out_features],
|
|
47
|
+
dtype: :float32, device_id: device_id)
|
|
48
|
+
bias_bound = 1.0 / Math.sqrt(in_features)
|
|
49
|
+
bias_nv.to_device
|
|
50
|
+
init_kernel.launch(grid: [(out_features + 255) / 256], block: [256],
|
|
51
|
+
args: [bias_nv, bias_bound.to_f, Ignis::JIT::Kernel::U64.new(seed + 1), out_features])
|
|
52
|
+
|
|
53
|
+
@bias = register_parameter("bias",
|
|
54
|
+
Tensor.new(data: bias_nv, requires_grad: true))
|
|
55
|
+
else
|
|
56
|
+
@bias = nil
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
# Forward pass: x @ W^T + b
|
|
61
|
+
# @param x [Tensor] input [*, in_features]
|
|
62
|
+
# @return [Tensor] output [*, out_features]
|
|
63
|
+
def forward(x)
|
|
64
|
+
# x @ W^T, with cuBLAS doing the transpose in the GEMM (transpose_b) —
|
|
65
|
+
# avoids materializing W^T every forward (the LM head's was a 765ms
|
|
66
|
+
# transpose of a 38M-element weight).
|
|
67
|
+
out = x.matmul(@weight, transpose_b: true)
|
|
68
|
+
# Bias is [out_features]; broadcast-add it across rows of [*, out_features].
|
|
69
|
+
out = out.add_bias(@bias) if @bias
|
|
70
|
+
out
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# @return [String]
|
|
74
|
+
def to_s
|
|
75
|
+
"Linear(in=#{@in_features}, out=#{@out_features}, bias=#{!@bias.nil?})"
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
end
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Ignis
|
|
4
|
+
module AI
|
|
5
|
+
module NN
|
|
6
|
+
# Module — base class for all neural network layers.
|
|
7
|
+
#
|
|
8
|
+
# Provides parameter management, state dict serialization,
|
|
9
|
+
# training/eval mode toggling, and automatic parameter collection.
|
|
10
|
+
#
|
|
11
|
+
# @example
|
|
12
|
+
# class MyModel < Ignis::AI::NN::Module
|
|
13
|
+
# def initialize
|
|
14
|
+
# super
|
|
15
|
+
# @linear = register_module("linear", Linear.new(768, 256))
|
|
16
|
+
# end
|
|
17
|
+
#
|
|
18
|
+
# def forward(x)
|
|
19
|
+
# @linear.call(x).relu
|
|
20
|
+
# end
|
|
21
|
+
# end
|
|
22
|
+
class Module
|
|
23
|
+
# @return [Boolean] whether in training mode
|
|
24
|
+
attr_reader :training
|
|
25
|
+
|
|
26
|
+
def initialize
|
|
27
|
+
@training = true
|
|
28
|
+
@modules = {} # name → Module
|
|
29
|
+
@parameters_map = {} # name → Tensor
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# Override in subclass to define forward pass.
|
|
33
|
+
# @raise [NotImplementedError]
|
|
34
|
+
def forward(*args)
|
|
35
|
+
raise NotImplementedError, "#{self.class}#forward must be implemented"
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# Make modules callable. Forwards positional AND keyword args (e.g. the
|
|
39
|
+
# `mask:` passed to Block/MultiHeadAttention) through to #forward.
|
|
40
|
+
# @return [Tensor]
|
|
41
|
+
def call(*args, **kwargs, &block)
|
|
42
|
+
forward(*args, **kwargs, &block)
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
# Collect all leaf parameters (requires_grad: true) recursively.
|
|
46
|
+
# @return [Array<Tensor>]
|
|
47
|
+
def parameters
|
|
48
|
+
params = []
|
|
49
|
+
@parameters_map.each_value { |p| params << p }
|
|
50
|
+
@modules.each_value { |m| params.concat(m.parameters) }
|
|
51
|
+
params
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
# Named parameters as a flat hash.
|
|
55
|
+
# @param prefix [String] prefix for nested module names
|
|
56
|
+
# @return [Hash{String => Tensor}]
|
|
57
|
+
def named_parameters(prefix: "")
|
|
58
|
+
result = {}
|
|
59
|
+
@parameters_map.each do |name, param|
|
|
60
|
+
key = prefix.empty? ? name : "#{prefix}.#{name}"
|
|
61
|
+
result[key] = param
|
|
62
|
+
end
|
|
63
|
+
@modules.each do |name, mod|
|
|
64
|
+
mod_prefix = prefix.empty? ? name : "#{prefix}.#{name}"
|
|
65
|
+
result.merge!(mod.named_parameters(prefix: mod_prefix))
|
|
66
|
+
end
|
|
67
|
+
result
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# Switch to training mode (enables dropout, batch norm updates).
|
|
71
|
+
# @return [self]
|
|
72
|
+
def train!
|
|
73
|
+
@training = true
|
|
74
|
+
@modules.each_value { |m| m.train! }
|
|
75
|
+
self
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
# Switch to eval mode (disables dropout, uses running stats for BN).
|
|
79
|
+
# @return [self]
|
|
80
|
+
def eval!
|
|
81
|
+
@training = false
|
|
82
|
+
@modules.each_value { |m| m.eval! }
|
|
83
|
+
self
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
# Move all parameters to a specific device.
|
|
87
|
+
# @param device_id [Integer]
|
|
88
|
+
# @return [self]
|
|
89
|
+
def to(device_id:)
|
|
90
|
+
parameters.each do |p|
|
|
91
|
+
next if p.device_id == device_id
|
|
92
|
+
host_data = p.to_host
|
|
93
|
+
new_nv = Ignis::Shared::NvArray.new(shape: p.shape, dtype: p.dtype, device_id: device_id)
|
|
94
|
+
new_nv.from_host(host_data)
|
|
95
|
+
p.instance_variable_set(:@data, new_nv)
|
|
96
|
+
end
|
|
97
|
+
self
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
# Zero all gradients.
|
|
101
|
+
# @return [void]
|
|
102
|
+
def zero_grad!
|
|
103
|
+
parameters.each(&:zero_grad!)
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
# Export state dict (parameter name → Ruby Array of values).
|
|
107
|
+
# @return [Hash{String => Array}]
|
|
108
|
+
def state_dict
|
|
109
|
+
result = {}
|
|
110
|
+
named_parameters.each do |name, param|
|
|
111
|
+
result[name] = param.to_host
|
|
112
|
+
end
|
|
113
|
+
result
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
# Load state dict from Hash of {name → Ruby Array}.
|
|
117
|
+
# @param dict [Hash{String => Array}]
|
|
118
|
+
# @param strict [Boolean] if true, raises on missing/unexpected keys
|
|
119
|
+
# @return [self]
|
|
120
|
+
def load_state_dict(dict, strict: true)
|
|
121
|
+
params = named_parameters
|
|
122
|
+
|
|
123
|
+
if strict
|
|
124
|
+
missing = params.keys - dict.keys
|
|
125
|
+
unexpected = dict.keys - params.keys
|
|
126
|
+
raise KeyError, "Missing keys: #{missing}" unless missing.empty?
|
|
127
|
+
raise KeyError, "Unexpected keys: #{unexpected}" unless unexpected.empty?
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
dict.each do |name, values|
|
|
131
|
+
next unless params.key?(name)
|
|
132
|
+
param = params[name]
|
|
133
|
+
param.data.from_host(values)
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
self
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
# Total parameter count.
|
|
140
|
+
# @return [Integer]
|
|
141
|
+
def num_parameters
|
|
142
|
+
parameters.sum(&:numel)
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
# String representation.
|
|
146
|
+
# @return [String]
|
|
147
|
+
def to_s
|
|
148
|
+
parts = ["#{self.class.name}("]
|
|
149
|
+
@modules.each do |name, mod|
|
|
150
|
+
parts << " (#{name}): #{mod}"
|
|
151
|
+
end
|
|
152
|
+
parts << ")"
|
|
153
|
+
parts.join("\n")
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
protected
|
|
157
|
+
|
|
158
|
+
# Register a parameter (leaf tensor with requires_grad: true).
|
|
159
|
+
# @param name [String]
|
|
160
|
+
# @param tensor [Tensor]
|
|
161
|
+
# @return [Tensor]
|
|
162
|
+
def register_parameter(name, tensor)
|
|
163
|
+
@parameters_map[name.to_s] = tensor
|
|
164
|
+
tensor
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
# Register a submodule.
|
|
168
|
+
# @param name [String]
|
|
169
|
+
# @param mod [Module]
|
|
170
|
+
# @return [Module]
|
|
171
|
+
def register_module(name, mod)
|
|
172
|
+
@modules[name.to_s] = mod
|
|
173
|
+
mod
|
|
174
|
+
end
|
|
175
|
+
end
|
|
176
|
+
end
|
|
177
|
+
end
|
|
178
|
+
end
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "module"
|
|
4
|
+
|
|
5
|
+
module Ignis
|
|
6
|
+
module AI
|
|
7
|
+
module NN
|
|
8
|
+
# Root-mean-square layer normalization: y = gamma * x / sqrt(mean(x^2) + eps).
|
|
9
|
+
# Used by Llama, Qwen, Mistral, SmolLM, Phi. Unlike LayerNorm there is no
|
|
10
|
+
# mean-subtraction and no bias — only a learned per-feature scale (gamma).
|
|
11
|
+
class RMSNorm < Module
|
|
12
|
+
# @return [Tensor] gamma (scale), initialized to ones
|
|
13
|
+
attr_reader :weight
|
|
14
|
+
|
|
15
|
+
# @param normalized_shape [Integer] size of the last dimension
|
|
16
|
+
# @param eps [Float] epsilon for numerical stability (Llama/Qwen use 1e-6/1e-5)
|
|
17
|
+
# @param device_id [Integer]
|
|
18
|
+
def initialize(normalized_shape, eps: 1e-6, device_id: 0)
|
|
19
|
+
super()
|
|
20
|
+
@normalized_shape = normalized_shape
|
|
21
|
+
@eps = eps
|
|
22
|
+
|
|
23
|
+
weight_nv = Ignis::Shared::NvArray.new(shape: [normalized_shape],
|
|
24
|
+
dtype: :float32, device_id: device_id)
|
|
25
|
+
weight_nv.from_host(Array.new(normalized_shape, 1.0))
|
|
26
|
+
@weight = register_parameter("weight",
|
|
27
|
+
Tensor.new(data: weight_nv, requires_grad: true))
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# @param x [Tensor] input [*, normalized_shape]
|
|
31
|
+
# @return [Tensor]
|
|
32
|
+
def forward(x)
|
|
33
|
+
x.rms_norm(@weight, eps: @eps)
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# @return [String]
|
|
37
|
+
def to_s
|
|
38
|
+
"RMSNorm(#{@normalized_shape}, eps=#{@eps})"
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
end
|