ignis-dl 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +15 -0
- data/lib/ignis-dl.rb +48 -0
- data/lib/nnw/ai/gpt2_loader.rb +144 -0
- data/lib/nnw/ai/inference.rb +224 -0
- data/lib/nnw/ai/kv_cache.rb +79 -0
- data/lib/nnw/ai/llama_loader.rb +100 -0
- data/lib/nnw/ai/loss.rb +170 -0
- data/lib/nnw/ai/nn/dropout.rb +68 -0
- data/lib/nnw/ai/nn/embedding.rb +86 -0
- data/lib/nnw/ai/nn/layer_norm.rb +54 -0
- data/lib/nnw/ai/nn/linear.rb +80 -0
- data/lib/nnw/ai/nn/module.rb +178 -0
- data/lib/nnw/ai/nn/rms_norm.rb +43 -0
- data/lib/nnw/ai/nn/sequential.rb +52 -0
- data/lib/nnw/ai/optim/adam.rb +63 -0
- data/lib/nnw/ai/optim/adamw.rb +63 -0
- data/lib/nnw/ai/optim/base.rb +90 -0
- data/lib/nnw/ai/optim/lr_scheduler.rb +118 -0
- data/lib/nnw/ai/optim/sgd.rb +49 -0
- data/lib/nnw/ai/safetensors.rb +220 -0
- data/lib/nnw/ai/server.rb +268 -0
- data/lib/nnw/ai/tokenizer.rb +413 -0
- data/lib/nnw/ai/trainer.rb +245 -0
- data/lib/nnw/ai/transformer/attention.rb +89 -0
- data/lib/nnw/ai/transformer/block.rb +90 -0
- data/lib/nnw/ai/transformer/feed_forward.rb +53 -0
- data/lib/nnw/ai/transformer/model.rb +189 -0
- data/lib/nnw/ai/transformer/modern.rb +191 -0
- data/lib/nnw/ai/transformer/swiglu.rb +39 -0
- data/lib/nnw/ai/weight_map.rb +139 -0
- metadata +91 -0
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Ignis
|
|
4
|
+
module AI
|
|
5
|
+
module Transformer
|
|
6
|
+
# Modern decoder components: RoPE + GQA attention, and a Llama/Qwen/SmolLM-style
|
|
7
|
+
# block (pre-RMSNorm, RoPE-GQA attention, SwiGLU MLP) and model (no learned
|
|
8
|
+
# position embedding — RoPE supplies position). These target Qwen3, SmolLM3,
|
|
9
|
+
# Phi, Llama-family architectures.
|
|
10
|
+
|
|
11
|
+
# Compute the RoPE inverse-frequency table [head_dim/2], optionally applying
|
|
12
|
+
# a scaling scheme. Supports the "llama3" rope_type (Llama-3.1/3.2): low
|
|
13
|
+
# frequencies are divided by `factor`, high frequencies are kept, and a smooth
|
|
14
|
+
# interpolation bridges the two — matching HF's _compute_llama3_parameters.
|
|
15
|
+
# @param head_dim [Integer]
|
|
16
|
+
# @param base [Float] rope_theta
|
|
17
|
+
# @param scaling [Hash, nil] e.g. {rope_type:"llama3", factor:, low_freq_factor:,
|
|
18
|
+
# high_freq_factor:, original_max_position_embeddings:} (symbol or string keys)
|
|
19
|
+
# @return [Array<Float>] length head_dim/2
|
|
20
|
+
def self.compute_inv_freq(head_dim, base, scaling = nil)
|
|
21
|
+
half = head_dim / 2
|
|
22
|
+
freqs = (0...half).map { |i| base.to_f**(-2.0 * i / head_dim) }
|
|
23
|
+
return freqs unless scaling
|
|
24
|
+
g = ->(k) { scaling[k] || scaling[k.to_s] }
|
|
25
|
+
return freqs unless (g.call(:rope_type) || g.call(:type)) == "llama3"
|
|
26
|
+
|
|
27
|
+
factor = g.call(:factor).to_f
|
|
28
|
+
low_ff = g.call(:low_freq_factor).to_f
|
|
29
|
+
high_ff = g.call(:high_freq_factor).to_f
|
|
30
|
+
old_ctx = g.call(:original_max_position_embeddings).to_f
|
|
31
|
+
low_wl = old_ctx / low_ff
|
|
32
|
+
high_wl = old_ctx / high_ff
|
|
33
|
+
freqs.map do |f|
|
|
34
|
+
wl = 2.0 * Math::PI / f
|
|
35
|
+
if wl > low_wl
|
|
36
|
+
f / factor
|
|
37
|
+
elsif wl < high_wl
|
|
38
|
+
f
|
|
39
|
+
else
|
|
40
|
+
smooth = (old_ctx / wl - low_ff) / (high_ff - low_ff)
|
|
41
|
+
(1.0 - smooth) * (f / factor) + smooth * f
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
# Attention with rotary embeddings (RoPE) and grouped-query attention (GQA).
|
|
47
|
+
# No bias on projections (Llama/Qwen convention).
|
|
48
|
+
class RopeGqaAttention < NN::Module
|
|
49
|
+
# @param embed_dim [Integer]
|
|
50
|
+
# @param num_heads [Integer] query heads
|
|
51
|
+
# @param num_kv_heads [Integer] key/value heads (== num_heads ⇒ plain MHA)
|
|
52
|
+
# @param head_dim [Integer, nil] per-head dim (default embed_dim/num_heads)
|
|
53
|
+
# @param rope_base [Float] RoPE theta
|
|
54
|
+
# @param bias [Boolean]
|
|
55
|
+
# @param device_id [Integer]
|
|
56
|
+
def initialize(embed_dim, num_heads, num_kv_heads:, head_dim: nil,
|
|
57
|
+
rope_base: 10000.0, rope_scaling: nil, bias: false, device_id: 0)
|
|
58
|
+
super()
|
|
59
|
+
raise ArgumentError, "num_heads must be a multiple of num_kv_heads" unless (num_heads % num_kv_heads).zero?
|
|
60
|
+
@embed_dim = embed_dim
|
|
61
|
+
@num_heads = num_heads
|
|
62
|
+
@num_kv_heads = num_kv_heads
|
|
63
|
+
@head_dim = head_dim || (embed_dim / num_heads)
|
|
64
|
+
# Fail early (at construction) rather than silently miscompute later:
|
|
65
|
+
# RoPE needs an even head_dim; the flash kernels cap head_dim at 128.
|
|
66
|
+
raise ArgumentError, "head_dim must be even for RoPE (got #{@head_dim})" unless @head_dim.even?
|
|
67
|
+
raise ArgumentError, "head_dim #{@head_dim} exceeds flash-attention HEAD_DIM_MAX (128)" if @head_dim > 128
|
|
68
|
+
@rope_base = rope_base
|
|
69
|
+
# Precompute the (optionally scaled) inv_freq table once; reused every layer/step.
|
|
70
|
+
@inv_freq = Transformer.compute_inv_freq(@head_dim, rope_base, rope_scaling)
|
|
71
|
+
q_out = num_heads * @head_dim
|
|
72
|
+
kv_out = num_kv_heads * @head_dim
|
|
73
|
+
@q_proj = register_module("q_proj", NN::Linear.new(embed_dim, q_out, bias: bias, device_id: device_id))
|
|
74
|
+
@k_proj = register_module("k_proj", NN::Linear.new(embed_dim, kv_out, bias: bias, device_id: device_id))
|
|
75
|
+
@v_proj = register_module("v_proj", NN::Linear.new(embed_dim, kv_out, bias: bias, device_id: device_id))
|
|
76
|
+
@o_proj = register_module("o_proj", NN::Linear.new(q_out, embed_dim, bias: bias, device_id: device_id))
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# @param x [Tensor] [seq, embed_dim]
|
|
80
|
+
# @param pos_offset [Integer] absolute position of row 0 (for KV-cache decode)
|
|
81
|
+
# @return [Tensor] [seq, embed_dim]
|
|
82
|
+
def forward(x, pos_offset: 0)
|
|
83
|
+
# RoPE is applied to Q and K (not V); q has num_heads, k has num_kv_heads.
|
|
84
|
+
q = @q_proj.call(x).rope(num_heads: @num_heads, pos_offset: pos_offset, inv_freq: @inv_freq)
|
|
85
|
+
k = @k_proj.call(x).rope(num_heads: @num_kv_heads, pos_offset: pos_offset, inv_freq: @inv_freq)
|
|
86
|
+
v = @v_proj.call(x)
|
|
87
|
+
ctx = q.sdpa(k, v, num_heads: @num_heads, num_kv_heads: @num_kv_heads, causal: true)
|
|
88
|
+
@o_proj.call(ctx)
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
# @return [String]
|
|
92
|
+
def to_s
|
|
93
|
+
"RopeGqaAttention(embed=#{@embed_dim}, q_heads=#{@num_heads}, kv_heads=#{@num_kv_heads}, head_dim=#{@head_dim})"
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
# Llama/Qwen-style block: x += attn(rmsnorm(x)); x += swiglu(rmsnorm(x)).
|
|
98
|
+
class ModernBlock < NN::Module
|
|
99
|
+
# @param embed_dim [Integer]
|
|
100
|
+
# @param num_heads [Integer]
|
|
101
|
+
# @param num_kv_heads [Integer]
|
|
102
|
+
# @param ff_dim [Integer]
|
|
103
|
+
# @param rope_base [Float]
|
|
104
|
+
# @param eps [Float] RMSNorm epsilon
|
|
105
|
+
# @param device_id [Integer]
|
|
106
|
+
def initialize(embed_dim, num_heads, num_kv_heads:, ff_dim:,
|
|
107
|
+
rope_base: 10000.0, rope_scaling: nil, head_dim: nil, eps: 1e-6, device_id: 0)
|
|
108
|
+
super()
|
|
109
|
+
@attn = register_module("attn",
|
|
110
|
+
RopeGqaAttention.new(embed_dim, num_heads, num_kv_heads: num_kv_heads,
|
|
111
|
+
head_dim: head_dim, rope_base: rope_base,
|
|
112
|
+
rope_scaling: rope_scaling, device_id: device_id))
|
|
113
|
+
@mlp = register_module("mlp", SwiGLU.new(embed_dim, ff_dim, device_id: device_id))
|
|
114
|
+
@norm1 = register_module("norm1", NN::RMSNorm.new(embed_dim, eps: eps, device_id: device_id))
|
|
115
|
+
@norm2 = register_module("norm2", NN::RMSNorm.new(embed_dim, eps: eps, device_id: device_id))
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
# @param x [Tensor] [seq, embed]
|
|
119
|
+
# @return [Tensor]
|
|
120
|
+
def forward(x)
|
|
121
|
+
x = x + @attn.call(@norm1.call(x))
|
|
122
|
+
x + @mlp.call(@norm2.call(x))
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
# @return [String]
|
|
126
|
+
def to_s
|
|
127
|
+
"ModernBlock(#{@attn}, #{@mlp})"
|
|
128
|
+
end
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
# Llama/Qwen/SmolLM-style decoder LM: token embedding → N ModernBlocks →
|
|
132
|
+
# final RMSNorm → LM head. No learned position embedding — RoPE supplies it.
|
|
133
|
+
class ModernModel < NN::Module
|
|
134
|
+
# @return [Integer]
|
|
135
|
+
attr_reader :vocab_size, :embed_dim, :num_heads, :num_kv_heads, :num_layers, :max_seq_len
|
|
136
|
+
|
|
137
|
+
# @param vocab_size [Integer]
|
|
138
|
+
# @param embed_dim [Integer]
|
|
139
|
+
# @param num_heads [Integer]
|
|
140
|
+
# @param num_kv_heads [Integer]
|
|
141
|
+
# @param num_layers [Integer]
|
|
142
|
+
# @param ff_dim [Integer]
|
|
143
|
+
# @param max_seq_len [Integer]
|
|
144
|
+
# @param rope_base [Float]
|
|
145
|
+
# @param eps [Float]
|
|
146
|
+
# @param device_id [Integer]
|
|
147
|
+
def initialize(vocab_size:, embed_dim:, num_heads:, num_kv_heads:, num_layers:,
|
|
148
|
+
ff_dim:, max_seq_len:, rope_base: 10000.0, rope_scaling: nil,
|
|
149
|
+
head_dim: nil, eps: 1e-6, device_id: 0)
|
|
150
|
+
super()
|
|
151
|
+
@vocab_size = vocab_size
|
|
152
|
+
@embed_dim = embed_dim
|
|
153
|
+
@num_heads = num_heads
|
|
154
|
+
@num_kv_heads = num_kv_heads
|
|
155
|
+
@num_layers = num_layers
|
|
156
|
+
@max_seq_len = max_seq_len
|
|
157
|
+
@device_id = device_id
|
|
158
|
+
@head_dim = head_dim || (embed_dim / num_heads)
|
|
159
|
+
|
|
160
|
+
@token_embedding = register_module("token_embedding",
|
|
161
|
+
NN::Embedding.new(vocab_size, embed_dim, device_id: device_id))
|
|
162
|
+
@blocks = []
|
|
163
|
+
num_layers.times do |i|
|
|
164
|
+
blk = ModernBlock.new(embed_dim, num_heads, num_kv_heads: num_kv_heads,
|
|
165
|
+
ff_dim: ff_dim, rope_base: rope_base, rope_scaling: rope_scaling,
|
|
166
|
+
head_dim: head_dim, eps: eps, device_id: device_id)
|
|
167
|
+
@blocks << register_module("blocks.#{i}", blk)
|
|
168
|
+
end
|
|
169
|
+
@norm = register_module("norm", NN::RMSNorm.new(embed_dim, eps: eps, device_id: device_id))
|
|
170
|
+
@head = register_module("head", NN::Linear.new(embed_dim, vocab_size, bias: false, device_id: device_id))
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
# @param input_ids [Tensor] token indices [seq] (int32)
|
|
174
|
+
# @param mask [Tensor, nil] unused (attention is causal)
|
|
175
|
+
# @return [Tensor] logits [seq, vocab]
|
|
176
|
+
def forward(input_ids, mask: nil)
|
|
177
|
+
x = @token_embedding.call(input_ids) # [seq, embed]; RoPE (in attn) supplies position
|
|
178
|
+
@blocks.each { |block| x = block.call(x) }
|
|
179
|
+
x = @norm.call(x)
|
|
180
|
+
@head.call(x)
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
# @return [String]
|
|
184
|
+
def to_s
|
|
185
|
+
"ModernModel(vocab=#{@vocab_size}, embed=#{@embed_dim}, q_heads=#{@num_heads}, " \
|
|
186
|
+
"kv_heads=#{@num_kv_heads}, layers=#{@num_layers})"
|
|
187
|
+
end
|
|
188
|
+
end
|
|
189
|
+
end
|
|
190
|
+
end
|
|
191
|
+
end
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Ignis
|
|
4
|
+
module AI
|
|
5
|
+
module Transformer
|
|
6
|
+
# SwiGLU feed-forward (Llama / Qwen / SmolLM / Mistral):
|
|
7
|
+
# down( silu(gate(x)) ⊙ up(x) )
|
|
8
|
+
# Two input projections (gate, up) into the hidden dim and one output (down)
|
|
9
|
+
# back to embed. silu(z) = z·sigmoid(z). Llama-style uses no bias. This is a
|
|
10
|
+
# pure composition of verified ops (Linear, silu, elementwise mul), so autograd
|
|
11
|
+
# produces the backward automatically.
|
|
12
|
+
class SwiGLU < NN::Module
|
|
13
|
+
# @param embed_dim [Integer]
|
|
14
|
+
# @param ff_dim [Integer] hidden dim (Llama uses ~8/3·embed, rounded)
|
|
15
|
+
# @param bias [Boolean] include projection biases (Llama/Qwen: false)
|
|
16
|
+
# @param device_id [Integer]
|
|
17
|
+
def initialize(embed_dim, ff_dim, bias: false, device_id: 0)
|
|
18
|
+
super()
|
|
19
|
+
@embed_dim = embed_dim
|
|
20
|
+
@ff_dim = ff_dim
|
|
21
|
+
@gate = register_module("gate", NN::Linear.new(embed_dim, ff_dim, bias: bias, device_id: device_id))
|
|
22
|
+
@up = register_module("up", NN::Linear.new(embed_dim, ff_dim, bias: bias, device_id: device_id))
|
|
23
|
+
@down = register_module("down", NN::Linear.new(ff_dim, embed_dim, bias: bias, device_id: device_id))
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
# @param x [Tensor] [*, embed_dim]
|
|
27
|
+
# @return [Tensor] [*, embed_dim]
|
|
28
|
+
def forward(x)
|
|
29
|
+
@down.call(@gate.call(x).silu * @up.call(x))
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# @return [String]
|
|
33
|
+
def to_s
|
|
34
|
+
"SwiGLU(embed=#{@embed_dim}, ff=#{@ff_dim})"
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Ignis
|
|
4
|
+
module AI
|
|
5
|
+
# Weight mapping from HuggingFace model naming conventions to Ignis parameter names.
|
|
6
|
+
# Used by Safetensors.load_model to map HF checkpoint keys to Ignis module keys.
|
|
7
|
+
module WeightMap
|
|
8
|
+
class << self
|
|
9
|
+
# Get the weight map for a given model architecture.
|
|
10
|
+
# @param architecture [Symbol] :gpt2, :llama, :bert
|
|
11
|
+
# @return [Hash{String => String}] HF key → Ignis key
|
|
12
|
+
def for(architecture)
|
|
13
|
+
case architecture
|
|
14
|
+
when :gpt2 then gpt2_map
|
|
15
|
+
when :llama then llama_map
|
|
16
|
+
when :bert then bert_map
|
|
17
|
+
else
|
|
18
|
+
raise ArgumentError, "Unknown architecture: #{architecture}. Supported: :gpt2, :llama, :bert"
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
private
|
|
23
|
+
|
|
24
|
+
# GPT-2 weight mapping: HuggingFace → Ignis
|
|
25
|
+
# HF naming: transformer.h.{i}.attn.c_attn.weight
|
|
26
|
+
# Ignis naming: blocks.{i}.attention.q_proj.weight (split)
|
|
27
|
+
# @return [Hash{String => String}]
|
|
28
|
+
def gpt2_map
|
|
29
|
+
map = {}
|
|
30
|
+
|
|
31
|
+
# Embeddings
|
|
32
|
+
map["wte.weight"] = "token_embedding.weight"
|
|
33
|
+
map["wpe.weight"] = "position_embedding.weight"
|
|
34
|
+
|
|
35
|
+
# Final layer norm
|
|
36
|
+
map["ln_f.weight"] = "norm.weight"
|
|
37
|
+
map["ln_f.bias"] = "norm.bias"
|
|
38
|
+
|
|
39
|
+
# LM head (GPT-2 ties wte and lm_head)
|
|
40
|
+
# map["lm_head.weight"] = "head.weight" # Usually tied to wte
|
|
41
|
+
|
|
42
|
+
# Per-layer mappings
|
|
43
|
+
12.times do |i|
|
|
44
|
+
hf_prefix = "h.#{i}"
|
|
45
|
+
nnw_prefix = "blocks.#{i}"
|
|
46
|
+
|
|
47
|
+
# Attention
|
|
48
|
+
map["#{hf_prefix}.attn.c_attn.weight"] = "#{nnw_prefix}.attention._c_attn_weight"
|
|
49
|
+
map["#{hf_prefix}.attn.c_attn.bias"] = "#{nnw_prefix}.attention._c_attn_bias"
|
|
50
|
+
map["#{hf_prefix}.attn.c_proj.weight"] = "#{nnw_prefix}.attention.out_proj.weight"
|
|
51
|
+
map["#{hf_prefix}.attn.c_proj.bias"] = "#{nnw_prefix}.attention.out_proj.bias"
|
|
52
|
+
|
|
53
|
+
# Layer norms
|
|
54
|
+
map["#{hf_prefix}.ln_1.weight"] = "#{nnw_prefix}.norm1.weight"
|
|
55
|
+
map["#{hf_prefix}.ln_1.bias"] = "#{nnw_prefix}.norm1.bias"
|
|
56
|
+
map["#{hf_prefix}.ln_2.weight"] = "#{nnw_prefix}.norm2.weight"
|
|
57
|
+
map["#{hf_prefix}.ln_2.bias"] = "#{nnw_prefix}.norm2.bias"
|
|
58
|
+
|
|
59
|
+
# MLP / Feed-forward
|
|
60
|
+
map["#{hf_prefix}.mlp.c_fc.weight"] = "#{nnw_prefix}.feed_forward.fc1.weight"
|
|
61
|
+
map["#{hf_prefix}.mlp.c_fc.bias"] = "#{nnw_prefix}.feed_forward.fc1.bias"
|
|
62
|
+
map["#{hf_prefix}.mlp.c_proj.weight"] = "#{nnw_prefix}.feed_forward.fc2.weight"
|
|
63
|
+
map["#{hf_prefix}.mlp.c_proj.bias"] = "#{nnw_prefix}.feed_forward.fc2.bias"
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
map
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
# LLaMA weight mapping: HuggingFace → Ignis
|
|
70
|
+
# @return [Hash{String => String}]
|
|
71
|
+
def llama_map
|
|
72
|
+
map = {}
|
|
73
|
+
|
|
74
|
+
map["model.embed_tokens.weight"] = "token_embedding.weight"
|
|
75
|
+
map["model.norm.weight"] = "norm.weight"
|
|
76
|
+
map["lm_head.weight"] = "head.weight"
|
|
77
|
+
|
|
78
|
+
32.times do |i|
|
|
79
|
+
hf_prefix = "model.layers.#{i}"
|
|
80
|
+
nnw_prefix = "blocks.#{i}"
|
|
81
|
+
|
|
82
|
+
# Self-attention
|
|
83
|
+
map["#{hf_prefix}.self_attn.q_proj.weight"] = "#{nnw_prefix}.attention.q_proj.weight"
|
|
84
|
+
map["#{hf_prefix}.self_attn.k_proj.weight"] = "#{nnw_prefix}.attention.k_proj.weight"
|
|
85
|
+
map["#{hf_prefix}.self_attn.v_proj.weight"] = "#{nnw_prefix}.attention.v_proj.weight"
|
|
86
|
+
map["#{hf_prefix}.self_attn.o_proj.weight"] = "#{nnw_prefix}.attention.out_proj.weight"
|
|
87
|
+
|
|
88
|
+
# RMS norms
|
|
89
|
+
map["#{hf_prefix}.input_layernorm.weight"] = "#{nnw_prefix}.norm1.weight"
|
|
90
|
+
map["#{hf_prefix}.post_attention_layernorm.weight"] = "#{nnw_prefix}.norm2.weight"
|
|
91
|
+
|
|
92
|
+
# MLP (SwiGLU)
|
|
93
|
+
map["#{hf_prefix}.mlp.gate_proj.weight"] = "#{nnw_prefix}.feed_forward.fc1.weight"
|
|
94
|
+
map["#{hf_prefix}.mlp.up_proj.weight"] = "#{nnw_prefix}.feed_forward._up_proj_weight"
|
|
95
|
+
map["#{hf_prefix}.mlp.down_proj.weight"] = "#{nnw_prefix}.feed_forward.fc2.weight"
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
map
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
# BERT weight mapping: HuggingFace → Ignis
|
|
102
|
+
# @return [Hash{String => String}]
|
|
103
|
+
def bert_map
|
|
104
|
+
map = {}
|
|
105
|
+
|
|
106
|
+
map["embeddings.word_embeddings.weight"] = "token_embedding.weight"
|
|
107
|
+
map["embeddings.position_embeddings.weight"] = "position_embedding.weight"
|
|
108
|
+
map["embeddings.LayerNorm.weight"] = "embedding_norm.weight"
|
|
109
|
+
map["embeddings.LayerNorm.bias"] = "embedding_norm.bias"
|
|
110
|
+
|
|
111
|
+
12.times do |i|
|
|
112
|
+
hf_prefix = "encoder.layer.#{i}"
|
|
113
|
+
nnw_prefix = "blocks.#{i}"
|
|
114
|
+
|
|
115
|
+
map["#{hf_prefix}.attention.self.query.weight"] = "#{nnw_prefix}.attention.q_proj.weight"
|
|
116
|
+
map["#{hf_prefix}.attention.self.query.bias"] = "#{nnw_prefix}.attention.q_proj.bias"
|
|
117
|
+
map["#{hf_prefix}.attention.self.key.weight"] = "#{nnw_prefix}.attention.k_proj.weight"
|
|
118
|
+
map["#{hf_prefix}.attention.self.key.bias"] = "#{nnw_prefix}.attention.k_proj.bias"
|
|
119
|
+
map["#{hf_prefix}.attention.self.value.weight"] = "#{nnw_prefix}.attention.v_proj.weight"
|
|
120
|
+
map["#{hf_prefix}.attention.self.value.bias"] = "#{nnw_prefix}.attention.v_proj.bias"
|
|
121
|
+
map["#{hf_prefix}.attention.output.dense.weight"] = "#{nnw_prefix}.attention.out_proj.weight"
|
|
122
|
+
map["#{hf_prefix}.attention.output.dense.bias"] = "#{nnw_prefix}.attention.out_proj.bias"
|
|
123
|
+
map["#{hf_prefix}.attention.output.LayerNorm.weight"] = "#{nnw_prefix}.norm1.weight"
|
|
124
|
+
map["#{hf_prefix}.attention.output.LayerNorm.bias"] = "#{nnw_prefix}.norm1.bias"
|
|
125
|
+
|
|
126
|
+
map["#{hf_prefix}.intermediate.dense.weight"] = "#{nnw_prefix}.feed_forward.fc1.weight"
|
|
127
|
+
map["#{hf_prefix}.intermediate.dense.bias"] = "#{nnw_prefix}.feed_forward.fc1.bias"
|
|
128
|
+
map["#{hf_prefix}.output.dense.weight"] = "#{nnw_prefix}.feed_forward.fc2.weight"
|
|
129
|
+
map["#{hf_prefix}.output.dense.bias"] = "#{nnw_prefix}.feed_forward.fc2.bias"
|
|
130
|
+
map["#{hf_prefix}.output.LayerNorm.weight"] = "#{nnw_prefix}.norm2.weight"
|
|
131
|
+
map["#{hf_prefix}.output.LayerNorm.bias"] = "#{nnw_prefix}.norm2.bias"
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
map
|
|
135
|
+
end
|
|
136
|
+
end
|
|
137
|
+
end
|
|
138
|
+
end
|
|
139
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: ignis-dl
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.0.1
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- NNW / Ignis contributors
|
|
8
|
+
bindir: bin
|
|
9
|
+
cert_chain: []
|
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
|
11
|
+
dependencies:
|
|
12
|
+
- !ruby/object:Gem::Dependency
|
|
13
|
+
name: ignis-autograd
|
|
14
|
+
requirement: !ruby/object:Gem::Requirement
|
|
15
|
+
requirements:
|
|
16
|
+
- - '='
|
|
17
|
+
- !ruby/object:Gem::Version
|
|
18
|
+
version: 0.0.1
|
|
19
|
+
type: :runtime
|
|
20
|
+
prerelease: false
|
|
21
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
22
|
+
requirements:
|
|
23
|
+
- - '='
|
|
24
|
+
- !ruby/object:Gem::Version
|
|
25
|
+
version: 0.0.1
|
|
26
|
+
description: |
|
|
27
|
+
ignis-dl is the deep-learning layer of the Ignis ecosystem: NN modules
|
|
28
|
+
(Linear, Embedding, LayerNorm, RMSNorm, Dropout), optimizers (SGD/Adam/AdamW),
|
|
29
|
+
losses, and a transformer stack (multi-head + grouped-query attention, RoPE,
|
|
30
|
+
SwiGLU, KV cache) with HuggingFace weight loaders (GPT-2, Llama). Loads real
|
|
31
|
+
GPT-2 and Llama-3.2 checkpoints and matches HuggingFace logits, and trains
|
|
32
|
+
transformers from scratch — in Ruby, on native Windows. Installing this pulls
|
|
33
|
+
the whole stack (ignis + ignis-autograd), so it also serves as the meta-gem.
|
|
34
|
+
executables: []
|
|
35
|
+
extensions: []
|
|
36
|
+
extra_rdoc_files: []
|
|
37
|
+
files:
|
|
38
|
+
- README.md
|
|
39
|
+
- lib/ignis-dl.rb
|
|
40
|
+
- lib/nnw/ai/gpt2_loader.rb
|
|
41
|
+
- lib/nnw/ai/inference.rb
|
|
42
|
+
- lib/nnw/ai/kv_cache.rb
|
|
43
|
+
- lib/nnw/ai/llama_loader.rb
|
|
44
|
+
- lib/nnw/ai/loss.rb
|
|
45
|
+
- lib/nnw/ai/nn/dropout.rb
|
|
46
|
+
- lib/nnw/ai/nn/embedding.rb
|
|
47
|
+
- lib/nnw/ai/nn/layer_norm.rb
|
|
48
|
+
- lib/nnw/ai/nn/linear.rb
|
|
49
|
+
- lib/nnw/ai/nn/module.rb
|
|
50
|
+
- lib/nnw/ai/nn/rms_norm.rb
|
|
51
|
+
- lib/nnw/ai/nn/sequential.rb
|
|
52
|
+
- lib/nnw/ai/optim/adam.rb
|
|
53
|
+
- lib/nnw/ai/optim/adamw.rb
|
|
54
|
+
- lib/nnw/ai/optim/base.rb
|
|
55
|
+
- lib/nnw/ai/optim/lr_scheduler.rb
|
|
56
|
+
- lib/nnw/ai/optim/sgd.rb
|
|
57
|
+
- lib/nnw/ai/safetensors.rb
|
|
58
|
+
- lib/nnw/ai/server.rb
|
|
59
|
+
- lib/nnw/ai/tokenizer.rb
|
|
60
|
+
- lib/nnw/ai/trainer.rb
|
|
61
|
+
- lib/nnw/ai/transformer/attention.rb
|
|
62
|
+
- lib/nnw/ai/transformer/block.rb
|
|
63
|
+
- lib/nnw/ai/transformer/feed_forward.rb
|
|
64
|
+
- lib/nnw/ai/transformer/model.rb
|
|
65
|
+
- lib/nnw/ai/transformer/modern.rb
|
|
66
|
+
- lib/nnw/ai/transformer/swiglu.rb
|
|
67
|
+
- lib/nnw/ai/weight_map.rb
|
|
68
|
+
homepage: https://github.com/tigel-agm/Ignis
|
|
69
|
+
licenses:
|
|
70
|
+
- MIT
|
|
71
|
+
metadata:
|
|
72
|
+
source_code_uri: https://github.com/tigel-agm/Ignis
|
|
73
|
+
rubygems_mfa_required: 'true'
|
|
74
|
+
rdoc_options: []
|
|
75
|
+
require_paths:
|
|
76
|
+
- lib
|
|
77
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
78
|
+
requirements:
|
|
79
|
+
- - ">="
|
|
80
|
+
- !ruby/object:Gem::Version
|
|
81
|
+
version: '3.1'
|
|
82
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
83
|
+
requirements:
|
|
84
|
+
- - ">="
|
|
85
|
+
- !ruby/object:Gem::Version
|
|
86
|
+
version: '0'
|
|
87
|
+
requirements: []
|
|
88
|
+
rubygems_version: 3.6.9
|
|
89
|
+
specification_version: 4
|
|
90
|
+
summary: Transformers + neural-net layers for Ruby, on the Ignis GPU/autograd stack
|
|
91
|
+
test_files: []
|