ignis-autograd 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,931 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "tape"
4
+
5
+ module Ignis
6
+ module AI
7
+ # Tensor — the user-facing GPU tensor type for AI operations.
8
+ #
9
+ # Wraps Ignis::Shared::NvArray and adds gradient tracking for autograd.
10
+ # All compute ops record backward functions on the tape when requires_grad is true.
11
+ #
12
+ # @example Forward + backward
13
+ # a = Ignis::AI::Tensor.from_host([1.0, 2.0, 3.0], shape: [3], requires_grad: true)
14
+ # b = a * a # b = a^2
15
+ # b.sum.backward!
16
+ # a.grad # => [2.0, 4.0, 6.0]
17
+ class Tensor
18
+ # @return [Ignis::Shared::NvArray] underlying GPU data
19
+ attr_reader :data
20
+
21
+ # @return [Ignis::Shared::NvArray, nil] gradient (same shape as data)
22
+ attr_accessor :grad
23
+
24
+ # @return [Boolean] whether this tensor participates in autograd
25
+ attr_reader :requires_grad
26
+
27
+ # @return [Proc, nil] backward function recorded by the tape
28
+ attr_accessor :grad_fn
29
+
30
+ # @return [Boolean] true if created by user (not computed)
31
+ attr_reader :is_leaf
32
+
33
+ # @return [Integer, nil] position in current tape
34
+ attr_accessor :_tape_id
35
+
36
+ # @param data [Ignis::Shared::NvArray]
37
+ # @param requires_grad [Boolean]
38
+ # @param grad_fn [Proc, nil]
39
+ # @param is_leaf [Boolean]
40
+ def initialize(data:, requires_grad: false, grad_fn: nil, is_leaf: true)
41
+ @data = data
42
+ @requires_grad = requires_grad
43
+ @grad = nil
44
+ @grad_fn = grad_fn
45
+ @is_leaf = is_leaf
46
+ @_tape_id = nil
47
+ end
48
+
49
+ # -------------------------------------------------------------------
50
+ # Constructors
51
+ # -------------------------------------------------------------------
52
+
53
+ # Wrap an existing NvArray.
54
+ # @param nv_array [Ignis::Shared::NvArray]
55
+ # @param requires_grad [Boolean]
56
+ # @return [Tensor]
57
+ def self.from_nv_array(nv_array, requires_grad: false)
58
+ new(data: nv_array, requires_grad: requires_grad)
59
+ end
60
+
61
+ # Create a zero-filled tensor.
62
+ # @param shape [Array<Integer>]
63
+ # @param dtype [Symbol]
64
+ # @param device_id [Integer]
65
+ # @param requires_grad [Boolean]
66
+ # @return [Tensor]
67
+ def self.zeros(shape, dtype: :float32, device_id: 0, requires_grad: false)
68
+ nv = Ignis::Shared::NvArray.new(shape: shape, dtype: dtype, device_id: device_id)
69
+ nv.from_host(Array.new(nv.numel, 0.0))
70
+ new(data: nv, requires_grad: requires_grad)
71
+ end
72
+
73
+ # Create a ones-filled tensor.
74
+ # @param shape [Array<Integer>]
75
+ # @param dtype [Symbol]
76
+ # @param device_id [Integer]
77
+ # @param requires_grad [Boolean]
78
+ # @return [Tensor]
79
+ def self.ones(shape, dtype: :float32, device_id: 0, requires_grad: false)
80
+ nv = Ignis::Shared::NvArray.new(shape: shape, dtype: dtype, device_id: device_id)
81
+ nv.from_host(Array.new(nv.numel, 1.0))
82
+ new(data: nv, requires_grad: requires_grad)
83
+ end
84
+
85
+ # Create a tensor with random uniform values in [0, 1).
86
+ # @param shape [Array<Integer>]
87
+ # @param dtype [Symbol]
88
+ # @param device_id [Integer]
89
+ # @param requires_grad [Boolean]
90
+ # @return [Tensor]
91
+ def self.rand(shape, dtype: :float32, device_id: 0, requires_grad: false)
92
+ nv = Ignis::Shared::NvArray.new(shape: shape, dtype: dtype, device_id: device_id)
93
+ nv.from_host(Array.new(nv.numel) { Kernel.rand })
94
+ new(data: nv, requires_grad: requires_grad)
95
+ end
96
+
97
+ # Create a tensor from a Ruby array.
98
+ # @param ruby_array [Array<Numeric>]
99
+ # @param shape [Array<Integer>]
100
+ # @param dtype [Symbol]
101
+ # @param device_id [Integer]
102
+ # @param requires_grad [Boolean]
103
+ # @return [Tensor]
104
+ def self.from_host(ruby_array, shape:, dtype: :float32, device_id: 0, requires_grad: false)
105
+ nv = Ignis::Shared::NvArray.new(shape: shape, dtype: dtype, device_id: device_id)
106
+ nv.from_host(ruby_array)
107
+ new(data: nv, requires_grad: requires_grad)
108
+ end
109
+
110
+ # -------------------------------------------------------------------
111
+ # Shape / dtype delegation
112
+ # -------------------------------------------------------------------
113
+
114
+ # @return [Array<Integer>]
115
+ def shape
116
+ @data.shape
117
+ end
118
+
119
+ # @return [Symbol]
120
+ def dtype
121
+ @data.dtype
122
+ end
123
+
124
+ # @return [Integer]
125
+ def numel
126
+ @data.numel
127
+ end
128
+
129
+ # @return [Integer]
130
+ def device_id
131
+ @data.device_id
132
+ end
133
+
134
+ # -------------------------------------------------------------------
135
+ # Compute ops — each records grad_fn when requires_grad is true
136
+ # -------------------------------------------------------------------
137
+
138
+ # Matrix multiplication: self @ other
139
+ # @param other [Tensor]
140
+ # @return [Tensor]
141
+ # @param other [Tensor]
142
+ # @param transpose_b [Boolean] compute self @ other^T (cuBLAS transposes in
143
+ # the GEMM — avoids materializing other^T, which for the LM head was a
144
+ # 765ms/forward transpose of a 38M-element weight). Used by Linear.
145
+ def matmul(other, transpose_b: false)
146
+ result_data = Ignis::LinAlg::Matmul.call(@data, other.data, transpose_b: transpose_b)
147
+ result = Tensor.new(data: result_data, requires_grad: should_track?(other), is_leaf: false)
148
+
149
+ if result.requires_grad
150
+ saved_self = @data
151
+ saved_other = other.data
152
+ Tape.record(result, inputs: [self, other]) do |grad|
153
+ if transpose_b
154
+ # y = A @ Bᵀ ⇒ dA = grad @ B, dB = gradᵀ @ A
155
+ grad_a = Ignis::LinAlg::Matmul.call(grad, saved_other)
156
+ grad_b = Ignis::LinAlg::Matmul.call(grad, saved_self, transpose_a: true)
157
+ else
158
+ # dA = grad @ Bᵀ, dB = Aᵀ @ grad
159
+ grad_a = Ignis::LinAlg::Matmul.call(grad, saved_other, transpose_b: true)
160
+ grad_b = Ignis::LinAlg::Matmul.call(saved_self, grad, transpose_a: true)
161
+ end
162
+ [grad_a, grad_b]
163
+ end
164
+ end
165
+
166
+ result
167
+ end
168
+
169
+ # Row-broadcast bias add: self [rows, cols] + bias [cols] -> [rows, cols].
170
+ # (Linear layer bias; plain Tensor#+ requires equal element counts.)
171
+ # @param bias [Tensor] bias vector of length shape[-1]
172
+ # @return [Tensor]
173
+ def add_bias(bias)
174
+ cols = shape[-1]
175
+ rows = numel / cols
176
+ result_nv = alloc_like(@data)
177
+
178
+ kernel = Ignis::JIT::Kernels::Elementwise.add_bias_rows
179
+ kernel.launch(grid: [(numel + 255) / 256], block: [256],
180
+ args: [@data, bias.data, result_nv, rows, cols])
181
+
182
+ result = Tensor.new(data: result_nv, requires_grad: should_track?(bias), is_leaf: false)
183
+
184
+ if result.requires_grad
185
+ Tape.record(result, inputs: [self, bias]) do |grad|
186
+ # d/d(input) = grad (passthrough); d/d(bias) = sum over rows
187
+ grad_bias = zeros_nv([cols])
188
+ bk = Ignis::JIT::Kernels::Elementwise.add_backward_broadcast
189
+ bk.launch(grid: [(cols + 255) / 256], block: [256], args: [grad, grad_bias, rows, cols])
190
+ [grad, grad_bias]
191
+ end
192
+ end
193
+
194
+ result
195
+ end
196
+
197
+ # Multi-head / grouped-query scaled dot-product attention (causal optional),
198
+ # batch = 1. self = Q [seq, num_heads*head_dim]; +k+, +v+ = [seq,
199
+ # num_kv_heads*head_dim]. Returns context [seq, num_heads*head_dim].
200
+ #
201
+ # With num_kv_heads == num_heads this is standard multi-head attention. With
202
+ # num_kv_heads < num_heads it is Grouped-Query Attention (Llama-2-70B, Llama-3,
203
+ # Qwen2/3, SmolLM3): each KV head is shared by group_size = num_heads/num_kv_heads
204
+ # query heads. Each query head runs the Flash-Attention-2 kernel against its
205
+ # group's KV head. In the backward, the group_size query heads that share a KV
206
+ # head ACCUMULATE into that head's dK/dV (scatter-add); dQ heads are disjoint.
207
+ # @param k [Tensor]
208
+ # @param v [Tensor]
209
+ # @param num_heads [Integer] number of query heads
210
+ # @param num_kv_heads [Integer, nil] number of K/V heads (nil ⇒ num_heads = MHA)
211
+ # @param causal [Boolean]
212
+ # @return [Tensor] context [seq, num_heads*head_dim]
213
+ def sdpa(k, v, num_heads:, num_kv_heads: nil, causal: true)
214
+ num_kv_heads ||= num_heads
215
+ raise ArgumentError, "num_heads (#{num_heads}) must be a multiple of num_kv_heads (#{num_kv_heads})" \
216
+ unless (num_heads % num_kv_heads).zero?
217
+
218
+ seq, embed = shape # embed = num_heads * head_dim
219
+ head_dim = embed / num_heads
220
+ # The flash-attention kernels store per-head rows in fixed [HEAD_DIM_MAX=128]
221
+ # register arrays and clamp every dim loop to d < 128. For head_dim > 128
222
+ # they would silently drop dims 128.. from scores/output/gradients with no
223
+ # error. Targets (Qwen3/Llama/SmolLM/Phi) use head_dim ≤ 128; fail loud above
224
+ # that rather than miscompute. (decode_sdpa uses cuBLAS+softmax and has no cap.)
225
+ raise ArgumentError,
226
+ "head_dim #{head_dim} exceeds flash-attention HEAD_DIM_MAX (128); " \
227
+ "larger heads are not yet supported by the flash kernels" if head_dim > 128
228
+ embed_kv = num_kv_heads * head_dim
229
+ group_size = num_heads / num_kv_heads
230
+ scale = (1.0 / Math.sqrt(head_dim)).to_f
231
+ cmask = causal ? 1 : 0
232
+ context_nv = zeros_nv([seq, embed])
233
+
234
+ fwd = Ignis::JIT::Kernels::Attention.flash_attention_forward
235
+ q_tiles = (seq + 63) / 64
236
+ num_heads.times do |h|
237
+ qoff = h * head_dim
238
+ koff = (h / group_size) * head_dim # the KV head this query head attends to
239
+ qh = slice_cols_nv(@data, qoff, head_dim, seq, embed)
240
+ kh = slice_cols_nv(k.data, koff, head_dim, seq, embed_kv)
241
+ vh = slice_cols_nv(v.data, koff, head_dim, seq, embed_kv)
242
+ oh = zeros_nv([seq, head_dim])
243
+ fwd.launch(grid: [q_tiles], block: [64],
244
+ args: [qh, kh, vh, oh, seq, head_dim, scale, cmask])
245
+ scatter_cols_nv!(oh, context_nv, qoff, head_dim, seq, embed)
246
+ end
247
+
248
+ result = Tensor.new(data: context_nv,
249
+ requires_grad: @requires_grad || should_track?(k) || should_track?(v),
250
+ is_leaf: false)
251
+
252
+ if result.requires_grad
253
+ sq = @data
254
+ sk = k.data
255
+ sv = v.data
256
+ so = context_nv
257
+ Tape.record(result, inputs: [self, k, v]) do |grad|
258
+ d_q = zeros_nv([seq, embed])
259
+ d_k = zeros_nv([seq, embed_kv])
260
+ d_v = zeros_nv([seq, embed_kv])
261
+ bwd = Ignis::JIT::Kernels::Attention.flash_attention_backward
262
+ blk = (seq + 255) / 256
263
+ num_heads.times do |h|
264
+ qoff = h * head_dim
265
+ koff = (h / group_size) * head_dim
266
+ qh = slice_cols_nv(sq, qoff, head_dim, seq, embed)
267
+ kh = slice_cols_nv(sk, koff, head_dim, seq, embed_kv)
268
+ vh = slice_cols_nv(sv, koff, head_dim, seq, embed_kv)
269
+ oh = slice_cols_nv(so, qoff, head_dim, seq, embed)
270
+ doh = slice_cols_nv(grad, qoff, head_dim, seq, embed)
271
+ dqh = zeros_nv([seq, head_dim])
272
+ dkh = zeros_nv([seq, head_dim])
273
+ dvh = zeros_nv([seq, head_dim])
274
+ bwd.launch(grid: [blk], block: [256],
275
+ args: [qh, kh, vh, oh, doh, dqh, dkh, dvh, seq, head_dim, scale, cmask])
276
+ # dQ heads are disjoint → overwrite. dK/dV heads are SHARED across the
277
+ # group → accumulate (add into a zero-initialized buffer). For MHA
278
+ # (group_size==1) the KV columns are disjoint too, so add-into-zero is
279
+ # numerically identical to the previous overwrite — no regression.
280
+ scatter_cols_nv!(dqh, d_q, qoff, head_dim, seq, embed)
281
+ scatter_cols_add_nv!(dkh, d_k, koff, head_dim, seq, embed_kv)
282
+ scatter_cols_add_nv!(dvh, d_v, koff, head_dim, seq, embed_kv)
283
+ end
284
+ [d_q, d_k, d_v]
285
+ end
286
+ end
287
+
288
+ result
289
+ end
290
+
291
+ # Single-query attention for autoregressive decode with a KV cache.
292
+ #
293
+ # self = q [1, embed] (the new token's query); +k+, +v+ = cached keys/values
294
+ # [past+1, embed] (every position up to and including the current one). The
295
+ # new token is the LAST position, so it attends to ALL cached positions — no
296
+ # causal mask is needed. Returns context [1, embed]. No autograd (decode runs
297
+ # under no_grad). Built from the verified column-major GEMM (the 1/sqrt(d)
298
+ # scale folded into alpha) + the numerically-stable softmax_forward kernel,
299
+ # mirroring sdpa's per-head column layout so head splitting is identical.
300
+ #
301
+ # @param k [Tensor] cached keys [past+1, embed]
302
+ # @param v [Tensor] cached values [past+1, embed]
303
+ # @param num_heads [Integer]
304
+ # @return [Tensor] context [1, embed]
305
+ def decode_sdpa(k, v, num_heads:)
306
+ _, embed = shape
307
+ tk = k.shape[0]
308
+ head_dim = embed / num_heads
309
+ scale = (1.0 / Math.sqrt(head_dim)).to_f
310
+ context_nv = zeros_nv([1, embed])
311
+
312
+ sm = Ignis::JIT::Kernels::Attention.softmax_forward
313
+ num_heads.times do |h|
314
+ off = h * head_dim
315
+ qh = slice_cols_nv(@data, off, head_dim, 1, embed) # [1, hd]
316
+ kh = slice_cols_nv(k.data, off, head_dim, tk, embed) # [tk, hd]
317
+ vh = slice_cols_nv(v.data, off, head_dim, tk, embed) # [tk, hd]
318
+
319
+ # scores = scale * (qh @ khᵀ) → [1, tk] (alpha folds in the scale)
320
+ scores = Ignis::LinAlg::Matmul.call(qh, kh, transpose_b: true, alpha: scale)
321
+ # probs = softmax(scores) along the tk axis → [1, tk]
322
+ probs = Ignis::Shared::NvArray.new(shape: [1, tk], dtype: :float32, device_id: device_id).to_device
323
+ sm.launch(grid: [1], block: [1], args: [scores, probs, 1, tk])
324
+ # ctx_h = probs @ vh → [1, hd]
325
+ ctx_h = Ignis::LinAlg::Matmul.call(probs, vh)
326
+ scatter_cols_nv!(ctx_h, context_nv, off, head_dim, 1, embed)
327
+ end
328
+
329
+ Tensor.new(data: context_nv, requires_grad: false, is_leaf: false)
330
+ end
331
+
332
+ # Elementwise addition: self + other
333
+ # @param other [Tensor, Numeric]
334
+ # @return [Tensor]
335
+ def +(other)
336
+ other = ensure_tensor(other)
337
+ result_nv = alloc_like(@data)
338
+
339
+ kernel = Ignis::JIT::Kernels::Elementwise.add_forward
340
+ n = numel
341
+ grid = [(n + 255) / 256]
342
+ kernel.launch(grid: grid, block: [256], args: [@data, other.data, result_nv, n])
343
+
344
+ result = Tensor.new(data: result_nv, requires_grad: should_track?(other), is_leaf: false)
345
+
346
+ if result.requires_grad
347
+ Tape.record(result, inputs: [self, other]) do |grad|
348
+ [grad, grad] # d(a+b)/da = 1, d(a+b)/db = 1
349
+ end
350
+ end
351
+
352
+ result
353
+ end
354
+
355
+ # Elementwise subtraction: self - other
356
+ # @param other [Tensor, Numeric]
357
+ # @return [Tensor]
358
+ def -(other)
359
+ other = ensure_tensor(other)
360
+ result_nv = alloc_like(@data)
361
+
362
+ kernel = Ignis::JIT::Kernels::Elementwise.sub_forward
363
+ n = numel
364
+ grid = [(n + 255) / 256]
365
+ kernel.launch(grid: grid, block: [256], args: [@data, other.data, result_nv, n])
366
+
367
+ result = Tensor.new(data: result_nv, requires_grad: should_track?(other), is_leaf: false)
368
+
369
+ if result.requires_grad
370
+ Tape.record(result, inputs: [self, other]) do |grad|
371
+ neg_grad = alloc_like(grad)
372
+ scale_k = Ignis::JIT::Kernels::Elementwise.scale_forward
373
+ gn = grad.numel
374
+ scale_k.launch(grid: [(gn + 255) / 256], block: [256], args: [grad, neg_grad, -1.0, gn])
375
+ [grad, neg_grad]
376
+ end
377
+ end
378
+
379
+ result
380
+ end
381
+
382
+ # Elementwise multiplication (Hadamard): self * other
383
+ # @param other [Tensor, Numeric]
384
+ # @return [Tensor]
385
+ def *(other)
386
+ if other.is_a?(Numeric)
387
+ return scalar_mul(other)
388
+ end
389
+
390
+ other = ensure_tensor(other)
391
+ result_nv = alloc_like(@data)
392
+
393
+ kernel = Ignis::JIT::Kernels::Elementwise.mul_forward
394
+ n = numel
395
+ grid = [(n + 255) / 256]
396
+ kernel.launch(grid: grid, block: [256], args: [@data, other.data, result_nv, n])
397
+
398
+ result = Tensor.new(data: result_nv, requires_grad: should_track?(other), is_leaf: false)
399
+
400
+ if result.requires_grad
401
+ saved_self = @data
402
+ saved_other = other.data
403
+ Tape.record(result, inputs: [self, other]) do |grad|
404
+ grad_a = alloc_like(grad)
405
+ grad_b = alloc_like(grad)
406
+ mk = Ignis::JIT::Kernels::Elementwise.mul_backward
407
+ gn = grad.numel
408
+ g = [(gn + 255) / 256]
409
+ mk.launch(grid: g, block: [256], args: [grad, saved_other, grad_a, gn])
410
+ mk.launch(grid: g, block: [256], args: [grad, saved_self, grad_b, gn])
411
+ [grad_a, grad_b]
412
+ end
413
+ end
414
+
415
+ result
416
+ end
417
+
418
+ # ReLU activation
419
+ # @return [Tensor]
420
+ def relu
421
+ result_nv = alloc_like(@data)
422
+ kernel = Ignis::JIT::Kernels::Activations.relu_forward(numel)
423
+ n = numel
424
+ kernel.launch(grid: [(n + 255) / 256], block: [256], args: [@data, result_nv, n])
425
+
426
+ result = Tensor.new(data: result_nv, requires_grad: @requires_grad, is_leaf: false)
427
+
428
+ if @requires_grad
429
+ saved_input = @data
430
+ Tape.record(result, inputs: [self]) do |grad|
431
+ grad_in = alloc_like(grad)
432
+ bk = Ignis::JIT::Kernels::Activations.relu_backward
433
+ gn = grad.numel
434
+ bk.launch(grid: [(gn + 255) / 256], block: [256], args: [grad, saved_input, grad_in, gn])
435
+ [grad_in]
436
+ end
437
+ end
438
+
439
+ result
440
+ end
441
+
442
+ # GELU activation (tanh approximation)
443
+ # @return [Tensor]
444
+ def gelu
445
+ result_nv = alloc_like(@data)
446
+ kernel = Ignis::JIT::Kernels::Activations.gelu_forward
447
+ n = numel
448
+ kernel.launch(grid: [(n + 255) / 256], block: [256], args: [@data, result_nv, n])
449
+
450
+ result = Tensor.new(data: result_nv, requires_grad: @requires_grad, is_leaf: false)
451
+
452
+ if @requires_grad
453
+ saved_input = @data
454
+ Tape.record(result, inputs: [self]) do |grad|
455
+ grad_in = alloc_like(grad)
456
+ bk = Ignis::JIT::Kernels::Activations.gelu_backward
457
+ gn = grad.numel
458
+ bk.launch(grid: [(gn + 255) / 256], block: [256], args: [grad, saved_input, grad_in, gn])
459
+ [grad_in]
460
+ end
461
+ end
462
+
463
+ result
464
+ end
465
+
466
+ # SiLU activation: x * sigmoid(x)
467
+ # @return [Tensor]
468
+ def silu
469
+ result_nv = alloc_like(@data)
470
+ kernel = Ignis::JIT::Kernels::Activations.silu_forward
471
+ n = numel
472
+ kernel.launch(grid: [(n + 255) / 256], block: [256], args: [@data, result_nv, n])
473
+
474
+ result = Tensor.new(data: result_nv, requires_grad: @requires_grad, is_leaf: false)
475
+
476
+ if @requires_grad
477
+ saved_input = @data
478
+ Tape.record(result, inputs: [self]) do |grad|
479
+ grad_in = alloc_like(grad)
480
+ bk = Ignis::JIT::Kernels::Activations.silu_backward
481
+ gn = grad.numel
482
+ bk.launch(grid: [(gn + 255) / 256], block: [256], args: [grad, saved_input, grad_in, gn])
483
+ [grad_in]
484
+ end
485
+ end
486
+
487
+ result
488
+ end
489
+
490
+ # Softmax along last dimension
491
+ # @return [Tensor]
492
+ def softmax
493
+ last_dim = shape[-1]
494
+ outer_size = numel / last_dim
495
+ result_nv = alloc_like(@data)
496
+
497
+ kernel = Ignis::JIT::Kernels::Attention.softmax_forward
498
+ kernel.launch(grid: [(outer_size + 255) / 256], block: [256],
499
+ args: [@data, result_nv, outer_size, last_dim])
500
+
501
+ result = Tensor.new(data: result_nv, requires_grad: @requires_grad, is_leaf: false)
502
+
503
+ if @requires_grad
504
+ saved_output = result_nv
505
+ Tape.record(result, inputs: [self]) do |grad|
506
+ grad_in = alloc_like(grad)
507
+ bk = Ignis::JIT::Kernels::Attention.softmax_backward
508
+ bk.launch(grid: [(outer_size + 255) / 256], block: [256],
509
+ args: [grad, saved_output, grad_in, outer_size, last_dim])
510
+ [grad_in]
511
+ end
512
+ end
513
+
514
+ result
515
+ end
516
+
517
+ # Layer normalization
518
+ # @param weight [Tensor] gamma parameter
519
+ # @param bias [Tensor] beta parameter
520
+ # @param eps [Float] epsilon for numerical stability
521
+ # @return [Tensor]
522
+ def layer_norm(weight, bias, eps: 1e-5)
523
+ norm_size = shape[-1]
524
+ outer_size = numel / norm_size
525
+ result_nv = alloc_like(@data)
526
+
527
+ # Allocate mean and rstd storage for backward pass
528
+ mean_nv = Ignis::Shared::NvArray.new(shape: [outer_size], dtype: dtype, device_id: device_id)
529
+ mean_nv.from_host(Array.new(outer_size, 0.0))
530
+ rstd_nv = Ignis::Shared::NvArray.new(shape: [outer_size], dtype: dtype, device_id: device_id)
531
+ rstd_nv.from_host(Array.new(outer_size, 0.0))
532
+
533
+ kernel = Ignis::JIT::Kernels::Normalization.layer_norm_forward
534
+ kernel.launch(grid: [(outer_size + 255) / 256], block: [256],
535
+ args: [@data, weight.data, bias.data, result_nv, mean_nv, rstd_nv,
536
+ outer_size, norm_size, eps])
537
+
538
+ result = Tensor.new(data: result_nv,
539
+ requires_grad: @requires_grad || weight.requires_grad || bias.requires_grad,
540
+ is_leaf: false)
541
+
542
+ if result.requires_grad
543
+ saved_input = @data
544
+ saved_gamma = weight.data
545
+ Tape.record(result, inputs: [self, weight, bias]) do |grad|
546
+ grad_input = alloc_like(grad)
547
+ grad_gamma = Ignis::Shared::NvArray.new(shape: [norm_size], dtype: dtype, device_id: device_id)
548
+ grad_gamma.from_host(Array.new(norm_size, 0.0))
549
+ grad_beta = Ignis::Shared::NvArray.new(shape: [norm_size], dtype: dtype, device_id: device_id)
550
+ grad_beta.from_host(Array.new(norm_size, 0.0))
551
+
552
+ bk = Ignis::JIT::Kernels::Normalization.layer_norm_backward
553
+ bk.launch(grid: [(outer_size + 255) / 256], block: [256],
554
+ args: [grad, saved_input, saved_gamma, mean_nv, rstd_nv,
555
+ grad_input, grad_gamma, grad_beta, outer_size, norm_size])
556
+ [grad_input, grad_gamma, grad_beta]
557
+ end
558
+ end
559
+
560
+ result
561
+ end
562
+
563
+ # RMSNorm: y = gamma * x / sqrt(mean(x^2) + eps) (Llama/Qwen/Mistral style).
564
+ # No mean-subtraction and no bias (vs LayerNorm). Normalizes the last dim.
565
+ # @param weight [Tensor] gamma scale [norm_size]
566
+ # @param eps [Float]
567
+ # @return [Tensor]
568
+ def rms_norm(weight, eps: 1e-5)
569
+ norm_size = shape[-1]
570
+ outer_size = numel / norm_size
571
+ result_nv = alloc_like(@data)
572
+
573
+ # rstd per row, saved for backward
574
+ rstd_nv = Ignis::Shared::NvArray.new(shape: [outer_size], dtype: dtype, device_id: device_id)
575
+ rstd_nv.zero!
576
+
577
+ fwd = Ignis::JIT::Kernels::Normalization.rms_norm_forward
578
+ fwd.launch(grid: [(outer_size + 255) / 256], block: [256],
579
+ args: [@data, weight.data, result_nv, rstd_nv, outer_size, norm_size, eps.to_f])
580
+
581
+ result = Tensor.new(data: result_nv,
582
+ requires_grad: @requires_grad || weight.requires_grad,
583
+ is_leaf: false)
584
+
585
+ if result.requires_grad
586
+ saved_input = @data
587
+ saved_gamma = weight.data
588
+ Tape.record(result, inputs: [self, weight]) do |grad|
589
+ grad_input = alloc_like(grad)
590
+ grad_gamma = zeros_nv([norm_size])
591
+ bk = Ignis::JIT::Kernels::Normalization.rms_norm_backward
592
+ bk.launch(grid: [(outer_size + 255) / 256], block: [256],
593
+ args: [grad, saved_input, saved_gamma, rstd_nv,
594
+ grad_input, grad_gamma, outer_size, norm_size])
595
+ [grad_input, grad_gamma]
596
+ end
597
+ end
598
+
599
+ result
600
+ end
601
+
602
+ # Rotary Position Embedding (RoPE), HF/Llama/Qwen "rotate_half" convention.
603
+ # self is [seq, num_heads*head_dim]; rotates each head's dims by its absolute
604
+ # position. No learned parameters — the backward is the same rotation with the
605
+ # sin sign flipped (orthogonal rotation ⇒ R^T = R(-θ)). Applied to Q and K.
606
+ # @param num_heads [Integer]
607
+ # @param base [Float] rotary base θ (Llama/Qwen use 10000; long-context models larger)
608
+ # @param pos_offset [Integer] absolute position of row 0 (for KV-cache decode)
609
+ # @return [Tensor]
610
+ # @param num_heads [Integer]
611
+ # @param base [Float] rotary base θ (used only when +inv_freq+ is nil)
612
+ # @param pos_offset [Integer] absolute position of row 0 (for KV-cache decode)
613
+ # @param inv_freq [Ignis::Shared::NvArray, Array<Float>, nil] precomputed [head_dim/2]
614
+ # inverse frequencies. nil ⇒ standard base^(-2i/head_dim). Pass a remapped table
615
+ # for RoPE scaling (llama3/NTK/YaRN).
616
+ def rope(num_heads:, base: 10000.0, pos_offset: 0, inv_freq: nil)
617
+ seq, embed = shape
618
+ head_dim = embed / num_heads
619
+ # rotate_half RoPE pairs dim i with i+head_dim/2, so it is only well-defined
620
+ # for EVEN head_dim. With an odd head_dim the pairing collides (one dim is
621
+ # used twice, another never), giving a non-orthogonal map whose forward AND
622
+ # gradient are silently wrong. No real architecture uses odd head_dim — fail
623
+ # loud rather than miscompute.
624
+ raise ArgumentError,
625
+ "RoPE requires an even head_dim (got #{head_dim} = #{embed}/#{num_heads}); " \
626
+ "rotate_half is only defined for paired dimensions" unless head_dim.even?
627
+
628
+ half = head_dim / 2
629
+ invf_nv = case inv_freq
630
+ when Ignis::Shared::NvArray then inv_freq
631
+ when Array then nv_from_floats(inv_freq)
632
+ else nv_from_floats((0...half).map { |i| base.to_f**(-2.0 * i / head_dim) })
633
+ end
634
+
635
+ out_nv = alloc_like(@data)
636
+ total = seq * embed
637
+ k = Ignis::JIT::Kernels::Attention.rope_apply
638
+ k.launch(grid: [(total + 255) / 256], block: [256],
639
+ args: [@data, out_nv, seq, num_heads, head_dim, pos_offset, invf_nv, 1.0])
640
+
641
+ result = Tensor.new(data: out_nv, requires_grad: @requires_grad, is_leaf: false)
642
+
643
+ if result.requires_grad
644
+ Tape.record(result, inputs: [self]) do |grad|
645
+ gin = alloc_like(grad)
646
+ # backward = forward rotation with negated sin (transpose of an orthogonal rotation)
647
+ k.launch(grid: [(total + 255) / 256], block: [256],
648
+ args: [grad, gin, seq, num_heads, head_dim, pos_offset, invf_nv, -1.0])
649
+ [gin]
650
+ end
651
+ end
652
+
653
+ result
654
+ end
655
+
656
+ # Transpose two dimensions (for 2D tensors)
657
+ # @param dim0 [Integer]
658
+ # @param dim1 [Integer]
659
+ # @return [Tensor]
660
+ def transpose(dim0 = 0, dim1 = 1)
661
+ raise ArgumentError, "transpose requires 2D tensor" unless shape.length == 2
662
+
663
+ rows = shape[0]
664
+ cols = shape[1]
665
+ result_nv = Ignis::Shared::NvArray.new(shape: [cols, rows], dtype: dtype, device_id: device_id)
666
+ result_nv.to_device # transpose_2d writes every element — alloc only, no host zeroing
667
+
668
+ kernel = Ignis::JIT::Kernels::Elementwise.transpose_2d
669
+ grid_x = (cols + 31) / 32
670
+ grid_y = (rows + 31) / 32
671
+ kernel.launch(grid: [grid_x, grid_y], block: [32, 8], args: [@data, result_nv, rows, cols])
672
+
673
+ result = Tensor.new(data: result_nv, requires_grad: @requires_grad, is_leaf: false)
674
+
675
+ if @requires_grad
676
+ Tape.record(result, inputs: [self]) do |grad|
677
+ # Backward of transpose is transpose
678
+ grad_t = alloc_like(@data)
679
+ kernel_t = Ignis::JIT::Kernels::Elementwise.transpose_2d
680
+ kernel_t.launch(grid: [grid_y, grid_x], block: [32, 8], args: [grad, grad_t, cols, rows])
681
+ [grad_t]
682
+ end
683
+ end
684
+
685
+ result
686
+ end
687
+
688
+ # Reshape (zero-copy if contiguous)
689
+ # @param new_shape [Array<Integer>]
690
+ # @return [Tensor]
691
+ def reshape(new_shape)
692
+ new_numel = new_shape.reduce(1, :*)
693
+ raise ArgumentError, "Cannot reshape #{shape} to #{new_shape}" unless new_numel == numel
694
+
695
+ # View over @data's buffer: non-owning, retains parent so it isn't freed
696
+ # while the view is alive (and never double-frees the shared allocation).
697
+ result_nv = Ignis::Shared::NvArray.new(shape: new_shape, dtype: dtype, device_id: device_id,
698
+ ptr: @data.ptr, parent: @data)
699
+ result = Tensor.new(data: result_nv, requires_grad: @requires_grad, is_leaf: false)
700
+
701
+ if @requires_grad
702
+ original_shape = shape
703
+ Tape.record(result, inputs: [self]) do |grad|
704
+ # Backward: reshape grad back to original shape (view over grad)
705
+ grad_reshaped = Ignis::Shared::NvArray.new(shape: original_shape, dtype: dtype,
706
+ device_id: device_id, ptr: grad.ptr, parent: grad)
707
+ [grad_reshaped]
708
+ end
709
+ end
710
+
711
+ result
712
+ end
713
+
714
+ # Sum reduction (all elements → scalar)
715
+ # @return [Tensor]
716
+ def sum
717
+ n = numel
718
+ result_nv = Ignis::Shared::NvArray.new(shape: [1], dtype: dtype, device_id: device_id)
719
+ result_nv.from_host([0.0])
720
+
721
+ kernel = Ignis::JIT::Kernels::Elementwise.sum_reduce
722
+ kernel.launch(grid: [1], block: [1], args: [@data, result_nv, 1, n])
723
+
724
+ result = Tensor.new(data: result_nv, requires_grad: @requires_grad, is_leaf: false)
725
+
726
+ if @requires_grad
727
+ orig_shape = shape
728
+ Tape.record(result, inputs: [self]) do |grad|
729
+ # Gradient of sum is broadcast of 1.0 to original shape
730
+ grad_input = Ignis::Shared::NvArray.new(shape: orig_shape, dtype: dtype, device_id: device_id)
731
+ grad_input.from_host(Array.new(n, 0.0))
732
+ bk = Ignis::JIT::Kernels::Elementwise.broadcast_grad
733
+ bk.launch(grid: [(n + 255) / 256], block: [256], args: [grad, grad_input, 1.0, n])
734
+ [grad_input]
735
+ end
736
+ end
737
+
738
+ result
739
+ end
740
+
741
+ # Mean reduction (all elements → scalar)
742
+ # @return [Tensor]
743
+ def mean
744
+ n = numel
745
+ sum_result = self.sum
746
+ # Scale by 1/n
747
+ result_nv = Ignis::Shared::NvArray.new(shape: [1], dtype: dtype, device_id: device_id)
748
+ result_nv.from_host([0.0])
749
+ kernel = Ignis::JIT::Kernels::Elementwise.scale_forward
750
+ kernel.launch(grid: [1], block: [1], args: [sum_result.data, result_nv, 1.0 / n, 1])
751
+
752
+ result = Tensor.new(data: result_nv, requires_grad: @requires_grad, is_leaf: false)
753
+
754
+ if @requires_grad
755
+ orig_shape = shape
756
+ Tape.record(result, inputs: [self]) do |grad|
757
+ grad_input = Ignis::Shared::NvArray.new(shape: orig_shape, dtype: dtype, device_id: device_id)
758
+ grad_input.from_host(Array.new(n, 0.0))
759
+ bk = Ignis::JIT::Kernels::Elementwise.broadcast_grad
760
+ bk.launch(grid: [(n + 255) / 256], block: [256], args: [grad, grad_input, 1.0 / n, n])
761
+ [grad_input]
762
+ end
763
+ end
764
+
765
+ result
766
+ end
767
+
768
+ # -------------------------------------------------------------------
769
+ # Autograd
770
+ # -------------------------------------------------------------------
771
+
772
+ # Trigger reverse-mode automatic differentiation from this tensor.
773
+ # @param grad_output [Ignis::Shared::NvArray, nil] initial gradient
774
+ # @return [void]
775
+ def backward!(grad_output = nil)
776
+ if grad_output.nil? && numel == 1
777
+ # Scalar loss: start with 1.0
778
+ grad_output = Ignis::Shared::NvArray.new(shape: [1], dtype: dtype, device_id: device_id)
779
+ grad_output.from_host([1.0])
780
+ end
781
+
782
+ raise ArgumentError, "backward! requires grad_output for non-scalar tensors" if grad_output.nil?
783
+
784
+ Tape.backward!(self, grad_output)
785
+ end
786
+
787
+ # Zero out gradients (sets to zeros, not nil — avoids alloc in training loop)
788
+ # @return [void]
789
+ def zero_grad!
790
+ if @grad
791
+ n = @grad.numel
792
+ fill_k = Ignis::JIT::Kernels::Elementwise.fill
793
+ fill_k.launch(grid: [(n + 255) / 256], block: [256], args: [@grad, 0.0, n])
794
+ else
795
+ @grad = Ignis::Shared::NvArray.new(shape: shape, dtype: dtype, device_id: device_id)
796
+ @grad.from_host(Array.new(numel, 0.0))
797
+ end
798
+ end
799
+
800
+ # Create a detached copy (same GPU memory, no grad tracking)
801
+ # @return [Tensor]
802
+ def detach
803
+ Tensor.new(data: @data, requires_grad: false, is_leaf: true)
804
+ end
805
+
806
+ # Copy GPU data to host as Ruby Array.
807
+ # @return [Array<Numeric>]
808
+ def to_host
809
+ @data.to_host
810
+ end
811
+
812
+ # Get scalar value (for single-element tensors).
813
+ # @return [Float, Integer]
814
+ def item
815
+ raise "item() requires a single-element tensor, got shape #{shape}" unless numel == 1
816
+ to_host[0]
817
+ end
818
+
819
+ # -------------------------------------------------------------------
820
+ # Internal helpers
821
+ # -------------------------------------------------------------------
822
+
823
+ private
824
+
825
+ # Scalar multiplication
826
+ # @param scalar [Numeric]
827
+ # @return [Tensor]
828
+ def scalar_mul(scalar)
829
+ result_nv = alloc_like(@data)
830
+ kernel = Ignis::JIT::Kernels::Elementwise.scale_forward
831
+ n = numel
832
+ kernel.launch(grid: [(n + 255) / 256], block: [256], args: [@data, result_nv, scalar.to_f, n])
833
+
834
+ result = Tensor.new(data: result_nv, requires_grad: @requires_grad, is_leaf: false)
835
+
836
+ if @requires_grad
837
+ Tape.record(result, inputs: [self]) do |grad|
838
+ grad_scaled = alloc_like(grad)
839
+ sk = Ignis::JIT::Kernels::Elementwise.scale_forward
840
+ gn = grad.numel
841
+ sk.launch(grid: [(gn + 255) / 256], block: [256], args: [grad, grad_scaled, scalar.to_f, gn])
842
+ [grad_scaled]
843
+ end
844
+ end
845
+
846
+ result
847
+ end
848
+
849
+ # Check if gradient tracking should be enabled for a binary op.
850
+ # @param other [Tensor]
851
+ # @return [Boolean]
852
+ def should_track?(other)
853
+ return false if Tape.no_grad_active?
854
+ @requires_grad || (other.is_a?(Tensor) && other.requires_grad)
855
+ end
856
+
857
+ # Ensure argument is a Tensor.
858
+ # @param other [Tensor, Numeric]
859
+ # @return [Tensor]
860
+ def ensure_tensor(other)
861
+ return other if other.is_a?(Tensor)
862
+
863
+ nv = Ignis::Shared::NvArray.new(shape: shape, dtype: dtype, device_id: device_id)
864
+ nv.from_host(Array.new(numel, other.to_f))
865
+ Tensor.new(data: nv, requires_grad: false)
866
+ end
867
+
868
+ # Allocate a new NvArray with same shape/dtype/device as source.
869
+ # @param source [Ignis::Shared::NvArray]
870
+ # @return [Ignis::Shared::NvArray]
871
+ def alloc_like(source)
872
+ nv = Ignis::Shared::NvArray.new(shape: source.shape, dtype: source.dtype, device_id: source.device_id)
873
+ nv.to_device # cudaMalloc
874
+ nv.zero! # device memset (was a host Array.new + H2D copy: ~0.5ms/op)
875
+ nv
876
+ end
877
+
878
+ # Build a small fp32 NvArray from a Ruby Float array (e.g. a RoPE inv_freq table).
879
+ # @param floats [Array<Float>]
880
+ # @return [Ignis::Shared::NvArray]
881
+ def nv_from_floats(floats)
882
+ nv = Ignis::Shared::NvArray.new(shape: [floats.length], dtype: :float32, device_id: device_id)
883
+ nv.from_host(floats.map(&:to_f))
884
+ nv
885
+ end
886
+
887
+ # Allocate a zeroed NvArray of the given shape (this tensor's dtype/device).
888
+ # @param shp [Array<Integer>]
889
+ # @return [Ignis::Shared::NvArray]
890
+ def zeros_nv(shp)
891
+ nv = Ignis::Shared::NvArray.new(shape: shp, dtype: dtype, device_id: device_id)
892
+ nv.to_device # cudaMalloc
893
+ nv.zero! # device memset (fast)
894
+ nv
895
+ end
896
+
897
+ # Copy columns [col_off, col_off+len) of every row into a fresh [rows, len] array.
898
+ # @return [Ignis::Shared::NvArray]
899
+ def slice_cols_nv(src, col_off, len, rows, total_cols)
900
+ out = Ignis::Shared::NvArray.new(shape: [rows, len], dtype: src.dtype, device_id: src.device_id)
901
+ out.to_device
902
+ kernel = Ignis::JIT::Kernels::Elementwise.slice_cols
903
+ total = rows * len
904
+ kernel.launch(grid: [(total + 255) / 256], block: [256],
905
+ args: [src, out, rows, total_cols, col_off, len])
906
+ out
907
+ end
908
+
909
+ # Write a [rows, len] array into columns [col_off, col_off+len) of dst [rows, total_cols].
910
+ # @return [Ignis::Shared::NvArray] dst
911
+ def scatter_cols_nv!(src, dst, col_off, len, rows, total_cols)
912
+ kernel = Ignis::JIT::Kernels::Elementwise.scatter_cols
913
+ total = rows * len
914
+ kernel.launch(grid: [(total + 255) / 256], block: [256],
915
+ args: [src, dst, rows, total_cols, col_off, len])
916
+ dst
917
+ end
918
+
919
+ # Accumulating scatter: dst[:, col_off...] += src. For GQA backward, where
920
+ # several query heads share one KV head and their dK/dV must SUM.
921
+ # @return [Ignis::Shared::NvArray] dst
922
+ def scatter_cols_add_nv!(src, dst, col_off, len, rows, total_cols)
923
+ kernel = Ignis::JIT::Kernels::Elementwise.scatter_cols_add
924
+ total = rows * len
925
+ kernel.launch(grid: [(total + 255) / 256], block: [256],
926
+ args: [src, dst, rows, total_cols, col_off, len])
927
+ dst
928
+ end
929
+ end
930
+ end
931
+ end