grnexus 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1103 @@
1
+ require_relative 'grnexus_activations'
2
+
3
+ module GRNEXUSLayer
4
+ class Layer
5
+ def initialize
6
+ @weights = nil
7
+ @biases = nil
8
+ @trainable = true
9
+ @cache = {}
10
+ end
11
+
12
+ def forward(input)
13
+ raise NotImplementedError, "Debes implementar el método forward"
14
+ end
15
+
16
+ def backward(gradient, learning_rate)
17
+ raise NotImplementedError, "Debes implementar el método backward"
18
+ end
19
+
20
+ def trainable?
21
+ @trainable
22
+ end
23
+
24
+ def parameters
25
+ [@weights, @biases].compact
26
+ end
27
+
28
+ def zero_gradients!
29
+ @weight_gradient = nil if @weight_gradient
30
+ @bias_gradient = nil if @bias_gradient
31
+ end
32
+
33
+ def update_parameters(learning_rate)
34
+ if @weight_gradient
35
+ update_matrix!(@weights, @weight_gradient, learning_rate)
36
+ end
37
+ if @bias_gradient
38
+ update_vector!(@biases, @bias_gradient, learning_rate)
39
+ end
40
+ end
41
+
42
+ private
43
+
44
+ def update_matrix!(matrix, gradient, lr)
45
+ (0...matrix.length).each do |i|
46
+ (0...matrix[i].length).each do |j|
47
+ matrix[i][j] -= lr * gradient[i][j]
48
+ end
49
+ end
50
+ end
51
+
52
+ def update_vector!(vector, gradient, lr)
53
+ (0...vector.length).each do |i|
54
+ vector[i] -= lr * gradient[i]
55
+ end
56
+ end
57
+ end
58
+
59
+ class DenseLayer < Layer
60
+ attr_accessor :units, :input_dim, :activation, :use_bias, :weights, :biases
61
+
62
+ def initialize(units:, input_dim:, activation: nil, use_bias: true, weight_init: :xavier)
63
+ super()
64
+ @units = units
65
+ @input_dim = input_dim
66
+ @activation = activation.is_a?(Symbol) ? GRNEXUSActivations.const_get(activation).new : activation
67
+ @use_bias = use_bias
68
+
69
+ @weights = initialize_weights(weight_init, @input_dim, @units)
70
+ @biases = @use_bias ? Array.new(@units, 0.0) : nil
71
+ end
72
+
73
+ def forward(input)
74
+ batch_case = input[0].is_a?(Array) && input[0].length > 0 && input[0][0].is_a?(Numeric)
75
+
76
+ if batch_case
77
+ @cache[:input] = input.dup
78
+ output_batch = []
79
+
80
+ input.each do |x|
81
+ # weights is (input_dim x units), x is (input_dim)
82
+ # result should be (units)
83
+ z = Array.new(@units, 0.0)
84
+ @units.times do |i|
85
+ sum = 0.0
86
+ @input_dim.times do |j|
87
+ sum += @weights[j][i] * x[j]
88
+ end
89
+ z[i] = sum
90
+ end
91
+
92
+ z = add_vectors(z, @biases) if @biases
93
+
94
+ if @activation
95
+ @cache[:pre_activation] ||= []
96
+ @cache[:pre_activation] << z.dup
97
+ z = @activation.call(z)
98
+ end
99
+
100
+ output_batch << z
101
+ end
102
+
103
+ output_batch
104
+ else
105
+ @cache[:input] = input.dup
106
+ # weights is (input_dim x units), input is (input_dim)
107
+ # result should be (units)
108
+ z = Array.new(@units, 0.0)
109
+ @units.times do |i|
110
+ sum = 0.0
111
+ @input_dim.times do |j|
112
+ sum += @weights[j][i] * input[j]
113
+ end
114
+ z[i] = sum
115
+ end
116
+
117
+ z = add_vectors(z, @biases) if @biases
118
+
119
+ if @activation
120
+ @cache[:pre_activation] = z.dup
121
+ z = @activation.call(z)
122
+ end
123
+
124
+ z
125
+ end
126
+ end
127
+
128
+ def backward(gradient, learning_rate)
129
+ batch_case = gradient[0].is_a?(Array) && gradient[0][0].is_a?(Numeric)
130
+ gradients = batch_case ? gradient : [gradient]
131
+ inputs = batch_case ? @cache[:input] : [@cache[:input]]
132
+
133
+ input_gradients = []
134
+ weight_gradients = []
135
+ bias_gradients = [] if @use_bias
136
+
137
+ gradients.each_with_index do |grad, idx|
138
+ current_input = inputs[idx]
139
+
140
+ if @activation
141
+ pre_act = batch_case ? @cache[:pre_activation][idx] : @cache[:pre_activation]
142
+ act_deriv = @activation.call(pre_act, derivative: true)
143
+ grad = multiply_elementwise(grad, act_deriv)
144
+ end
145
+
146
+ # Weight gradient: outer product of input and grad
147
+ # weights is (input_dim x units), so gradient should be same shape
148
+ weight_grad = Array.new(@input_dim) { Array.new(@units, 0.0) }
149
+ @input_dim.times do |i|
150
+ @units.times do |j|
151
+ weight_grad[i][j] = current_input[i] * grad[j]
152
+ end
153
+ end
154
+ weight_gradients << weight_grad
155
+
156
+ if @biases
157
+ bias_gradients << grad.dup
158
+ end
159
+
160
+ # Input gradient: weights^T * grad
161
+ input_grad = Array.new(@input_dim, 0.0)
162
+ @input_dim.times do |i|
163
+ sum = 0.0
164
+ @units.times do |j|
165
+ sum += @weights[i][j] * grad[j]
166
+ end
167
+ input_grad[i] = sum
168
+ end
169
+ input_gradients << input_grad
170
+ end
171
+
172
+ # Average gradients and update weights
173
+ avg_weight_grad = average_matrices(weight_gradients)
174
+ @input_dim.times do |i|
175
+ @units.times do |j|
176
+ @weights[i][j] -= learning_rate * avg_weight_grad[i][j]
177
+ end
178
+ end
179
+
180
+ if @biases
181
+ avg_bias_grad = average_arrays(bias_gradients)
182
+ @units.times do |i|
183
+ @biases[i] -= learning_rate * avg_bias_grad[i]
184
+ end
185
+ end
186
+
187
+ batch_case ? input_gradients : input_gradients.first
188
+ end
189
+
190
+ private
191
+
192
+ def initialize_weights(method, input_dim, units)
193
+ case method
194
+ when :xavier
195
+ std = Math.sqrt(2.0 / (input_dim + units))
196
+ Array.new(input_dim) { Array.new(units) { rand_normal(0, std) } }
197
+ when :he
198
+ std = Math.sqrt(2.0 / input_dim)
199
+ Array.new(input_dim) { Array.new(units) { rand_normal(0, std) } }
200
+ when :random
201
+ Array.new(input_dim) { Array.new(units) { rand * 2 - 1 } }
202
+ else
203
+ Array.new(input_dim) { Array.new(units, 0.0) }
204
+ end
205
+ end
206
+
207
+ def rand_normal(mean, std_dev)
208
+ u1 = rand
209
+ u2 = rand
210
+ z0 = Math.sqrt(-2 * Math.log(u1)) * Math.cos(2 * Math::PI * u2)
211
+ z0 * std_dev + mean
212
+ end
213
+
214
+ def multiply_matrix_vector(matrix, vector)
215
+ result = Array.new(matrix.length, 0.0)
216
+ matrix.each_with_index do |row, i|
217
+ result[i] = dot_product(row, vector)
218
+ end
219
+ result
220
+ end
221
+
222
+ def dot_product(a, b)
223
+ sum = 0.0
224
+ (0...a.length).each { |i| sum += a[i] * b[i] }
225
+ sum
226
+ end
227
+
228
+ def add_vectors(a, b)
229
+ (0...a.length).map { |i| a[i] + b[i] }
230
+ end
231
+
232
+ def multiply_elementwise(a, b)
233
+ (0...a.length).map { |i| a[i] * b[i] }
234
+ end
235
+
236
+ def multiply_outer_product(vec1, vec2)
237
+ Array.new(vec1.length) do |i|
238
+ Array.new(vec2.length) { |j| vec1[i] * vec2[j] }
239
+ end
240
+ end
241
+
242
+ def average_matrices(matrices)
243
+ rows, cols = matrices[0].length, matrices[0][0].length
244
+ avg = Array.new(rows) { Array.new(cols, 0.0) }
245
+
246
+ matrices.each do |matrix|
247
+ matrix.each_with_index do |row, i|
248
+ row.each_with_index { |val, j| avg[i][j] += val }
249
+ end
250
+ end
251
+
252
+ avg.map! { |row| row.map! { |val| val / matrices.length } }
253
+ avg
254
+ end
255
+
256
+ def average_arrays(arrays)
257
+ length = arrays[0].length
258
+ avg = Array.new(length, 0.0)
259
+
260
+ arrays.each do |arr|
261
+ arr.each_with_index { |val, i| avg[i] += val }
262
+ end
263
+
264
+ avg.map! { |val| val / arrays.length }
265
+ end
266
+ end
267
+
268
+ class ActivationLayer < Layer
269
+ attr_accessor :activation
270
+
271
+ def initialize(activation)
272
+ super()
273
+ @activation = activation.is_a?(Symbol) ? GRNEXUSActivations.const_get(activation).new : activation
274
+ @trainable = false
275
+ end
276
+
277
+ def forward(input)
278
+ @cache[:input] = input.is_a?(Array) && input[0].is_a?(Array) ? input.dup : input.dup
279
+ if input[0].is_a?(Array)
280
+ input.map { |x| @activation.call(x) }
281
+ else
282
+ @activation.call(input)
283
+ end
284
+ end
285
+
286
+ def backward(gradient, learning_rate = nil)
287
+ input_cache = @cache[:input]
288
+
289
+ if gradient[0].is_a?(Array)
290
+ gradient.zip(input_cache).map do |grad, cached_input|
291
+ activation_deriv = @activation.call(cached_input, derivative: true)
292
+ multiply_elementwise(grad, activation_deriv)
293
+ end
294
+ else
295
+ activation_deriv = @activation.call(input_cache, derivative: true)
296
+ multiply_elementwise(gradient, activation_deriv)
297
+ end
298
+ end
299
+
300
+ private
301
+
302
+ def multiply_elementwise(a, b)
303
+ (0...a.length).map { |i| a[i] * b[i] }
304
+ end
305
+ end
306
+
307
+ class DropoutLayer < Layer
308
+ attr_accessor :rate
309
+
310
+ def initialize(rate: 0.5)
311
+ super()
312
+ @rate = rate
313
+ @mask = nil
314
+ @trainable = false
315
+ end
316
+
317
+ def forward(input, training: true)
318
+ @cache[:training] = training
319
+ if training
320
+ if input[0].is_a?(Array)
321
+ @mask = input.map { |sample| sample.map { rand > @rate ? 1.0 / (1.0 - @rate) : 0.0 } }
322
+ multiply_batch_elementwise(input, @mask)
323
+ else
324
+ @mask = input.map { rand > @rate ? 1.0 / (1.0 - @rate) : 0.0 }
325
+ multiply_elementwise(input, @mask)
326
+ end
327
+ else
328
+ input
329
+ end
330
+ end
331
+
332
+ def backward(gradient, learning_rate = nil)
333
+ training = @cache[:training]
334
+ if training && @mask
335
+ if gradient[0].is_a?(Array)
336
+ multiply_batch_elementwise(gradient, @mask)
337
+ else
338
+ multiply_elementwise(gradient, @mask)
339
+ end
340
+ else
341
+ gradient
342
+ end
343
+ end
344
+
345
+ private
346
+
347
+ def multiply_elementwise(a, b)
348
+ (0...a.length).map { |i| a[i] * b[i] }
349
+ end
350
+
351
+ def multiply_batch_elementwise(batch_a, batch_b)
352
+ batch_a.zip(batch_b).map do |a, b|
353
+ multiply_elementwise(a, b)
354
+ end
355
+ end
356
+ end
357
+
358
+ class BatchNormLayer < Layer
359
+ attr_accessor :epsilon, :momentum, :gamma, :beta
360
+
361
+ def initialize(epsilon: 1e-5, momentum: 0.1)
362
+ super()
363
+ @epsilon = epsilon
364
+ @momentum = momentum
365
+ @running_mean = nil
366
+ @running_var = nil
367
+ @gamma = 1.0
368
+ @beta = 0.0
369
+ @trainable = true
370
+ end
371
+
372
+ def forward(input, training: true)
373
+ batch_case = input[0].is_a?(Array) && input[0][0].is_a?(Numeric)
374
+ input_tensor = batch_case ? input : [input]
375
+
376
+ if training
377
+ batch_mean = calculate_mean(input_tensor)
378
+ batch_var = calculate_variance(input_tensor, batch_mean)
379
+ update_running_stats(batch_mean, batch_var)
380
+
381
+ @cache[:mean] = batch_mean
382
+ @cache[:var] = batch_var
383
+ @cache[:inv_std] = batch_var.map { |v| 1.0 / Math.sqrt(v + @epsilon) }
384
+ @cache[:x_norm] = normalize_batch(input_tensor, batch_mean, batch_var)
385
+ @cache[:x_centered] = center_batch(input_tensor, batch_mean)
386
+ else
387
+ # Si no hay running stats, usar las del batch actual
388
+ if @running_mean.nil? || @running_var.nil?
389
+ batch_mean = calculate_mean(input_tensor)
390
+ batch_var = calculate_variance(input_tensor, batch_mean)
391
+ update_running_stats(batch_mean, batch_var)
392
+ @cache[:x_norm] = normalize_batch(input_tensor, batch_mean, batch_var)
393
+ else
394
+ running_inv_std = @running_var.map { |v| 1.0 / Math.sqrt(v + @epsilon) }
395
+ @cache[:x_norm] = normalize_batch(input_tensor, @running_mean, @running_var, running_inv_std)
396
+ end
397
+ end
398
+
399
+ output_tensor = apply_affine_transform(batch_case ? @cache[:x_norm] : @cache[:x_norm].first)
400
+ batch_case ? output_tensor : output_tensor.first
401
+ end
402
+
403
+ def backward(gradient, learning_rate = nil)
404
+ batch_case = gradient[0].is_a?(Array) && gradient[0][0].is_a?(Numeric)
405
+ grad_tensor = batch_case ? gradient : [gradient]
406
+
407
+ @gamma_gradient = calculate_gamma_gradient(grad_tensor, @cache[:x_norm])
408
+ @beta_gradient = calculate_beta_gradient(grad_tensor)
409
+
410
+ # dx_norm = grad_tensor * gamma (element-wise)
411
+ dx_norm = grad_tensor.map { |grad_sample| grad_sample.map { |g| g * @gamma } }
412
+
413
+ n = grad_tensor.length
414
+ inv_std = @cache[:inv_std]
415
+
416
+ dx = []
417
+ (0...n).each do |i|
418
+ sum1 = multiply_elementwise(dx_norm[i], inv_std)
419
+ mean_dx_norm = dx_norm[i].map { |val| val / n }.reduce(:+)
420
+ sum2 = @cache[:x_centered][i].zip(inv_std).map { |c, s| c * s**3 }.map { |val| val / n }
421
+ sum2 = multiply_elementwise(sum2, Array.new(sum2.length, mean_dx_norm))
422
+
423
+ dx << subtract_vectors(sum1, sum2)
424
+ end
425
+
426
+ batch_case ? dx : dx.first
427
+ end
428
+
429
+ private
430
+
431
+ def calculate_mean(batch)
432
+ features = batch[0].length
433
+ means = Array.new(features, 0.0)
434
+
435
+ batch.each do |sample|
436
+ sample.each_with_index { |val, i| means[i] += val }
437
+ end
438
+
439
+ means.map! { |sum| sum / batch.length }
440
+ end
441
+
442
+ def calculate_variance(batch, means)
443
+ features = batch[0].length
444
+ vars = Array.new(features, 0.0)
445
+
446
+ batch.each do |sample|
447
+ sample.each_with_index { |val, i| vars[i] += (val - means[i])**2 }
448
+ end
449
+
450
+ vars.map! { |sum| sum / batch.length }
451
+ end
452
+
453
+ def update_running_stats(batch_mean, batch_var)
454
+ if @running_mean.nil?
455
+ @running_mean = batch_mean.dup
456
+ @running_var = batch_var.dup
457
+ else
458
+ @running_mean = multiply_scalar_add_vector((1 - @momentum), @running_mean, @momentum, batch_mean)
459
+ @running_var = multiply_scalar_add_vector((1 - @momentum), @running_var, @momentum, batch_var)
460
+ end
461
+ end
462
+
463
+ def normalize_batch(batch, means, vars, inv_std = nil)
464
+ inv_std ||= vars.map { |v| 1.0 / Math.sqrt(v + @epsilon) }
465
+
466
+ batch.map do |sample|
467
+ sample.zip(means, inv_std).map { |val, mean, std_inv| (val - mean) * std_inv }
468
+ end
469
+ end
470
+
471
+ def center_batch(batch, means)
472
+ batch.map do |sample|
473
+ sample.zip(means).map { |val, mean| val - mean }
474
+ end
475
+ end
476
+
477
+ def apply_affine_transform(normalized)
478
+ if normalized[0].is_a?(Array)
479
+ normalized.map do |sample|
480
+ sample.map { |val| @gamma * val + @beta }
481
+ end
482
+ else
483
+ normalized.map { |val| @gamma * val + @beta }
484
+ end
485
+ end
486
+
487
+ def calculate_gamma_gradient(gradient_batch, x_norm_batch)
488
+ grad_sum = Array.new(x_norm_batch[0].length, 0.0)
489
+
490
+ gradient_batch.each do |grad_sample|
491
+ grad_sample.each_with_index { |grad_val, i| grad_sum[i] += grad_val * x_norm_batch[gradient_batch.index(grad_sample)][i] }
492
+ end
493
+
494
+ grad_sum
495
+ end
496
+
497
+ def calculate_beta_gradient(gradient_batch)
498
+ grad_sum = Array.new(gradient_batch[0].length, 0.0)
499
+
500
+ gradient_batch.each do |grad_sample|
501
+ grad_sample.each_with_index { |grad_val, i| grad_sum[i] += grad_val }
502
+ end
503
+
504
+ grad_sum
505
+ end
506
+
507
+ def multiply_batch_elementwise(batch_a, scalar_or_vector)
508
+ if scalar_or_vector.is_a?(Array)
509
+ batch_a.map do |sample|
510
+ sample.zip(scalar_or_vector).map { |val, mult| val * mult }
511
+ end
512
+ else
513
+ batch_a.map do |sample|
514
+ sample.map { |val| val * scalar_or_vector }
515
+ end
516
+ end
517
+ end
518
+
519
+ def subtract_vectors(a, b)
520
+ (0...a.length).map { |i| a[i] - b[i] }
521
+ end
522
+
523
+ def multiply_elementwise(a, b)
524
+ (0...a.length).map { |i| a[i] * b[i] }
525
+ end
526
+
527
+ def multiply_scalar_add_vector(scalar1, vec1, scalar2, vec2)
528
+ (0...vec1.length).map { |i| scalar1 * vec1[i] + scalar2 * vec2[i] }
529
+ end
530
+ end
531
+
532
+ class Conv2DLayer < Layer
533
+ def initialize(filters:, kernel_size:, stride: 1, padding: 0)
534
+ super()
535
+ @filters = filters
536
+ @kernel_size = kernel_size.is_a?(Array) ? kernel_size : [kernel_size, kernel_size]
537
+ @stride = stride
538
+ @padding = padding
539
+
540
+ kh, kw = @kernel_size
541
+ @kernels = initialize_conv_kernels(@filters, kh, kw)
542
+ @biases = Array.new(@filters, 0.0)
543
+ end
544
+
545
+ def forward(input)
546
+ batch_case = input[0].is_a?(Array) && input[0][0].is_a?(Array) && input[0][0][0].is_a?(Numeric)
547
+ input_tensor = batch_case ? input : [input]
548
+
549
+ output_batch = input_tensor.map do |single_input|
550
+ convolve_2d(single_input)
551
+ end
552
+
553
+ batch_case ? output_batch : output_batch.first
554
+ end
555
+
556
+ def backward(gradient)
557
+ gradient
558
+ end
559
+
560
+ private
561
+
562
+ def initialize_conv_kernels(filters, kh, kw)
563
+ fan_in = kh * kw
564
+ std = Math.sqrt(2.0 / fan_in)
565
+
566
+ Array.new(filters) do
567
+ Array.new(kh) { Array.new(kw) { rand_normal(0, std) } }
568
+ end
569
+ end
570
+
571
+ def convolve_2d(input_image)
572
+ h, w = input_image.length, input_image[0].length
573
+ kh, kw = @kernels[0].length, @kernels[0][0].length
574
+
575
+ out_h = (h + 2 * @padding - kh) / @stride + 1
576
+ out_w = (w + 2 * @padding - kw) / @stride + 1
577
+
578
+ output = Array.new(out_h) { Array.new(out_w) { Array.new(@filters, 0.0) } }
579
+ padded = @padding > 0 ? pad_image(input_image, @padding) : input_image
580
+
581
+ (0...out_h).each do |oh|
582
+ (0...out_w).each do |ow|
583
+ roi_start_h = oh * @stride
584
+ roi_start_w = ow * @stride
585
+
586
+ (0...@filters).each do |f|
587
+ sum = 0.0
588
+ (0...kh).each do |kh_off|
589
+ (0...kw).each do |kw_off|
590
+ ih = roi_start_h + kh_off
591
+ iw = roi_start_w + kw_off
592
+ sum += padded[ih][iw] * @kernels[f][kh_off][kw_off]
593
+ end
594
+ end
595
+ output[oh][ow][f] = sum + @biases[f]
596
+ end
597
+ end
598
+ end
599
+
600
+ output
601
+ end
602
+
603
+ def pad_image(image, padding)
604
+ h, w = image.length, image[0].length
605
+ padded_h, padded_w = h + 2 * padding, w + 2 * padding
606
+
607
+ padded = Array.new(padded_h) { Array.new(padded_w, 0.0) }
608
+
609
+ (0...h).each do |ih|
610
+ (0...w).each do |iw|
611
+ padded[ih + padding][iw + padding] = image[ih][iw]
612
+ end
613
+ end
614
+
615
+ padded
616
+ end
617
+
618
+ def rand_normal(mean, std_dev)
619
+ u1 = rand
620
+ u2 = rand
621
+ z0 = Math.sqrt(-2 * Math.log(u1)) * Math.cos(2 * Math::PI * u2)
622
+ z0 * std_dev + mean
623
+ end
624
+ end
625
+
626
+ class MaxPoolingLayer < Layer
627
+ def initialize(pool_size:, stride: nil)
628
+ super()
629
+ @pool_size = pool_size.is_a?(Array) ? pool_size : [pool_size, pool_size]
630
+ @stride = stride || @pool_size
631
+ @stride = @stride.is_a?(Array) ? @stride : [@stride, @stride]
632
+ @trainable = false
633
+ end
634
+
635
+ def forward(input)
636
+ batch_case = input[0].is_a?(Array) && input[0][0].is_a?(Numeric)
637
+ input_tensor = batch_case ? input : [input]
638
+
639
+ output_batch = input_tensor.map do |single_input|
640
+ pool_2d(single_input)
641
+ end
642
+
643
+ batch_case ? output_batch : output_batch.first
644
+ end
645
+
646
+ def backward(gradient)
647
+ gradient
648
+ end
649
+
650
+ private
651
+
652
+ def pool_2d(input_image)
653
+ h, w = input_image.length, input_image[0].length
654
+ ph, pw = @pool_size
655
+ sh, sw = @stride
656
+
657
+ out_h = (h - ph) / sh + 1
658
+ out_w = (w - pw) / sw + 1
659
+
660
+ output = Array.new(out_h) { Array.new(out_w, 0.0) }
661
+ @cache[:switch_indices] ||= []
662
+ switch_indices_map = []
663
+
664
+ (0...out_h).each do |oh|
665
+ (0...out_w).each do |ow|
666
+ pool_start_h = oh * sh
667
+ pool_start_w = ow * sw
668
+
669
+ max_val = -Float::INFINITY
670
+ max_h, max_w = 0, 0
671
+
672
+ (0...ph).each do |ph_off|
673
+ (0...pw).each do |pw_off|
674
+ ih = pool_start_h + ph_off
675
+ iw = pool_start_w + pw_off
676
+ if input_image[ih][iw] > max_val
677
+ max_val = input_image[ih][iw]
678
+ max_h, max_w = ih, iw
679
+ end
680
+ end
681
+ end
682
+
683
+ output[oh][ow] = max_val
684
+ switch_indices_map << [max_h, max_w]
685
+ end
686
+ end
687
+
688
+ @cache[:switch_indices] << switch_indices_map
689
+ output
690
+ end
691
+ end
692
+
693
+ class LSTMLayer < Layer
694
+ def initialize(units:, input_size:)
695
+ super()
696
+ @units = units
697
+ @input_size = input_size
698
+ @hidden_size = units
699
+
700
+ @wf = initialize_weights(:xavier, @input_size, @units)
701
+ @uf = initialize_weights(:xavier, @hidden_size, @units)
702
+ @bf = Array.new(@units, 0.0)
703
+
704
+ @wi = initialize_weights(:xavier, @input_size, @units)
705
+ @ui = initialize_weights(:xavier, @hidden_size, @units)
706
+ @bi = Array.new(@units, 0.0)
707
+
708
+ @wo = initialize_weights(:xavier, @input_size, @units)
709
+ @uo = initialize_weights(:xavier, @hidden_size, @units)
710
+ @bo = Array.new(@units, 0.0)
711
+
712
+ @wc = initialize_weights(:xavier, @input_size, @units)
713
+ @uc = initialize_weights(:xavier, @hidden_size, @units)
714
+ @bc = Array.new(@units, 0.0)
715
+ end
716
+
717
+ def forward(input_sequence)
718
+ batch_case = input_sequence[0][0].is_a?(Array) && input_sequence[0][0][0].is_a?(Numeric)
719
+ sequences = batch_case ? transpose_batch_sequences(input_sequence) : [input_sequence]
720
+
721
+ outputs_batch = sequences.map do |single_sequence|
722
+ hidden_state = Array.new(@units, 0.0)
723
+ cell_state = Array.new(@units, 0.0)
724
+ outputs = []
725
+
726
+ single_sequence.each do |input_t|
727
+ hidden_state, cell_state = lstm_step(input_t, hidden_state, cell_state)
728
+ outputs << hidden_state.dup
729
+ end
730
+
731
+ outputs
732
+ end
733
+
734
+ if batch_case
735
+ transpose_batch_sequences(outputs_batch)
736
+ else
737
+ outputs_batch.first
738
+ end
739
+ end
740
+
741
+ def backward(gradient)
742
+ gradient
743
+ end
744
+
745
+ private
746
+
747
+ def lstm_step(input_t, prev_hidden, prev_cell)
748
+ f_input = add_vectors(multiply_matrix_vector(@wf, input_t), multiply_matrix_vector(@uf, prev_hidden))
749
+ f_input = add_vectors(f_input, @bf)
750
+ f_gate = GRNEXUSActivations::Sigmoid.new.call(f_input)
751
+
752
+ i_input = add_vectors(multiply_matrix_vector(@wi, input_t), multiply_matrix_vector(@ui, prev_hidden))
753
+ i_input = add_vectors(i_input, @bi)
754
+ i_gate = GRNEXUSActivations::Sigmoid.new.call(i_input)
755
+
756
+ o_input = add_vectors(multiply_matrix_vector(@wo, input_t), multiply_matrix_vector(@uo, prev_hidden))
757
+ o_input = add_vectors(o_input, @bo)
758
+ o_gate = GRNEXUSActivations::Sigmoid.new.call(o_input)
759
+
760
+ c_input = add_vectors(multiply_matrix_vector(@wc, input_t), multiply_matrix_vector(@uc, prev_hidden))
761
+ c_input = add_vectors(c_input, @bc)
762
+ candidate = GRNEXUSActivations::Tanh.new.call(c_input)
763
+
764
+ new_cell = add_vectors(
765
+ multiply_elementwise(f_gate, prev_cell),
766
+ multiply_elementwise(i_gate, candidate)
767
+ )
768
+
769
+ tanh_cell = GRNEXUSActivations::Tanh.new.call(new_cell)
770
+ new_hidden = multiply_elementwise(o_gate, tanh_cell)
771
+
772
+ [new_hidden, new_cell]
773
+ end
774
+
775
+ def transpose_batch_sequences(sequences)
776
+ seq_len = sequences.length
777
+ batch_size = sequences[0].length
778
+ input_size = sequences[0][0].length
779
+
780
+ transposed = Array.new(batch_size) { Array.new(seq_len) { Array.new(input_size) } }
781
+
782
+ (0...seq_len).each do |t|
783
+ (0...batch_size).each do |b|
784
+ (0...input_size).each do |i|
785
+ transposed[b][t][i] = sequences[t][b][i]
786
+ end
787
+ end
788
+ end
789
+
790
+ transposed
791
+ end
792
+
793
+ def initialize_weights(method, input_dim, units)
794
+ case method
795
+ when :xavier
796
+ std = Math.sqrt(2.0 / (input_dim + units))
797
+ Array.new(units) { Array.new(input_dim) { rand_normal(0, std) } }
798
+ else
799
+ Array.new(units) { Array.new(input_dim, 0.0) }
800
+ end
801
+ end
802
+
803
+ def rand_normal(mean, std_dev)
804
+ u1 = rand
805
+ u2 = rand
806
+ z0 = Math.sqrt(-2 * Math.log(u1)) * Math.cos(2 * Math::PI * u2)
807
+ z0 * std_dev + mean
808
+ end
809
+
810
+ def multiply_matrix_vector(matrix, vector)
811
+ result = Array.new(matrix.length, 0.0)
812
+ matrix.each_with_index do |row, i|
813
+ result[i] = dot_product(row, vector)
814
+ end
815
+ result
816
+ end
817
+
818
+ def dot_product(a, b)
819
+ sum = 0.0
820
+ (0...a.length).each { |i| sum += a[i] * b[i] }
821
+ sum
822
+ end
823
+
824
+ def add_vectors(a, b)
825
+ (0...a.length).map { |i| a[i] + b[i] }
826
+ end
827
+
828
+ def multiply_elementwise(a, b)
829
+ (0...a.length).map { |i| a[i] * b[i] }
830
+ end
831
+ end
832
+
833
+ class SoftmaxLayer < Layer
834
+ def initialize
835
+ super()
836
+ @trainable = false
837
+ end
838
+
839
+ def forward(input)
840
+ batch_case = input[0].is_a?(Array) && input[0][0].is_a?(Numeric)
841
+ input_tensor = batch_case ? input : [input]
842
+
843
+ output_batch = input_tensor.map { |x| compute_softmax(x) }
844
+ batch_case ? output_batch : output_batch.first
845
+ end
846
+
847
+ def backward(gradient)
848
+ gradient
849
+ end
850
+
851
+ private
852
+
853
+ def compute_softmax(x)
854
+ max_val = x.max
855
+ exps = x.map { |val| Math.exp(val - max_val) }
856
+ sum_exps = exps.sum
857
+ exps.map { |exp| exp / sum_exps }
858
+ end
859
+ end
860
+
861
+ class GRULayer < Layer
862
+ def initialize(units:, input_size:)
863
+ super()
864
+ @units = units
865
+ @input_size = input_size
866
+ @hidden_size = units
867
+
868
+ @wr = initialize_weights(:xavier, @input_size, @units)
869
+ @ur = initialize_weights(:xavier, @hidden_size, @units)
870
+ @br = Array.new(@units, 0.0)
871
+
872
+ @wz = initialize_weights(:xavier, @input_size, @units)
873
+ @uz = initialize_weights(:xavier, @hidden_size, @units)
874
+ @bz = Array.new(@units, 0.0)
875
+
876
+ @wh = initialize_weights(:xavier, @input_size, @units)
877
+ @uh = initialize_weights(:xavier, @hidden_size, @units)
878
+ @bh = Array.new(@units, 0.0)
879
+ end
880
+
881
+ def forward(input_sequence)
882
+ batch_case = input_sequence[0][0].is_a?(Array) && input_sequence[0][0][0].is_a?(Numeric)
883
+ sequences = batch_case ? transpose_batch_sequences(input_sequence) : [input_sequence]
884
+
885
+ outputs_batch = sequences.map do |single_sequence|
886
+ hidden_state = Array.new(@units, 0.0)
887
+ outputs = []
888
+
889
+ single_sequence.each do |input_t|
890
+ hidden_state = gru_step(input_t, hidden_state)
891
+ outputs << hidden_state.dup
892
+ end
893
+
894
+ outputs
895
+ end
896
+
897
+ if batch_case
898
+ transpose_batch_sequences(outputs_batch)
899
+ else
900
+ outputs_batch.first
901
+ end
902
+ end
903
+
904
+ def backward(gradient)
905
+ gradient
906
+ end
907
+
908
+ private
909
+
910
+ def gru_step(input_t, prev_hidden)
911
+ r_input = add_vectors(multiply_matrix_vector(@wr, input_t), multiply_matrix_vector(@ur, prev_hidden))
912
+ r_input = add_vectors(r_input, @br)
913
+ r_gate = GRNEXUSActivations::Sigmoid.new.call(r_input)
914
+
915
+ z_input = add_vectors(multiply_matrix_vector(@wz, input_t), multiply_matrix_vector(@uz, prev_hidden))
916
+ z_input = add_vectors(z_input, @bz)
917
+ z_gate = GRNEXUSActivations::Sigmoid.new.call(z_input)
918
+
919
+ rh_hidden = multiply_elementwise(r_gate, prev_hidden)
920
+ h_input = add_vectors(multiply_matrix_vector(@wh, input_t), multiply_matrix_vector(@uh, rh_hidden))
921
+ h_input = add_vectors(h_input, @bh)
922
+ h_tilde = GRNEXUSActivations::Tanh.new.call(h_input)
923
+
924
+ one_minus_z = (0...@units).map { |i| 1.0 - z_gate[i] }
925
+ term1 = multiply_elementwise(one_minus_z, h_tilde)
926
+ term2 = multiply_elementwise(z_gate, prev_hidden)
927
+ new_hidden = add_vectors(term1, term2)
928
+
929
+ new_hidden
930
+ end
931
+
932
+ def transpose_batch_sequences(sequences)
933
+ seq_len = sequences.length
934
+ batch_size = sequences[0].length
935
+ input_size = sequences[0][0].length
936
+
937
+ transposed = Array.new(batch_size) { Array.new(seq_len) { Array.new(input_size) } }
938
+
939
+ (0...seq_len).each do |t|
940
+ (0...batch_size).each do |b|
941
+ (0...input_size).each do |i|
942
+ transposed[b][t][i] = sequences[t][b][i]
943
+ end
944
+ end
945
+ end
946
+
947
+ transposed
948
+ end
949
+
950
+ def initialize_weights(method, input_dim, units)
951
+ case method
952
+ when :xavier
953
+ std = Math.sqrt(2.0 / (input_dim + units))
954
+ Array.new(units) { Array.new(input_dim) { rand_normal(0, std) } }
955
+ else
956
+ Array.new(units) { Array.new(input_dim, 0.0) }
957
+ end
958
+ end
959
+
960
+ def rand_normal(mean, std_dev)
961
+ u1 = rand
962
+ u2 = rand
963
+ z0 = Math.sqrt(-2 * Math.log(u1)) * Math.cos(2 * Math::PI * u2)
964
+ z0 * std_dev + mean
965
+ end
966
+
967
+ def multiply_matrix_vector(matrix, vector)
968
+ result = Array.new(matrix.length, 0.0)
969
+ matrix.each_with_index do |row, i|
970
+ result[i] = dot_product(row, vector)
971
+ end
972
+ result
973
+ end
974
+
975
+ def dot_product(a, b)
976
+ sum = 0.0
977
+ (0...a.length).each { |i| sum += a[i] * b[i] }
978
+ sum
979
+ end
980
+
981
+ def add_vectors(a, b)
982
+ (0...a.length).map { |i| a[i] + b[i] }
983
+ end
984
+
985
+ def multiply_elementwise(a, b)
986
+ (0...a.length).map { |i| a[i] * b[i] }
987
+ end
988
+ end
989
+
990
+ class EmbeddingLayer < Layer
991
+ attr_accessor :weights
992
+
993
+ def initialize(vocab_size:, embedding_dim:, padding_idx: nil)
994
+ super()
995
+ @vocab_size = vocab_size
996
+ @embedding_dim = embedding_dim
997
+ @padding_idx = padding_idx
998
+ @trainable = false # Set to false for now (no gradient update)
999
+
1000
+ # Xavier initialization
1001
+ limit = Math.sqrt(6.0 / (@vocab_size + @embedding_dim))
1002
+ @weights = Array.new(@vocab_size) { Array.new(@embedding_dim) { rand(-limit..limit) } }
1003
+ end
1004
+
1005
+ def forward(input)
1006
+ # Check if input is a batch of sequences
1007
+ batch_case = input[0].is_a?(Array)
1008
+ input_tensor = batch_case ? input : [input]
1009
+
1010
+ # Convert sequences to embeddings
1011
+ output_batch = input_tensor.map do |sequence|
1012
+ sequence.map do |idx|
1013
+ idx_int = idx.to_i
1014
+ # Clamp index to valid range
1015
+ idx_int = [[idx_int, 0].max, @vocab_size - 1].min
1016
+ @weights[idx_int].dup
1017
+ end
1018
+ end
1019
+
1020
+ batch_case ? output_batch : output_batch.first
1021
+ end
1022
+
1023
+ def backward(gradient, learning_rate = nil)
1024
+ # For now, pass gradient through
1025
+ # Full backprop through embeddings would require tracking indices
1026
+ gradient
1027
+ end
1028
+ end
1029
+
1030
+ class FlattenLayer < Layer
1031
+ def initialize
1032
+ super()
1033
+ @trainable = false
1034
+ @input_shape = nil
1035
+ end
1036
+
1037
+ def forward(input)
1038
+ # Handle batch of sequences (3D: batch x sequence x features)
1039
+ if input[0].is_a?(Array) && input[0][0].is_a?(Array) && input[0][0][0].is_a?(Numeric)
1040
+ @input_shape = [input.length, input[0].length, input[0][0].length]
1041
+ # Flatten each sample in the batch
1042
+ input.map { |sample| sample.flatten }
1043
+ # Handle batch of vectors (2D: batch x features)
1044
+ elsif input[0].is_a?(Array) && input[0][0].is_a?(Numeric)
1045
+ @input_shape = [input.length, input[0].length]
1046
+ input
1047
+ # Handle single sequence (2D: sequence x features)
1048
+ elsif input[0].is_a?(Array)
1049
+ @input_shape = [input.length, input[0].length]
1050
+ [input.flatten]
1051
+ # Handle single vector (1D: features)
1052
+ else
1053
+ @input_shape = [input.length]
1054
+ [input]
1055
+ end
1056
+ end
1057
+
1058
+ def backward(gradient, learning_rate = nil)
1059
+ gradient
1060
+ end
1061
+ end
1062
+
1063
+ class ReshapeLayer < Layer
1064
+ def initialize(shape)
1065
+ super()
1066
+ @target_shape = shape
1067
+ @trainable = false
1068
+ end
1069
+
1070
+ def forward(input)
1071
+ reshape_tensor(input, @target_shape)
1072
+ end
1073
+
1074
+ def backward(gradient)
1075
+ reshape_tensor(gradient, @original_shape)
1076
+ end
1077
+
1078
+ private
1079
+
1080
+ def reshape_tensor(tensor, new_shape)
1081
+ flattened = tensor.flatten
1082
+ build_tensor(flattened, new_shape)
1083
+ end
1084
+
1085
+ def build_tensor(flat_array, shape)
1086
+ if shape.length == 1
1087
+ flat_array
1088
+ elsif shape.length == 2
1089
+ rows, cols = shape
1090
+ Array.new(rows) { |i| Array.new(cols) { |j| flat_array[i * cols + j] } }
1091
+ else
1092
+ size = shape[0]
1093
+ remaining_shape = shape[1..-1]
1094
+ remaining_size = remaining_shape.reduce(:*)
1095
+
1096
+ Array.new(size) do |i|
1097
+ sub_array = flat_array[i * remaining_size, remaining_size]
1098
+ build_tensor(sub_array, remaining_shape)
1099
+ end
1100
+ end
1101
+ end
1102
+ end
1103
+ end