ruby-dnn 0.10.4 → 0.12.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. checksums.yaml +4 -4
  2. data/.travis.yml +1 -2
  3. data/README.md +33 -6
  4. data/examples/cifar100_example.rb +3 -3
  5. data/examples/cifar10_example.rb +3 -3
  6. data/examples/dcgan/dcgan.rb +112 -0
  7. data/examples/dcgan/imgen.rb +20 -0
  8. data/examples/dcgan/train.rb +41 -0
  9. data/examples/iris_example.rb +3 -6
  10. data/examples/mnist_conv2d_example.rb +5 -5
  11. data/examples/mnist_define_by_run.rb +52 -0
  12. data/examples/mnist_example.rb +3 -3
  13. data/examples/mnist_lstm_example.rb +3 -3
  14. data/examples/xor_example.rb +4 -5
  15. data/ext/rb_stb_image/rb_stb_image.c +103 -0
  16. data/lib/dnn.rb +10 -10
  17. data/lib/dnn/cifar10.rb +1 -1
  18. data/lib/dnn/cifar100.rb +1 -1
  19. data/lib/dnn/core/activations.rb +21 -22
  20. data/lib/dnn/core/cnn_layers.rb +94 -111
  21. data/lib/dnn/core/embedding.rb +30 -9
  22. data/lib/dnn/core/initializers.rb +31 -21
  23. data/lib/dnn/core/iterator.rb +52 -0
  24. data/lib/dnn/core/layers.rb +99 -66
  25. data/lib/dnn/core/link.rb +24 -0
  26. data/lib/dnn/core/losses.rb +69 -59
  27. data/lib/dnn/core/merge_layers.rb +71 -0
  28. data/lib/dnn/core/models.rb +393 -0
  29. data/lib/dnn/core/normalizations.rb +27 -14
  30. data/lib/dnn/core/optimizers.rb +212 -134
  31. data/lib/dnn/core/param.rb +8 -6
  32. data/lib/dnn/core/regularizers.rb +10 -7
  33. data/lib/dnn/core/rnn_layers.rb +78 -85
  34. data/lib/dnn/core/utils.rb +6 -3
  35. data/lib/dnn/downloader.rb +3 -3
  36. data/lib/dnn/fashion-mnist.rb +89 -0
  37. data/lib/dnn/image.rb +57 -18
  38. data/lib/dnn/iris.rb +1 -3
  39. data/lib/dnn/mnist.rb +38 -34
  40. data/lib/dnn/version.rb +1 -1
  41. data/third_party/stb_image.h +16 -4
  42. data/third_party/stb_image_resize.h +2630 -0
  43. data/third_party/stb_image_write.h +4 -7
  44. metadata +12 -4
  45. data/lib/dnn/core/dataset.rb +0 -34
  46. data/lib/dnn/core/model.rb +0 -440
@@ -2,18 +2,19 @@ module DNN
2
2
  module Layers
3
3
 
4
4
  class BatchNormalization < HasParamLayer
5
- # @return [Integer] The axis to normalization.
5
+ attr_reader :gamma
6
+ attr_reader :beta
7
+ attr_reader :running_mean
8
+ attr_reader :running_var
6
9
  attr_reader :axis
7
- # @return [Float] Exponential moving average of mean and variance.
8
10
  attr_accessor :momentum
9
- # @return [Float] Value to avoid division by zero.
10
11
  attr_accessor :eps
11
12
 
12
13
  def self.from_hash(hash)
13
14
  self.new(axis: hash[:axis], momentum: hash[:momentum])
14
15
  end
15
16
 
16
- # @param [integer] axis The axis to normalization.
17
+ # @param [Integer] axis The axis to normalization.
17
18
  # @param [Float] momentum Exponential moving average of mean and variance.
18
19
  # @param [Float] eps Value to avoid division by zero.
19
20
  def initialize(axis: 0, momentum: 0.9, eps: 1e-7)
@@ -23,27 +24,35 @@ module DNN
23
24
  @eps = eps
24
25
  end
25
26
 
27
+ def call(input)
28
+ x, prev_link, learning_phase = *input
29
+ build(x.shape[1..-1]) unless built?
30
+ y = forward(x, learning_phase)
31
+ link = Link.new(prev_link, self)
32
+ [y, link, learning_phase]
33
+ end
34
+
26
35
  def build(input_shape)
27
36
  super
28
- @params[:gamma] = @gamma = Param.new(Xumo::SFloat.ones(*output_shape), 0)
29
- @params[:beta] = @beta = Param.new(Xumo::SFloat.zeros(*output_shape), 0)
30
- @params[:running_mean] = @running_mean = Param.new(Xumo::SFloat.zeros(*output_shape))
31
- @params[:running_var] = @running_var = Param.new(Xumo::SFloat.zeros(*output_shape))
37
+ @gamma = Param.new(Xumo::SFloat.ones(*output_shape), 0)
38
+ @beta = Param.new(Xumo::SFloat.zeros(*output_shape), 0)
39
+ @running_mean = Param.new(Xumo::SFloat.zeros(*output_shape))
40
+ @running_var = Param.new(Xumo::SFloat.zeros(*output_shape))
32
41
  end
33
42
 
34
- def forward(x)
43
+ def forward(x, learning_phase)
35
44
  if learning_phase
36
45
  mean = x.mean(axis: @axis, keepdims: true)
37
46
  @xc = x - mean
38
- var = (@xc**2).mean(axis: @axis, keepdims: true)
39
- @std = NMath.sqrt(var + @eps)
47
+ var = (@xc ** 2).mean(axis: @axis, keepdims: true)
48
+ @std = Xumo::NMath.sqrt(var + @eps)
40
49
  xn = @xc / @std
41
50
  @xn = xn
42
51
  @running_mean.data = @momentum * @running_mean.data + (1 - @momentum) * mean
43
52
  @running_var.data = @momentum * @running_var.data + (1 - @momentum) * var
44
53
  else
45
54
  xc = x - @running_mean.data
46
- xn = xc / NMath.sqrt(@running_var.data + @eps)
55
+ xn = xc / Xumo::NMath.sqrt(@running_var.data + @eps)
47
56
  end
48
57
  @gamma.data * xn + @beta.data
49
58
  end
@@ -56,7 +65,7 @@ module DNN
56
65
  end
57
66
  dxn = @gamma.data * dy
58
67
  dxc = dxn / @std
59
- dstd = -((dxn * @xc) / (@std**2)).sum(axis: @axis, keepdims: true)
68
+ dstd = -((dxn * @xc) / (@std ** 2)).sum(axis: @axis, keepdims: true)
60
69
  dvar = 0.5 * dstd / @std
61
70
  dxc += (2.0 / batch_size) * @xc * dvar
62
71
  dmean = dxc.sum(axis: @axis, keepdims: true)
@@ -64,7 +73,11 @@ module DNN
64
73
  end
65
74
 
66
75
  def to_hash
67
- super({axis: @axis, momentum: @momentum, eps: @eps})
76
+ super(axis: @axis, momentum: @momentum, eps: @eps)
77
+ end
78
+
79
+ def get_params
80
+ { gamma: @gamma, beta: @beta, running_mean: @running_mean, running_var: @running_var }
68
81
  end
69
82
  end
70
83
 
@@ -3,172 +3,191 @@ module DNN
3
3
 
4
4
  # Super class of all optimizer classes.
5
5
  class Optimizer
6
- # @return [Float] Return the Learning rate.
7
- attr_accessor :learning_rate
6
+ attr_accessor :clip_norm
8
7
 
9
- def initialize(learning_rate)
10
- @learning_rate = learning_rate
8
+ # @param [Float | NilClass] clip_norm Gradient clip norm.
9
+ def initialize(clip_norm: nil)
10
+ @clip_norm = clip_norm
11
11
  end
12
12
 
13
- # Update layers has param.
13
+ # Update layers has params.
14
14
  def update(layers)
15
- target_params = layers.select { |layer| layer.is_a?(HasParamLayer) && layer.trainable }
16
- .map { |layer| layer.params.values }.flatten
15
+ target_params = layers.select { |layer| layer.is_a?(Layers::HasParamLayer) && layer.trainable }
16
+ .map { |layer| layer.get_params.values }.flatten.compact
17
17
  .select { |param| param.grad }
18
+ clip_grads(target_params) if @clip_norm
19
+ update_params(target_params)
18
20
  target_params.each do |param|
19
- update_param(param)
20
- param.grad = 0
21
+ param.grad = Xumo::SFloat.zeros(*param.data.shape)
21
22
  end
22
23
  end
23
24
 
24
25
  def to_hash(merge_hash = nil)
25
- hash = {class: self.class.name, learning_rate: @learning_rate}
26
+ hash = { class: self.class.name, clip_norm: @clip_norm }
26
27
  hash.merge!(merge_hash) if merge_hash
27
28
  hash
28
29
  end
29
30
 
30
- # Update param.
31
- # Classes that inherit from this class must implement this method.
32
- private def update_param(param)
33
- raise NotImplementedError.new("Class '#{self.class.name}' has implement method 'update_param'")
31
+ # Update params.
32
+ private def update_params(params)
33
+ raise NotImplementedError.new("Class '#{self.class.name}' has implement method 'update_params'")
34
+ end
35
+
36
+ private def clip_grads(params)
37
+ norm = Math.sqrt(params.reduce(0) { |sum, param| sum + (param.grad == 0 ? 0 : (param.grad ** 2).sum) })
38
+ return if norm <= @clip_norm
39
+ rate = @clip_norm / (norm + 1e-7)
40
+ params.each do |param|
41
+ param.grad *= rate
42
+ end
34
43
  end
35
44
  end
36
45
 
37
46
 
38
47
  class SGD < Optimizer
39
- # @return [Float] Return the momentum coefficient.
48
+ attr_accessor :lr
40
49
  attr_accessor :momentum
41
50
 
42
51
  def self.from_hash(hash)
43
- self.new(hash[:learning_rate], momentum: hash[:momentum])
52
+ self.new(hash[:lr], momentum: hash[:momentum], clip_norm: hash[:clip_norm])
44
53
  end
45
54
 
46
- # @param [Float] learning_rate Learning rate.
47
- # @param [Float] momentum momentum coefficient.
48
- def initialize(learning_rate = 0.01, momentum: 0)
49
- super(learning_rate)
55
+ # @param [Float] lr Learning rate.
56
+ # @param [Float] momentum Momentum coefficient.
57
+ def initialize(lr = 0.01, momentum: 0, clip_norm: nil)
58
+ super(clip_norm: clip_norm)
59
+ @lr = lr
50
60
  @momentum = momentum
51
61
  @v = {}
52
62
  end
53
63
 
54
64
  def to_hash
55
- super(momentum: @momentum)
56
- end
57
-
58
- private def update_param(param)
59
- amount = param.grad * @learning_rate
60
- if @momentum > 0
61
- @v[param] ||= 0
62
- amount += @momentum * @v[param]
63
- @v[param] = amount
65
+ super(lr: @lr, momentum: @momentum)
66
+ end
67
+
68
+ private def update_params(params)
69
+ params.each do |param|
70
+ amount = param.grad * @lr
71
+ if @momentum > 0
72
+ @v[param] ||= Xumo::SFloat.zeros(*param.data.shape)
73
+ amount += @momentum * @v[param]
74
+ @v[param] = amount
75
+ end
76
+ param.data -= amount
64
77
  end
65
- param.data -= amount
66
78
  end
67
79
  end
68
80
 
69
81
 
70
82
  class Nesterov < Optimizer
83
+ attr_accessor :lr
71
84
  attr_accessor :momentum
72
-
85
+
73
86
  def self.from_hash(hash)
74
- self.new(hash[:learning_rate], momentum: hash[:momentum])
87
+ self.new(hash[:lr], momentum: hash[:momentum], clip_norm: hash[:clip_norm])
75
88
  end
76
89
 
77
- # @param [Float] learning_rate Learning rate.
78
- # @param [Float] momentum momentum coefficient.
79
- def initialize(learning_rate = 0.01, momentum: 0.9)
80
- super(learning_rate)
90
+ # @param [Float] lr Learning rate.
91
+ # @param [Float] momentum Momentum coefficient.
92
+ def initialize(lr = 0.01, momentum: 0.9, clip_norm: nil)
93
+ super(clip_norm: clip_norm)
94
+ @lr = lr
81
95
  @momentum = momentum
82
96
  @v = {}
83
97
  end
84
98
 
85
99
  def to_hash
86
- super(momentum: @momentum)
100
+ super(lr: @lr, momentum: @momentum)
87
101
  end
88
-
89
- private def update_param(param)
90
- @v[param] ||= 0
91
- amount = param.grad * @learning_rate
92
- @v[param] = @v[param] * @momentum - amount
93
- param.data = (param.data + @momentum**2 * @v[param]) - (1 + @momentum) * amount
102
+
103
+ private def update_params(params)
104
+ params.each do |param|
105
+ @v[param] ||= Xumo::SFloat.zeros(*param.data.shape)
106
+ amount = param.grad * @lr
107
+ @v[param] = @v[param] * @momentum - amount
108
+ param.data = (param.data + @momentum ** 2 * @v[param]) - (1 + @momentum) * amount
109
+ end
94
110
  end
95
111
  end
96
-
97
-
112
+
113
+
98
114
  class AdaGrad < Optimizer
99
- # @return [Float] Return the eps value.
115
+ attr_accessor :lr
100
116
  attr_accessor :eps
101
117
 
102
- # @param [Float] learning_rate Learning rate.
118
+ def self.from_hash(hash)
119
+ self.new(hash[:lr], eps: hash[:eps], clip_norm: hash[:clip_norm])
120
+ end
121
+
122
+ # @param [Float] lr Learning rate.
103
123
  # @param [Float] eps Value to avoid division by zero.
104
- def initialize(learning_rate = 0.01, eps: 1e-7)
105
- super(learning_rate)
124
+ def initialize(lr = 0.01, eps: 1e-7, clip_norm: nil)
125
+ super(clip_norm: clip_norm)
126
+ @lr = lr
106
127
  @eps = eps
107
128
  @g = {}
108
129
  end
109
130
 
110
- def self.from_hash(hash)
111
- self.new(hash[:learning_rate], eps: hash[:eps])
112
- end
113
-
114
- private def update_param(param)
115
- @g[param] ||= 0
116
- @g[param] += param.grad**2
117
- param.data -= (@learning_rate / NMath.sqrt(@g[param] + @eps)) * param.grad
131
+ private def update_params(params)
132
+ params.each do |param|
133
+ @g[param] ||= Xumo::SFloat.zeros(*param.data.shape)
134
+ @g[param] += param.grad ** 2
135
+ param.data -= (@lr / Xumo::NMath.sqrt(@g[param] + @eps)) * param.grad
136
+ end
118
137
  end
119
138
 
120
139
  def to_hash
121
- super(eps: @eps)
140
+ super(lr: @lr, eps: @eps)
122
141
  end
123
142
  end
124
-
143
+
125
144
 
126
145
  class RMSProp < Optimizer
127
- # @return [Float] Return the alpha value.
146
+ attr_accessor :lr
128
147
  attr_accessor :alpha
129
- # @return [Float] Return the eps value.
130
148
  attr_accessor :eps
131
149
 
132
150
  def self.from_hash(hash)
133
- self.new(hash[:learning_rate], alpha: hash[:alpha], eps: hash[:eps])
151
+ self.new(hash[:lr], alpha: hash[:alpha], eps: hash[:eps], clip_norm: hash[:clip_norm])
134
152
  end
135
153
 
136
- # @param [Float] learning_rate Learning rate.
154
+ # @param [Float] lr Learning rate.
137
155
  # @param [Float] alpha Moving average index of past slopes.
138
156
  # @param [Float] eps Value to avoid division by zero.
139
- def initialize(learning_rate = 0.001, alpha: 0.9, eps: 1e-7)
140
- super(learning_rate)
157
+ def initialize(lr = 0.001, alpha: 0.9, eps: 1e-7, clip_norm: nil)
158
+ super(clip_norm: clip_norm)
159
+ @lr = lr
141
160
  @alpha = alpha
142
161
  @eps = eps
143
162
  @g = {}
144
163
  end
145
164
 
146
165
  def to_hash
147
- super(alpha: @alpha, eps: @eps)
166
+ super(lr: @lr, alpha: @alpha, eps: @eps)
148
167
  end
149
168
 
150
- private def update_param(param)
151
- @g[param] ||= 0
152
- @g[param] = @alpha * @g[param] + (1 - @alpha) * param.grad**2
153
- param.data -= (@learning_rate / NMath.sqrt(@g[param] + @eps)) * param.grad
169
+ private def update_params(params)
170
+ params.each do |param|
171
+ @g[param] ||= Xumo::SFloat.zeros(*param.data.shape)
172
+ @g[param] = @alpha * @g[param] + (1 - @alpha) * param.grad ** 2
173
+ param.data -= (@lr / Xumo::NMath.sqrt(@g[param] + @eps)) * param.grad
174
+ end
154
175
  end
155
176
  end
156
177
 
157
178
 
158
179
  class AdaDelta < Optimizer
159
- # @return [Float] Return the rho value.
160
180
  attr_accessor :rho
161
- # @return [Float] Return the eps value.
162
181
  attr_accessor :eps
163
182
 
164
183
  def self.from_hash(hash)
165
- self.new(rho: hash[:rho], eps: hash[:eps])
184
+ self.new(rho: hash[:rho], eps: hash[:eps], clip_norm: hash[:clip_norm])
166
185
  end
167
186
 
168
187
  # @param [Float] rho Moving average index of past slopes.
169
188
  # @param [Float] eps Value to avoid division by zero.
170
- def initialize(rho: 0.95, eps: 1e-6)
171
- super(nil)
189
+ def initialize(rho: 0.95, eps: 1e-6, clip_norm: nil)
190
+ super(clip_norm: clip_norm)
172
191
  @rho = rho
173
192
  @eps = eps
174
193
  @h = {}
@@ -179,103 +198,162 @@ module DNN
179
198
  super(rho: @rho, eps: @eps)
180
199
  end
181
200
 
182
- private def update_param(param)
183
- @h[param] ||= Xumo::SFloat.zeros(*param.data.shape)
184
- @s[param] ||= Xumo::SFloat.zeros(*param.data.shape)
185
- @h[param] = @rho * @h[param] + (1 - @rho) * param.grad**2
186
- v = (NMath.sqrt(@s[param] + @eps) / NMath.sqrt(@h[param] + @eps)) * param.grad
187
- @s[param] = @rho * @s[param] + (1 - @rho) * v**2
188
- param.data -= v
201
+ private def update_params(params)
202
+ params.each do |param|
203
+ @h[param] ||= Xumo::SFloat.zeros(*param.data.shape)
204
+ @s[param] ||= Xumo::SFloat.zeros(*param.data.shape)
205
+ @h[param] = @rho * @h[param] + (1 - @rho) * param.grad ** 2
206
+ v = (Xumo::NMath.sqrt(@s[param] + @eps) / Xumo::NMath.sqrt(@h[param] + @eps)) * param.grad
207
+ @s[param] = @rho * @s[param] + (1 - @rho) * v ** 2
208
+ param.data -= v
209
+ end
189
210
  end
190
211
  end
191
212
 
192
213
 
193
- class Adam < Optimizer
194
- # @return [Float] Return the alpha value.
214
+ class RMSPropGraves < Optimizer
215
+ attr_accessor :lr
195
216
  attr_accessor :alpha
196
- # @return [Float] Return the beta1 value.
197
- attr_accessor :beta1
198
- # @return [Float] Return the beta2 value.
199
- attr_accessor :beta2
200
- # @return [Float] Return the eps value.
201
217
  attr_accessor :eps
202
-
218
+
203
219
  def self.from_hash(hash)
204
- self.new(alpha: hash[:alpha], beta1: hash[:beta1], beta2: hash[:beta2], eps: hash[:eps])
220
+ self.new(hash[:lr], alpha: hash[:alpha], eps: hash[:eps], clip_norm: hash[:clip_norm])
205
221
  end
206
222
 
207
- # @param [Float] alpha Value used to calculate learning rate.
208
- # @param [Float] beta1 Moving average index of beta1.
209
- # @param [Float] beta2 Moving average index of beta2.
223
+ # @param [Float] lr Learning rate.
224
+ # @param [Float] alpha Moving average index of past slopes.
210
225
  # @param [Float] eps Value to avoid division by zero.
211
- def initialize(alpha: 0.001, beta1: 0.9, beta2: 0.999, eps: 1e-7)
212
- super(nil)
226
+ def initialize(lr = 0.0001, alpha: 0.95, eps: 0.0001, clip_norm: nil)
227
+ super(clip_norm: clip_norm)
228
+ @lr = lr
213
229
  @alpha = alpha
214
- @beta1 = beta1
215
- @beta2 = beta2
216
230
  @eps = eps
217
- @iter = 0
218
231
  @m = {}
219
232
  @v = {}
220
233
  end
221
234
 
222
- def update(layers)
223
- @iter += 1
224
- learning_rate = @alpha * Math.sqrt(1 - @beta2**@iter) / (1 - @beta1**@iter)
225
- target_params = layers.select { |layer| layer.is_a?(HasParamLayer) && layer.trainable }
226
- .map { |layer| layer.params.values }.flatten
227
- .select { |param| param.grad }
228
- target_params.each do |param|
229
- update_param(param, learning_rate)
230
- param.grad = 0
231
- end
232
- end
233
-
234
235
  def to_hash
235
- super(alpha: @alpha, beta1: @beta1, beta2: @beta2, eps: @eps)
236
+ super(lr: @lr, alpha: @alpha, eps: @eps)
236
237
  end
237
238
 
238
- private def update_param(param, learning_rate)
239
- @m[param] ||= 0
240
- @v[param] ||= 0
241
- @m[param] += (1 - @beta1) * (param.grad - @m[param])
242
- @v[param] += (1 - @beta2) * (param.grad**2 - @v[param])
243
- param.data -= learning_rate * @m[param] / NMath.sqrt(@v[param] + @eps)
239
+ private def update_params(params)
240
+ params.each do |param|
241
+ @m[param] ||= Xumo::SFloat.zeros(*param.data.shape)
242
+ @v[param] ||= Xumo::SFloat.zeros(*param.data.shape)
243
+ @m[param] = @alpha * @m[param] + (1 - @alpha) * param.grad
244
+ @v[param] = @alpha * @v[param] + (1 - @alpha) * param.grad ** 2
245
+ param.data -= (@lr / Xumo::NMath.sqrt(@v[param] - @m[param] ** 2 + @eps)) * param.grad
246
+ end
244
247
  end
245
248
  end
246
249
 
247
250
 
248
- class RMSPropGraves < Optimizer
249
- # @return [Float] Return the alpha value.
251
+ class Adam < Optimizer
250
252
  attr_accessor :alpha
251
- # @return [Float] Return the eps value.
253
+ attr_accessor :beta1
254
+ attr_accessor :beta2
252
255
  attr_accessor :eps
253
-
256
+ attr_reader :amsgrad
257
+
254
258
  def self.from_hash(hash)
255
- self.new(hash[:learning_rate], alpha: hash[:alpha], eps: hash[:eps])
259
+ self.new(alpha: hash[:alpha], beta1: hash[:beta1], beta2: hash[:beta2],
260
+ eps: hash[:eps], amsgrad: hash[:amsgrad], clip_norm: hash[:clip_norm])
256
261
  end
257
262
 
258
- # @param [Float] learning_rate Learning rate.
259
- # @param [Float] alpha Moving average index of past slopes.
263
+ # @param [Float] alpha Value used to calculate learning rate.
264
+ # @param [Float] beta1 Moving average index of beta1.
265
+ # @param [Float] beta2 Moving average index of beta2.
260
266
  # @param [Float] eps Value to avoid division by zero.
261
- def initialize(learning_rate = 0.0001, alpha: 0.95, eps: 0.0001)
262
- super(learning_rate)
267
+ # @param [Boolean] amsgrad Setting the true enable amsgrad.
268
+ def initialize(alpha: 0.001, beta1: 0.9, beta2: 0.999, eps: 1e-7, amsgrad: false, clip_norm: nil)
269
+ super(clip_norm: clip_norm)
263
270
  @alpha = alpha
271
+ @beta1 = beta1
272
+ @beta2 = beta2
264
273
  @eps = eps
274
+ @amsgrad = amsgrad
275
+ @t = 0
265
276
  @m = {}
266
277
  @v = {}
278
+ @s = {} if amsgrad
267
279
  end
268
280
 
269
281
  def to_hash
270
- super(alpha: @alpha, eps: @eps)
282
+ {
283
+ class: self.class.name, alpha: @alpha, beta1: @beta1, beta2: @beta2,
284
+ eps: @eps, amsgrad: @amsgrad, clip_norm: @clip_norm
285
+ }
286
+ end
287
+
288
+ private def update_params(params)
289
+ @t += 1
290
+ lr = @alpha * Math.sqrt(1 - @beta2 ** @t) / (1 - @beta1 ** @t)
291
+ params.each do |param|
292
+ @m[param] ||= Xumo::SFloat.zeros(*param.data.shape)
293
+ @v[param] ||= Xumo::SFloat.zeros(*param.data.shape)
294
+ @m[param] += (1 - @beta1) * (param.grad - @m[param])
295
+ @v[param] += (1 - @beta2) * (param.grad ** 2 - @v[param])
296
+ if @amsgrad
297
+ @s[param] ||= Xumo::SFloat.zeros(*param.data.shape)
298
+ @s[param] = Xumo::SFloat.maximum(@s[param], @v[param])
299
+ param.data -= lr * @m[param] / Xumo::NMath.sqrt(@s[param] + @eps)
300
+ else
301
+ param.data -= lr * @m[param] / Xumo::NMath.sqrt(@v[param] + @eps)
302
+ end
303
+ end
304
+ end
305
+ end
306
+
307
+
308
+ class AdaBound < Adam
309
+ attr_accessor :final_lr
310
+ attr_accessor :gamma
311
+
312
+ def self.from_hash(hash)
313
+ self.new(alpha: hash[:alpha], beta1: hash[:beta1], beta2: hash[:beta2],
314
+ final_lr: hash[:final_lr], gamma: hash[:gamma], eps: hash[:eps], amsgrad: hash[:amsgrad], clip_norm: hash[:clip_norm])
315
+ end
316
+
317
+ # @param [Float] final_lr Final learning rate.
318
+ # @param [Float] gamma Lower and upper range value.
319
+ def initialize(alpha: 0.001, beta1: 0.9, beta2: 0.999, final_lr: 0.1, gamma: 0.001, eps: 1e-7, amsgrad: false, clip_norm: nil)
320
+ super(alpha: alpha, beta1: beta1, beta2: beta2, eps: eps, amsgrad: amsgrad, clip_norm: clip_norm)
321
+ @final_lr = final_lr
322
+ @gamma = gamma
323
+ end
324
+
325
+ def to_hash
326
+ {
327
+ class: self.class.name, alpha: @alpha, beta1: @beta1, beta2: @beta2,
328
+ final_lr: @final_lr, gamma: @gamma, eps: @eps, amsgrad: amsgrad, clip_norm: @clip_norm
329
+ }
330
+ end
331
+
332
+ private def update_params(params)
333
+ @t += 1
334
+ lr = @alpha * Math.sqrt(1 - @beta2 ** @t) / (1 - @beta1 ** @t)
335
+ final_lr = @final_lr * lr / @alpha
336
+ lower_bound = final_lr * (1 - 1 / (@gamma * @t + 1))
337
+ upper_bound = final_lr * (1 + 1 / (@gamma * @t))
338
+ params.each do |param|
339
+ @m[param] ||= Xumo::SFloat.zeros(*param.data.shape)
340
+ @v[param] ||= Xumo::SFloat.zeros(*param.data.shape)
341
+ @m[param] += (1 - @beta1) * (param.grad - @m[param])
342
+ @v[param] += (1 - @beta2) * (param.grad ** 2 - @v[param])
343
+ if @amsgrad
344
+ @s[param] ||= Xumo::SFloat.zeros(*param.data.shape)
345
+ @s[param] = Xumo::SFloat.maximum(@s[param], @v[param])
346
+ param.data -= clip_lr(lr / (Xumo::NMath.sqrt(@s[param]) + @eps), lower_bound, upper_bound) * @m[param]
347
+ else
348
+ param.data -= clip_lr(lr / (Xumo::NMath.sqrt(@v[param]) + @eps), lower_bound, upper_bound) * @m[param]
349
+ end
350
+ end
271
351
  end
272
352
 
273
- private def update_param(param)
274
- @m[param] ||= 0
275
- @v[param] ||= 0
276
- @m[param] = @alpha * @m[param] + (1 - @alpha) * param.grad
277
- @v[param] = @alpha * @v[param] + (1 - @alpha) * param.grad**2
278
- param.data -= (@learning_rate / NMath.sqrt(@v[param] - @m[param]**2 + @eps)) * param.grad
353
+ private def clip_lr(lr, lower_bound, upper_bound)
354
+ lr[lr < lower_bound] = lower_bound
355
+ lr[lr > upper_bound] = upper_bound
356
+ lr
279
357
  end
280
358
  end
281
359