ruby-dnn 0.10.1 → 0.10.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +2 -2
- data/examples/cifar100_example.rb +71 -71
- data/examples/cifar10_example.rb +68 -68
- data/examples/iris_example.rb +34 -34
- data/examples/mnist_conv2d_example.rb +50 -50
- data/examples/mnist_example.rb +39 -39
- data/examples/mnist_lstm_example.rb +36 -36
- data/examples/xor_example.rb +24 -24
- data/lib/dnn.rb +27 -26
- data/lib/dnn/cifar10.rb +51 -51
- data/lib/dnn/cifar100.rb +49 -49
- data/lib/dnn/core/activations.rb +148 -148
- data/lib/dnn/core/cnn_layers.rb +464 -464
- data/lib/dnn/core/dataset.rb +34 -34
- data/lib/dnn/core/embedding.rb +56 -0
- data/lib/dnn/core/error.rb +5 -5
- data/lib/dnn/core/initializers.rb +126 -126
- data/lib/dnn/core/layers.rb +307 -307
- data/lib/dnn/core/losses.rb +175 -175
- data/lib/dnn/core/model.rb +461 -461
- data/lib/dnn/core/normalizations.rb +72 -72
- data/lib/dnn/core/optimizers.rb +283 -283
- data/lib/dnn/core/param.rb +9 -9
- data/lib/dnn/core/regularizers.rb +106 -106
- data/lib/dnn/core/rnn_layers.rb +464 -464
- data/lib/dnn/core/utils.rb +34 -34
- data/lib/dnn/downloader.rb +50 -50
- data/lib/dnn/image.rb +41 -41
- data/lib/dnn/iris.rb +60 -60
- data/lib/dnn/mnist.rb +84 -84
- data/lib/dnn/version.rb +3 -3
- metadata +2 -1
@@ -1,72 +1,72 @@
|
|
1
|
-
module DNN
|
2
|
-
module Layers
|
3
|
-
|
4
|
-
class BatchNormalization < HasParamLayer
|
5
|
-
# @return [Integer] The axis to normalization.
|
6
|
-
attr_reader :axis
|
7
|
-
# @return [Float] Exponential moving average of mean and variance.
|
8
|
-
attr_accessor :momentum
|
9
|
-
# @return [Float] Value to avoid division by zero.
|
10
|
-
attr_accessor :eps
|
11
|
-
|
12
|
-
def self.from_hash(hash)
|
13
|
-
self.new(axis: hash[:axis], momentum: hash[:momentum])
|
14
|
-
end
|
15
|
-
|
16
|
-
# @param [integer] axis The axis to normalization.
|
17
|
-
# @param [Float] momentum Exponential moving average of mean and variance.
|
18
|
-
# @param [Float] eps Value to avoid division by zero.
|
19
|
-
def initialize(axis: 0, momentum: 0.9, eps: 1e-7)
|
20
|
-
super()
|
21
|
-
@axis = axis
|
22
|
-
@momentum = momentum
|
23
|
-
@eps = eps
|
24
|
-
end
|
25
|
-
|
26
|
-
def build(input_shape)
|
27
|
-
super
|
28
|
-
@params[:gamma] = @gamma = Param.new(Xumo::SFloat.ones(*output_shape), 0)
|
29
|
-
@params[:beta] = @beta = Param.new(Xumo::SFloat.zeros(*output_shape), 0)
|
30
|
-
@params[:running_mean] = @running_mean = Param.new(Xumo::SFloat.zeros(*output_shape))
|
31
|
-
@params[:running_var] = @running_var = Param.new(Xumo::SFloat.zeros(*output_shape))
|
32
|
-
end
|
33
|
-
|
34
|
-
def forward(x)
|
35
|
-
if learning_phase
|
36
|
-
mean = x.mean(axis: @axis, keepdims: true)
|
37
|
-
@xc = x - mean
|
38
|
-
var = (@xc**2).mean(axis: @axis, keepdims: true)
|
39
|
-
@std = NMath.sqrt(var + @eps)
|
40
|
-
xn = @xc / @std
|
41
|
-
@xn = xn
|
42
|
-
@running_mean.data = @momentum * @running_mean.data + (1 - @momentum) * mean
|
43
|
-
@running_var.data = @momentum * @running_var.data + (1 - @momentum) * var
|
44
|
-
else
|
45
|
-
xc = x - @running_mean.data
|
46
|
-
xn = xc / NMath.sqrt(@running_var.data + @eps)
|
47
|
-
end
|
48
|
-
@gamma.data * xn + @beta.data
|
49
|
-
end
|
50
|
-
|
51
|
-
def backward(dy)
|
52
|
-
batch_size = dy.shape[@axis]
|
53
|
-
if @trainable
|
54
|
-
@beta.grad = dy.sum(axis: @axis, keepdims: true)
|
55
|
-
@gamma.grad = (@xn * dy).sum(axis: @axis, keepdims: true)
|
56
|
-
end
|
57
|
-
dxn = @gamma.data * dy
|
58
|
-
dxc = dxn / @std
|
59
|
-
dstd = -((dxn * @xc) / (@std**2)).sum(axis: @axis, keepdims: true)
|
60
|
-
dvar = 0.5 * dstd / @std
|
61
|
-
dxc += (2.0 / batch_size) * @xc * dvar
|
62
|
-
dmean = dxc.sum(axis: @axis, keepdims: true)
|
63
|
-
dxc - dmean / batch_size
|
64
|
-
end
|
65
|
-
|
66
|
-
def to_hash
|
67
|
-
super({axis: @axis, momentum: @momentum, eps: @eps})
|
68
|
-
end
|
69
|
-
end
|
70
|
-
|
71
|
-
end
|
72
|
-
end
|
1
|
+
module DNN
|
2
|
+
module Layers
|
3
|
+
|
4
|
+
class BatchNormalization < HasParamLayer
|
5
|
+
# @return [Integer] The axis to normalization.
|
6
|
+
attr_reader :axis
|
7
|
+
# @return [Float] Exponential moving average of mean and variance.
|
8
|
+
attr_accessor :momentum
|
9
|
+
# @return [Float] Value to avoid division by zero.
|
10
|
+
attr_accessor :eps
|
11
|
+
|
12
|
+
def self.from_hash(hash)
|
13
|
+
self.new(axis: hash[:axis], momentum: hash[:momentum])
|
14
|
+
end
|
15
|
+
|
16
|
+
# @param [integer] axis The axis to normalization.
|
17
|
+
# @param [Float] momentum Exponential moving average of mean and variance.
|
18
|
+
# @param [Float] eps Value to avoid division by zero.
|
19
|
+
def initialize(axis: 0, momentum: 0.9, eps: 1e-7)
|
20
|
+
super()
|
21
|
+
@axis = axis
|
22
|
+
@momentum = momentum
|
23
|
+
@eps = eps
|
24
|
+
end
|
25
|
+
|
26
|
+
def build(input_shape)
|
27
|
+
super
|
28
|
+
@params[:gamma] = @gamma = Param.new(Xumo::SFloat.ones(*output_shape), 0)
|
29
|
+
@params[:beta] = @beta = Param.new(Xumo::SFloat.zeros(*output_shape), 0)
|
30
|
+
@params[:running_mean] = @running_mean = Param.new(Xumo::SFloat.zeros(*output_shape))
|
31
|
+
@params[:running_var] = @running_var = Param.new(Xumo::SFloat.zeros(*output_shape))
|
32
|
+
end
|
33
|
+
|
34
|
+
def forward(x)
|
35
|
+
if learning_phase
|
36
|
+
mean = x.mean(axis: @axis, keepdims: true)
|
37
|
+
@xc = x - mean
|
38
|
+
var = (@xc**2).mean(axis: @axis, keepdims: true)
|
39
|
+
@std = NMath.sqrt(var + @eps)
|
40
|
+
xn = @xc / @std
|
41
|
+
@xn = xn
|
42
|
+
@running_mean.data = @momentum * @running_mean.data + (1 - @momentum) * mean
|
43
|
+
@running_var.data = @momentum * @running_var.data + (1 - @momentum) * var
|
44
|
+
else
|
45
|
+
xc = x - @running_mean.data
|
46
|
+
xn = xc / NMath.sqrt(@running_var.data + @eps)
|
47
|
+
end
|
48
|
+
@gamma.data * xn + @beta.data
|
49
|
+
end
|
50
|
+
|
51
|
+
def backward(dy)
|
52
|
+
batch_size = dy.shape[@axis]
|
53
|
+
if @trainable
|
54
|
+
@beta.grad = dy.sum(axis: @axis, keepdims: true)
|
55
|
+
@gamma.grad = (@xn * dy).sum(axis: @axis, keepdims: true)
|
56
|
+
end
|
57
|
+
dxn = @gamma.data * dy
|
58
|
+
dxc = dxn / @std
|
59
|
+
dstd = -((dxn * @xc) / (@std**2)).sum(axis: @axis, keepdims: true)
|
60
|
+
dvar = 0.5 * dstd / @std
|
61
|
+
dxc += (2.0 / batch_size) * @xc * dvar
|
62
|
+
dmean = dxc.sum(axis: @axis, keepdims: true)
|
63
|
+
dxc - dmean / batch_size
|
64
|
+
end
|
65
|
+
|
66
|
+
def to_hash
|
67
|
+
super({axis: @axis, momentum: @momentum, eps: @eps})
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
end
|
72
|
+
end
|
data/lib/dnn/core/optimizers.rb
CHANGED
@@ -1,283 +1,283 @@
|
|
1
|
-
module DNN
|
2
|
-
module Optimizers
|
3
|
-
|
4
|
-
# Super class of all optimizer classes.
|
5
|
-
class Optimizer
|
6
|
-
# @return [Float] Return the Learning rate.
|
7
|
-
attr_accessor :learning_rate
|
8
|
-
|
9
|
-
def initialize(learning_rate)
|
10
|
-
@learning_rate = learning_rate
|
11
|
-
end
|
12
|
-
|
13
|
-
# Update layers has param.
|
14
|
-
def update(layers)
|
15
|
-
target_params = layers.select { |layer| layer.is_a?(HasParamLayer) && layer.trainable }
|
16
|
-
.map { |layer| layer.params.values }.flatten
|
17
|
-
.select { |param| param.grad }
|
18
|
-
target_params.each do |param|
|
19
|
-
update_param(param)
|
20
|
-
param.grad = 0
|
21
|
-
end
|
22
|
-
end
|
23
|
-
|
24
|
-
def to_hash(merge_hash = nil)
|
25
|
-
hash = {class: self.class.name, learning_rate: @learning_rate}
|
26
|
-
hash.merge!(merge_hash) if merge_hash
|
27
|
-
hash
|
28
|
-
end
|
29
|
-
|
30
|
-
# Update param.
|
31
|
-
# Classes that inherit from this class must implement this method.
|
32
|
-
private def update_param(param)
|
33
|
-
raise NotImplementedError.new("Class '#{self.class.name}' has implement method 'update_param'")
|
34
|
-
end
|
35
|
-
end
|
36
|
-
|
37
|
-
|
38
|
-
class SGD < Optimizer
|
39
|
-
# @return [Float] Return the momentum coefficient.
|
40
|
-
attr_accessor :momentum
|
41
|
-
|
42
|
-
def self.from_hash(hash)
|
43
|
-
self.new(hash[:learning_rate], momentum: hash[:momentum])
|
44
|
-
end
|
45
|
-
|
46
|
-
# @param [Float] learning_rate Learning rate.
|
47
|
-
# @param [Float] momentum momentum coefficient.
|
48
|
-
def initialize(learning_rate = 0.01, momentum: 0)
|
49
|
-
super(learning_rate)
|
50
|
-
@momentum = momentum
|
51
|
-
@v = {}
|
52
|
-
end
|
53
|
-
|
54
|
-
def to_hash
|
55
|
-
super(momentum: @momentum)
|
56
|
-
end
|
57
|
-
|
58
|
-
private def update_param(param)
|
59
|
-
amount = param.grad * @learning_rate
|
60
|
-
if @momentum > 0
|
61
|
-
@v[param] ||= 0
|
62
|
-
amount += @momentum * @v[param]
|
63
|
-
@v[param] = amount
|
64
|
-
end
|
65
|
-
param.data -= amount
|
66
|
-
end
|
67
|
-
end
|
68
|
-
|
69
|
-
|
70
|
-
class Nesterov < Optimizer
|
71
|
-
attr_accessor :momentum
|
72
|
-
|
73
|
-
def self.from_hash(hash)
|
74
|
-
self.new(hash[:learning_rate], momentum: hash[:momentum])
|
75
|
-
end
|
76
|
-
|
77
|
-
# @param [Float] learning_rate Learning rate.
|
78
|
-
# @param [Float] momentum momentum coefficient.
|
79
|
-
def initialize(learning_rate = 0.01, momentum: 0.9)
|
80
|
-
super(learning_rate)
|
81
|
-
@momentum = momentum
|
82
|
-
@v = {}
|
83
|
-
end
|
84
|
-
|
85
|
-
def to_hash
|
86
|
-
super(momentum: @momentum)
|
87
|
-
end
|
88
|
-
|
89
|
-
private def update_param(param)
|
90
|
-
@v[param] ||= 0
|
91
|
-
amount = param.grad * @learning_rate
|
92
|
-
@v[param] = @v[param] * @momentum - amount
|
93
|
-
param.data = (param.data + @momentum**2 * @v[param]) - (1 + @momentum) * amount
|
94
|
-
end
|
95
|
-
end
|
96
|
-
|
97
|
-
|
98
|
-
class AdaGrad < Optimizer
|
99
|
-
# @return [Float] Return the eps value.
|
100
|
-
attr_accessor :eps
|
101
|
-
|
102
|
-
# @param [Float] learning_rate Learning rate.
|
103
|
-
# @param [Float] eps Value to avoid division by zero.
|
104
|
-
def initialize(learning_rate = 0.01, eps: 1e-7)
|
105
|
-
super(learning_rate)
|
106
|
-
@eps = eps
|
107
|
-
@g = {}
|
108
|
-
end
|
109
|
-
|
110
|
-
def self.from_hash(hash)
|
111
|
-
self.new(hash[:learning_rate], eps: hash[:eps])
|
112
|
-
end
|
113
|
-
|
114
|
-
private def update_param(param)
|
115
|
-
@g[param] ||= 0
|
116
|
-
@g[param] += param.grad**2
|
117
|
-
param.data -= (@learning_rate / NMath.sqrt(@g[param] + @eps)) * param.grad
|
118
|
-
end
|
119
|
-
|
120
|
-
def to_hash
|
121
|
-
super(eps: @eps)
|
122
|
-
end
|
123
|
-
end
|
124
|
-
|
125
|
-
|
126
|
-
class RMSProp < Optimizer
|
127
|
-
# @return [Float] Return the alpha value.
|
128
|
-
attr_accessor :alpha
|
129
|
-
# @return [Float] Return the eps value.
|
130
|
-
attr_accessor :eps
|
131
|
-
|
132
|
-
def self.from_hash(hash)
|
133
|
-
self.new(hash[:learning_rate], alpha: hash[:alpha], eps: hash[:eps])
|
134
|
-
end
|
135
|
-
|
136
|
-
# @param [Float] learning_rate Learning rate.
|
137
|
-
# @param [Float] alpha Moving average index of past slopes.
|
138
|
-
# @param [Float] eps Value to avoid division by zero.
|
139
|
-
def initialize(learning_rate = 0.001, alpha: 0.9, eps: 1e-7)
|
140
|
-
super(learning_rate)
|
141
|
-
@alpha = alpha
|
142
|
-
@eps = eps
|
143
|
-
@g = {}
|
144
|
-
end
|
145
|
-
|
146
|
-
def to_hash
|
147
|
-
super(alpha: @alpha, eps: @eps)
|
148
|
-
end
|
149
|
-
|
150
|
-
private def update_param(param)
|
151
|
-
@g[param] ||= 0
|
152
|
-
@g[param] = @alpha * @g[param] + (1 - @alpha) * param.grad**2
|
153
|
-
param.data -= (@learning_rate / NMath.sqrt(@g[param] + @eps)) * param.grad
|
154
|
-
end
|
155
|
-
end
|
156
|
-
|
157
|
-
|
158
|
-
class AdaDelta < Optimizer
|
159
|
-
# @return [Float] Return the rho value.
|
160
|
-
attr_accessor :rho
|
161
|
-
# @return [Float] Return the eps value.
|
162
|
-
attr_accessor :eps
|
163
|
-
|
164
|
-
def self.from_hash(hash)
|
165
|
-
self.new(rho: hash[:rho], eps: hash[:eps])
|
166
|
-
end
|
167
|
-
|
168
|
-
# @param [Float] rho Moving average index of past slopes.
|
169
|
-
# @param [Float] eps Value to avoid division by zero.
|
170
|
-
def initialize(rho: 0.95, eps: 1e-6)
|
171
|
-
super(nil)
|
172
|
-
@rho = rho
|
173
|
-
@eps = eps
|
174
|
-
@h = {}
|
175
|
-
@s = {}
|
176
|
-
end
|
177
|
-
|
178
|
-
def to_hash
|
179
|
-
super(rho: @rho, eps: @eps)
|
180
|
-
end
|
181
|
-
|
182
|
-
private def update_param(param)
|
183
|
-
@h[param] ||= Xumo::SFloat.zeros(*param.data.shape)
|
184
|
-
@s[param] ||= Xumo::SFloat.zeros(*param.data.shape)
|
185
|
-
@h[param] = @rho * @h[param] + (1 - @rho) * param.grad**2
|
186
|
-
v = (NMath.sqrt(@s[param] + @eps) / NMath.sqrt(@h[param] + @eps)) * param.grad
|
187
|
-
@s[param] = @rho * @s[param] + (1 - @rho) * v**2
|
188
|
-
param.data -= v
|
189
|
-
end
|
190
|
-
end
|
191
|
-
|
192
|
-
|
193
|
-
class Adam < Optimizer
|
194
|
-
# @return [Float] Return the alpha value.
|
195
|
-
attr_accessor :alpha
|
196
|
-
# @return [Float] Return the beta1 value.
|
197
|
-
attr_accessor :beta1
|
198
|
-
# @return [Float] Return the beta2 value.
|
199
|
-
attr_accessor :beta2
|
200
|
-
# @return [Float] Return the eps value.
|
201
|
-
attr_accessor :eps
|
202
|
-
|
203
|
-
def self.from_hash(hash)
|
204
|
-
self.new(alpha: hash[:alpha], beta1: hash[:beta1], beta2: hash[:beta2], eps: hash[:eps])
|
205
|
-
end
|
206
|
-
|
207
|
-
# @param [Float] alpha Value used to calculate learning rate.
|
208
|
-
# @param [Float] beta1 Moving average index of beta1.
|
209
|
-
# @param [Float] beta2 Moving average index of beta2.
|
210
|
-
# @param [Float] eps Value to avoid division by zero.
|
211
|
-
def initialize(alpha: 0.001, beta1: 0.9, beta2: 0.999, eps: 1e-7)
|
212
|
-
super(nil)
|
213
|
-
@alpha = alpha
|
214
|
-
@beta1 = beta1
|
215
|
-
@beta2 = beta2
|
216
|
-
@eps = eps
|
217
|
-
@iter = 0
|
218
|
-
@m = {}
|
219
|
-
@v = {}
|
220
|
-
end
|
221
|
-
|
222
|
-
def update(layers)
|
223
|
-
@iter += 1
|
224
|
-
learning_rate = @alpha * Math.sqrt(1 - @beta2**@iter) / (1 - @beta1**@iter)
|
225
|
-
target_params = layers.select { |layer| layer.is_a?(HasParamLayer) && layer.trainable }
|
226
|
-
.map { |layer| layer.params.values }.flatten
|
227
|
-
.select { |param| param.grad }
|
228
|
-
target_params.each do |param|
|
229
|
-
update_param(param, learning_rate)
|
230
|
-
param.grad = 0
|
231
|
-
end
|
232
|
-
end
|
233
|
-
|
234
|
-
def to_hash
|
235
|
-
super(alpha: @alpha, beta1: @beta1, beta2: @beta2, eps: @eps)
|
236
|
-
end
|
237
|
-
|
238
|
-
private def update_param(param, learning_rate)
|
239
|
-
@m[param] ||= 0
|
240
|
-
@v[param] ||= 0
|
241
|
-
@m[param] += (1 - @beta1) * (param.grad - @m[param])
|
242
|
-
@v[param] += (1 - @beta2) * (param.grad**2 - @v[param])
|
243
|
-
param.data -= learning_rate * @m[param] / NMath.sqrt(@v[param] + @eps)
|
244
|
-
end
|
245
|
-
end
|
246
|
-
|
247
|
-
|
248
|
-
class RMSPropGraves < Optimizer
|
249
|
-
# @return [Float] Return the alpha value.
|
250
|
-
attr_accessor :alpha
|
251
|
-
# @return [Float] Return the eps value.
|
252
|
-
attr_accessor :eps
|
253
|
-
|
254
|
-
def self.from_hash(hash)
|
255
|
-
self.new(hash[:learning_rate], alpha: hash[:alpha], eps: hash[:eps])
|
256
|
-
end
|
257
|
-
|
258
|
-
# @param [Float] learning_rate Learning rate.
|
259
|
-
# @param [Float] alpha Moving average index of past slopes.
|
260
|
-
# @param [Float] eps Value to avoid division by zero.
|
261
|
-
def initialize(learning_rate = 0.0001, alpha: 0.95, eps: 0.0001)
|
262
|
-
super(learning_rate)
|
263
|
-
@alpha = alpha
|
264
|
-
@eps = eps
|
265
|
-
@m = {}
|
266
|
-
@v = {}
|
267
|
-
end
|
268
|
-
|
269
|
-
def to_hash
|
270
|
-
super(alpha: @alpha, eps: @eps)
|
271
|
-
end
|
272
|
-
|
273
|
-
private def update_param(param)
|
274
|
-
@m[param] ||= 0
|
275
|
-
@v[param] ||= 0
|
276
|
-
@m[param] = @alpha * @m[param] + (1 - @alpha) * param.grad
|
277
|
-
@v[param] = @alpha * @v[param] + (1 - @alpha) * param.grad**2
|
278
|
-
param.data -= (@learning_rate / NMath.sqrt(@v[param] - @m[param]**2 + @eps)) * param.grad
|
279
|
-
end
|
280
|
-
end
|
281
|
-
|
282
|
-
end
|
283
|
-
end
|
1
|
+
module DNN
|
2
|
+
module Optimizers
|
3
|
+
|
4
|
+
# Super class of all optimizer classes.
|
5
|
+
class Optimizer
|
6
|
+
# @return [Float] Return the Learning rate.
|
7
|
+
attr_accessor :learning_rate
|
8
|
+
|
9
|
+
def initialize(learning_rate)
|
10
|
+
@learning_rate = learning_rate
|
11
|
+
end
|
12
|
+
|
13
|
+
# Update layers has param.
|
14
|
+
def update(layers)
|
15
|
+
target_params = layers.select { |layer| layer.is_a?(HasParamLayer) && layer.trainable }
|
16
|
+
.map { |layer| layer.params.values }.flatten
|
17
|
+
.select { |param| param.grad }
|
18
|
+
target_params.each do |param|
|
19
|
+
update_param(param)
|
20
|
+
param.grad = 0
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def to_hash(merge_hash = nil)
|
25
|
+
hash = {class: self.class.name, learning_rate: @learning_rate}
|
26
|
+
hash.merge!(merge_hash) if merge_hash
|
27
|
+
hash
|
28
|
+
end
|
29
|
+
|
30
|
+
# Update param.
|
31
|
+
# Classes that inherit from this class must implement this method.
|
32
|
+
private def update_param(param)
|
33
|
+
raise NotImplementedError.new("Class '#{self.class.name}' has implement method 'update_param'")
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
|
38
|
+
class SGD < Optimizer
|
39
|
+
# @return [Float] Return the momentum coefficient.
|
40
|
+
attr_accessor :momentum
|
41
|
+
|
42
|
+
def self.from_hash(hash)
|
43
|
+
self.new(hash[:learning_rate], momentum: hash[:momentum])
|
44
|
+
end
|
45
|
+
|
46
|
+
# @param [Float] learning_rate Learning rate.
|
47
|
+
# @param [Float] momentum momentum coefficient.
|
48
|
+
def initialize(learning_rate = 0.01, momentum: 0)
|
49
|
+
super(learning_rate)
|
50
|
+
@momentum = momentum
|
51
|
+
@v = {}
|
52
|
+
end
|
53
|
+
|
54
|
+
def to_hash
|
55
|
+
super(momentum: @momentum)
|
56
|
+
end
|
57
|
+
|
58
|
+
private def update_param(param)
|
59
|
+
amount = param.grad * @learning_rate
|
60
|
+
if @momentum > 0
|
61
|
+
@v[param] ||= 0
|
62
|
+
amount += @momentum * @v[param]
|
63
|
+
@v[param] = amount
|
64
|
+
end
|
65
|
+
param.data -= amount
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
|
70
|
+
class Nesterov < Optimizer
|
71
|
+
attr_accessor :momentum
|
72
|
+
|
73
|
+
def self.from_hash(hash)
|
74
|
+
self.new(hash[:learning_rate], momentum: hash[:momentum])
|
75
|
+
end
|
76
|
+
|
77
|
+
# @param [Float] learning_rate Learning rate.
|
78
|
+
# @param [Float] momentum momentum coefficient.
|
79
|
+
def initialize(learning_rate = 0.01, momentum: 0.9)
|
80
|
+
super(learning_rate)
|
81
|
+
@momentum = momentum
|
82
|
+
@v = {}
|
83
|
+
end
|
84
|
+
|
85
|
+
def to_hash
|
86
|
+
super(momentum: @momentum)
|
87
|
+
end
|
88
|
+
|
89
|
+
private def update_param(param)
|
90
|
+
@v[param] ||= 0
|
91
|
+
amount = param.grad * @learning_rate
|
92
|
+
@v[param] = @v[param] * @momentum - amount
|
93
|
+
param.data = (param.data + @momentum**2 * @v[param]) - (1 + @momentum) * amount
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
|
98
|
+
class AdaGrad < Optimizer
|
99
|
+
# @return [Float] Return the eps value.
|
100
|
+
attr_accessor :eps
|
101
|
+
|
102
|
+
# @param [Float] learning_rate Learning rate.
|
103
|
+
# @param [Float] eps Value to avoid division by zero.
|
104
|
+
def initialize(learning_rate = 0.01, eps: 1e-7)
|
105
|
+
super(learning_rate)
|
106
|
+
@eps = eps
|
107
|
+
@g = {}
|
108
|
+
end
|
109
|
+
|
110
|
+
def self.from_hash(hash)
|
111
|
+
self.new(hash[:learning_rate], eps: hash[:eps])
|
112
|
+
end
|
113
|
+
|
114
|
+
private def update_param(param)
|
115
|
+
@g[param] ||= 0
|
116
|
+
@g[param] += param.grad**2
|
117
|
+
param.data -= (@learning_rate / NMath.sqrt(@g[param] + @eps)) * param.grad
|
118
|
+
end
|
119
|
+
|
120
|
+
def to_hash
|
121
|
+
super(eps: @eps)
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
|
126
|
+
class RMSProp < Optimizer
|
127
|
+
# @return [Float] Return the alpha value.
|
128
|
+
attr_accessor :alpha
|
129
|
+
# @return [Float] Return the eps value.
|
130
|
+
attr_accessor :eps
|
131
|
+
|
132
|
+
def self.from_hash(hash)
|
133
|
+
self.new(hash[:learning_rate], alpha: hash[:alpha], eps: hash[:eps])
|
134
|
+
end
|
135
|
+
|
136
|
+
# @param [Float] learning_rate Learning rate.
|
137
|
+
# @param [Float] alpha Moving average index of past slopes.
|
138
|
+
# @param [Float] eps Value to avoid division by zero.
|
139
|
+
def initialize(learning_rate = 0.001, alpha: 0.9, eps: 1e-7)
|
140
|
+
super(learning_rate)
|
141
|
+
@alpha = alpha
|
142
|
+
@eps = eps
|
143
|
+
@g = {}
|
144
|
+
end
|
145
|
+
|
146
|
+
def to_hash
|
147
|
+
super(alpha: @alpha, eps: @eps)
|
148
|
+
end
|
149
|
+
|
150
|
+
private def update_param(param)
|
151
|
+
@g[param] ||= 0
|
152
|
+
@g[param] = @alpha * @g[param] + (1 - @alpha) * param.grad**2
|
153
|
+
param.data -= (@learning_rate / NMath.sqrt(@g[param] + @eps)) * param.grad
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
|
158
|
+
class AdaDelta < Optimizer
|
159
|
+
# @return [Float] Return the rho value.
|
160
|
+
attr_accessor :rho
|
161
|
+
# @return [Float] Return the eps value.
|
162
|
+
attr_accessor :eps
|
163
|
+
|
164
|
+
def self.from_hash(hash)
|
165
|
+
self.new(rho: hash[:rho], eps: hash[:eps])
|
166
|
+
end
|
167
|
+
|
168
|
+
# @param [Float] rho Moving average index of past slopes.
|
169
|
+
# @param [Float] eps Value to avoid division by zero.
|
170
|
+
def initialize(rho: 0.95, eps: 1e-6)
|
171
|
+
super(nil)
|
172
|
+
@rho = rho
|
173
|
+
@eps = eps
|
174
|
+
@h = {}
|
175
|
+
@s = {}
|
176
|
+
end
|
177
|
+
|
178
|
+
def to_hash
|
179
|
+
super(rho: @rho, eps: @eps)
|
180
|
+
end
|
181
|
+
|
182
|
+
private def update_param(param)
|
183
|
+
@h[param] ||= Xumo::SFloat.zeros(*param.data.shape)
|
184
|
+
@s[param] ||= Xumo::SFloat.zeros(*param.data.shape)
|
185
|
+
@h[param] = @rho * @h[param] + (1 - @rho) * param.grad**2
|
186
|
+
v = (NMath.sqrt(@s[param] + @eps) / NMath.sqrt(@h[param] + @eps)) * param.grad
|
187
|
+
@s[param] = @rho * @s[param] + (1 - @rho) * v**2
|
188
|
+
param.data -= v
|
189
|
+
end
|
190
|
+
end
|
191
|
+
|
192
|
+
|
193
|
+
class Adam < Optimizer
|
194
|
+
# @return [Float] Return the alpha value.
|
195
|
+
attr_accessor :alpha
|
196
|
+
# @return [Float] Return the beta1 value.
|
197
|
+
attr_accessor :beta1
|
198
|
+
# @return [Float] Return the beta2 value.
|
199
|
+
attr_accessor :beta2
|
200
|
+
# @return [Float] Return the eps value.
|
201
|
+
attr_accessor :eps
|
202
|
+
|
203
|
+
def self.from_hash(hash)
|
204
|
+
self.new(alpha: hash[:alpha], beta1: hash[:beta1], beta2: hash[:beta2], eps: hash[:eps])
|
205
|
+
end
|
206
|
+
|
207
|
+
# @param [Float] alpha Value used to calculate learning rate.
|
208
|
+
# @param [Float] beta1 Moving average index of beta1.
|
209
|
+
# @param [Float] beta2 Moving average index of beta2.
|
210
|
+
# @param [Float] eps Value to avoid division by zero.
|
211
|
+
def initialize(alpha: 0.001, beta1: 0.9, beta2: 0.999, eps: 1e-7)
|
212
|
+
super(nil)
|
213
|
+
@alpha = alpha
|
214
|
+
@beta1 = beta1
|
215
|
+
@beta2 = beta2
|
216
|
+
@eps = eps
|
217
|
+
@iter = 0
|
218
|
+
@m = {}
|
219
|
+
@v = {}
|
220
|
+
end
|
221
|
+
|
222
|
+
def update(layers)
|
223
|
+
@iter += 1
|
224
|
+
learning_rate = @alpha * Math.sqrt(1 - @beta2**@iter) / (1 - @beta1**@iter)
|
225
|
+
target_params = layers.select { |layer| layer.is_a?(HasParamLayer) && layer.trainable }
|
226
|
+
.map { |layer| layer.params.values }.flatten
|
227
|
+
.select { |param| param.grad }
|
228
|
+
target_params.each do |param|
|
229
|
+
update_param(param, learning_rate)
|
230
|
+
param.grad = 0
|
231
|
+
end
|
232
|
+
end
|
233
|
+
|
234
|
+
def to_hash
|
235
|
+
super(alpha: @alpha, beta1: @beta1, beta2: @beta2, eps: @eps)
|
236
|
+
end
|
237
|
+
|
238
|
+
private def update_param(param, learning_rate)
|
239
|
+
@m[param] ||= 0
|
240
|
+
@v[param] ||= 0
|
241
|
+
@m[param] += (1 - @beta1) * (param.grad - @m[param])
|
242
|
+
@v[param] += (1 - @beta2) * (param.grad**2 - @v[param])
|
243
|
+
param.data -= learning_rate * @m[param] / NMath.sqrt(@v[param] + @eps)
|
244
|
+
end
|
245
|
+
end
|
246
|
+
|
247
|
+
|
248
|
+
class RMSPropGraves < Optimizer
|
249
|
+
# @return [Float] Return the alpha value.
|
250
|
+
attr_accessor :alpha
|
251
|
+
# @return [Float] Return the eps value.
|
252
|
+
attr_accessor :eps
|
253
|
+
|
254
|
+
def self.from_hash(hash)
|
255
|
+
self.new(hash[:learning_rate], alpha: hash[:alpha], eps: hash[:eps])
|
256
|
+
end
|
257
|
+
|
258
|
+
# @param [Float] learning_rate Learning rate.
|
259
|
+
# @param [Float] alpha Moving average index of past slopes.
|
260
|
+
# @param [Float] eps Value to avoid division by zero.
|
261
|
+
def initialize(learning_rate = 0.0001, alpha: 0.95, eps: 0.0001)
|
262
|
+
super(learning_rate)
|
263
|
+
@alpha = alpha
|
264
|
+
@eps = eps
|
265
|
+
@m = {}
|
266
|
+
@v = {}
|
267
|
+
end
|
268
|
+
|
269
|
+
def to_hash
|
270
|
+
super(alpha: @alpha, eps: @eps)
|
271
|
+
end
|
272
|
+
|
273
|
+
private def update_param(param)
|
274
|
+
@m[param] ||= 0
|
275
|
+
@v[param] ||= 0
|
276
|
+
@m[param] = @alpha * @m[param] + (1 - @alpha) * param.grad
|
277
|
+
@v[param] = @alpha * @v[param] + (1 - @alpha) * param.grad**2
|
278
|
+
param.data -= (@learning_rate / NMath.sqrt(@v[param] - @m[param]**2 + @eps)) * param.grad
|
279
|
+
end
|
280
|
+
end
|
281
|
+
|
282
|
+
end
|
283
|
+
end
|