ruby-dnn 0.10.4 → 0.12.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +1 -2
- data/README.md +33 -6
- data/examples/cifar100_example.rb +3 -3
- data/examples/cifar10_example.rb +3 -3
- data/examples/dcgan/dcgan.rb +112 -0
- data/examples/dcgan/imgen.rb +20 -0
- data/examples/dcgan/train.rb +41 -0
- data/examples/iris_example.rb +3 -6
- data/examples/mnist_conv2d_example.rb +5 -5
- data/examples/mnist_define_by_run.rb +52 -0
- data/examples/mnist_example.rb +3 -3
- data/examples/mnist_lstm_example.rb +3 -3
- data/examples/xor_example.rb +4 -5
- data/ext/rb_stb_image/rb_stb_image.c +103 -0
- data/lib/dnn.rb +10 -10
- data/lib/dnn/cifar10.rb +1 -1
- data/lib/dnn/cifar100.rb +1 -1
- data/lib/dnn/core/activations.rb +21 -22
- data/lib/dnn/core/cnn_layers.rb +94 -111
- data/lib/dnn/core/embedding.rb +30 -9
- data/lib/dnn/core/initializers.rb +31 -21
- data/lib/dnn/core/iterator.rb +52 -0
- data/lib/dnn/core/layers.rb +99 -66
- data/lib/dnn/core/link.rb +24 -0
- data/lib/dnn/core/losses.rb +69 -59
- data/lib/dnn/core/merge_layers.rb +71 -0
- data/lib/dnn/core/models.rb +393 -0
- data/lib/dnn/core/normalizations.rb +27 -14
- data/lib/dnn/core/optimizers.rb +212 -134
- data/lib/dnn/core/param.rb +8 -6
- data/lib/dnn/core/regularizers.rb +10 -7
- data/lib/dnn/core/rnn_layers.rb +78 -85
- data/lib/dnn/core/utils.rb +6 -3
- data/lib/dnn/downloader.rb +3 -3
- data/lib/dnn/fashion-mnist.rb +89 -0
- data/lib/dnn/image.rb +57 -18
- data/lib/dnn/iris.rb +1 -3
- data/lib/dnn/mnist.rb +38 -34
- data/lib/dnn/version.rb +1 -1
- data/third_party/stb_image.h +16 -4
- data/third_party/stb_image_resize.h +2630 -0
- data/third_party/stb_image_write.h +4 -7
- metadata +12 -4
- data/lib/dnn/core/dataset.rb +0 -34
- data/lib/dnn/core/model.rb +0 -440
@@ -2,18 +2,19 @@ module DNN
|
|
2
2
|
module Layers
|
3
3
|
|
4
4
|
class BatchNormalization < HasParamLayer
|
5
|
-
|
5
|
+
attr_reader :gamma
|
6
|
+
attr_reader :beta
|
7
|
+
attr_reader :running_mean
|
8
|
+
attr_reader :running_var
|
6
9
|
attr_reader :axis
|
7
|
-
# @return [Float] Exponential moving average of mean and variance.
|
8
10
|
attr_accessor :momentum
|
9
|
-
# @return [Float] Value to avoid division by zero.
|
10
11
|
attr_accessor :eps
|
11
12
|
|
12
13
|
def self.from_hash(hash)
|
13
14
|
self.new(axis: hash[:axis], momentum: hash[:momentum])
|
14
15
|
end
|
15
16
|
|
16
|
-
# @param [
|
17
|
+
# @param [Integer] axis The axis to normalization.
|
17
18
|
# @param [Float] momentum Exponential moving average of mean and variance.
|
18
19
|
# @param [Float] eps Value to avoid division by zero.
|
19
20
|
def initialize(axis: 0, momentum: 0.9, eps: 1e-7)
|
@@ -23,27 +24,35 @@ module DNN
|
|
23
24
|
@eps = eps
|
24
25
|
end
|
25
26
|
|
27
|
+
def call(input)
|
28
|
+
x, prev_link, learning_phase = *input
|
29
|
+
build(x.shape[1..-1]) unless built?
|
30
|
+
y = forward(x, learning_phase)
|
31
|
+
link = Link.new(prev_link, self)
|
32
|
+
[y, link, learning_phase]
|
33
|
+
end
|
34
|
+
|
26
35
|
def build(input_shape)
|
27
36
|
super
|
28
|
-
@
|
29
|
-
@
|
30
|
-
@
|
31
|
-
@
|
37
|
+
@gamma = Param.new(Xumo::SFloat.ones(*output_shape), 0)
|
38
|
+
@beta = Param.new(Xumo::SFloat.zeros(*output_shape), 0)
|
39
|
+
@running_mean = Param.new(Xumo::SFloat.zeros(*output_shape))
|
40
|
+
@running_var = Param.new(Xumo::SFloat.zeros(*output_shape))
|
32
41
|
end
|
33
42
|
|
34
|
-
def forward(x)
|
43
|
+
def forward(x, learning_phase)
|
35
44
|
if learning_phase
|
36
45
|
mean = x.mean(axis: @axis, keepdims: true)
|
37
46
|
@xc = x - mean
|
38
|
-
var = (@xc**2).mean(axis: @axis, keepdims: true)
|
39
|
-
@std = NMath.sqrt(var + @eps)
|
47
|
+
var = (@xc ** 2).mean(axis: @axis, keepdims: true)
|
48
|
+
@std = Xumo::NMath.sqrt(var + @eps)
|
40
49
|
xn = @xc / @std
|
41
50
|
@xn = xn
|
42
51
|
@running_mean.data = @momentum * @running_mean.data + (1 - @momentum) * mean
|
43
52
|
@running_var.data = @momentum * @running_var.data + (1 - @momentum) * var
|
44
53
|
else
|
45
54
|
xc = x - @running_mean.data
|
46
|
-
xn = xc / NMath.sqrt(@running_var.data + @eps)
|
55
|
+
xn = xc / Xumo::NMath.sqrt(@running_var.data + @eps)
|
47
56
|
end
|
48
57
|
@gamma.data * xn + @beta.data
|
49
58
|
end
|
@@ -56,7 +65,7 @@ module DNN
|
|
56
65
|
end
|
57
66
|
dxn = @gamma.data * dy
|
58
67
|
dxc = dxn / @std
|
59
|
-
dstd = -((dxn * @xc) / (@std**2)).sum(axis: @axis, keepdims: true)
|
68
|
+
dstd = -((dxn * @xc) / (@std ** 2)).sum(axis: @axis, keepdims: true)
|
60
69
|
dvar = 0.5 * dstd / @std
|
61
70
|
dxc += (2.0 / batch_size) * @xc * dvar
|
62
71
|
dmean = dxc.sum(axis: @axis, keepdims: true)
|
@@ -64,7 +73,11 @@ module DNN
|
|
64
73
|
end
|
65
74
|
|
66
75
|
def to_hash
|
67
|
-
super(
|
76
|
+
super(axis: @axis, momentum: @momentum, eps: @eps)
|
77
|
+
end
|
78
|
+
|
79
|
+
def get_params
|
80
|
+
{ gamma: @gamma, beta: @beta, running_mean: @running_mean, running_var: @running_var }
|
68
81
|
end
|
69
82
|
end
|
70
83
|
|
data/lib/dnn/core/optimizers.rb
CHANGED
@@ -3,172 +3,191 @@ module DNN
|
|
3
3
|
|
4
4
|
# Super class of all optimizer classes.
|
5
5
|
class Optimizer
|
6
|
-
|
7
|
-
attr_accessor :learning_rate
|
6
|
+
attr_accessor :clip_norm
|
8
7
|
|
9
|
-
|
10
|
-
|
8
|
+
# @param [Float | NilClass] clip_norm Gradient clip norm.
|
9
|
+
def initialize(clip_norm: nil)
|
10
|
+
@clip_norm = clip_norm
|
11
11
|
end
|
12
12
|
|
13
|
-
# Update layers has
|
13
|
+
# Update layers has params.
|
14
14
|
def update(layers)
|
15
|
-
target_params = layers.select { |layer| layer.is_a?(HasParamLayer) && layer.trainable }
|
16
|
-
.map { |layer| layer.
|
15
|
+
target_params = layers.select { |layer| layer.is_a?(Layers::HasParamLayer) && layer.trainable }
|
16
|
+
.map { |layer| layer.get_params.values }.flatten.compact
|
17
17
|
.select { |param| param.grad }
|
18
|
+
clip_grads(target_params) if @clip_norm
|
19
|
+
update_params(target_params)
|
18
20
|
target_params.each do |param|
|
19
|
-
|
20
|
-
param.grad = 0
|
21
|
+
param.grad = Xumo::SFloat.zeros(*param.data.shape)
|
21
22
|
end
|
22
23
|
end
|
23
24
|
|
24
25
|
def to_hash(merge_hash = nil)
|
25
|
-
hash = {class: self.class.name,
|
26
|
+
hash = { class: self.class.name, clip_norm: @clip_norm }
|
26
27
|
hash.merge!(merge_hash) if merge_hash
|
27
28
|
hash
|
28
29
|
end
|
29
30
|
|
30
|
-
# Update
|
31
|
-
|
32
|
-
|
33
|
-
|
31
|
+
# Update params.
|
32
|
+
private def update_params(params)
|
33
|
+
raise NotImplementedError.new("Class '#{self.class.name}' has implement method 'update_params'")
|
34
|
+
end
|
35
|
+
|
36
|
+
private def clip_grads(params)
|
37
|
+
norm = Math.sqrt(params.reduce(0) { |sum, param| sum + (param.grad == 0 ? 0 : (param.grad ** 2).sum) })
|
38
|
+
return if norm <= @clip_norm
|
39
|
+
rate = @clip_norm / (norm + 1e-7)
|
40
|
+
params.each do |param|
|
41
|
+
param.grad *= rate
|
42
|
+
end
|
34
43
|
end
|
35
44
|
end
|
36
45
|
|
37
46
|
|
38
47
|
class SGD < Optimizer
|
39
|
-
|
48
|
+
attr_accessor :lr
|
40
49
|
attr_accessor :momentum
|
41
50
|
|
42
51
|
def self.from_hash(hash)
|
43
|
-
self.new(hash[:
|
52
|
+
self.new(hash[:lr], momentum: hash[:momentum], clip_norm: hash[:clip_norm])
|
44
53
|
end
|
45
54
|
|
46
|
-
# @param [Float]
|
47
|
-
# @param [Float] momentum
|
48
|
-
def initialize(
|
49
|
-
super(
|
55
|
+
# @param [Float] lr Learning rate.
|
56
|
+
# @param [Float] momentum Momentum coefficient.
|
57
|
+
def initialize(lr = 0.01, momentum: 0, clip_norm: nil)
|
58
|
+
super(clip_norm: clip_norm)
|
59
|
+
@lr = lr
|
50
60
|
@momentum = momentum
|
51
61
|
@v = {}
|
52
62
|
end
|
53
63
|
|
54
64
|
def to_hash
|
55
|
-
super(momentum: @momentum)
|
56
|
-
end
|
57
|
-
|
58
|
-
private def
|
59
|
-
|
60
|
-
|
61
|
-
@
|
62
|
-
|
63
|
-
|
65
|
+
super(lr: @lr, momentum: @momentum)
|
66
|
+
end
|
67
|
+
|
68
|
+
private def update_params(params)
|
69
|
+
params.each do |param|
|
70
|
+
amount = param.grad * @lr
|
71
|
+
if @momentum > 0
|
72
|
+
@v[param] ||= Xumo::SFloat.zeros(*param.data.shape)
|
73
|
+
amount += @momentum * @v[param]
|
74
|
+
@v[param] = amount
|
75
|
+
end
|
76
|
+
param.data -= amount
|
64
77
|
end
|
65
|
-
param.data -= amount
|
66
78
|
end
|
67
79
|
end
|
68
80
|
|
69
81
|
|
70
82
|
class Nesterov < Optimizer
|
83
|
+
attr_accessor :lr
|
71
84
|
attr_accessor :momentum
|
72
|
-
|
85
|
+
|
73
86
|
def self.from_hash(hash)
|
74
|
-
self.new(hash[:
|
87
|
+
self.new(hash[:lr], momentum: hash[:momentum], clip_norm: hash[:clip_norm])
|
75
88
|
end
|
76
89
|
|
77
|
-
# @param [Float]
|
78
|
-
# @param [Float] momentum
|
79
|
-
def initialize(
|
80
|
-
super(
|
90
|
+
# @param [Float] lr Learning rate.
|
91
|
+
# @param [Float] momentum Momentum coefficient.
|
92
|
+
def initialize(lr = 0.01, momentum: 0.9, clip_norm: nil)
|
93
|
+
super(clip_norm: clip_norm)
|
94
|
+
@lr = lr
|
81
95
|
@momentum = momentum
|
82
96
|
@v = {}
|
83
97
|
end
|
84
98
|
|
85
99
|
def to_hash
|
86
|
-
super(momentum: @momentum)
|
100
|
+
super(lr: @lr, momentum: @momentum)
|
87
101
|
end
|
88
|
-
|
89
|
-
private def
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
102
|
+
|
103
|
+
private def update_params(params)
|
104
|
+
params.each do |param|
|
105
|
+
@v[param] ||= Xumo::SFloat.zeros(*param.data.shape)
|
106
|
+
amount = param.grad * @lr
|
107
|
+
@v[param] = @v[param] * @momentum - amount
|
108
|
+
param.data = (param.data + @momentum ** 2 * @v[param]) - (1 + @momentum) * amount
|
109
|
+
end
|
94
110
|
end
|
95
111
|
end
|
96
|
-
|
97
|
-
|
112
|
+
|
113
|
+
|
98
114
|
class AdaGrad < Optimizer
|
99
|
-
|
115
|
+
attr_accessor :lr
|
100
116
|
attr_accessor :eps
|
101
117
|
|
102
|
-
|
118
|
+
def self.from_hash(hash)
|
119
|
+
self.new(hash[:lr], eps: hash[:eps], clip_norm: hash[:clip_norm])
|
120
|
+
end
|
121
|
+
|
122
|
+
# @param [Float] lr Learning rate.
|
103
123
|
# @param [Float] eps Value to avoid division by zero.
|
104
|
-
def initialize(
|
105
|
-
super(
|
124
|
+
def initialize(lr = 0.01, eps: 1e-7, clip_norm: nil)
|
125
|
+
super(clip_norm: clip_norm)
|
126
|
+
@lr = lr
|
106
127
|
@eps = eps
|
107
128
|
@g = {}
|
108
129
|
end
|
109
130
|
|
110
|
-
def
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
@g[param] += param.grad**2
|
117
|
-
param.data -= (@learning_rate / NMath.sqrt(@g[param] + @eps)) * param.grad
|
131
|
+
private def update_params(params)
|
132
|
+
params.each do |param|
|
133
|
+
@g[param] ||= Xumo::SFloat.zeros(*param.data.shape)
|
134
|
+
@g[param] += param.grad ** 2
|
135
|
+
param.data -= (@lr / Xumo::NMath.sqrt(@g[param] + @eps)) * param.grad
|
136
|
+
end
|
118
137
|
end
|
119
138
|
|
120
139
|
def to_hash
|
121
|
-
super(eps: @eps)
|
140
|
+
super(lr: @lr, eps: @eps)
|
122
141
|
end
|
123
142
|
end
|
124
|
-
|
143
|
+
|
125
144
|
|
126
145
|
class RMSProp < Optimizer
|
127
|
-
|
146
|
+
attr_accessor :lr
|
128
147
|
attr_accessor :alpha
|
129
|
-
# @return [Float] Return the eps value.
|
130
148
|
attr_accessor :eps
|
131
149
|
|
132
150
|
def self.from_hash(hash)
|
133
|
-
self.new(hash[:
|
151
|
+
self.new(hash[:lr], alpha: hash[:alpha], eps: hash[:eps], clip_norm: hash[:clip_norm])
|
134
152
|
end
|
135
153
|
|
136
|
-
# @param [Float]
|
154
|
+
# @param [Float] lr Learning rate.
|
137
155
|
# @param [Float] alpha Moving average index of past slopes.
|
138
156
|
# @param [Float] eps Value to avoid division by zero.
|
139
|
-
def initialize(
|
140
|
-
super(
|
157
|
+
def initialize(lr = 0.001, alpha: 0.9, eps: 1e-7, clip_norm: nil)
|
158
|
+
super(clip_norm: clip_norm)
|
159
|
+
@lr = lr
|
141
160
|
@alpha = alpha
|
142
161
|
@eps = eps
|
143
162
|
@g = {}
|
144
163
|
end
|
145
164
|
|
146
165
|
def to_hash
|
147
|
-
super(alpha: @alpha, eps: @eps)
|
166
|
+
super(lr: @lr, alpha: @alpha, eps: @eps)
|
148
167
|
end
|
149
168
|
|
150
|
-
private def
|
151
|
-
|
152
|
-
|
153
|
-
|
169
|
+
private def update_params(params)
|
170
|
+
params.each do |param|
|
171
|
+
@g[param] ||= Xumo::SFloat.zeros(*param.data.shape)
|
172
|
+
@g[param] = @alpha * @g[param] + (1 - @alpha) * param.grad ** 2
|
173
|
+
param.data -= (@lr / Xumo::NMath.sqrt(@g[param] + @eps)) * param.grad
|
174
|
+
end
|
154
175
|
end
|
155
176
|
end
|
156
177
|
|
157
178
|
|
158
179
|
class AdaDelta < Optimizer
|
159
|
-
# @return [Float] Return the rho value.
|
160
180
|
attr_accessor :rho
|
161
|
-
# @return [Float] Return the eps value.
|
162
181
|
attr_accessor :eps
|
163
182
|
|
164
183
|
def self.from_hash(hash)
|
165
|
-
self.new(rho: hash[:rho], eps: hash[:eps])
|
184
|
+
self.new(rho: hash[:rho], eps: hash[:eps], clip_norm: hash[:clip_norm])
|
166
185
|
end
|
167
186
|
|
168
187
|
# @param [Float] rho Moving average index of past slopes.
|
169
188
|
# @param [Float] eps Value to avoid division by zero.
|
170
|
-
def initialize(rho: 0.95, eps: 1e-6)
|
171
|
-
super(
|
189
|
+
def initialize(rho: 0.95, eps: 1e-6, clip_norm: nil)
|
190
|
+
super(clip_norm: clip_norm)
|
172
191
|
@rho = rho
|
173
192
|
@eps = eps
|
174
193
|
@h = {}
|
@@ -179,103 +198,162 @@ module DNN
|
|
179
198
|
super(rho: @rho, eps: @eps)
|
180
199
|
end
|
181
200
|
|
182
|
-
private def
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
201
|
+
private def update_params(params)
|
202
|
+
params.each do |param|
|
203
|
+
@h[param] ||= Xumo::SFloat.zeros(*param.data.shape)
|
204
|
+
@s[param] ||= Xumo::SFloat.zeros(*param.data.shape)
|
205
|
+
@h[param] = @rho * @h[param] + (1 - @rho) * param.grad ** 2
|
206
|
+
v = (Xumo::NMath.sqrt(@s[param] + @eps) / Xumo::NMath.sqrt(@h[param] + @eps)) * param.grad
|
207
|
+
@s[param] = @rho * @s[param] + (1 - @rho) * v ** 2
|
208
|
+
param.data -= v
|
209
|
+
end
|
189
210
|
end
|
190
211
|
end
|
191
212
|
|
192
213
|
|
193
|
-
class
|
194
|
-
|
214
|
+
class RMSPropGraves < Optimizer
|
215
|
+
attr_accessor :lr
|
195
216
|
attr_accessor :alpha
|
196
|
-
# @return [Float] Return the beta1 value.
|
197
|
-
attr_accessor :beta1
|
198
|
-
# @return [Float] Return the beta2 value.
|
199
|
-
attr_accessor :beta2
|
200
|
-
# @return [Float] Return the eps value.
|
201
217
|
attr_accessor :eps
|
202
|
-
|
218
|
+
|
203
219
|
def self.from_hash(hash)
|
204
|
-
self.new(
|
220
|
+
self.new(hash[:lr], alpha: hash[:alpha], eps: hash[:eps], clip_norm: hash[:clip_norm])
|
205
221
|
end
|
206
222
|
|
207
|
-
# @param [Float]
|
208
|
-
# @param [Float]
|
209
|
-
# @param [Float] beta2 Moving average index of beta2.
|
223
|
+
# @param [Float] lr Learning rate.
|
224
|
+
# @param [Float] alpha Moving average index of past slopes.
|
210
225
|
# @param [Float] eps Value to avoid division by zero.
|
211
|
-
def initialize(
|
212
|
-
super(
|
226
|
+
def initialize(lr = 0.0001, alpha: 0.95, eps: 0.0001, clip_norm: nil)
|
227
|
+
super(clip_norm: clip_norm)
|
228
|
+
@lr = lr
|
213
229
|
@alpha = alpha
|
214
|
-
@beta1 = beta1
|
215
|
-
@beta2 = beta2
|
216
230
|
@eps = eps
|
217
|
-
@iter = 0
|
218
231
|
@m = {}
|
219
232
|
@v = {}
|
220
233
|
end
|
221
234
|
|
222
|
-
def update(layers)
|
223
|
-
@iter += 1
|
224
|
-
learning_rate = @alpha * Math.sqrt(1 - @beta2**@iter) / (1 - @beta1**@iter)
|
225
|
-
target_params = layers.select { |layer| layer.is_a?(HasParamLayer) && layer.trainable }
|
226
|
-
.map { |layer| layer.params.values }.flatten
|
227
|
-
.select { |param| param.grad }
|
228
|
-
target_params.each do |param|
|
229
|
-
update_param(param, learning_rate)
|
230
|
-
param.grad = 0
|
231
|
-
end
|
232
|
-
end
|
233
|
-
|
234
235
|
def to_hash
|
235
|
-
super(
|
236
|
+
super(lr: @lr, alpha: @alpha, eps: @eps)
|
236
237
|
end
|
237
238
|
|
238
|
-
private def
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
239
|
+
private def update_params(params)
|
240
|
+
params.each do |param|
|
241
|
+
@m[param] ||= Xumo::SFloat.zeros(*param.data.shape)
|
242
|
+
@v[param] ||= Xumo::SFloat.zeros(*param.data.shape)
|
243
|
+
@m[param] = @alpha * @m[param] + (1 - @alpha) * param.grad
|
244
|
+
@v[param] = @alpha * @v[param] + (1 - @alpha) * param.grad ** 2
|
245
|
+
param.data -= (@lr / Xumo::NMath.sqrt(@v[param] - @m[param] ** 2 + @eps)) * param.grad
|
246
|
+
end
|
244
247
|
end
|
245
248
|
end
|
246
249
|
|
247
250
|
|
248
|
-
class
|
249
|
-
# @return [Float] Return the alpha value.
|
251
|
+
class Adam < Optimizer
|
250
252
|
attr_accessor :alpha
|
251
|
-
|
253
|
+
attr_accessor :beta1
|
254
|
+
attr_accessor :beta2
|
252
255
|
attr_accessor :eps
|
253
|
-
|
256
|
+
attr_reader :amsgrad
|
257
|
+
|
254
258
|
def self.from_hash(hash)
|
255
|
-
self.new(hash[:
|
259
|
+
self.new(alpha: hash[:alpha], beta1: hash[:beta1], beta2: hash[:beta2],
|
260
|
+
eps: hash[:eps], amsgrad: hash[:amsgrad], clip_norm: hash[:clip_norm])
|
256
261
|
end
|
257
262
|
|
258
|
-
# @param [Float]
|
259
|
-
# @param [Float]
|
263
|
+
# @param [Float] alpha Value used to calculate learning rate.
|
264
|
+
# @param [Float] beta1 Moving average index of beta1.
|
265
|
+
# @param [Float] beta2 Moving average index of beta2.
|
260
266
|
# @param [Float] eps Value to avoid division by zero.
|
261
|
-
|
262
|
-
|
267
|
+
# @param [Boolean] amsgrad Setting the true enable amsgrad.
|
268
|
+
def initialize(alpha: 0.001, beta1: 0.9, beta2: 0.999, eps: 1e-7, amsgrad: false, clip_norm: nil)
|
269
|
+
super(clip_norm: clip_norm)
|
263
270
|
@alpha = alpha
|
271
|
+
@beta1 = beta1
|
272
|
+
@beta2 = beta2
|
264
273
|
@eps = eps
|
274
|
+
@amsgrad = amsgrad
|
275
|
+
@t = 0
|
265
276
|
@m = {}
|
266
277
|
@v = {}
|
278
|
+
@s = {} if amsgrad
|
267
279
|
end
|
268
280
|
|
269
281
|
def to_hash
|
270
|
-
|
282
|
+
{
|
283
|
+
class: self.class.name, alpha: @alpha, beta1: @beta1, beta2: @beta2,
|
284
|
+
eps: @eps, amsgrad: @amsgrad, clip_norm: @clip_norm
|
285
|
+
}
|
286
|
+
end
|
287
|
+
|
288
|
+
private def update_params(params)
|
289
|
+
@t += 1
|
290
|
+
lr = @alpha * Math.sqrt(1 - @beta2 ** @t) / (1 - @beta1 ** @t)
|
291
|
+
params.each do |param|
|
292
|
+
@m[param] ||= Xumo::SFloat.zeros(*param.data.shape)
|
293
|
+
@v[param] ||= Xumo::SFloat.zeros(*param.data.shape)
|
294
|
+
@m[param] += (1 - @beta1) * (param.grad - @m[param])
|
295
|
+
@v[param] += (1 - @beta2) * (param.grad ** 2 - @v[param])
|
296
|
+
if @amsgrad
|
297
|
+
@s[param] ||= Xumo::SFloat.zeros(*param.data.shape)
|
298
|
+
@s[param] = Xumo::SFloat.maximum(@s[param], @v[param])
|
299
|
+
param.data -= lr * @m[param] / Xumo::NMath.sqrt(@s[param] + @eps)
|
300
|
+
else
|
301
|
+
param.data -= lr * @m[param] / Xumo::NMath.sqrt(@v[param] + @eps)
|
302
|
+
end
|
303
|
+
end
|
304
|
+
end
|
305
|
+
end
|
306
|
+
|
307
|
+
|
308
|
+
class AdaBound < Adam
|
309
|
+
attr_accessor :final_lr
|
310
|
+
attr_accessor :gamma
|
311
|
+
|
312
|
+
def self.from_hash(hash)
|
313
|
+
self.new(alpha: hash[:alpha], beta1: hash[:beta1], beta2: hash[:beta2],
|
314
|
+
final_lr: hash[:final_lr], gamma: hash[:gamma], eps: hash[:eps], amsgrad: hash[:amsgrad], clip_norm: hash[:clip_norm])
|
315
|
+
end
|
316
|
+
|
317
|
+
# @param [Float] final_lr Final learning rate.
|
318
|
+
# @param [Float] gamma Lower and upper range value.
|
319
|
+
def initialize(alpha: 0.001, beta1: 0.9, beta2: 0.999, final_lr: 0.1, gamma: 0.001, eps: 1e-7, amsgrad: false, clip_norm: nil)
|
320
|
+
super(alpha: alpha, beta1: beta1, beta2: beta2, eps: eps, amsgrad: amsgrad, clip_norm: clip_norm)
|
321
|
+
@final_lr = final_lr
|
322
|
+
@gamma = gamma
|
323
|
+
end
|
324
|
+
|
325
|
+
def to_hash
|
326
|
+
{
|
327
|
+
class: self.class.name, alpha: @alpha, beta1: @beta1, beta2: @beta2,
|
328
|
+
final_lr: @final_lr, gamma: @gamma, eps: @eps, amsgrad: amsgrad, clip_norm: @clip_norm
|
329
|
+
}
|
330
|
+
end
|
331
|
+
|
332
|
+
private def update_params(params)
|
333
|
+
@t += 1
|
334
|
+
lr = @alpha * Math.sqrt(1 - @beta2 ** @t) / (1 - @beta1 ** @t)
|
335
|
+
final_lr = @final_lr * lr / @alpha
|
336
|
+
lower_bound = final_lr * (1 - 1 / (@gamma * @t + 1))
|
337
|
+
upper_bound = final_lr * (1 + 1 / (@gamma * @t))
|
338
|
+
params.each do |param|
|
339
|
+
@m[param] ||= Xumo::SFloat.zeros(*param.data.shape)
|
340
|
+
@v[param] ||= Xumo::SFloat.zeros(*param.data.shape)
|
341
|
+
@m[param] += (1 - @beta1) * (param.grad - @m[param])
|
342
|
+
@v[param] += (1 - @beta2) * (param.grad ** 2 - @v[param])
|
343
|
+
if @amsgrad
|
344
|
+
@s[param] ||= Xumo::SFloat.zeros(*param.data.shape)
|
345
|
+
@s[param] = Xumo::SFloat.maximum(@s[param], @v[param])
|
346
|
+
param.data -= clip_lr(lr / (Xumo::NMath.sqrt(@s[param]) + @eps), lower_bound, upper_bound) * @m[param]
|
347
|
+
else
|
348
|
+
param.data -= clip_lr(lr / (Xumo::NMath.sqrt(@v[param]) + @eps), lower_bound, upper_bound) * @m[param]
|
349
|
+
end
|
350
|
+
end
|
271
351
|
end
|
272
352
|
|
273
|
-
private def
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
@v[param] = @alpha * @v[param] + (1 - @alpha) * param.grad**2
|
278
|
-
param.data -= (@learning_rate / NMath.sqrt(@v[param] - @m[param]**2 + @eps)) * param.grad
|
353
|
+
private def clip_lr(lr, lower_bound, upper_bound)
|
354
|
+
lr[lr < lower_bound] = lower_bound
|
355
|
+
lr[lr > upper_bound] = upper_bound
|
356
|
+
lr
|
279
357
|
end
|
280
358
|
end
|
281
359
|
|