RubyGems - red-chainer - Versions diffs - 0.3.2 → 0.4.0 - Mend

red-chainer 0.3.2 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (81) hide show

checksums.yaml +4 -4
data/.gitignore +2 -2
data/.travis.yml +8 -3
data/.yardopts +1 -0
data/Gemfile +6 -1
data/README.md +34 -3
data/examples/cifar/train_cifar.rb +13 -2
data/examples/iris/iris.rb +9 -5
data/examples/mnist/mnist.rb +16 -4
data/lib/chainer.rb +17 -1
data/lib/chainer/backend.rb +27 -0
data/lib/chainer/cuda.rb +37 -15
data/lib/chainer/dataset/convert.rb +20 -16
data/lib/chainer/datasets/cifar.rb +8 -6
data/lib/chainer/datasets/mnist.rb +14 -55
data/lib/chainer/device.rb +88 -0
data/lib/chainer/function.rb +103 -41
data/lib/chainer/function_node.rb +454 -0
data/lib/chainer/functions/activation/leaky_relu.rb +38 -13
data/lib/chainer/functions/activation/log_softmax.rb +46 -9
data/lib/chainer/functions/activation/relu.rb +8 -8
data/lib/chainer/functions/activation/relu_grad2.rb +34 -0
data/lib/chainer/functions/activation/sigmoid.rb +13 -11
data/lib/chainer/functions/activation/sigmoid_grad.rb +25 -0
data/lib/chainer/functions/activation/tanh.rb +48 -11
data/lib/chainer/functions/array/broadcast_to.rb +56 -0
data/lib/chainer/functions/array/cast.rb +41 -0
data/lib/chainer/functions/array/reshape.rb +28 -0
data/lib/chainer/functions/array/rollaxis.rb +57 -0
data/lib/chainer/functions/array/select_item.rb +72 -0
data/lib/chainer/functions/array/squeeze.rb +78 -0
data/lib/chainer/functions/array/transpose.rb +44 -0
data/lib/chainer/functions/connection/convolution_2d.rb +43 -26
data/lib/chainer/functions/connection/convolution_2d_grad_w.rb +48 -0
data/lib/chainer/functions/connection/deconvolution_2d.rb +159 -0
data/lib/chainer/functions/connection/linear.rb +29 -22
data/lib/chainer/functions/evaluation/accuracy.rb +5 -5
data/lib/chainer/functions/loss/mean_squared_error.rb +21 -12
data/lib/chainer/functions/loss/softmax_cross_entropy.rb +98 -71
data/lib/chainer/functions/math/basic_math.rb +36 -30
data/lib/chainer/functions/math/exp.rb +28 -0
data/lib/chainer/functions/math/identity.rb +4 -3
data/lib/chainer/functions/math/sum.rb +52 -0
data/lib/chainer/functions/noise/dropout.rb +20 -4
data/lib/chainer/functions/normalization/batch_normalization.rb +257 -104
data/lib/chainer/functions/pooling/average_pooling_2d.rb +29 -6
data/lib/chainer/functions/pooling/max_pooling_2d.rb +67 -12
data/lib/chainer/functions/pooling/pooling_2d.rb +6 -4
data/lib/chainer/gradient_check.rb +157 -73
data/lib/chainer/gradient_method.rb +3 -2
data/lib/chainer/initializers/init.rb +5 -5
data/lib/chainer/initializers/normal.rb +4 -2
data/lib/chainer/initializers/uniform.rb +15 -0
data/lib/chainer/iterators/serial_iterator.rb +5 -3
data/lib/chainer/link.rb +4 -2
data/lib/chainer/links/connection/convolution_2d.rb +2 -2
data/lib/chainer/links/model/classifier.rb +24 -5
data/lib/chainer/links/normalization/batch_normalization.rb +7 -10
data/lib/chainer/optimizer.rb +42 -11
data/lib/chainer/optimizers/adam.rb +3 -2
data/lib/chainer/optimizers/momentum_sgd.rb +1 -1
data/lib/chainer/parameter.rb +7 -6
data/lib/chainer/serializer.rb +4 -4
data/lib/chainer/serializers/marshal.rb +10 -8
data/lib/chainer/testing/array.rb +1 -1
data/lib/chainer/training/extensions/evaluator.rb +2 -3
data/lib/chainer/training/extensions/exponential_shift.rb +1 -1
data/lib/chainer/training/extensions/progress_bar.rb +1 -0
data/lib/chainer/training/trainer.rb +4 -9
data/lib/chainer/training/triggers/interval.rb +7 -2
data/lib/chainer/utils/array.rb +80 -1
data/lib/chainer/utils/conv.rb +10 -2
data/lib/chainer/utils/initializer.rb +2 -2
data/lib/chainer/variable.rb +159 -69
data/lib/chainer/variable_node.rb +64 -10
data/lib/chainer/version.rb +1 -1
data/red-chainer.gemspec +4 -3
data/templates/default/layout/html/layout.erb +40 -0
data/templates/default/onefile/html/layout.erb +33 -0
metadata +44 -11
data/lib/chainer/dataset/download.rb +0 -56

data/lib/chainer/functions/pooling/average_pooling_2d.rb CHANGED Viewed

@@ -2,6 +2,8 @@ module Chainer
   module Functions
     module Pooling
       class AveragePooling2D < Pooling2D
+        attr_reader :in_shape, :in_dtype
         # Spatial average pooling function.
         #
         # This function acts similarly to :class:`Convolution2D`,
@@ -14,31 +16,52 @@ module Chainer
         # @param [integer] pad Spatial padding width for the input array. `pad=p` and `pad=[p, p]` are equivalent.
         # @return [Chainer::Variable] Output variable
         def self.average_pooling_2d(x, ksize, stride: nil, pad: 0)
-          self.new(ksize, stride: stride, pad: pad, cover_all: false).(x)
+          self.new(ksize, stride: stride, pad: pad, cover_all: false).apply([x])[0]
         end
         # Average pooling over a set of 2d planes.
-        def forward_cpu(x)
-          retain_inputs([])
+        def forward(x)
           @in_shape = x[0].shape
           @in_dtype = x[0].class
-          col = Chainer::Utils::Conv.im2col_cpu(x[0], @kh, @kw, @sy, @sx, @ph, @pw)
+          col = Chainer::Utils::Conv.im2col(x[0], @kh, @kw, @sy, @sx, @ph, @pw)
           y = col.mean(axis: [2, 3])
           [y]
         end
-        def backward_cpu(x, gy)
+        def backward(indexes, gy)
+          AveragePooling2DGrad.new(self).apply(gy)
+        end
+      end
+      class AveragePooling2DGrad < FunctionNode
+        def initialize(apool2d)
+          @kh = apool2d.kh
+          @kw = apool2d.kw
+          @sy = apool2d.sy
+          @sx = apool2d.sx
+          @ph = apool2d.ph
+          @pw = apool2d.pw
+          @in_shape = apool2d.in_shape
+          @in_dtype = apool2d.in_dtype
+          @apool2d = apool2d
+        end
+        def forward(gy)
           h, w  = @in_shape[2..-1]
           shape = gy[0].shape
           shape.insert(2, 1, 1)
           gcol = gy[0].reshape(*shape).tile(1, 1, @kh, @kw, 1, 1)
-          gx = Chainer::Utils::Conv.col2im_cpu(gcol, @sy, @sx, @ph, @pw, h, w)
+          gx = Chainer::Utils::Conv.col2im(gcol, @sy, @sx, @ph, @pw, h, w)
           gx /= @kh * @kw
           [gx]
         end
+        def backward(indexes, grad_outputs)
+          AveragePooling2D.new([@kh, @kw], stride: [@sy, @sx], pad: [@ph, @pw], cover_all: false).apply(grad_outputs)
+        end
       end
     end
   end

data/lib/chainer/functions/pooling/max_pooling_2d.rb CHANGED Viewed

@@ -2,24 +2,24 @@ module Chainer
   module Functions
     module Pooling
       class MaxPooling2D < Pooling2D
+        attr_reader :in_shape, :in_dtype, :indexes
         # Spatial max pooling function
         #
         # @param [Chainer::Variable] x Input variable
-        # @param [integer || 2D integer array] Size of pooling window
-        # @param [integer || 2D integer array] Stride of pooling applications
-        # @param [integer || 2D integer array] Spatial padding width for the input array
-        # @param [boolean] If `true`, all spatial locations are pooled int some output pixels
+        # @param [integer || 2D integer array] ksize Size of pooling window
+        # @param [integer || 2D integer array] stride Stride of pooling applications
+        # @param [integer || 2D integer array] pad Spatial padding width for the input array
+        # @param [boolean] cover_all If `true`, all spatial locations are pooled int some output pixels
         # @return [Chainer::Variable] Output variable
         def self.max_pooling_2d(x, ksize, stride: nil, pad: 0, cover_all: true)
-          self.new(ksize, stride: stride, pad: pad, cover_all: cover_all).(x)
+          self.new(ksize, stride: stride, pad: pad, cover_all: cover_all).apply([x]).first
         end
-        def forward_cpu(x)
-          retain_inputs([])
+        def forward(x)
           @in_shape = x[0].shape
           @in_dtype = x[0].class
-          col = Chainer::Utils::Conv.im2col_cpu(x[0], @kh, @kw, @sy, @sx, @ph, @pw, pval: -Float::INFINITY, cover_all: @cover_all)
+          col = Chainer::Utils::Conv.im2col(x[0], @kh, @kw, @sy, @sx, @ph, @pw, pval: -Float::INFINITY, cover_all: @cover_all)
           n, c, kh, kw, out_h, out_w = col.shape
           col = col.reshape(n , c, kh * kw, out_h, out_w)
@@ -33,7 +33,27 @@ module Chainer
           [y]
         end
-        def backward_cpu(x, gy)
+        def backward(indexes, gy)
+          MaxPooling2DGrad.new(self).apply(gy)
+        end
+      end
+      class MaxPooling2DGrad < FunctionNode
+        def initialize(mpool2d)
+          @kh = mpool2d.kh
+          @kw = mpool2d.kw
+          @sy = mpool2d.sy
+          @sx = mpool2d.sx
+          @ph = mpool2d.ph
+          @pw = mpool2d.pw
+          @cover_all = mpool2d.cover_all
+          @indexes = mpool2d.indexes
+          @in_shape = mpool2d.in_shape
+          @in_dtype = mpool2d.in_dtype
+          @mpool2d = mpool2d
+        end
+        def forward(gy)
           n, c, out_h, out_w = gy[0].shape
           h, w  = @in_shape[2..-1]
           kh, kw = @kh, @kw
@@ -41,16 +61,51 @@ module Chainer
           gcol = @in_dtype.zeros(n * c * out_h * out_w * kh * kw)
           indexes = @indexes.flatten
-          indexes += Numo::Int64.new((indexes.size * kh * kw) / (kh * kw)).seq(0, kh * kw)
+          indexes += indexes.class.new((indexes.size * kh * kw) / (kh * kw)).seq(0, kh * kw)
           gcol[indexes] = gy[0].flatten.dup
           gcol = gcol.reshape(n, c, out_h, out_w, kh, kw)
           gcol = gcol.swapaxes(2, 4)
           gcol = gcol.swapaxes(3, 5)
-          gx = Chainer::Utils::Conv.col2im_cpu(gcol, @sy, @sx, @ph, @pw, h, w)
+          gx = Chainer::Utils::Conv.col2im(gcol, @sy, @sx, @ph, @pw, h, w)
           [gx]
         end
+        def backward(indexes, ggx)
+          MaxPooling2DWithIndexes.new(@mpool2d).apply(ggx)
+        end
+      end
+      class MaxPooling2DWithIndexes < FunctionNode
+        def initialize(mpool2d)
+          @kh = mpool2d.kh
+          @kw = mpool2d.kw
+          @sy = mpool2d.sy
+          @sx = mpool2d.sx
+          @ph = mpool2d.ph
+          @pw = mpool2d.pw
+          @cover_all = mpool2d.cover_all
+          @indexes = mpool2d.indexes
+        end
+        def forward(x)
+          col = Chainer::Utils::Conv.im2col(x[0], @kh, @kw, @sy, @sx, @ph, @pw, pval: -Float::INFINITY, cover_all: @cover_all)
+          n, c, kh, kw, out_h, out_w = col.shape
+          col = col.reshape(n, c, kh * kw, out_h, out_w)
+          col = col.transpose(0, 1, 3, 4, 2).reshape(nil, kh * kw)
+          indexes = @indexes.flatten.dup
+          # TODO: col = col[numpy.arange(len(indexes)), indexes]
+          new_col = col.class.zeros(indexes.size)
+          x[0].class.new(indexes.size).seq.each_with_index do |v, i|
+            new_col[i] = col[v, indexes[i]]
+          end
+          col = new_col
+          [col.reshape(n, c, out_h, out_w)]
+        end
       end
     end
   end

data/lib/chainer/functions/pooling/pooling_2d.rb CHANGED Viewed

@@ -2,15 +2,17 @@ module Chainer
   module Functions
     module Pooling
       # Base class of pooling function over a set of 2d planes
-      class Pooling2D < Chainer::Function
+      class Pooling2D < Chainer::FunctionNode
+        attr_reader :kh, :kw, :sy, :sx, :ph, :pw, :cover_all
         def initialize(ksize, stride: nil, pad: 0, cover_all: true)
           if stride.nil?
             stride = ksize
           end
-          @kh, @kw = ksize.is_a?(Array) ? ksize : [ksize, ksize]
-          @sy, @sx = stride.is_a?(Array) ? stride : [stride, stride]
-          @ph, @pw = pad.is_a?(Array) ? pad: [pad, pad]
+          @kh, @kw = ksize.is_a?(::Array) ? ksize : [ksize, ksize]
+          @sy, @sx = stride.is_a?(::Array) ? stride : [stride, stride]
+          @ph, @pw = pad.is_a?(::Array) ? pad: [pad, pad]
           @cover_all = cover_all
         end

data/lib/chainer/gradient_check.rb CHANGED Viewed

@@ -1,7 +1,6 @@
 module Chainer
   def _copy_arrays(xs)
-    xp = Chainer::get_array_module(*xs)
-    xs.map{|x| (x.is_a? Numo::NArray) ? x.dup : x}
+    xs.map{|x| Chainer.array?(x) ? x.dup : x}
   end
   # Computes numerical gradient by finite differences.
@@ -19,37 +18,31 @@ module Chainer
   # @param [Float] eps Epsilon value of finite differences.
   # @return [Array] Numerical gradient arrays corresponding to +inputs+.
   #
-  def numerical_grad(f, inputs, grad_outputs, eps=0.001)
+  def numerical_grad(f, inputs, grad_outputs, eps=1e-3)
     raise unless eps > 0
     inputs = inputs.to_a
     grad_outputs = grad_outputs.to_a
-    xp = Numo::NArray
     grads = inputs.map{|x| x.new_zeros()}
-    if inputs[0].ndim < 2
-      tmp = [[inputs[0], grads[0]]]
-    else
-      tmp = (0...inputs[0].shape[0]).map{|i|[inputs[0][i, false], grads[0][i, false]]}
-    end
-    tmp.each do |x, gx|
-      x.each_with_index{|xx, *i|
-        orig = x[*i]   # hold original value
+    inputs.zip(grads).each do |x, gx|
+      orig_x = x.dup # hold original value
+      x.each_with_index{|_, *i|
+        orig = orig_x[*i]
         x[*i] = orig + eps
-        ys1 = _copy_arrays(f.call(x))
+        ys1 = _copy_arrays(f.())
         x[*i] = orig - eps
-        ys2 = _copy_arrays(f.call(x))
+        ys2 = _copy_arrays(f.())
         x[*i] = orig
         ys1.zip(ys2, grad_outputs).each do |y1, y2, gy|
-          if !gy.nil?
-            if  ((y1 - y2) * gy).is_a? Numo::NArray
-              dot = ((y1 - y2) * gy).sum()
-            else
-              dot = ((y1 - y2) * gy).inject(:+)
-            end
-            gx[*i] += dot / (2*eps).to_f
+          next if gy.nil?
+          diff = y1 - y2
+          if Chainer.array?(diff) && diff.empty?
+            dot = 0
+          else
+            dot = (diff * gy).sum
           end
+          gx[*i] += dot / (2 * eps)
         end
       }
     end
@@ -153,6 +146,7 @@ module Chainer
   #
   def check_backward(func, x_data, y_grad, params=[], eps: 0.001, atol: 1e-5, rtol: 1e-4, no_grads: nil, dtype: nil)
     x_data = _as_tuple(x_data)
+    xm = Chainer.get_array_module(*x_data)
     if !y_grad.nil?
       y_grad = _as_tuple(y_grad)
     end
@@ -161,80 +155,170 @@ module Chainer
     xs = x_data.map{|x| Chainer::Variable.new(x)}
     y = func.(*xs)
     y = _as_tuple(y)
-    y = Chainer::Functions::Math::Identity.identity(*y)
-    y = _as_tuple(y)
+    y = Chainer::Functions::Math::Identity.new.apply(y)
-    if !y_grad.nil?
-      if (y).size != (y_grad).size
-        raise TypeError, "`y_grad` must have the same length of output values"
-      end
+    y_grad = set_y_grad(y, y_grad)
+    # Clear gradients which may exist if func calls backward inside of itself.
+    clear_grads(xs)
+    clear_grads(params)
-      y.zip(y_grad).each do |iy, igy|
-        iy.grad = igy
-      end
-    else
-      if (y).size != 1
-        raise TypeError, "When `y_grad` is `nil`, the function must return azero-dimentional array"
-      end
-      y_grad = [1]
-    end
     # We only need to call `backward` for one result `Chainer::Variable`.
     # `Chainer::Variable.backward` method calls `Chainer::Function.backward` of its creator.
     y[0].backward()
+    param_data = params.map { |p| p.data }
     if dtype.nil?
-      casted_xs = x_data.map{|x| Chainer::Variable.new(x)}
+      casted_xs = x_data.map { |x| Chainer::Variable.new(x) }
     else
-      if (dtype != Numo::DFloat) and (dtype != Numo::SFloat)
-        raise TypeError, "`dtype` is allowed only float type"
-      end
-      if (params).size > 0
-        raise TypeError, "`dtype` is available only if `params` is empty"
+      raise '`dtype` is allowed only float type' if dtype != xm::DFloat && dtype != xm::SFloat
+      casted_xs = x_data.map { |x| x.is_a?(Numo::NArray) ? Chainer::Variable.new(x.cast_to(dtype)) : x  }
+    end
+    if no_grads.nil?
+      no_grads = xs.map { |x| x.dtype != Numo::SFloat && x.dtype != Numo::DFloat }
+    else
+      raise "Length of no_grads param and xs should be same." if no_grads.size != xs.size
+    end
+    casted_data = casted_xs.map { |x| x.data.dup }
+    no_grads.zip(xs).each do |skip, x|
+      if skip
+        raise "x.grad is not nil" if  x.grad != nil
+      else
+        raise 'gradients of some arguments are not calculated' if x.grad.nil?
       end
-      casted_xs = x_data.map{|x|
-                    if x.class == Numo::DFloat or x.class == Numo::SFloat
-                      Chainer::Variable.new(dtype.cast(x))
-                    else
-                      Chainer::Variable.new(x)
-                    end
-                  }
     end
-    f = lambda do |_|
+    # Keep the gradient arrays of params which may be overwritten by func
+    params_grad = params.map(&:grad)
+    if dtype.nil?
+      one = Numo::DFloat.new().fill(1.0)
+    else
+      one = dtype.new().fill(1.0)
+    end
+    g = lambda do
+      # This functions is called twice in `numerical_grad`.
+      # `one` is `1 + epsilon` or `1 - epsilon` in these calls.
+      # See the document of `numerical_grad`.
+      no_grads.zip(casted_xs, casted_data).each do |skip, cx, data|
+        next if skip || cx.data.empty?
+        # astype is require to store data with the given type
+        data = (one * data).cast_to(data.class)
+        cx.data = data
+      end
+      params.zip(param_data).each do |param, data|
+        if !dtype.nil?
+          param_dtype = dtype
+        else
+          param_dtype = param.dtype
+        end
+        # The inner astype is required to calculates __mul__ in
+        # `param_type` when data is low accuracy float.
+        # The outer one is require to store data with the given type.
+        param.data = (one * data.cast_to(param_dtype)).cast_to(param_dtype)
+      end
+      # Clear gradients to support func that calls backward inside of itself.
+      clear_grads(casted_xs)
+      clear_grads(params)
       ys = func.(*casted_xs)
       ys = _as_tuple(ys)
-      return ys.map{|y| y.data}.to_a
+      ys_data = ys.map { |y| y.data }
+      no_grads.zip(casted_xs, casted_data).each do |skip, cx, data|
+        next if skip
+        cx.data = data
+      end
+      params.zip(param_data).each do |param, data|
+        param.data = data
+      end
+      ys_data
     end
-    if no_grads.nil?
-      no_grads = xs.map{|x| (x.dtype != Numo::DFloat) and (x.dtype != Numo::SFloat)}
-    else
-      if no_grads.size != xs.size
-        raise TypeError, "Length of no_grads param and xs should be same."
+    gx, = numerical_grad(g, [one], y_grad, eps)
+    gx_accum = 0
+    no_grads.zip(xs, casted_xs).each do |skip, x, cx|
+      next if skip
+      gxi = x.grad.flatten.dup
+      cxi = cx.data.flatten.dup
+      unless dtype.nil?
+        gxi = gxi.cast_to(dtype)
+        cxi = cxi.cast_to(dtype)
       end
+      gx_accum += gxi.empty? ? 0 : gxi.dot(cxi)
     end
-    no_grads.zip(xs, casted_xs).each do |skip, x, cx|
-      if skip
-        raise unless x.grad.nil?
-        next
+    params.zip(params_grad).each do |p, gpi|
+      gpi =gpi.flatten.dup
+      pi = p.data.flatten.dup
+      unless dtype.nil?
+        gpi = gpi.cast_to(dtype)
+        pi = pi.cast_to(dtype)
       end
-      gx, = numerical_grad(f, [cx.data], y_grad, eps)
-      Chainer::Testing.assert_allclose(x.grad, gx, atol: atol, rtol: rtol)
-      if dtype.nil?
-        raise unless gx.class == x.grad.class
-      else
-        if ((gx.class != Numo::DFloat) and (gx.class != Numo::SFloat)) and (gx.class != dtype)
-           raise
+      gx_accum += gpi.dot(pi)
+    end
+    Chainer::Testing.assert_allclose(gx, gx_accum, atol: atol, rtol: rtol)
+  end
+  def check_double_backward(func, x_data, y_grad, x_grad_grad, params=[], params_grad_grad=[], eps: 1e-3, atol: 1e-4, rtol: 1e-3, no_grads: nil, dtype: nil)
+    x_data = _as_tuple(x_data)
+    params = _as_tuple(params)
+    n_x = x_data.size
+    first_order_grad = -> *inputs do
+      xs = inputs[0...n_x]
+      gys = inputs[n_x..-1]
+      y = _as_tuple(func.(*xs))
+      # Let all elements of y share the same creator.
+      # See the comment in check_backward.
+      y = Chainer::Functions::Math::Identity.new.apply(y)
+      set_y_grad(y, gys)
+      y[0].backward(enable_double_backprop: true)
+      xs.map(&:grad_var) + params.map(&:grad_var)
+    end
+    inputs = x_data + _as_tuple(y_grad)
+    grad_grad = _as_tuple(x_grad_grad) + _as_tuple(params_grad_grad)
+    check_backward(first_order_grad, inputs, grad_grad, params=params, eps: eps, atol: atol, rtol: rtol, no_grads: no_grads, dtype: dtype)
+  end
+  def set_y_grad(y, y_grad)
+    if y_grad.nil?
+      if y.size != 1
+        raise TypeError, 'When `y_grad` is `None`, the function must return a zero-dimentional array'
+      end
+      y_grad = [1]
+    else
+      if y.size != y_grad.size
+        raise TypeError, '`y_grad` must have the same length of output values'
+      end
+      y.zip(y_grad).each do |iy, igy|
+        if igy.is_a?(Chainer::Variable)
+          iy.grad_var = igy
+        else
+          iy.grad = igy
         end
       end
     end
-    params.each do |p|
-      gp, = numerical_grad(f, [p.data], y_grad, eps)
-      Chainer::Testing.assert_allclose(p.grad, gp, atol: atol, rtol: rtol)
-      raise unless gp.dtype === p.grad.dtype
+    y_grad
+  end
+  def clear_grads(xs)
+    xs.each do |x|
+      x.grad_var = nil
     end
   end
-  module_function :_copy_arrays, :numerical_grad, :_as_tuple, :check_backward
+  module_function :_copy_arrays, :numerical_grad, :_as_tuple, :check_backward, :check_double_backward, :set_y_grad, :clear_grads
+  private_class_method :set_y_grad, :clear_grads
 end