RubyGems - red-chainer - Versions diffs - 0.3.2 → 0.4.0 - Mend

red-chainer 0.3.2 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (81) hide show

checksums.yaml +4 -4
data/.gitignore +2 -2
data/.travis.yml +8 -3
data/.yardopts +1 -0
data/Gemfile +6 -1
data/README.md +34 -3
data/examples/cifar/train_cifar.rb +13 -2
data/examples/iris/iris.rb +9 -5
data/examples/mnist/mnist.rb +16 -4
data/lib/chainer.rb +17 -1
data/lib/chainer/backend.rb +27 -0
data/lib/chainer/cuda.rb +37 -15
data/lib/chainer/dataset/convert.rb +20 -16
data/lib/chainer/datasets/cifar.rb +8 -6
data/lib/chainer/datasets/mnist.rb +14 -55
data/lib/chainer/device.rb +88 -0
data/lib/chainer/function.rb +103 -41
data/lib/chainer/function_node.rb +454 -0
data/lib/chainer/functions/activation/leaky_relu.rb +38 -13
data/lib/chainer/functions/activation/log_softmax.rb +46 -9
data/lib/chainer/functions/activation/relu.rb +8 -8
data/lib/chainer/functions/activation/relu_grad2.rb +34 -0
data/lib/chainer/functions/activation/sigmoid.rb +13 -11
data/lib/chainer/functions/activation/sigmoid_grad.rb +25 -0
data/lib/chainer/functions/activation/tanh.rb +48 -11
data/lib/chainer/functions/array/broadcast_to.rb +56 -0
data/lib/chainer/functions/array/cast.rb +41 -0
data/lib/chainer/functions/array/reshape.rb +28 -0
data/lib/chainer/functions/array/rollaxis.rb +57 -0
data/lib/chainer/functions/array/select_item.rb +72 -0
data/lib/chainer/functions/array/squeeze.rb +78 -0
data/lib/chainer/functions/array/transpose.rb +44 -0
data/lib/chainer/functions/connection/convolution_2d.rb +43 -26
data/lib/chainer/functions/connection/convolution_2d_grad_w.rb +48 -0
data/lib/chainer/functions/connection/deconvolution_2d.rb +159 -0
data/lib/chainer/functions/connection/linear.rb +29 -22
data/lib/chainer/functions/evaluation/accuracy.rb +5 -5
data/lib/chainer/functions/loss/mean_squared_error.rb +21 -12
data/lib/chainer/functions/loss/softmax_cross_entropy.rb +98 -71
data/lib/chainer/functions/math/basic_math.rb +36 -30
data/lib/chainer/functions/math/exp.rb +28 -0
data/lib/chainer/functions/math/identity.rb +4 -3
data/lib/chainer/functions/math/sum.rb +52 -0
data/lib/chainer/functions/noise/dropout.rb +20 -4
data/lib/chainer/functions/normalization/batch_normalization.rb +257 -104
data/lib/chainer/functions/pooling/average_pooling_2d.rb +29 -6
data/lib/chainer/functions/pooling/max_pooling_2d.rb +67 -12
data/lib/chainer/functions/pooling/pooling_2d.rb +6 -4
data/lib/chainer/gradient_check.rb +157 -73
data/lib/chainer/gradient_method.rb +3 -2
data/lib/chainer/initializers/init.rb +5 -5
data/lib/chainer/initializers/normal.rb +4 -2
data/lib/chainer/initializers/uniform.rb +15 -0
data/lib/chainer/iterators/serial_iterator.rb +5 -3
data/lib/chainer/link.rb +4 -2
data/lib/chainer/links/connection/convolution_2d.rb +2 -2
data/lib/chainer/links/model/classifier.rb +24 -5
data/lib/chainer/links/normalization/batch_normalization.rb +7 -10
data/lib/chainer/optimizer.rb +42 -11
data/lib/chainer/optimizers/adam.rb +3 -2
data/lib/chainer/optimizers/momentum_sgd.rb +1 -1
data/lib/chainer/parameter.rb +7 -6
data/lib/chainer/serializer.rb +4 -4
data/lib/chainer/serializers/marshal.rb +10 -8
data/lib/chainer/testing/array.rb +1 -1
data/lib/chainer/training/extensions/evaluator.rb +2 -3
data/lib/chainer/training/extensions/exponential_shift.rb +1 -1
data/lib/chainer/training/extensions/progress_bar.rb +1 -0
data/lib/chainer/training/trainer.rb +4 -9
data/lib/chainer/training/triggers/interval.rb +7 -2
data/lib/chainer/utils/array.rb +80 -1
data/lib/chainer/utils/conv.rb +10 -2
data/lib/chainer/utils/initializer.rb +2 -2
data/lib/chainer/variable.rb +159 -69
data/lib/chainer/variable_node.rb +64 -10
data/lib/chainer/version.rb +1 -1
data/red-chainer.gemspec +4 -3
data/templates/default/layout/html/layout.erb +40 -0
data/templates/default/onefile/html/layout.erb +33 -0
metadata +44 -11
data/lib/chainer/dataset/download.rb +0 -56

data/lib/chainer/functions/connection/deconvolution_2d.rb ADDED Viewed

@@ -0,0 +1,159 @@
+module Chainer
+  module Functions
+    module Connection
+      class Deconvolution2DFunction < Chainer::FunctionNode
+        attr_reader :sy, :sx, :ph, :pw, :cover_all
+        # Two dimensional deconvolution function.
+        #
+        # This is an implementation of two-dimensional deconvolution.
+        # In most of deep learning frameworks and papers,
+        # this function is called <b>transposed convolution</b>.
+        # But because of historical reasons (e.g. paper by Ziller Deconvolutional Networks) and backward compatibility,
+        # this function is called +deconvolution+ in Chainer.
+        #
+        # It takes three variables: input image +x+,
+        # the filter weight +W+, and the bias vector +b+.
+        #
+        # - $n$ is the batch size.
+        # - $c_I$ and $c_O$ are the number of the input and output channels, respectively.
+        # - $h_I$ and $w_I$ are the height and width of the input image, respectively.
+        # - $h_K$ and $w_K$ are the height and width of the filters, respectively.
+        # - $h_P$ and $w_P$ are the height and width of the spatial padding size, respectively.
+        #
+        # Let $(s_Y, s_X)$ be the stride of filter application.
+        # Then, the output size $(h_O, w_O)$ is estimated by the following equations:
+        #
+        # $
+        # h_O &= s_Y (h_I - 1) + h_K - 2h_P,
+        # w_O &= s_X (w_I - 1) + w_K - 2w_P.
+        # $
+        #
+        # @param [Chainer::Variable or Numo::NArray] x Input variable of shape $(n, c_I, h_I, w_I)$.
+        # @param [Chainer::Variable or Numo::NArray] w Weight variable of shape $(c_I, c_O, h_K, w_K)$.
+        # @param [Chainer::Variable or Numo::NArray] b Bias variable of length $c_O$ (optional).
+        # @param [integer or Array<integer>] stride Stride of filter applications. +stride=s+ and +stride=[s, s]+ are equivalent.
+        # @param [integer or Array<integer>] pad Spatial padding width for input arrays. +pad=p+ and +pad=[p, p]+ are equivalent.
+        # @param [integer or Arrat<integer>] outsize Expected output size of deconvolutional operation.
+        #   It should be pair of height and width $(h_O, w_O)$.
+        #   Default value is +nil+ and the outsize is estimated by input size, stride and pad.
+        # @return [Chainer::Variable] Output variable of shape $(n, c_O, h_O, w_O)$.
+        #
+        # Example
+        # > n = 10
+        # > c_i, c_o = 1, 3
+        # > h_i, w_i = 5, 10
+        # > h_k, w_k = 10, 10
+        # > h_p, w_p = 5, 5
+        # > x = Numo::DFloat.new(n, c_i, h_i, w_i).rand
+        # > x.shape
+        # => [10, 1, 5, 10]
+        # > w = Numo::DFloat.new(c_i, c_o, h_k, w_k).rand
+        # > w.shape
+        # => [1, 3, 10, 10]
+        # > b = Numo::DFloat.new(c_o).rand
+        # > b.shape
+        # => [3]
+        # > s_y, s_x = 5, 5
+        # > y = Chainer::Functions::Connection::Deconvolution2DFunction.deconvolution_2d(x, w, b: b, stride: [s_y, s_x], pad: [h_p, w_p])
+        # > y.shape
+        # => [10, 3, 20, 45]
+        # > h_o = s_y * (h_i - 1) + h_k - 2 * h_p
+        # > w_o = s_x * (w_i - 1) + w_k - 2 * w_p
+        # > y.shape == [n, c_o, h_o, w_o]
+        # => true
+        def self.deconvolution_2d(x, w, b: nil, stride: 1, pad: 0, outsize: nil)
+          func = Deconvolution2DFunction.new(stride: stride, pad: pad, outsize: outsize)
+          if b.nil?
+            args = x, w
+          else
+            args = x, w, b
+          end
+          func.apply(args).first
+        end
+        def initialize(stride: 1, pad: 0, outsize: nil)
+          @cover_all = nil
+          @sy, @sx = stride.is_a?(::Array) ? stride : [stride, stride]
+          @ph, @pw = pad.is_a?(::Array) ? pad : [pad, pad]
+          @outh, @outw = outsize.nil? ? [nil, nil] : outsize
+        end
+        def forward(inputs)
+          retain_inputs([0, 1])
+          x, w = inputs[0...2]
+          b = inputs.size == 3 ? inputs[2] : nil
+          unless inputs.all? { |i| i.is_a?(Numo::NArray) }
+            if b.nil?
+              raise TypeError, "Numo::NArray must not be used together w: #{w.class}, x: #{x.class}"
+            else
+              raise TypeError, "Numo::NArray must not be used together w: #{w.class}, x: #{x.class}, b: #{b.class}"
+            end
+          end
+          kh, kw = w.shape[2..-1]
+          _, _, x_h, x_w = x.shape
+          gcol = Chainer::Utils::Math.tensordot(w, x, [0, 1]).cast_to(x.class)
+          # - k, m, n: shape of out_channel
+          # - b: number of inputs
+          # - h, w: height and width of kernels
+          # k, m, n, b, h, w -> b, k, m, n, h, w
+          gcol = gcol.transpose(3, 0, 1, 2, 4, 5)
+          if @outh.nil?
+            @outh = Chainer::Utils::Conv.get_deconv_outsize(x_h, kh, @sy, @ph)
+            raise TypeError, 'Height in the output should be positive.' if @outh <= 0
+          end
+          if @outw.nil?
+            @outw = Chainer::Utils::Conv.get_deconv_outsize(x_w, kw, @sx, @pw)
+            raise TypeError, 'Width in the output should be positive.' if @outw <= 0
+          end
+          y = Chainer::Utils::Conv.col2im(gcol, @sy, @sx, @ph, @pw, @outh, @outw)
+          if !b.nil?
+            y += b.reshape(1, b.size, 1, 1)
+          end
+          [y]
+        end
+        def backward(indexes, grad_outputs)
+          x, w = get_retained_inputs
+          gy = grad_outputs.first
+          ret = []
+          if indexes.include?(0)
+            set_cover_all(x, w) if @cover_all.nil?
+            gw = Chainer::Functions::Connection::Convolution2DFunction.convolution_2d(gy, w, stride: [@sy, @sx], pad: [@ph, @pw], cover_all: @cover_all)
+            ret << gw
+          end
+          if indexes.include?(1)
+            set_cover_all(x, w) if @cover_all.nil?
+            gw = Chainer::Functions::Connection::Convolution2DGradW.new(self).apply([gy, x]).first
+            ret << gw
+          end
+          if indexes.include?(2)
+            gb = Chainer::Functions::Math::Sum.sum(gy, axis: [0, 2, 3])
+            ret << gb
+          end
+          ret
+        end
+        private
+        def set_cover_all(x, w)
+          in_h, in_w = x.shape[2..-1]
+          kh, kw = w.shape[2..-1]
+          @cover_all = in_h != Chainer::Utils::Conv.get_conv_outsize(@outh, kh, @sy, @ph) || in_w != Chainer::Utils::Conv.get_conv_outsize(@outw, kw, @sx, @pw)
+        end
+      end
+    end
+  end
+end

data/lib/chainer/functions/connection/linear.rb CHANGED Viewed

@@ -1,17 +1,23 @@
 module Chainer
   module Functions
     module Connection
-      class LinearFunction < Chainer::Function
+      class LinearFunction < Chainer::FunctionNode
         def self.linear(x, w, b=nil)
+          if x.ndim > 2
+            x = x.reshape(x.shape.first, -1)
+          end
           if b.nil?
-            self.new.(x, w)
+            args = x, w
           else
-            self.new.(x, w, b)
+            args = x, w, b
           end
+          self.new.apply(args).first
         end
         def forward(inputs)
-          x = as_mat(inputs[0])
+          x = inputs[0]
           w = inputs[1]
           y = x.dot(w.transpose).cast_to(x.class)
@@ -19,28 +25,29 @@ module Chainer
             b = inputs[2]
             y += b
           end
-          return [y]
-        end
-        def backward(inputs, grad_outputs)
-          x = as_mat(inputs[0])
-          w = inputs[1]
-          gy = grad_outputs[0]
-          gx = gy.dot(w).cast_to(x.class).reshape(*inputs[0].shape)
-          gw = gy.transpose.dot(x).cast_to(w.class)
-          if inputs.size == 3
-            gb = gy.sum(0)
-            [gx, gw, gb]
-          else
-            [gx, gw]
-          end
+          retain_inputs([0, 1])
+          return [y]
         end
-        private
+        def backward(indexes, grad_outputs)
+          x, w = get_retained_inputs
+          gy = grad_outputs.first
-        def as_mat(x)
-          return x if x.ndim == 2
-          x.reshape(x.shape.first, true)
+          ret = []
+          if indexes.include?(0)
+            gx = LinearFunction.linear(gy, w.transpose)
+            ret << Chainer::Functions::Array::Cast.cast(gx, x.dtype)
+          end
+          if indexes.include?(1)
+            gw = LinearFunction.linear(gy.transpose, x.transpose)
+            ret << Chainer::Functions::Array::Cast.cast(gw, w.dtype)
+          end
+          if indexes.include?(2)
+            gb = Chainer::Functions::Math::Sum.sum(gy, axis: 0)
+            ret << gb
+          end
+          ret
         end
       end
     end

data/lib/chainer/functions/evaluation/accuracy.rb CHANGED Viewed

@@ -12,13 +12,13 @@ module Chainer
         def forward(inputs)
           y, t = inputs
+          xm = Chainer.get_array_module(*inputs)
           if @ignore_label
             mask = t.eq(@ignore_label)
             ignore_cnt = mask.count
-            # this work
-            pred = y.max_index(axis: 1).to_a.map.with_index { |val, idx| val - y.shape[1] * idx}
-            pred = y.class[*pred].reshape(*t.shape)
+            pred = y.max_index(axis: 1) - xm::Int32.new(y.shape[0]).seq(0, y.shape[1])
+            pred = pred.reshape(*t.shape)
             pred[mask] = @ignore_label
             count = pred.eq(t).count - ignore_cnt
@@ -30,8 +30,8 @@ module Chainer
               [y.class.cast(count.to_f / total)]
             end
           else
-            pred = y.max_index(axis: 1).to_a.map.with_index { |val, idx| val - y.shape[1] * idx}
-            pred = y.class[*pred].reshape(*t.shape)
+            pred = y.max_index(axis: 1) - xm::Int32.new(y.shape[0]).seq(0, y.shape[1])
+            pred = pred.reshape(*t.shape)
             [y.class.cast(y.class[pred.eq(t)].mean)]
           end

data/lib/chainer/functions/loss/mean_squared_error.rb CHANGED Viewed

@@ -2,31 +2,40 @@ module Chainer
   module Functions
     module Loss
       # Mean squared error (a.k.a. Euclidean loss) function.
-      class MeanSquaredError < Function
+      class MeanSquaredError < FunctionNode
         # Mean squared error function.
         #
         # This function computes mean squared error between two variables. The mean
         # is taken over the minibatch. Note that the error is not scaled by 1/2.
         #
-        # @param [Chainer::Variable or Numo::NArray] x0 Input variable.
-        # @param [Chainer::Variable or Numo::NArray] x1 Input variable.
+        # @param [Chainer::Variable or Numo::NArray or Cumo::NArray] x0 Input variable.
+        # @param [Chainer::Variable or Numo::NArray or Cumo::NArray] x1 Input variable.
         # @return [Chainer::Variable] A variable holding an array representing the mean squared error of two inputs.
         #
         def self.mean_squared_error(x0, x1)
-          self.new.(x0, x1)
+          self.new.apply([x0, x1]).first
         end
-        def forward_cpu(inputs)
-          x0, x1 = inputs
-          @diff = x0 - x1
-          diff = @diff.flatten.dup()
+        def forward(inputs)
+          retain_inputs([0, 1])
+          diff = (inputs[0] - inputs[1]).flatten.dup
           [diff.class.cast(diff.dot(diff) / diff.size)]
         end
-        def backward(inputs, gy)
-          coeff = gy[0] * gy[0].class.cast(2.0 / @diff.size)
-          gx0 = coeff * @diff
-          [gx0, -(gx0)]
+        def backward(indexes, gy)
+          x0, x1 = get_retained_inputs
+          diff = x0 - x1
+          gy0 = Chainer::Functions::Array::BroadcastTo.broadcast_to(gy[0], diff.shape)
+          gx0 = gy0 * diff * (2.0 / diff.size)
+          ret = []
+          if indexes.include?(0)
+            ret << gx0
+          end
+          if indexes.include?(1)
+            ret << -gx0
+          end
+          ret
         end
       end
     end

data/lib/chainer/functions/loss/softmax_cross_entropy.rb CHANGED Viewed

@@ -2,68 +2,101 @@ module Chainer
   module Functions
     module Loss
       class SoftmaxCrossEntropy < Function
-        def self.softmax_cross_entropy(x, t, normalize: true, cache_score: true, class_weight: nil, ignore_label: -1, reduce: 'mean')
-          self.new(normalize: normalize, cache_score: cache_score, class_weight: class_weight, ignore_label: ignore_label, reduce: reduce).(x, t)
+        def self.softmax_cross_entropy(x, t, normalize: true, cache_score: true, class_weight: nil, ignore_label: -1, reduce: 'mean', enable_double_backprop: false)
+          if enable_double_backprop
+            self.double_backward_softmax_cross_entropy(x, t, normalize, class_weight, ignore_label, reduce)
+          else
+            self.new(normalize: normalize, cache_score: cache_score, class_weight: class_weight, ignore_label: ignore_label, reduce: reduce).(x, t)
+          end
+        end
+        def self.double_backward_softmax_cross_entropy(x, t, normalize, class_weight, ignore_label, reduce)
+          if t.is_a?(Chainer::Variable)
+            t =  t.data
+          end
+          self.check_class_weight_option(class_weight)
+          self.check_reduce_option(reduce)
+          loss = -Activation::LogSoftmax.log_softmax(x)
+          if class_weight
+            shape = x.ndim.times.map { |d| d != 1 ? 1 : class_weight.shape[-1] }
+            class_weight = Chainer::Functions::Array::BroadcastTo.broadcast_to(class_weight.reshape(*shape), x.shape)
+            loss = loss * class_weight
+          end
+          dtype = x.is_a?(Chainer::Variable) ? x.dtype : x.class
+          in_use = t.ne(ignore_label).cast_to(dtype)
+          loss = Chainer::Functions::Array::Rollaxis.rollaxis(loss, 1, start:  loss.ndim)
+          # TODO: loss = chainer.functions.reshape(loss, (-1, loss.shape[-1]))
+          shape = loss.shape
+          last_shape = shape.pop
+          loss = Chainer::Functions::Array::Reshape.reshape(loss, [shape.inject(:*), last_shape])
+          # Replace ignore_label value with one valid for F.select_item below.
+          t = t.clip(0, loss.shape[1] - 1)
+          loss = Chainer::Functions::Array::SelectItem.select_item(loss, t.flatten.dup)
+          loss = Chainer::Functions::Array::Reshape.reshape(loss, t.shape)
+          loss = loss * in_use
+          if reduce == "mean"
+            count = normalize ? in_use.sum : x.shape.first
+            count = [count, 1.0].max
+            loss = loss * (1.0 / count)
+            return Chainer::Functions::Math::Sum.sum(loss)
+          else
+            return loss
+          end
         end
         def initialize(normalize: true, cache_score: true, class_weight: nil, ignore_label: -1, reduce: 'mean')
           @normalize = normalize
           @cache_score = cache_score
+          self.class.check_class_weight_option(class_weight)
           @class_weight = class_weight
-          unless class_weight.nil?
-            if @class_weight.ndim != 1
-              raise ArgumentError, 'class_weight.ndim should be 1'
-            elsif (@class_weight.class != Numo::DFloat) and (@class_weight.class != Numo::SFloat)
-              raise ArgumentError, "The dtype of class_weight should be 'Numo::DFloat' or 'Numo::SFloat'"
-            elsif @class_weight.kind_of?(Chainer::Variable)
-              raise ArgumentError, 'class_weight should be a Numo::NArray, not a chainer.Variable'
-            end
-          end
           @ignore_label = ignore_label
-          unless ['mean', 'no'].include?(reduce)
-            raise ArgumentError, "only 'mean' and 'no' are valid for 'reduce', but #{reduce} is given"
-          end
+          self.class.check_reduce_option(reduce)
           @reduce = reduce
         end
-        def forward_cpu(inputs)
+        def forward(inputs)
+          xm = Chainer.get_array_module(*inputs)
           x, t = inputs
           log_y = Activation._log_softmax(x)
           if @cache_score
-            @y = Numo::NMath.exp(log_y)
+            @y = xm::NMath.exp(log_y)
           end
           if @class_weight
             shape = x.ndim.times.map { |e| e == 1 ? true : 1 }
-            log_y *= Chainer::Functions::Loss.broadcast_to(@class_weight.reshape(*shape), x.shape)
+            log_y *= Chainer::Utils::Array.broadcast_to(@class_weight.reshape(*shape), x.shape)
           end
-          log_yd = Chainer::Functions::Loss.rollaxis(log_y, 1)
+          log_yd = Chainer::Utils::Array.rollaxis(log_y, 1)
           begin
             log_yd = log_yd.reshape(log_yd.shape[0], true)
           rescue ArgumentError
           end
-          ravel_arr = t.dup.flatten.dup
-          ravel_arr[ravel_arr<0] = 0
-          arange_arr = t.class.new(t.size).seq
-          # https://github.com/chainer/chainer/blob/v2.0.2/chainer/functions/loss/softmax_cross_entropy.py#L79
-          log_p = []
-          ravel_arr.each_with_index do |r, i|
-            log_p << log_yd[r, i]
+          log_p = log_yd[t.class.maximum(t.flatten, 0), t.class.new(t.size).seq].diagonal
+          if @ignore_label
+            t_valid= t.ne(@ignore_label)
+            log_p *= t_valid.flatten
           end
-          log_p = log_yd.class.[](*log_p)
-          log_p[t.flatten.dup.eq(@ignore_label)] = 0
           if @reduce == 'mean'
-            if @normalize
-              count = t.ne(@ignore_label).count
+            if @normalize and t_valid
+              @coeff = 1.0 / log_p.class.maximum(Chainer::Utils::Array.force_array(t_valid.count), 1)
             else
               count = x.shape[0]
+              @coeff = 1.0 / [count, 1].max
             end
-            @coeff = 1.0 / [count, 1].max
             y = log_p.sum(keepdims: true) * (-@coeff)
             [y.class.cast(y[0])]
           else
@@ -71,7 +104,8 @@ module Chainer
           end
         end
-        def backward_cpu(inputs, grad_outputs)
+        def backward(inputs, grad_outputs)
+          xm = Chainer.get_array_module(*(inputs + grad_outputs))
           x, t = inputs
           gloss = grad_outputs[0]
@@ -79,24 +113,24 @@ module Chainer
             y = @y.dup
           else
             y = Activation._log_softmax(x)
-            y = Numo::NMath.exp(y)
+            y = xm::NMath.exp(y)
           end
           if y.ndim == 2
             gx = y
+            # TODO(sonots): Avoid to_a especially in Cumo to improve performance
             t.class.new(t.shape[0]).seq(0).to_a.zip(t.class.maximum(t, 0).to_a).each{|v| gx[*v] -= 1}
             if @class_weight
               shape = x.ndim.times.map { |d| d == 1 ? true : 1 }
-              c = Chainer::Functions::Loss.broadcast_to(@class_weight.reshape(*shape), x.shape)
-              c = c.class.cast(t.class.new(t.shape[0]).seq.to_a.zip(t.class.maximum(t, 0).to_a).map{|v| c[*v]})
-              gx *= Chainer::Functions::Loss.broadcast_to(c.expand_dims(1), gx.shape)
+              c = Chainer::Utils::Array.broadcast_to(@class_weight.reshape(*shape), x.shape)
+              c = c[t.class.new(t.shape[0]).seq, t.class.maximum(t, 0)].diagonal.dup
+              gx *= Chainer::Utils::Array.broadcast_to(c.expand_dims(1), gx.shape)
             end
-            bit = t.flatten.dup
-            bit[t.ne(@ignore_label)] = 1
-            bit[bit.ne(1)] = 0
-            gx *= bit.reshape(t.shape[0], 1)
+            if @ignore_label
+              gx *= (t.ne @ignore_label).reshape(t.shape[0], 1)
+            end
           else
             # in the case where y.ndim is higher than 2,
             # we think that a current implementation is inefficient
@@ -104,18 +138,21 @@ module Chainer
             n_unit = t.size / t.shape[0]
             gx = y.reshape(y.shape[0], y.shape[1], true)
-            fst_index = Numo::Int32.new(t.size).seq(0) / n_unit
-            trd_index = Numo::Int32.new(t.size).seq(0) % n_unit
+            fst_index = xm::Int32.new(t.size).seq(0) / n_unit
+            trd_index = xm::Int32.new(t.size).seq(0) % n_unit
+            # TODO(sonots): Avoid to_a especially in Cumo to improve performance
             fst_index.to_a.zip(t.class.maximum(t.flatten.dup, 0).to_a, trd_index.to_a).each{|v| gx[*v] -= 1}
             if @class_weight
               shape = x.ndim.times.map{|d| d == 1 ? true : 1}
-              c = Chainer::Functions::Loss.broadcast_to(@class_weight.reshape(*shape), x.shape)
+              c = Chainer::Utils::Array.broadcast_to(@class_weight.reshape(*shape), x.shape)
               c = c.reshape(*gx.shape)
-              c = c.class.cast(fst_index.to_a.zip(t.class.maximum(t.flatten.dup, 0).to_a, trd_index.to_a).map{|v| c[*v]})
+              c = c[fst_index, t.class.maximum(t.flatten.dup, 0), trd_index].diagonal.diagonal.dup
               c = c.reshape(y.shape[0], 1, true)
-              gx *= Chainer::Functions::Loss.broadcast_to(c, gx.shape)
+              gx *= Chainer::Utils::Array.broadcast_to(c, gx.shape)
+            end
+            if @ignore_label
+              gx *= (t.ne @ignore_label).reshape(t.shape[0], 1, true)
             end
-            gx *= (t.ne @ignore_label).reshape(t.shape[0], 1, true)
             gx = gx.reshape(*y.shape)
           end
@@ -126,36 +163,26 @@ module Chainer
           end
           return [gx, nil]
         end
-      end
-      def rollaxis(y, axis, start: 0)
-        axes = (0...y.ndim).to_a
-        axes.delete_at(axis)
-        axes.insert(start <= axes.size ? start : -1, axis)
-        y.transpose(*axes)
-      end
+        def self.check_class_weight_option(class_weight)
+          return if class_weight.nil?
-      def broadcast_to(array, shape)
-        if array.shape.size > shape.size
-           raise TypeError, "Shape of data  mismatch\n array.shape.size(#{array.shape.size}) > shape.size(#{shape.size})"
+          xm = Chainer.get_array_module(@class_weight)
+          if class_weight.ndim != 1
+            raise ArgumentError, 'class_weight.ndim should be 1'
+          elsif (class_weight.class != xm::DFloat) and (class_weight.class != xm::SFloat)
+            raise ArgumentError, "The dtype of class_weight should be 'DFloat' or 'SFloat'"
+          elsif class_weight.kind_of?(Chainer::Variable)
+            raise ArgumentError, 'class_weight should be a NArray, not a chainer.Variable'
+          end
         end
-        tile_shape = []
-        shape_check = shape[-array.shape.size..-1]
-        shape_check.each_with_index{|s, i|
-          if array.shape[i] == 1
-            tile_shape << s
-          elsif array.shape[i] == s
-            tile_shape << 1
-          else
-            raise TypeError, "Shape of data  mismatch\n#{array.shape} != #{shape}"
+        def self.check_reduce_option(reduce)
+          unless ['mean', 'no'].include?(reduce)
+            raise ArgumentError, "only 'mean' and 'no' are valid for 'reduce', but #{reduce} is given"
           end
-        }
-        array.tile(*shape[0...-array.shape.size], *tile_shape)
+        end
       end
-      module_function :rollaxis, :broadcast_to
     end
   end
 end