RubyGems - tensor_stream-opencl - Versions diffs - 0.1.3 → 0.2.0 - Mend

tensor_stream-opencl 0.1.3 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

checksums.yaml +4 -4
data/Gemfile.lock +11 -4
data/benchmark/benchmark.rb +91 -0
data/benchmark_intel.txt +36 -0
data/lib/tensor_stream/opencl/array_ops.rb +395 -0
data/lib/tensor_stream/opencl/images_ops.rb +62 -0
data/lib/tensor_stream/opencl/kernels/abs.cl +6 -8
data/lib/tensor_stream/opencl/kernels/acos.cl +3 -4
data/lib/tensor_stream/opencl/kernels/apply_adadelta.cl +2 -4
data/lib/tensor_stream/opencl/kernels/apply_adagrad.cl +12 -0
data/lib/tensor_stream/opencl/kernels/apply_adam.cl +2 -5
data/lib/tensor_stream/opencl/kernels/apply_centered_rms_prop.cl +19 -0
data/lib/tensor_stream/opencl/kernels/apply_gradient.cl +3 -4
data/lib/tensor_stream/opencl/kernels/apply_momentum.cl +2 -4
data/lib/tensor_stream/opencl/kernels/apply_rms_prop.cl +16 -0
data/lib/tensor_stream/opencl/kernels/asin.cl +3 -4
data/lib/tensor_stream/opencl/kernels/ceil.cl +3 -4
data/lib/tensor_stream/opencl/kernels/concat.cl +21 -0
data/lib/tensor_stream/opencl/kernels/cos.cl +3 -5
data/lib/tensor_stream/opencl/kernels/exp.cl +3 -5
data/lib/tensor_stream/opencl/kernels/floor.cl +3 -4
data/lib/tensor_stream/opencl/kernels/log.cl +3 -4
data/lib/tensor_stream/opencl/kernels/log1p.cl +3 -4
data/lib/tensor_stream/opencl/kernels/negate.cl +3 -4
data/lib/tensor_stream/opencl/kernels/reciprocal.cl +3 -4
data/lib/tensor_stream/opencl/kernels/sigmoid.cl +3 -4
data/lib/tensor_stream/opencl/kernels/sign.cl +7 -8
data/lib/tensor_stream/opencl/kernels/sin.cl +3 -4
data/lib/tensor_stream/opencl/kernels/split.cl +17 -0
data/lib/tensor_stream/opencl/kernels/split_n.cl +18 -0
data/lib/tensor_stream/opencl/kernels/sqrt.cl +3 -4
data/lib/tensor_stream/opencl/kernels/square.cl +3 -4
data/lib/tensor_stream/opencl/kernels/tan.cl +3 -4
data/lib/tensor_stream/opencl/kernels/tanh.cl +3 -4
data/lib/tensor_stream/opencl/kernels/tanh_grad.cl +3 -4
data/lib/tensor_stream/opencl/kernels/unpack.cl +23 -0
data/lib/tensor_stream/opencl/nn_ops.rb +111 -26
data/lib/tensor_stream/opencl/opencl_buffer.rb +9 -0
data/lib/tensor_stream/opencl/opencl_evaluator.rb +129 -172
data/lib/tensor_stream/opencl/version.rb +1 -1
data/samples/iris.data +150 -0
data/samples/iris.rb +110 -0
data/samples/mnist_data.rb +65 -0
data/samples/multigpu.rb +73 -0
data/samples/nearest_neighbor.rb +56 -0
data/samples/rnn.rb +108 -0
data/tensor_stream-opencl.gemspec +4 -1
metadata +62 -3

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: dccdd97c6bdddfa8e1100dc135e7eb74d78218c3e91c75a0ef06e69be5b5ab2e
-  data.tar.gz: 52b061f6e1eb393ab9d0f54d7feebd497d0ad6b9d735eb1ec602f21cb1fcbd79
+  metadata.gz: 80aa4c8e84193ba879b9c7863b8103cd345b6591ec0a534162c53965609f1bd1
+  data.tar.gz: 88840b00a6c4a71540d837a4e20378cf2aafe4efda1990d2f978f401cae35c83
 SHA512:
-  metadata.gz: '0990739a203b75ca8900cefb77781675abc866a1fa9a5a2aefe19fbd528f06a83ba2e06e4ddb0d04f41cf76b460fedc6cf7bfd70e05816e0346ed96fb9c022d4'
-  data.tar.gz: 8b438e82d0d3d9234053b12154d49b84397fe0e0cb1a1a4cd9159bb957e0132e91b0aa8f6ffb4df119e20947f49d1a9639b53ff793b3c2645e88ab99daeef2dd
+  metadata.gz: c8c74bbc136ea42c8a01506a0b606bbde8a83a922026caef917d3eb8dbad1c41298fea37040e846ea1eee0683af35f0dd24df7d5449dac75c3e175ed07d94d49
+  data.tar.gz: 2235974d1d8dc5cfe9117991cb5ea4dff2b75409e26e20b197414613484a68b482c41796ff74699b041dbcf5f963721ffffa2f5f1a9f28e1f05b5bb96a081039

data/Gemfile.lock CHANGED Viewed

@@ -1,13 +1,15 @@
 PATH
   remote: .
   specs:
-    tensor_stream-opencl (0.1.3)
+    tensor_stream-opencl (0.2.0)
+      oily_png
       opencl_ruby_ffi
-      tensor_stream
+      tensor_stream (~> 0.9.0)
 GEM
   remote: https://rubygems.org/
   specs:
+    awesome_print (1.8.0)
     byebug (10.0.2)
     chunky_png (1.3.10)
     coderay (1.1.2)
@@ -16,10 +18,13 @@ GEM
     diff-lcs (1.3)
     ffi (1.9.25)
     method_source (0.9.0)
+    mnist-learn (0.1.1)
     narray (0.6.1.2)
-    narray_ffi (1.4.3)
+    narray_ffi (1.4.4)
       ffi (~> 1.9, >= 1.9.3)
       narray (~> 0.6, >= 0.6.0.8)
+    oily_png (1.2.1)
+      chunky_png (~> 1.3.7)
     opencl_ruby_ffi (1.3.4)
       ffi (~> 1.9, >= 1.9.3)
       narray (~> 0.6, >= 0.6.0.8)
@@ -44,7 +49,7 @@ GEM
       diff-lcs (>= 1.2.0, < 2.0)
       rspec-support (~> 3.8.0)
     rspec-support (3.8.0)
-    tensor_stream (0.8.5)
+    tensor_stream (0.9.0)
       chunky_png
       concurrent-ruby
       deep_merge
@@ -53,7 +58,9 @@ PLATFORMS
   ruby
 DEPENDENCIES
+  awesome_print
   bundler (~> 1.16)
+  mnist-learn
   pry-byebug
   rake (~> 10.0)
   rspec (~> 3.0)

data/benchmark/benchmark.rb ADDED Viewed

@@ -0,0 +1,91 @@
+require "bundler/setup"
+require 'tensor_stream'
+require 'benchmark'
+require 'pry-byebug'
+require 'awesome_print'
+require 'tensor_stream/opencl'
+def tr(t, places = 1)
+  if t.is_a?(Array)
+    return t.collect do |v|
+      tr(v, places)
+    end
+  end
+  return t unless t.is_a?(Float)
+  t.round(places)
+end
+tf = TensorStream
+srand(5)
+seed = 5
+tf.set_random_seed(seed)
+SHAPES = [32, 32]
+sess = tf.session(:ruby_evaluator)
+a = tf.constant(sess.run(tf.random_uniform(SHAPES)))
+a_int = tf.constant([
+  [1, 2, 3, 4, 4, 1, 4, 8, 3, 4, 1, 1],
+  [2, 2, 3, 4, 4, 1, 1, 1, 1, 4, 1, 1],
+  [3, 2, 3, 4, 0, 1, 1, 2, 1, 1, 2, 1],
+  [4, 2, 3, 4, 0, 1, 1, 0, 1, 1, 3, 1],
+  [4, 2, 3, 4, 0, 1, 1, 0, 1, 1, 4, 1],
+  [4, 2, 3, 4, 0, 1, 1, 0, 0, 1, 5, 1],
+  [4, 2, 3, 4, 0, 1, 1, 0, 0, 1, 6, 1],
+  [4, 2, 3, 4, 0, 1, 1, 0, 0, 0, 0, 1],
+  [4, 2, 3, 4, 0, 1, 1, 0, 0, 2, 6, 1],
+  [4, 2, 3, 4, 0, 1, 1, 0, 0, 2, 1, 1],
+  [4, 2, 3, 4, 0, 1, 1, 0, 0, 2, 1, 2],
+  [4, 2, 3, 4, 0, 1, 1, 0, 0, 2, 1, 2],
+])
+b = tf.constant(sess.run(tf.random_uniform(SHAPES)))
+c = tf.constant(sess.run(tf.random_uniform(SHAPES)))
+d = tf.constant(sess.run(tf.random_uniform(SHAPES)))
+p = tf.placeholder('float')
+q = tf.placeholder('float')
+model = -tf.sin(a.dot(b + p) + c).dot(a) + tf.cos(a.dot(d + q))
+single_function_test = (tf.sigmoid(a * p) * tf.sigmoid(b * q)) + c
+pow_f = tf.pow(a, 3)
+pow_i = tf.pow(a_int, 3)
+matmul = tf.matmul(a, b)
+out_of_order = tf.matmul(a, b) + tf.matmul(c, d)
+softmax = tf.nn.softmax(a)
+add_n = tf.add_n([a,b,c,d])
+split = tf.split(a, 4)
+puts TensorStream::Evaluator.default_evaluators
+sess2 = tf.session
+puts `cat /proc/cpuinfo | grep "model name" | head -1`
+device = TensorStream::Evaluator::OpenclEvaluator.default_device.native_device
+puts "OpenCL device #{device.platform.to_s} #{device.name}"
+Benchmark.bmbm do |x|
+  x.report("pure ruby split          :") { 100.times do sess.run(split) end }
+  x.report("opencl split             :") { 100.times do sess2.run(split) end }
+  x.report("pure ruby add_n          :") { 100.times do sess.run(add_n) end }
+  x.report("opencl add_n             :") { 100.times do sess2.run(add_n) end }
+  x.report("pure ruby ooo matmul     :") { 100.times do sess.run(out_of_order) end }
+  x.report("opencl    ooo matmul     :") { 100.times do sess2.run(out_of_order) end }
+  x.report("pure ruby softmax        :") { 100.times do sess.run(softmax) end }
+  x.report("opencl    softmax        :") { 100.times do sess2.run(softmax) end }
+  x.report("pure ruby matmul         :") { 100.times do sess.run(matmul) end }
+  x.report("opencl    matmul         :") { 100.times do sess2.run(matmul) end }
+  x.report("pure ruby                :") { 100.times do sess.run(model, feed_dict: { p => rand, q => rand }) end }
+  x.report("opencl                   :") { 100.times do sess2.run(model, feed_dict: { p => rand, q => rand }) end }
+  x.report("pure ruby single function:") { 100.times do sess.run(single_function_test, feed_dict: { p => rand, q => rand }) end }
+  x.report("opencl     singlefunction:") { 100.times do sess2.run(single_function_test, feed_dict: { p => rand, q => rand }) end }
+  x.report("pure ruby pow float:") { 100.times do sess.run(pow_f, feed_dict: { p => rand, q => rand }) end }
+  x.report("opencl pow float:") { 100.times do sess2.run(pow_f, feed_dict: { p => rand, q => rand }) end }
+  x.report("pure ruby pow int:") { 100.times do sess.run(pow_i, feed_dict: { p => rand, q => rand }) end }
+  x.report("opencl pow int:") { 100.times do sess2.run(pow_i, feed_dict: { p => rand, q => rand }) end }
+end

data/benchmark_intel.txt ADDED Viewed

@@ -0,0 +1,36 @@
+TensorStream::Evaluator::OpenclEvaluator
+TensorStream::Evaluator::RubyEvaluator
+model name	: Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz
+OpenCL device Intel Gen OCL Driver Intel(R) HD Graphics Skylake ULT GT2
+Rehearsal --------------------------------------------------------------
+pure ruby ooo matmul     :   1.800000   0.000000   1.800000 (  1.803752)
+opencl    ooo matmul     :   0.520000   0.050000   0.570000 (  0.630992)
+pure ruby softmax        :   0.300000   0.000000   0.300000 (  0.303185)
+opencl    softmax        :   0.180000   0.010000   0.190000 (  0.200246)
+pure ruby matmul         :   0.860000   0.010000   0.870000 (  0.869387)
+opencl    matmul         :   0.260000   0.020000   0.280000 (  0.335164)
+pure ruby                :   2.960000   0.020000   2.980000 (  2.980800)
+opencl                   :   1.050000   0.090000   1.140000 (  1.258354)
+pure ruby single function:   0.460000   0.000000   0.460000 (  0.464543)
+opencl     singlefunction:   0.570000   0.020000   0.590000 (  0.590300)
+pure ruby pow float:         0.120000   0.000000   0.120000 (  0.123025)
+opencl pow float:            0.290000   0.010000   0.300000 (  0.316175)
+pure ruby pow int:           0.020000   0.000000   0.020000 (  0.021570)
+opencl pow int:              0.180000   0.000000   0.180000 (  0.194088)
+----------------------------------------------------- total: 9.800000sec
+                                 user     system      total        real
+pure ruby ooo matmul     :   1.860000   0.000000   1.860000 (  1.866387)
+opencl    ooo matmul     :   0.410000   0.040000   0.450000 (  0.505565)
+pure ruby softmax        :   0.300000   0.000000   0.300000 (  0.298407)
+opencl    softmax        :   0.120000   0.000000   0.120000 (  0.128033)
+pure ruby matmul         :   0.830000   0.000000   0.830000 (  0.836471)
+opencl    matmul         :   0.240000   0.010000   0.250000 (  0.269629)
+pure ruby                :   2.950000   0.000000   2.950000 (  2.947306)
+opencl                   :   0.930000   0.100000   1.030000 (  1.205344)
+pure ruby single function:   0.650000   0.000000   0.650000 (  0.642834)
+opencl     singlefunction:   0.840000   0.040000   0.880000 (  1.097814)
+pure ruby pow float:         0.140000   0.000000   0.140000 (  0.140097)
+opencl pow float:            0.190000   0.010000   0.200000 (  0.269772)
+pure ruby pow int:           0.030000   0.000000   0.030000 (  0.030491)
+opencl pow int:              0.040000   0.010000   0.050000 (  0.084335)

data/lib/tensor_stream/opencl/array_ops.rb ADDED Viewed

@@ -0,0 +1,395 @@
+module TensorStream
+  module OpenCLHelpers
+    # Collection of math functions for interfacing with OpenCL kernels
+    module ArrayOps
+      def ArrayOps.included(klass)
+        klass.class_eval do
+          register_op :expand_dims, buffer: true do |_context, tensor, inputs|
+            axis = inputs[1].buffer[0]
+            shape = inputs[0].shape.dup
+            axis = -axis if axis == shape.size
+            new_shape = shape.insert(axis, 1).compact
+            new_buf = inputs[0].buffer.reshape(*new_shape.reverse)
+            convert_to_opencl(new_buf, new_shape, data_type: inputs[0].data_type, name: tensor.name)
+          end
+          register_op :fill, buffer: true do |_context, tensor, inputs|
+            shape = inputs[0]
+            value = inputs[1]
+            narray_size = shape.buffer.to_a.reduce(:*) || 1
+            cl_buffer = get_cached_buffer(tensor.name, shape.buffer.to_a)
+            buffer = if cl_buffer
+                       cl_buffer.buffer
+                     else
+                       allocate_narray_for_type(tensor.data_type, narray_size)
+                     end
+            buffer.fill!(value.buffer[0])
+            convert_to_opencl(buffer, shape.buffer.to_a, data_type: tensor.data_type, name: tensor.name)
+          end
+          register_op :split do |context, tensor, inputs|
+            value, num_split, axis = inputs
+            value_shape = value.shape
+            axis = read_final_result(complete_eval(axis, context))
+            num_split = read_final_result(complete_eval(num_split, context))
+            multipliers = value_shape.dup.drop(1).reverse.inject([1]) do |a, s|
+              a << s * a.last
+            end.reverse
+            outputs = if !num_split.is_a?(Array) # scalar split
+                        split_target = value_shape[axis]
+                        raise TensorStream::ValueError, "#{num_split} does not divide #{split_target} evenly" if split_target % num_split != 0
+                        piece_size = split_target / num_split
+                        new_shape = value_shape.dup
+                        new_shape[axis] = piece_size
+                        if axis.zero? # axis zero fast copy path
+                          Array.new(num_split) do |index|
+                            _create_result_sub_buffer(value, index, tensor.data_type, new_shape, "#{tensor.name}/out_#{index}_#{num_split}")
+                          end
+                        else
+                          # create buffers for each piece
+                          work_buffer = _create_result_buffer(tensor.data_type, value_shape, "#{tensor.name}/out")
+                          piece_size = new_shape.reduce(:*)
+                          work_group = [num_split, piece_size]
+                          divisors = new_shape.dup.drop(1).reverse.inject([1]) do |a, s|
+                            a << s * a.last
+                          end.reverse
+                          cl_piece_size = OpenCL::Int1.new(piece_size)
+                          event_wait_list = build_event_wait_list(inputs)
+                          step = value_shape[axis] / num_split
+                          event = _cl_program('split', step: step, axis: axis, mul: multipliers, dest: divisors, data_type: tensor.data_type).split(_opencl_queue, work_group,
+                                     cl_piece_size,
+                                     value.cl_buffer,
+                                     work_buffer.cl_buffer,
+                                     event_wait_list: event_wait_list)
+                          work_buffer.op = event
+                          Array.new(num_split) do |index|
+                            _create_result_sub_buffer(work_buffer, index, tensor.data_type, new_shape, "#{tensor.name}/out_#{index}_#{num_split}")
+                          end
+                        end
+                      else
+                        raise TensorStream::ValueError, "#{num_split} does not divide #{value_shape[axis]} evenly" if num_split.reduce(:+) != value_shape[axis]
+                        # compute shapes of individual output buffers
+                        new_shapes = num_split.each_with_index.collect do |num, index|
+                                       new_shape = value_shape.dup
+                                       new_shape[axis] = num
+                                       new_shape
+                                     end
+                        if axis.zero? # axis zero fast copy path
+                          start = 0
+                          out = []
+                          new_shapes.each_with_index do |new_shape, index|
+                            element_count = new_shape.reduce(:*) || 1
+                            region_size_in_bytes = element_count * value.buffer.element_size
+                            out << _create_variable_result_sub_buffer(value, index, start, region_size_in_bytes, tensor.data_type, new_shape, "#{tensor.name}/out_#{index}_#{new_shape.join('.')}")
+                            start += region_size_in_bytes
+                          end
+                          out
+                        else
+                          # create buffers for each piece
+                          work_buffer = _create_result_buffer(tensor.data_type, value_shape, "#{tensor.name}/out")
+                          out = []
+                          start = 0
+                          steps = num_split.dup.reverse.drop(1).inject([0]) do |a, s|
+                            a << s + a.last
+                          end
+                          offsets = new_shapes.dup.reverse.drop(1).inject([0]) do |a, shape|
+                            size_bytes = shape.reduce(:*) || 1
+                            a << a.last + size_bytes
+                          end
+                          events = new_shapes.each_with_index.collect do |shape, index|
+                            offset = offsets[index]
+                            step = steps[index]
+                            divisors = shape.dup.drop(1).reverse.inject([1]) do |a, s|
+                              a << s * a.last
+                            end.reverse
+                            piece_size = shape.reduce(:*) || 1
+                            work_group = [piece_size]
+                            cl_offset = OpenCL::Int1.new(offset)
+                            _cl_program('split_n', axis: axis,
+                                                           div: divisors,
+                                                           mul: multipliers,
+                                                           step: step,
+                                                           data_type: tensor.data_type).
+                                                          split(_opencl_queue,
+                                                                work_group,
+                                                                cl_offset,
+                                                                value.cl_buffer,
+                                                                work_buffer.cl_buffer,
+                                                                event_wait_list: event_wait_list)
+                          end
+                          work_buffer.op = events
+                          new_shapes.each_with_index do |new_shape, index|
+                            element_count = new_shape.reduce(:*) || 1
+                            region_size_in_bytes = element_count * work_buffer.buffer.element_size
+                            out << _create_variable_result_sub_buffer(work_buffer, index, start, region_size_in_bytes, tensor.data_type, new_shape, "#{tensor.name}/out_#{index}_#{new_shape.join('.')}")
+                            start += region_size_in_bytes
+                          end
+                          out
+                        end
+                      end
+            TensorStream::Evaluator::OutputGroup.new(outputs, outputs.map(&:data_type))
+          end
+          register_op :concat do |context, tensor, inputs|
+            axis = inputs.shift
+            shape = inputs[0].shape
+            normal_shape = inputs[0].shape.dup
+            axis = read_final_result(_run(axis, context))
+            axis = normal_shape.size - 1 if axis == -1
+            divisors = normal_shape.dup.drop(1).reverse.inject([1]) do |a, s|
+              a << s * a.last
+            end.reverse
+            new_shape = inputs[0].shape.dup
+            new_shape[axis] = 0
+            inputs.each do |input|
+              new_shape[axis] += input.shape[axis]
+            end
+            multipliers = new_shape.dup.drop(1).reverse.inject([1]) do |a, s|
+              a << s * a.last
+            end.reverse
+            output_buffer = _create_result_buffer(tensor.data_type, new_shape, tensor.name)
+            ops = if axis.zero? # fast path
+              inputs.each_with_index.map do |input, index|
+                next if input.empty_value?
+                start = index * input.buffer.size * input.buffer.element_size
+                region = [input.buffer.size * input.buffer.element_size, 1, 1]
+                event_wait_list = build_event_wait_list(input)
+                _opencl_queue.enqueue_copy_buffer_rect(input.cl_buffer, output_buffer.cl_buffer,
+                      region, dst_origin: [start, 0, 0], event_wait_list: event_wait_list)
+              end.compact
+            else
+              elem_size = shape.empty? ? 1 : shape.reduce(:*)
+              cl_n = OpenCL::Int1.new(elem_size)
+              steps = inputs.map(&:shape).reverse.drop(1).inject([0]) do |a, shape|
+                a << shape[axis] + a.last
+              end
+              work_group = [elem_size]
+              event_wait_list = build_event_wait_list(inputs)
+              inputs.each_with_index.map do |input, index|
+                cl_index = OpenCL::Int1.new(index)
+                step = OpenCL::Int1.new(steps[index])
+                _cl_program('concat', data_type: tensor.data_type, divisors: divisors, multipliers: multipliers, axis: axis).
+                              concat(_opencl_queue, work_group, cl_n, cl_index, step, input.cl_buffer,
+                                     output_buffer.cl_buffer, event_wait_list: event_wait_list)
+              end
+            end
+            output_buffer.op = ops
+            output_buffer
+          end
+          register_op :stack do |_context, tensor, inputs|
+            axis = tensor.options[:axis] || 0
+            shape = inputs[0].shape
+            rank = shape.size + 1
+            elem_size = shape.empty? ? 1 : shape.reduce(:*)
+            new_shape = [inputs.size]
+            shape.inject(new_shape) { |ns, s| ns << s }
+            divisors = new_shape.dup.drop(1).reverse.inject([1]) do |a, s|
+              a << s * a.last
+            end.reverse
+            axis = rank + axis if axis < 0
+            rotated_shape = Array.new(axis + 1) { new_shape.shift }
+            new_shape = rotated_shape.rotate! + new_shape
+            output_buffer = _create_result_buffer(tensor.data_type, new_shape, tensor.name)
+            multipliers = new_shape.dup.drop(1).reverse.inject([1]) do |a, s|
+              a << s * a.last
+            end.reverse
+            cl_n = OpenCL::Int1.new(elem_size)
+            work_group = [elem_size]
+            ops = if axis.zero? # fast path if axis == 0
+                    step = multipliers[0]
+                    inputs.each_with_index.map do |input, index|
+                      start = index * step * input.buffer.element_size
+                      region = [input.buffer.size * input.buffer.element_size, 1, 1]
+                      _opencl_queue.enqueue_copy_buffer_rect(input.cl_buffer, output_buffer.cl_buffer, region, dst_origin: [start, 0, 0], event_wait_list: input.op)
+                    end
+                  else
+                    event_wait_list = build_event_wait_list(inputs)
+                    inputs.each_with_index.map do |input, index|
+                      cl_index = OpenCL::Int1.new(index)
+                      _cl_program('pack', data_type: tensor.data_type, divisors: divisors, multipliers: multipliers, axis: axis).pack(_opencl_queue, work_group, cl_n, cl_index, input.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
+                    end
+                  end
+            output_buffer.op = ops
+            output_buffer
+          end
+          register_op :unstack do |context, tensor, inputs|
+            value = inputs[0]
+            axis = tensor.options[:axis] || 0
+            new_shape = value.shape.dup
+            rank = new_shape.size - 1
+            elem_size = new_shape.empty? ? 1 : new_shape.reduce(:*)
+            divisors = new_shape.dup.drop(1).reverse.inject([1]) do |a, s|
+              a << s * a.last
+            end.reverse
+            axis = rank + axis if axis < 0
+            rotated_shape = Array.new(axis + 1) { new_shape.shift }
+            new_shape = rotated_shape.rotate!(-1) + new_shape
+            multipliers = new_shape.dup.drop(1).reverse.inject([1]) do |a, s|
+              a << s * a.last
+            end.reverse
+            step = multipliers[0]
+            sub_shape = new_shape.dup
+            sub_shape.shift
+            outputs = if axis.zero? # shortcut for axis == 0
+                        Array.new(new_shape[0]) do |index|
+                          _create_result_sub_buffer(value, index, tensor.data_type, sub_shape, "#{tensor.name}/out_#{index}")
+                        end
+                      else
+                        output_buffer = _create_result_buffer(tensor.data_type, new_shape, tensor.name)
+                        cl_n = OpenCL::Int1.new(elem_size)
+                        work_group = [elem_size]
+                        event_wait_list = build_event_wait_list(inputs)
+                        ops = inputs.each_with_index.map do |input, index|
+                          cl_index = OpenCL::Int1.new(index)
+                          _cl_program('unpack', data_type: tensor.data_type, divisors: divisors, multipliers: multipliers, axis: axis).unpack(_opencl_queue, work_group, cl_n, cl_index, input.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
+                        end
+                        output_buffer.op = ops
+                        Array.new(new_shape[0]) do |index|
+                          _create_result_sub_buffer(output_buffer, index, tensor.data_type, sub_shape, "#{tensor.name}/out_#{index}")
+                        end
+                      end
+            TensorStream::Evaluator::OutputGroup.new(outputs, outputs.map(&:data_type))
+          end
+          register_op :index, noop: true do |context, tensor, inputs|
+            a = _run(inputs[0], context)
+            index = read_final_result(_run(inputs[1], context))
+            if a.is_a?(TensorStream::Evaluator::OutputGroup)
+              a.outputs[index]
+            elsif a.is_a?(Array)
+              a[index]
+            else
+              new_shape = a.shape.dup
+              new_shape.shift
+              input_a = read_final_result(a)
+              convert_to_opencl(input_a[index], new_shape, data_type: a.data_type, name: tensor.name)
+            end
+          end
+          register_op :shape do |_context, tensor, inputs|
+            wrap_opencl(inputs[0].shape, name: tensor.name, data_type: tensor.data_type)
+          end
+          register_op :shape_n do |_context, tensor, inputs|
+            shapes = inputs.collect do |input|
+              wrap_opencl(input.shape, name: tensor.name, data_type: tensor.data_type)
+            end
+            TensorStream::Evaluator::OutputGroup.new(shapes, shapes.map { tensor.data_type })
+          end
+          register_op :reshape do |context, tensor, inputs|
+            arr = inputs[0]
+            new_shape = read_final_result(complete_eval(inputs[1], context))
+            shape = if new_shape.size.zero? && arr.buffer.size == 1
+                      new_shape
+                    else
+                      TensorShape.fix_inferred_elements(new_shape, arr.buffer.size)
+                    end
+            OpenCLBuffer.new(name: tensor.name, data_type: tensor.data_type,
+                             shape: shape, buffer: arr.buffer,
+                             cl_buffer: arr.cl_buffer,
+                             op: arr.op)
+          end
+          register_op :transpose, buffer: true do |_context, tensor, inputs|
+            t_param = Array.new(inputs[0].shape.size) { |index| index }.reverse
+            if inputs[0].shape.size == 2 && inputs[1].nil?
+              transposed = inputs[0].buffer.reshape(*inputs[0].shape.reverse).transpose(*t_param)
+              res = convert_to_opencl(transposed.flatten, transposed.shape.reverse, data_type: inputs[0].data_type, name: tensor.name)
+              res
+            else
+              rank = inputs[0].shape.size
+              perm = inputs[1].nil? ? (0...rank).to_a.reverse : inputs[1].buffer
+              new_shape = perm.map { |p| inputs[0].shape[p] }.to_a
+              output_buffer = _create_result_buffer(tensor.data_type, new_shape, tensor.name)
+              transpose_with_perm(inputs[0].buffer, output_buffer.buffer, inputs[0].shape, new_shape, perm)
+              write_op = _opencl_queue.enqueue_write_buffer(output_buffer.cl_buffer, output_buffer.buffer)
+              output_buffer.op = write_op
+              output_buffer
+            end
+          end
+          register_op :slice, noop: true do |context, tensor, inputs|
+            input_a = complete_eval(inputs[0], context)
+            input_b = read_final_result(complete_eval(inputs[1], context))
+            size = tensor.options[:size]
+            shape = input_a.shape
+            slice_param = input_b.zip(size).collect.with_index { | p, index|  p[1] = (p[1] == -1) ? shape[index] : p[1] ; p[0]..p[0] + p[1] - 1 }.reverse
+            new_buf = input_a.buffer.reshape(*input_a.shape.reverse)
+            sliced = new_buf.slice[*slice_param]
+            convert_to_opencl(sliced.flatten, sliced.shape.reverse, data_type: inputs[0].data_type, name: tensor.name)
+          end
+          register_op :rank do |_context, tensor, inputs|
+            wrap_opencl(inputs[0].shape.size, data_type: tensor.data_type, name: tensor.name)
+          end
+          register_op :cast do |_context, tensor, inputs|
+            a = inputs[0]
+            if a.data_type != tensor.data_type
+              buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
+              m, n = a.shape
+              cl_m = OpenCL::Int1.new(m || 1)
+              cl_n = OpenCL::Int1.new(n || 1)
+              work_group = [m || 1, n || 1]
+              event_wait_list = build_event_wait_list(inputs)
+              buffer.op = _cl_program("cast", source_dt: a.data_type, target_dt: tensor.data_type).cast(_opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, buffer.cl_buffer, event_wait_list: event_wait_list)
+              buffer
+            else
+              a
+            end
+          end
+        end
+      end
+    end
+  end
+end