RubyGems - tensor_stream-opencl - Versions diffs - 0.2.2 → 0.2.3 - Mend

tensor_stream-opencl 0.2.2 → 0.2.3

Files changed (30) hide show

checksums.yaml +4 -4
data/.gitignore +1 -0
data/benchmark/benchmark.rb +23 -1
data/benchmark_ryzen.txt +56 -0
data/lib/tensor_stream/opencl/array_ops.rb +3 -3
data/lib/tensor_stream/opencl/images_ops.rb +30 -0
data/lib/tensor_stream/opencl/kernels/conv2d.cl +27 -0
data/lib/tensor_stream/opencl/kernels/conv2d_backprop_filter.cl +26 -0
data/lib/tensor_stream/opencl/kernels/conv2d_backprop_input.cl +32 -0
data/lib/tensor_stream/opencl/kernels/gemm.cl +2 -10
data/lib/tensor_stream/opencl/kernels/max.cl +5 -13
data/lib/tensor_stream/opencl/kernels/mean.cl +26 -0
data/lib/tensor_stream/opencl/kernels/min.cl +3 -11
data/lib/tensor_stream/opencl/kernels/prod.cl +26 -0
data/lib/tensor_stream/opencl/kernels/relu6.cl +7 -0
data/lib/tensor_stream/opencl/kernels/round.cl +3 -4
data/lib/tensor_stream/opencl/kernels/sum.cl +26 -0
data/lib/tensor_stream/opencl/math_ops.rb +86 -29
data/lib/tensor_stream/opencl/nn_ops.rb +89 -5
data/lib/tensor_stream/opencl/opencl_buffer.rb +6 -2
data/lib/tensor_stream/opencl/opencl_evaluator.rb +97 -92
data/lib/tensor_stream/opencl/version.rb +1 -1
data/samples/iris.rb +2 -2
data/samples/logistic_regression.rb +84 -0
data/samples/mnist_data_2.1.rb +9 -4
data/samples/mnist_data_2.2.rb +12 -7
data/samples/mnist_data_2.3.rb +111 -0
data/samples/rnn.rb +1 -1
data/tensor_stream-opencl.gemspec +2 -1
metadata +28 -4

data/lib/tensor_stream/opencl/kernels/relu6.cl ADDED Viewed

@@ -0,0 +1,7 @@
+% c_dtype = dtype_to_c_type(dtype)
+__kernel void relu6_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
+    // Get the index of the current element to be processed
+    const int id = get_global_id(0);
+    C[id] = min((<%= c_dtype %>)max((<%= c_dtype %>) A[id], (<%= c_dtype %>)0), (<%= c_dtype %>)6);
+}

data/lib/tensor_stream/opencl/kernels/round.cl CHANGED Viewed

@@ -1,8 +1,7 @@
 % c_dtype = dtype_to_c_type(dtype)
-__kernel void round_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
+__kernel void round_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
     // Get the index of the current element to be processed
-    const int globalRow = get_global_id(0); // Row ID of C (0..M)
-    const int globalCol = get_global_id(1); // Col ID of C (0..N)
+    const int id = get_global_id(0); // Row ID of C (0..M)
-    C[globalRow * N + globalCol] = round(A[globalRow * N + globalCol]);
+    C[id] = round(A[id]);
 }

data/lib/tensor_stream/opencl/kernels/sum.cl ADDED Viewed

@@ -0,0 +1,26 @@
+% c_dtype = dtype_to_c_type(dtype)
+__kernel void sum_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
+    // Get the index of the current element to be processed
+    const int id = get_global_id(0);
+    int offset = (id + <%= index %>) * <%= w %>;
+    <%= c_dtype %> sum = 0;
+    <% if n > 4 %>
+      for(int i = 0; i < <%= n/4 %> ; i++) {
+        <% sums = 4.times.map do |i|
+          "A[offset + #{i}]"
+        end %>
+        sum += <%= sums.join(' + ') %>;
+        offset += 4;
+      }
+      <% if n%4!=0 %>
+        <% (n % 4).times do |i| %>
+          sum += A[offset + <%= i %>];
+        <% end %>
+      <% end %>
+    <% else %>
+      <% n.times do |i| %>
+        sum += A[offset + <%= i %>];
+      <% end %>
+    <% end %>
+    C[id] = sum;
+}

data/lib/tensor_stream/opencl/math_ops.rb CHANGED Viewed

@@ -5,8 +5,8 @@ module TensorStream
       def MathOps.included(klass)
         klass.class_eval do
           %i[max min add real_div div sub floor_mod mod mul pow sigmoid_grad squared_difference].each do |op|
-            register_op op, noop: true do |context, tensor, inputs|
-              execute_2_operand_func(op.to_s, tensor, inputs[0], inputs[1], context)
+            register_op op do |context, tensor, inputs|
+              execute_2_operand_func(op.to_s, tensor, inputs[0], inputs[1])
             end
           end
@@ -38,11 +38,11 @@ module TensorStream
             end
           end
-          register_op :floor_div, noop: true do |context, tensor, inputs|
+          register_op :floor_div do |context, tensor, inputs|
             if fp_type?(tensor.data_type)
-              execute_2_operand_func('floor_div', tensor, inputs[0], inputs[1], context)
+              execute_2_operand_func('floor_div', tensor, inputs[0], inputs[1])
             else
-              execute_2_operand_func('div', tensor, inputs[0], inputs[1], context)
+              execute_2_operand_func('div', tensor, inputs[0], inputs[1])
             end
           end
@@ -78,11 +78,8 @@ module TensorStream
             cl_n = OpenCL::Int1.new(n)
             cl_k = OpenCL::Int1.new(k)
-            transpose_a = OpenCL::Int1.new(tensor.options[:transpose_a] ? 1 : 0)
-            transpose_b = OpenCL::Int1.new(tensor.options[:transpose_b] ? 1 : 0)
             event_wait_list = build_event_wait_list([a, b])
-            output_buffer.op = _cl_program('gemm', dtype: dtype).send(:"gemm_#{dtype}", _opencl_queue, result_shape, cl_m, cl_n, cl_k, transpose_a, transpose_b, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
+            output_buffer.op = _cl_program('gemm', ta: !!tensor.options[:transpose_a], tb: !!tensor.options[:transpose_b], dtype: dtype).send(:"gemm_#{dtype}", _opencl_queue, result_shape, cl_m, cl_n, cl_k, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
             output_buffer
           end
@@ -94,39 +91,99 @@ module TensorStream
           end
           %i[sum mean].each do |op|
-            register_op op, noop: true do |context, tensor, inputs|
+            register_op op do |context, tensor, inputs|
               reduction(context, tensor, inputs[0], inputs[1], op.to_sym)
             end
           end
-          register_op :prod, noop: true do |context, tensor, inputs|
-            input_a = complete_eval(inputs[0], context)
-            if input_a.buffer.empty?
+          register_op :prod do |context, tensor, inputs|
+            if inputs[0].shape == [0]
               convert_to_opencl([1.0], [], data_type: inputs[0].data_type, name: tensor.name)
             else
               reduction(context, tensor, inputs[0], inputs[1], :prod)
             end
           end
-          register_op :argmin, buffer: true do |_context, tensor, inputs|
-            axis = tensor.options[:axis] || 0
-            rank = inputs[0].shape.size
-            raise TensorStream::InvalidArgumentError, "Expected dimension in the range [#{-rank},#{rank}) but got #{axis}" if axis < -rank || axis >= rank
+          # register_op :argmin, buffer: true do |_context, tensor, inputs|
+          #   axis = inputs[1].nil? || inputs[1].buffer.nil? || inputs[1].buffer.empty? ? 0 : inputs[1].buffer
+          #   rank = inputs[0].shape.size
+          #   raise TensorStream::InvalidArgumentError, "Expected dimension in the range [#{-rank},#{rank}) but got #{axis}" if axis < -rank || axis >= rank
+          #   arr = inputs[0].buffer.reshape(*inputs[0].shape.reverse).to_a
+          #   op = get_op_with_axis(arr, axis, 0, inputs[0].data_type, ->(a, b) { a < b })
+          #   convert_to_opencl(op, shape_eval(op), data_type: tensor.data_type, name: tensor.name)
+          # end
+          # register_op :argmax, buffer: true do |_context, tensor, inputs|
+          #   axis = inputs[1].nil? || inputs[1].buffer.nil? || inputs[1].buffer.empty? ? 0 : inputs[1].buffer
+          #   rank = inputs[0].shape.size
+          #   raise TensorStream::InvalidArgumentError, "Expected dimension in the range [#{-rank},#{rank}) but got #{axis}" if axis < -rank || axis >= rank
+          #   arr = inputs[0].buffer.reshape(*inputs[0].shape.reverse).to_a
+          #   op = get_op_with_axis(arr, axis, 0, inputs[0].data_type, ->(a, b) { a > b })
+          #   convert_to_opencl(op, shape_eval(op), data_type: tensor.data_type, name: tensor.name)
+          # end
+          def reduction(child_context, tensor, value, axis, func)
+            if axis.nil?
+              value = _run(value, child_context)
+              size = value.shape.reduce(:*) || 1
+              if value.shape.empty? # for scalars, just return as is
+                value
+              else
+                reduction_threads = 32
+                items_per_thread_threshold = 4
+                output_buffer = _create_result_buffer(value.data_type, [], tensor.name)
+                event_wait_list = build_event_wait_list([value])
+                if (size > reduction_threads) && ((size / reduction_threads) > items_per_thread_threshold)
+                  items_per_thread = size / reduction_threads
+                  extra_items = size % reduction_threads
+                  intermediate_output_buffer = _create_result_buffer(value.data_type, [reduction_threads], tensor.name)
+                  temp_values = if extra_items.zero?
+                                  _cl_program(func, dtype: value.data_type, index: 0, n: items_per_thread, w: items_per_thread).
+                                    send(:"#{func}_#{value.data_type}", _opencl_queue, [reduction_threads], value.cl_buffer, intermediate_output_buffer.cl_buffer, event_wait_list: event_wait_list)
+                                else
+                                  [_cl_program(func, dtype: value.data_type, index: 0, n: items_per_thread, w: items_per_thread).
+                                    send(:"#{func}_#{value.data_type}", _opencl_queue, [reduction_threads - 1], value.cl_buffer, intermediate_output_buffer.cl_buffer, event_wait_list: event_wait_list),
+                                  _cl_program(func, dtype: value.data_type, index: reduction_threads - 1, n: items_per_thread + extra_items,  w: items_per_thread).send(:"#{func}_#{value.data_type}", _opencl_queue, [1], value.cl_buffer, intermediate_output_buffer.cl_buffer, event_wait_list: event_wait_list)]
+                                end
+                  output_buffer.op = _cl_program(func, dtype: value.data_type, n: reduction_threads, index: 0, w: 0).send(:"#{func}_#{value.data_type}", _opencl_queue, [1], value.cl_buffer, output_buffer.cl_buffer, event_wait_list: temp_values)
+                  output_buffer
+                else
+                  output_buffer.op = _cl_program(func, dtype: value.data_type, n: size, index: 0, w: 0).send(:"#{func}_#{value.data_type}", _opencl_queue, [1], value.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
+                  output_buffer
+                end
+               end
+            else
+              return value if value.shape.empty?
+              axis = axis.is_a?(OpenCLBuffer) ? read_final_result(axis) : axis
+              input = complete_eval(value, child_context)
+              value = value.buffer.reshape(*value.shape.reverse)
+              rank = input.shape.size - 1
+              if axis.is_a?(Array)
+                axis.map { |x| rank - x.abs }.sort.reverse_each do |x|
+                  value = value.send(func, x.to_i)
+                end
+              else
+                value = value.send(func, rank - axis.abs)
+              end
-            arr = inputs[0].buffer.reshape(*inputs[0].shape.reverse).to_a
-            op = get_op_with_axis(arr, axis, 0, inputs[0].data_type, ->(a, b) { a < b })
-            convert_to_opencl(op, shape_eval(op), data_type: tensor.data_type, name: tensor.name)
-          end
+              new_shape = if value.is_a?(NArray)
+                            value.shape.reverse
+                          else
+                            value = [value]
+                            []
+                          end
-          register_op :argmax, buffer: true do |_context, tensor, inputs|
-            axis = tensor.options[:axis] || 0
-            rank = inputs[0].shape.size
-            raise TensorStream::InvalidArgumentError, "Expected dimension in the range [#{-rank},#{rank}) but got #{axis}" if axis < -rank || axis >= rank
+              new_shape = _reduced_shape(input.shape.dup, axis) if tensor.options[:keepdims]
-            arr = inputs[0].buffer.reshape(*inputs[0].shape.reverse).to_a
-            op = get_op_with_axis(arr, axis, 0, inputs[0].data_type, ->(a, b) { a > b })
-            convert_to_opencl(op, shape_eval(op), data_type: tensor.data_type, name: tensor.name)
+              convert_to_opencl(value.flatten, new_shape, data_type: tensor.data_type, name: tensor.name)
+            end
           end
         end
       end

data/lib/tensor_stream/opencl/nn_ops.rb CHANGED Viewed

@@ -121,11 +121,11 @@ module TensorStream
           end
           register_op :apply_adagrad do |context, tensor, inputs|
-            target_var, accum, lr, grad = inputs
+            _target_var, _accum, lr, grad = inputs
             assign = tensor.inputs[0] || tensor
             assign_acc = tensor.inputs[1]
             assign.buffer.dirty = true
             assign_acc.buffer.dirty = true
             output_buffer = assign.buffer
@@ -133,7 +133,7 @@ module TensorStream
             work_group = [output_buffer.total_elements]
             event_wait_list = build_event_wait_list(inputs)
-            event = call_program('apply_adagrad',
+            event = call_program('apply_adagrad',
                                       output_buffer.data_type,
                                       work_group,
                                       lr.cl_buffer,
@@ -195,7 +195,7 @@ module TensorStream
             event_wait_list = build_event_wait_list(inputs)
             work_group = [output_buffer.total_elements]
-            event = call_program('apply_rms_prop', output_buffer.data_type,
+            event = call_program('apply_rms_prop', output_buffer.data_type,
                             work_group,
                             lr.cl_buffer,
                             rho.cl_buffer,
@@ -298,7 +298,7 @@ module TensorStream
             end
             b = wrap_opencl(labels, data_type: inputs[0].data_type, name: "#{tensor.name}_label")
             event_wait_list = build_event_wait_list(inputs)
             dtype = tensor.data_type
             output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
@@ -335,6 +335,90 @@ module TensorStream
             output_buffer.op = event
             output_buffer
           end
+          %i[relu6].each do |op|
+            register_op op, noop: true do |context, tensor, inputs|
+              execute_func(op.to_s, tensor, inputs[0], context)
+            end
+          end
+          # Fast per pixel parallel convolution operation
+          register_op :conv2d do |_context, tensor, inputs|
+            filter = inputs[1]
+            batch, height, width, channel = inputs[0].shape
+            filter_shape = filter.shape
+            strides = tensor.options[:strides]
+            height_stride = strides[1]
+            width_stride = strides[2]
+            raise TensorStream::ValueError, " Current implementation does not yet support strides in the batch and depth dimensions." if strides[0] != 1 || strides[3] != 1
+            event_wait_list = build_event_wait_list(inputs)
+            f_height, f_width, in_channels, out_channels = filter_shape
+            out_shape = [batch, height / height_stride, width / width_stride, out_channels]
+            output_buffer = _create_result_buffer(tensor.data_type, out_shape, tensor.name)
+            cl_image_height = OpenCL::Int1.new(height)
+            cl_image_width = OpenCL::Int1.new(width)
+            work_dimen = [batch, height / height_stride, width / width_stride]
+            output_buffer.op = _cl_program("conv2d", dtype: tensor.data_type, fh: f_height, fw: f_width, ch: channel, out_ch: out_channels, stride: [height_stride, width_stride] ).send(:conv2d, _opencl_queue, work_dimen, cl_image_height, cl_image_width, inputs[0].cl_buffer,
+              inputs[1].cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
+            output_buffer
+          end
+          register_op :conv2d_backprop_input do |context, tensor, inputs|
+            image_shape, filter, grad = inputs
+            filter_shape = filter.shape
+            strides = tensor.options[:strides]
+            height_stride = strides[1]
+            width_stride = strides[2]
+            image_shape = read_final_result(complete_eval(image_shape, context))
+            event_wait_list = build_event_wait_list(inputs)
+            output_buffer = _create_result_buffer(tensor.data_type, image_shape, tensor.name)
+            batch, height, width, channels = image_shape
+            f_height, f_width, in_channels, out_channels = filter_shape
+            work_dimen = [batch, height, width]
+            cl_image_height = OpenCL::Int1.new(height)
+            cl_image_width = OpenCL::Int1.new(width)
+            output_buffer.op = _cl_program("conv2d_backprop_input", dtype: tensor.data_type, fh: f_height, fw: f_width, ch: channels, out_ch: out_channels, stride: [height_stride, width_stride] ).send(:conv2d_backprop_input, _opencl_queue, work_dimen, cl_image_height, cl_image_width,
+              filter.cl_buffer, grad.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
+            output_buffer
+          end
+          register_op :conv2d_backprop_filter do |context, tensor, inputs|
+            images, filter_shape, grad = inputs
+            event_wait_list = build_event_wait_list(inputs)
+            strides = tensor.options[:strides]
+            height_stride = strides[1]
+            width_stride = strides[2]
+            filter_shape = read_final_result(complete_eval(filter_shape, context))
+            output_buffer = _create_result_buffer(tensor.data_type, filter_shape, tensor.name)
+            batch_size, height, width, channels = images.shape
+            f_height, f_width, input_channels, output_channels = filter_shape
+            work_dimen = [f_height, f_width, output_channels]
+            cl_batch_size = OpenCL::Int1.new(batch_size)
+            cl_image_height = OpenCL::Int1.new(height)
+            cl_image_width = OpenCL::Int1.new(width)
+            output_buffer.op = _cl_program("conv2d_backprop_filter", dtype: tensor.data_type, fh: f_height, fw: f_width, ch: channels, out_ch: output_channels, stride: [height_stride, width_stride] ).send(:conv2d_backprop_filter, _opencl_queue, work_dimen, cl_batch_size, cl_image_height, cl_image_width,
+              images.cl_buffer, grad.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
+            output_buffer
+          end
         end
       end
     end

data/lib/tensor_stream/opencl/opencl_buffer.rb CHANGED Viewed

@@ -37,9 +37,13 @@ module TensorStream
         return buffer[0] != 0 if data_type == :boolean
         return buffer[0]
       end
-      result = buffer.reshape(*shape.map(&:to_i).reverse).to_a
+      result = buffer.reshape(*shape.map(&:to_i).reverse).to_a
       data_type == :boolean ? process_function_op(result, ->(a, _b) { a != 0 }) : result
     end
+    def self.nil_buffer(owner, name, data_type)
+      OpenCLBuffer.new(owner, name: name, data_type: data_type, shape: [0], buffer: nil, cl_buffer: nil)
+    end
   end
 end

data/lib/tensor_stream/opencl/opencl_evaluator.rb CHANGED Viewed

@@ -35,7 +35,8 @@ module TensorStream
     end
     ##
-    # PURE ruby evaluator used for testing and development
+    # OpenCL hardware accelerated evaluator
+    #
     class OpenclEvaluator < BaseEvaluator
       attr_accessor :retain
       attr_reader :opencl_device, :opencl_context
@@ -53,41 +54,57 @@ module TensorStream
         super
         _create_opencl_context
         @opencl_device = device.native_device
+        @max_work_item_dimensions = @opencl_device.max_work_item_dimensions
+        @max_work_item_sizes = @opencl_device.max_work_item_sizes
+        @max_work_group_size = @opencl_device.max_work_group_size
+        @local_mem_size = @opencl_device.local_mem_size
+        @device_type = @opencl_device.type.to_s.downcase
         create_command_queue
       end
-      def self.query_supported_devices
-        devices = query_devices_with_score
-        devices.sort { |a, b| a[1] <=> b[1] }.map do |d|
-          opencl_to_device(d)
+      class << self
+        def query_supported_devices
+          devices = query_devices_with_score
+          devices.sort_by { |a| a[1] }.map do |d|
+            opencl_to_device(d)
+          end
         end
-      end
-      def self.fetch_device(query = [])
-        devices = query_devices_with_score
-        platform_devices = devices.select { |d| d[0].platform.to_s.tr(' ', '_').downcase =~ /#{query[0].downcase}/ }
-        opencl_to_device(platform_devices[[query[1].to_i, platform_devices.size - 1].min])
-      end
+        def fetch_device(query = [])
+          devices = query_devices_with_score
+          platform_devices = devices.select { |d| d[0].platform.to_s.tr(' ', '_').downcase =~ /#{query[0].downcase}/ }
+          opencl_to_device(platform_devices[[query[1].to_i, platform_devices.size - 1].min])
+        end
-      def self.opencl_to_device(dev)
-        device = dev[0]
-        index = dev[3]
-        platform_name = device.platform.name.tr(' ', '_').downcase
-        uri = [platform_name, index].join(':')
+        def opencl_to_device(dev)
+          device = dev[0]
+          index = dev[3]
+          platform_name = device.platform.name.tr(' ', '_').downcase
+          uri = [platform_name, index].join(':')
-        device_type = device.type.to_s == 'GPU' ? :gpu : :cpu
+          device_type = device.type.to_s == 'GPU' ? :gpu : :cpu
-        OpenclDevice.new(uri, device_type, self).tap do |d|
-          d.native_device = device
+          OpenclDevice.new(uri, device_type, self).tap do |d|
+            d.native_device = device
+          end
         end
-      end
-      ##
-      # Select the best device available in the system for this evaluator
-      def self.default_device
-        devices = OpenclEvaluator.query_devices_with_score
-        device = devices.max { |a, b| a[1] <=> b[1] }
-        opencl_to_device(device)
+        ##
+        # Select the best device available in the system for this evaluator
+        def default_device
+          devices = OpenclEvaluator.query_devices_with_score
+          device = devices.max { |a, b| a[1] <=> b[1] }
+          opencl_to_device(device)
+        end
+        def getset_global_opencl_context(platform)
+          @global_opencl_context ||= {}
+          @global_opencl_context[platform] ||= yield
+          @global_opencl_context[platform]
+        end
       end
       # opencl evaluator main entrypoint
@@ -228,16 +245,22 @@ module TensorStream
       def _create_opencl_context(device = nil)
         if device.nil?
-          @@global_opencl_context ||= begin
-            all_devices = OpenclEvaluator.query_supported_devices.map(&:native_device)
-            puts "global context created for #{all_devices}"
-            OpenCL.create_context(all_devices)
+          all_devices_by_platform = {}
+          TensorStream::Evaluator::OpenclEvaluator.query_supported_devices.map(&:native_device).each do |d|
+            all_devices_by_platform[d.platform.name] ||= []
+            all_devices_by_platform[d.platform.name] << d
           end
-          @opencl_context = @@global_opencl_context
+          all_devices_by_platform.each do |platform, devices|
+            @opencl_context = TensorStream::Evaluator::OpenclEvaluator.getset_global_opencl_context(platform) do
+              OpenCL.create_context(devices)
+            end
+          end
         else
           puts "context created for #{device.native_device}"
-          @opencl_context = OpenCL.create_context(device.native_device)
+          @opencl_context = TensorStream::Evaluator::OpenclEvaluator.getset_global_opencl_context(device.native_device.platform) do
+            OpenCL.create_context(device.native_device)
+          end
         end
       end
@@ -269,11 +292,12 @@ module TensorStream
         @context[:_cache][kernel_cache_key] ||=
           begin
             # puts "building #{kernel_cache_key}"
-            file_path = File.join('/tmp', "#{kernel}.#{suffix}.cl")
+            file_path = File.join(ENV['TS_OPENCL_FILE_CACHE_PATH'] || '/tmp', "#{kernel}.#{suffix}.cl")
             source = if File.exist?(file_path) && ENV['TS_OPENCL_FILE_CACHE']
                        File.read(file_path)
                      else
-                       filename = %w[cl.erb cl].map { |ext| cl_template_path(kernel, ext) }.find { |n| File.exist?(n) }
+                       filenames = ['', ".#{@device_type}"].map { |type| %w[cl.erb cl].map { |ext| cl_template_path("#{kernel}#{type}", ext) } }.flatten
+                       filename = filenames.find { |n| File.exist?(n) }
                        raise "opencl kernel template for #{kernel} has not yet been defined" if filename.nil?
                        source = File.read(filename)
@@ -322,6 +346,7 @@ module TensorStream
       def eval_variable(tensor, _child_context)
         raise "variable #{tensor.name} not initalized" if tensor.value.nil? && (tensor.buffer.nil? || !tensor.buffer.dirty)
         tensor.buffer = wrap_opencl(tensor, name: tensor.name) if tensor.buffer.nil?
         tensor.buffer
       end
@@ -339,7 +364,7 @@ module TensorStream
         end
       end
-      register_op :identity do |context, tensor, inputs|
+      register_op :identity do |_context, tensor, inputs|
         value = inputs[0]
         buffer = OpenCLBuffer.new(self, name: tensor.name, data_type: tensor.data_type, shape: value.shape, buffer: value.buffer, cl_buffer: value.cl_buffer)
         buffer.op = build_event_wait_list(inputs)
@@ -351,25 +376,26 @@ module TensorStream
       end
       register_op :assign_add do |context, tensor, inputs|
-        value = execute_2_operand_func('add', tensor, inputs[0], inputs[1], context)
+        value = execute_2_operand_func('add', tensor, inputs[0], inputs[1])
         assign_var(tensor, value, context)
       end
       register_op :assign_sub do |context, tensor, inputs|
-        value = execute_2_operand_func('sub', tensor, inputs[0], inputs[1], context)
+        value = execute_2_operand_func('sub', tensor, inputs[0], inputs[1])
         assign_var(tensor, value, context)
       end
-      register_op :variable, noop: true do |context, tensor, inputs|
+      register_op :variable, noop: true do |_context, tensor, _inputs|
         variable = tensor.inputs[0]
         raise "variable #{tensor.name} not initalized" if variable.value.nil? && (variable.buffer.nil? || !variable.buffer.dirty)
         variable.buffer = wrap_opencl(variable, name: variable.name) if variable.buffer.nil?
         variable.buffer
       end
       %i[less less_equal greater greater_equal equal not_equal logical_and].each do |op|
-        register_op op, noop: true do |context, tensor, inputs|
-          execute_2_operand_func(op.to_s, tensor, inputs[0], inputs[1], context, 'cond')
+        register_op op do |context, tensor, inputs|
+          execute_2_operand_func(op.to_s, tensor, inputs[0], inputs[1], 'cond')
         end
       end
@@ -463,11 +489,11 @@ module TensorStream
       rescue EvaluatorExcecutionException => e
         _opencl_queue.finish # dump queue
         puts e.message
-        raise e, "error #{e.message} while evaluating #{tensor.name} : #{tensor.to_math(true, 1)} defined at #{tensor.source}"
+        raise e, "error #{e.message} while evaluating #{tensor.name} : defined at #{tensor.source}"
       rescue TensorStreamError => e
         _opencl_queue.finish # dump queue
         puts e.message
-        raise e, "error #{e.message} while evaluating #{tensor.name} : #{tensor.to_math(true, 1)} defined at #{tensor.source}"
+        raise e, "error #{e.message} while evaluating #{tensor.name} : defined at #{tensor.source}"
       rescue StandardError => e
         _opencl_queue.finish # dump queue
         puts e.message
@@ -496,6 +522,7 @@ module TensorStream
         cache_key = "#{tensor.graph.object_id}_opencl_#{tensor.name}:#{object_id}"
         return @context[cache_key] if @context.key?(cache_key)
         return @context[:_cache][cache_key] if tensor.is_const && @context[:_cache][cache_key]
         @context[cache_key] = if tensor.value.is_a?(Tensor)
                                 _run(tensor.value, child_context)
                               else
@@ -512,7 +539,6 @@ module TensorStream
         buffer = complete_eval(b, child_context)
         if assign.buffer
-          # buffer = type_cast(buffer, assign.data_type, name: "#{tensor.name}/cast_#{tensor.name}_#{tensor.data_type}")
           event_wait_list = build_event_wait_list([buffer, assign.buffer])
           assign.buffer.op = if assign.buffer.cl_buffer != buffer.cl_buffer
                                _opencl_queue.enqueue_copy_buffer(buffer.cl_buffer, assign.buffer.cl_buffer, event_wait_list: event_wait_list)
@@ -524,23 +550,32 @@ module TensorStream
           assign.buffer = convert_to_opencl(value, buffer.shape, data_type: tensor.data_type, name: assign.name)
           assign.value = value
         end
         assign.buffer.dirty = true
         assign.buffer
       end
-      def execute_2_operand_func(op_name, tensor, input_a, input_b, child_context, prog_name = nil)
-        a = _run(input_a, child_context)
-        b = _run(input_b, child_context)
+      def execute_2_operand_func(op_name, tensor, a, b, prog_name = nil)
         a, b = auto_type_cast(a, b, name: "#{tensor.name}/cast_#{a.name}_#{b.data_type}")
         dtype = tensor.data_type
         result_shape = TensorShape.infer_shape(a.shape, b.shape)
-        return _create_result_buffer(dtype, [0], "out_#{tensor.name}") if result_shape == [0]
+        return OpenCLBuffer.nil_buffer(self, "out_#{tensor.name}", dtype) if result_shape == [0]
         output_buffer = _create_result_buffer(tensor.data_type, result_shape, "out_#{tensor.name}")
         a, b, prog, switch_operands = select_program(a, b, op_name)
         m, n = result_shape
-        work_group = [m || 1, n || 1]
-        cl_m = OpenCL::Int1.new(m || 1)
-        cl_n = OpenCL::Int1.new(n || 1)
+        work_group = if result_shape.size > 2 && (b.shape.size.zero? || (a.shape == b.shape))
+                       [m, result_shape.reduce(:*) / m]
+                     elsif result_shape.size <= 2
+                       [m || 1, n || 1]
+                     else
+                       raise "rank > 2 not supported for now"
+                     end
+        cl_m = OpenCL::Int1.new(work_group[0])
+        cl_n = OpenCL::Int1.new(work_group[1])
         cl_switch = OpenCL::Int1.new(switch_operands) # no need to switch for addition
         event_wait_list = build_event_wait_list([a, b]) # add dependency wait list
@@ -610,6 +645,7 @@ module TensorStream
       def auto_type_cast(a, b, name: nil)
         return [a, b] if a.data_type == b.data_type
         m, n = b.shape
         work_group = [m || 1, n || 1]
         event_wait_list = build_event_wait_list([b])
@@ -624,6 +660,7 @@ module TensorStream
       def type_cast(source, data_type, name: nil)
         return source if source.data_type == data_type
         m, n = source.shape
         work_group = [m || 1, n || 1]
         event_wait_list = [source.op].compact
@@ -673,8 +710,6 @@ module TensorStream
                       return nil if buffer.nil?
                       cl_buffer = unless value.flatten.empty?
                                     cl_buffer_size = 1 if cl_buffer_size.zero?
                                     _opencl_context.create_buffer(cl_buffer_size * buffer.element_size)
@@ -682,6 +717,7 @@ module TensorStream
                       @context[:_cache][cache_key] = OpenCLBuffer.new(self, name: name, data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer)
                     end
         if data_type == :string
           value[0].each_byte.with_index do |c, index|
             cl_object.buffer[index] = c
@@ -704,11 +740,11 @@ module TensorStream
           cl_object.buffer[0] = Tensor.cast_dtype(value, data_type)
         end
-        if cl_object.cl_buffer && !value.nil? && (!value.is_a?(Array) || !value.empty?)
-          write_op = _opencl_queue.enqueue_write_buffer(cl_object.cl_buffer, cl_object.buffer)
+        # if OpenCL buffer is valid enqueue a write
+        if cl_object.cl_buffer && value && (!value.is_a?(Array) || !value.empty?)
+          cl_object.op = _opencl_queue.enqueue_write_buffer(cl_object.cl_buffer, cl_object.buffer)
         end
-        cl_object.op = write_op
         cl_object
       end
@@ -718,7 +754,7 @@ module TensorStream
           NArray.sfloat(narray_size)
         when :float64
           NArray.float(narray_size)
-        when :int, :int32, :int64, :uint64, :uint32 #NArray does not have 64 bit int types
+        when :int, :int32, :int64, :uint64, :uint32 # NArray does not have 64 bit int types
           NArray.int(narray_size)
         when :int16, :uint16
           NArray.sint(narray_size)
@@ -736,7 +772,8 @@ module TensorStream
       end
       def _create_result_buffer(data_type, shape, name)
-        return OpenCLBuffer.new(self, name: name, data_type: data_type, shape: [0], buffer: nil, cl_buffer: nil) if shape == [0]
+        return OpenCLBuffer.nil_buffer(self, name, data_type) if shape == [0]
         cache_key = "_result_#{name}_#{shape.join('_')}:#{object_id}"
         @context[:_cache][:_cl_buffers][cache_key] ||= begin
           # puts "create result buffer #{cache_key}"
@@ -759,7 +796,7 @@ module TensorStream
             region = OpenCL::BufferRegion::new(start, buffer.size * buffer.element_size)
             cl_buffer = parent_buffer.cl_buffer.create_sub_buffer(OpenCL::BUFFER_CREATE_TYPE_REGION, region)
             OpenCLBuffer.new(self, data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer, name: name)
-          else
+          else # source buffer already a sub buffer, OpenCL does not allow sub buffers from sub buffers
             _create_result_buffer(tensor.data_type, shape, name)
           end
         end
@@ -768,7 +805,7 @@ module TensorStream
         if buffer.cl_buffer.associated_memobject
           buffer.op = parent_buffer.op
-        else
+        else # source buffer alreay a sub buffer, so we need to do a copy instead
           region_size_in_bytes = buffer.buffer.size * buffer.buffer.element_size
           start = index * region_size_in_bytes
           region = [region_size_in_bytes, 1, 1]
@@ -841,6 +878,7 @@ module TensorStream
       def _reduced_shape(input_shape, axes)
         return [] if axes.nil? # reduce to scalar
         axes = [axes] unless axes.is_a?(Array)
         return input_shape if axes.empty?
@@ -850,39 +888,6 @@ module TensorStream
         input_shape
       end
-      def reduction(child_context, tensor, a, b, func)
-        input = complete_eval(a, child_context)
-        axis = b.is_a?(Tensor) ? read_final_result(complete_eval(b, child_context)) : b
-        if axis.nil?
-          red = input.buffer.send(func)
-          convert_to_opencl(red, [], data_type: tensor.data_type, name: tensor.name)
-        else
-          return input if input.shape.empty?
-          value = input.buffer.reshape(*input.shape.reverse)
-          rank = input.shape.size - 1
-          if axis.is_a?(Array)
-            axis.map { |x| rank - x.abs }.sort.reverse_each do |x|
-              value = value.send(func, x.to_i)
-            end
-          else
-            value = value.send(func, rank - axis.abs)
-          end
-          new_shape = if value.is_a?(NArray)
-                        value.shape.reverse
-                      else
-                        value = [value]
-                        []
-                      end
-          new_shape = _reduced_shape(input.shape.dup, axis) if tensor.options[:keepdims]
-          convert_to_opencl(value.flatten, new_shape, data_type: tensor.data_type, name: tensor.name)
-        end
-      end
       # selects variants of cl programs depending on input
       def select_program(input_a, input_b, op)
         return [input_a, input_b, op.to_s, 0] if input_a.shape == input_b.shape