RubyGems - tensor_stream-opencl - Versions diffs - 0.2.10 → 0.3.0 - Mend

tensor_stream-opencl 0.2.10 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

checksums.yaml +4 -4
data/benchmark/benchmark.rb +12 -0
data/benchmark_ryzen_nvidia.txt +80 -0
data/lib/tensor_stream/opencl/kernels/arg_axis.cl +42 -0
data/lib/tensor_stream/opencl/kernels/argmax.cl +12 -6
data/lib/tensor_stream/opencl/kernels/argmin.cl +12 -6
data/lib/tensor_stream/opencl/kernels/bias_add.cl +9 -0
data/lib/tensor_stream/opencl/kernels/bias_add_grad.cl +10 -0
data/lib/tensor_stream/opencl/kernels/reduce_axis.cl +42 -0
data/lib/tensor_stream/opencl/math_ops.rb +62 -38
data/lib/tensor_stream/opencl/opencl_buffer.rb +4 -0
data/lib/tensor_stream/opencl/opencl_template_helper.rb +21 -0
data/lib/tensor_stream/opencl/version.rb +1 -1
data/tensor_stream-opencl.gemspec +1 -1
metadata +9 -4

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: d590302525812d813924ea639202fa41db60a6b4d46b2f4eafaf01f77910a530
-  data.tar.gz: 6413c7d9e5376844fd2da090e6f8e84d23cd0b5e47f2be2a0eef5f82652d6f78
+  metadata.gz: d2b7fa8bb0dcaa041e5ec390418ffac2bb40d777086e9ebe771f9f42d79ccabc
+  data.tar.gz: e21e0cf638e177da495e9748087808c51aee15ac10212c856ee789ae8c32d458
 SHA512:
-  metadata.gz: af8845e919363d7d1cb06bf6899a9644e1cc7907908bc9c43a2efb0995c7696e6215445ee60fb802250f6032740cc7169f48404e9141c226bf2a3b0b1caf018d
-  data.tar.gz: eb56a21e66f2624f19bc8d19e374dfb967f219806eac538009588db16cf2f2ed89b98f2f6fa35135e28a543db0e6b389d5047da3ce0bf6a26518ab8c1e5cd75b
+  metadata.gz: 31a1cc671716dee18d6841e39f3bf43c5326a423be59e118e5acf35777aaed9b29acdfdd3425d6ce48f5f5c5f8e5692fae687a592828be0a458f337498965008
+  data.tar.gz: ecef0d771882d662996aa31cfa4eaf9e8cef720c2b05eb48f872ce9c3fbe9e07dfc99427332a6bcbf5fb8054f7f00d27fc55184462a559be834635aaad358c30

data/benchmark/benchmark.rb CHANGED Viewed

@@ -43,6 +43,8 @@ a_int = tf.constant([
   [4, 2, 3, 4, 0, 1, 1, 0, 0, 2, 1, 2],
 ])
+large_tensor_bias = tf.constant(sess.run(tf.random_uniform([256])))
 b = tf.constant(sess.run(tf.random_uniform(SHAPES)))
 c = tf.constant(sess.run(tf.random_uniform(SHAPES)))
@@ -67,11 +69,15 @@ split = tf.split(a, 4)
 sum = tf.reduce_sum(large_tensor)
 sum_axis_1 = tf.reduce_sum(large_tensor, 1)
 min = tf.min(large_tensor, 1)
+argmin = tf.argmin(large_tensor)
 index = large_tensor[0]
 conv2d = tf.nn.conv2d(sample_image, sample_filter, [1, 1, 1, 1], 'SAME')
 conv2d_grad = tf.gradients(conv2d, [sample_image, sample_filter])
+bias_add = tf.nn.bias_add(large_tensor, large_tensor_bias)
+bias_add_grad = tf.gradients(bias_add, [large_tensor_bias])
 puts TensorStream::Evaluator.default_evaluators
 sess2 = tf.session
@@ -80,6 +86,12 @@ puts `cat /proc/cpuinfo | grep "model name" | head -1`
 device = TensorStream::Evaluator::OpenclEvaluator.default_device.native_device
 puts "OpenCL device #{device.platform.to_s} #{device.name}"
 Benchmark.bmbm do |x|
+  x.report("pure ruby argmin            :") { 100.times do sess.run(argmin) end }
+  x.report("opencl argmin               :") { 100.times do sess2.run(argmin) end }
+  x.report("pure ruby bias_add_grad            :") { 100.times do sess.run(bias_add_grad) end }
+  x.report("opencl bias_add_grad               :") { 100.times do sess2.run(bias_add_grad) end }
+  x.report("pure ruby bias_add             :") { 100.times do sess.run(bias_add) end }
+  x.report("opencl bias_add                :") { 100.times do sess2.run(bias_add) end }
   x.report("pure ruby conv2d_backprop      :") { 100.times do sess.run(conv2d_grad) end }
   x.report("opencl conv2d_backprop         :") { 100.times do sess2.run(conv2d_grad) end }
   x.report("pure ruby conv2d      :") { 100.times do sess.run(conv2d) end }

data/benchmark_ryzen_nvidia.txt ADDED Viewed

@@ -0,0 +1,80 @@
+TensorStream::Evaluator::OpenclEvaluator
+TensorStream::Evaluator::RubyEvaluator
+model name	: AMD Ryzen 3 1300X Quad-Core Processor
+OpenCL device NVIDIA CUDA GeForce GTX 1060 6GB
+Rehearsal ------------------------------------------------------------------------
+pure ruby argmin            :          0.708414   0.007882   0.716296 (  0.717201)
+opencl argmin               :          0.204186   0.222389   0.426575 (  0.447862)
+pure ruby bias_add_grad            :   2.048097   0.005187   2.053284 (  2.057617)
+opencl bias_add_grad               :   0.012482   0.000426   0.012908 (  0.013225)
+pure ruby bias_add             :       2.406516   0.000087   2.406603 (  2.406307)
+opencl bias_add                :       0.136466   0.008025   0.144491 (  0.134989)
+pure ruby conv2d_backprop      :       3.685220   0.000155   3.685375 (  3.685049)
+opencl conv2d_backprop         :       0.028940   0.008031   0.036971 (  0.029904)
+pure ruby conv2d      :                0.788991   0.000041   0.789032 (  0.788881)
+opencl conv2d         :                0.020150   0.000138   0.020288 (  0.016917)
+pure ruby arr index      :             0.003036   0.000000   0.003036 (  0.003044)
+opencl arr index         :             0.009626   0.000023   0.009649 (  0.006703)
+pure ruby min            :             3.767836   0.007871   3.775707 (  3.773523)
+opencl min               :             0.141541   0.008039   0.149580 (  0.139246)
+pure ruby sum            :             3.219801   0.000076   3.219877 (  3.218388)
+opencl sum               :             0.007480   0.004074   0.011554 (  0.008261)
+pure ruby sum axis 1     :             3.203423   0.000000   3.203423 (  3.201832)
+opencl sum axis 1        :             0.011710   0.000033   0.011743 (  0.008379)
+pure ruby split          :             0.016504   0.000008   0.016512 (  0.016529)
+opencl split             :             0.041059   0.012026   0.053085 (  0.043289)
+pure ruby add_n          :             0.141810   0.000000   0.141810 (  0.141721)
+opencl add_n             :             0.013751   0.000000   0.013751 (  0.012208)
+pure ruby ooo matmul     :             1.395286   0.000000   1.395286 (  1.394697)
+opencl    ooo matmul     :             0.013448   0.000000   0.013448 (  0.009873)
+pure ruby softmax        :             0.025362   0.000018   0.025380 (  0.025382)
+opencl    softmax        :             0.014999   0.000051   0.015050 (  0.011977)
+pure ruby matmul         :             0.666863   0.000000   0.666863 (  0.666499)
+opencl    matmul         :             0.008572   0.003920   0.012492 (  0.009246)
+pure ruby                :             2.429792   0.000005   2.429797 (  2.428788)
+opencl                   :             0.140862   0.004014   0.144876 (  0.137264)
+pure ruby single function:             0.340247   0.000000   0.340247 (  0.340184)
+opencl     singlefunction:             0.084871   0.007956   0.092827 (  0.087899)
+pure ruby pow float:                   0.083372   0.000000   0.083372 (  0.083339)
+opencl pow float:                      0.013498   0.000014   0.013512 (  0.010353)
+pure ruby pow int:                     0.018739   0.000000   0.018739 (  0.018753)
+opencl pow int:                        0.007737   0.004041   0.011778 (  0.008817)
+-------------------------------------------------------------- total: 26.165217sec
+                                           user     system      total        real
+pure ruby argmin            :          0.677097   0.000009   0.677106 (  0.676828)
+opencl argmin               :          0.005919   0.003950   0.009869 (  0.006618)
+pure ruby bias_add_grad            :   2.027326   0.000000   2.027326 (  2.026399)
+opencl bias_add_grad               :   0.011544   0.000050   0.011594 (  0.008380)
+pure ruby bias_add             :       2.378283   0.000000   2.378283 (  2.377411)
+opencl bias_add                :       0.130993   0.011994   0.142987 (  0.132772)
+pure ruby conv2d_backprop      :       3.738167   0.000000   3.738167 (  3.737946)
+opencl conv2d_backprop         :       0.031267   0.003958   0.035225 (  0.030381)
+pure ruby conv2d      :                0.794182   0.000000   0.794182 (  0.794100)
+opencl conv2d         :                0.015865   0.004020   0.019885 (  0.016878)
+pure ruby arr index      :             0.003112   0.000000   0.003112 (  0.003109)
+opencl arr index         :             0.012100   0.000000   0.012100 (  0.009728)
+pure ruby min            :             3.669509   0.003944   3.673453 (  3.671906)
+opencl min               :             0.137071   0.004055   0.141126 (  0.131802)
+pure ruby sum            :             3.210619   0.000000   3.210619 (  3.210064)
+opencl sum               :             0.002431   0.008030   0.010461 (  0.007522)
+pure ruby sum axis 1     :             3.208789   0.000000   3.208789 (  3.208125)
+opencl sum axis 1        :             0.006075   0.003963   0.010038 (  0.007679)
+pure ruby split          :             0.013985   0.000000   0.013985 (  0.013990)
+opencl split             :             0.029464   0.011999   0.041463 (  0.030797)
+pure ruby add_n          :             0.140984   0.000003   0.140987 (  0.140959)
+opencl add_n             :             0.003146   0.007934   0.011080 (  0.007778)
+pure ruby ooo matmul     :             1.416585   0.000000   1.416585 (  1.416290)
+opencl    ooo matmul     :             0.011156   0.000000   0.011156 (  0.008723)
+pure ruby softmax        :             0.024724   0.000000   0.024724 (  0.024731)
+opencl    softmax        :             0.006237   0.003945   0.010182 (  0.009005)
+pure ruby matmul         :             0.679538   0.000000   0.679538 (  0.680048)
+opencl    matmul         :             0.003456   0.007965   0.011421 (  0.008568)
+pure ruby                :             2.437790   0.004031   2.441821 (  2.443126)
+opencl                   :             0.133039   0.003996   0.137035 (  0.130579)
+pure ruby single function:             0.332269   0.004003   0.336272 (  0.336201)
+opencl     singlefunction:             0.078607   0.004009   0.082616 (  0.078640)
+pure ruby pow float:                   0.081409   0.000000   0.081409 (  0.081364)
+opencl pow float:                      0.011501   0.000000   0.011501 (  0.008471)
+pure ruby pow int:                     0.016687   0.000000   0.016687 (  0.016711)
+opencl pow int:                        0.007061   0.003950   0.011011 (  0.007819)

data/lib/tensor_stream/opencl/kernels/arg_axis.cl ADDED Viewed

@@ -0,0 +1,42 @@
+% c_dtype = dtype_to_c_type(dtype)
+% out_c_dtype = dtype_to_c_type(out_dtype)
+% o_multipliers = o_shape.dup.drop(1).reverse.inject([1]) { |a, s| a << s * a.last }.reverse
+% i_multipliers = shape.dup.drop(1).reverse.inject([1]) { |a, s| a << s * a.last }.reverse
+% out_ops = o_multipliers.map.with_index { |m, index| "id_#{index} * #{m}"}.join(' + ')
+% axis = axis[0]
+% in_axis_multipliers = i_multipliers.select.with_index { |m, index| axis == index }
+% in_axis_ops =  in_axis_multipliers.map.with_index { |m| "i * #{m}"}.join(' + ')
+% in_output_multipliers = i_multipliers.reject.with_index { |m, index| axis == index }
+% in_output_ops =  in_output_multipliers.map.with_index { |m, index| "id_#{index} * #{m}"}.join(' + ')
+__kernel void arg_axis_<%= dtype %>(__global const <%= c_dtype %> *value, __global <%= out_c_dtype %> *output) {
+    // Get the index of the current element to be processed
+<% o_multipliers.size.times.each_with_index do |s, index| %>
+  const int id_<%= index %> = get_global_id(<%= index %>);
+<% end %>
+<%= c_dtype %> min_or_max_value = <%= f == :argmax ? min_value_for(dtype) : max_value_for(dtype) %>;
+int min_or_max_index = 0;
+for (int i = 0; i < <%= shape[axis] %>; i++) {
+  int index = <%= in_axis_ops %>;
+  <% unless in_output_ops.empty? %>
+  index += <%= in_output_ops %>;
+  <% end %>
+  <%= case(f)
+    when :argmax
+      "if (value[index] > min_or_max_value) {"
+    when :argmin
+      "if (value[index] < min_or_max_value) {"
+    else
+    raise "unkown redunction func #{f}"
+    end
+  %>
+     min_or_max_index = i;
+     min_or_max_value = value[index];
+  }
+}
+  output[<%= out_ops %>] = (<%= out_c_dtype %>)min_or_max_index;
+}

data/lib/tensor_stream/opencl/kernels/argmax.cl CHANGED Viewed

@@ -1,8 +1,14 @@
 % c_dtype = dtype_to_c_type(dtype)
- __kernel void argmax_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
-    // Get the index of the current element to be processed
-    const int globalRow = get_global_id(0); // Row ID of C (0..M)
-    const int globalCol = get_global_id(1); // Col ID of C (0..N)
-    C[globalRow * N + globalCol] = A[globalRow * N + globalCol] + B[globalRow * N + globalCol];
+% out_c_dtype = dtype_to_c_type(out_dtype)
+__kernel void argmax_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
+    <%= c_dtype %> max = <%= min_value_for(dtype) %>;
+    <%= out_c_dtype %> max_index = 0;
+    for(int i = 0; i < <%= n %>; i++) {
+        if (A[i] > max) {
+            max = A[i];
+            max_index = i;
+        }
+    }
+    C[0] = max_index;
 }

data/lib/tensor_stream/opencl/kernels/argmin.cl CHANGED Viewed

@@ -1,8 +1,14 @@
 % c_dtype = dtype_to_c_type(dtype)
- __kernel void argmin_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
-    // Get the index of the current element to be processed
-    const int globalRow = get_global_id(0); // Row ID of C (0..M)
-    const int globalCol = get_global_id(1); // Col ID of C (0..N)
-    C[globalRow * N + globalCol] = A[globalRow * N + globalCol] + B[globalRow * N + globalCol];
+% out_c_dtype = dtype_to_c_type(out_dtype)
+__kernel void argmin_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
+    <%= c_dtype %> min = <%= max_value_for(dtype) %>;
+    <%= out_c_dtype %> min_index = 0;
+    for(int i = 0; i < <%= n %>; i++) {
+        if (A[i] < min) {
+            min = A[i];
+            min_index = i;
+        }
+    }
+    C[0] = min_index;
 }

data/lib/tensor_stream/opencl/kernels/bias_add.cl ADDED Viewed

@@ -0,0 +1,9 @@
+% c_dtype = dtype_to_c_type(dtype)
+__kernel void bias_add_<%= dtype %>(__global const <%= c_dtype %> *value, __constant const <%= c_dtype %> *bias, __global <%= c_dtype %> *output) {
+    const int id = get_global_id(0);
+    for(int i = 0; i < <%= n %>; i++) {
+      output[ <%= n %> * id + i] = value[ <%= n %> * id + i] + bias[i];
+    }
+}

data/lib/tensor_stream/opencl/kernels/bias_add_grad.cl ADDED Viewed

@@ -0,0 +1,10 @@
+% c_dtype = dtype_to_c_type(dtype)
+__kernel void bias_add_grad_<%= dtype %>(__global const <%= c_dtype %> *received_grad, __global <%= c_dtype %> *output) {
+    const int id = get_global_id(0);
+    <%= c_dtype %> sum = 0;
+    for(int i = 0; i < <%= rows %>; i++) {
+      sum += received_grad[<%= n %> * i + id];
+    }
+    output[id] = sum;
+}

data/lib/tensor_stream/opencl/kernels/reduce_axis.cl ADDED Viewed

@@ -0,0 +1,42 @@
+% c_dtype = dtype_to_c_type(dtype)
+% o_multipliers = o_shape.dup.drop(1).reverse.inject([1]) { |a, s| a << s * a.last }.reverse
+% i_multipliers = shape.dup.drop(1).reverse.inject([1]) { |a, s| a << s * a.last }.reverse
+% out_ops = o_multipliers.map.with_index { |m, index| "id_#{index} * #{m}" }.join(' + ')
+% in_axis_multipliers = i_multipliers.select.with_index { |m, index| axis.include?(index) }
+% in_axis_ops =  in_axis_multipliers.map.with_index { |m, index| "i_#{index} * #{m}"}.join(' + ')
+% in_output_multipliers = i_multipliers.reject.with_index { |m, index| axis.include?(index) }
+% in_output_ops =  in_output_multipliers.map.with_index { |m, index| "id_#{index} * #{m}"}.join(' + ')
+__kernel void reduce_axis_<%= dtype %>(__global const <%= c_dtype %> *value, __global <%= c_dtype %> *output) {
+    // Get the index of the current element to be processed
+<% o_multipliers.size.times.each_with_index do |s, index| %>
+  const int id_<%= index %> = get_global_id(<%= index %>);
+<% end %>
+<%= c_dtype %> sum = <%= f == :prod ? 1 : 0 %>;
+<%= c_dtype %> item_size = 0;
+<% axis.each_with_index do |axis, index| %>
+  for (int i_<%= index %> = 0; i_<%= index %> < <%= shape[axis] %>; i_<%= index %>++) {
+<% end %>
+  int index = <%= in_axis_ops %>;
+  item_size += 1;
+  <% unless in_output_ops.empty? %>
+  index += <%= in_output_ops %>;
+  <% end %>
+  <%= case(f)
+    when :sum, :mean
+      "sum += value[index];"
+    when :prod
+      "sum *= value[index];"
+    else
+    raise "unkown redunction func #{f}"
+    end
+  %>
+<% axis.each do |axis| %>
+  }
+<% end %>
+<% if f == :mean %>
+  output[<%= out_ops %>] = sum / item_size;
+<% else %>
+  output[<%= out_ops %>] = sum;
+<% end %>
+}

data/lib/tensor_stream/opencl/math_ops.rb CHANGED Viewed

@@ -90,6 +90,36 @@ module TensorStream
             output_buffer
           end
+          register_op :bias_add do |context, tensor, inputs|
+            value, bias = inputs
+            output_buffer = _create_result_buffer(value.data_type, value.shape, tensor.name)
+            result_shape = value.shape.dup
+            bias_length = result_shape.pop
+            work_group = [result_shape.reduce(:*)]
+            event_wait_list = build_event_wait_list([value, bias])
+            dtype = tensor.data_type
+            output_buffer.op = _cl_program('bias_add', n: bias_length, dtype: dtype)
+              .send(:"bias_add_#{dtype}", _opencl_queue, work_group, value.cl_buffer,
+                    bias.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
+            output_buffer
+          end
+          register_op :bias_add_grad do |context, tensor, inputs|
+            received_grad = inputs[0]
+            bias_size = received_grad.shape.last
+            output_buffer = _create_result_buffer(received_grad.data_type, [bias_size], tensor.name)
+            work_group = [bias_size]
+            received_grad_shape = received_grad.shape.dup
+            received_grad_shape.pop
+            item_rows = received_grad_shape.reduce(:*)
+            dtype = tensor.data_type
+            output_buffer.op = _cl_program('bias_add_grad', n: bias_size, rows: item_rows, dtype: dtype)
+              .send(:"bias_add_grad_#{dtype}", _opencl_queue, work_group, received_grad.cl_buffer,
+                    output_buffer.cl_buffer, event_wait_list: build_event_wait_list([received_grad]))
+            output_buffer
+          end
           %i[sign exp tan acos asin sin cos abs sqrt negate square reciprocal tanh tanh_grad sigmoid log1p round floor ceil log].each do |op|
             register_op op, noop: true do |context, tensor, inputs|
               execute_func(op.to_s, tensor, inputs[0], context)
@@ -110,25 +140,18 @@ module TensorStream
             end
           end
-          # register_op :argmin, buffer: true do |_context, tensor, inputs|
-          #   axis = inputs[1].nil? || inputs[1].buffer.nil? || inputs[1].buffer.empty? ? 0 : inputs[1].buffer
-          #   rank = inputs[0].shape.size
-          #   raise TensorStream::InvalidArgumentError, "Expected dimension in the range [#{-rank},#{rank}) but got #{axis}" if axis < -rank || axis >= rank
-          #   arr = inputs[0].buffer.reshape(*inputs[0].shape.reverse).to_a
-          #   op = get_op_with_axis(arr, axis, 0, inputs[0].data_type, ->(a, b) { a < b })
-          #   convert_to_opencl(op, shape_eval(op), data_type: tensor.data_type, name: tensor.name)
-          # end
+          %i[argmin argmax].each do |op|
+            register_op op do |context, tensor, inputs|
+              value, axis = inputs
+              rank = value.shape.size
+              axis = 0 if axis.nil?
-          # register_op :argmax, buffer: true do |_context, tensor, inputs|
-          #   axis = inputs[1].nil? || inputs[1].buffer.nil? || inputs[1].buffer.empty? ? 0 : inputs[1].buffer
-          #   rank = inputs[0].shape.size
-          #   raise TensorStream::InvalidArgumentError, "Expected dimension in the range [#{-rank},#{rank}) but got #{axis}" if axis < -rank || axis >= rank
+              axis = axis.is_a?(OpenCLBuffer) ? read_final_result(axis) : axis
+              raise TensorStream::InvalidArgumentError, "Expected dimension in the range [#{-rank},#{rank}) but got #{axis}" if axis < -rank || axis >= rank
-          #   arr = inputs[0].buffer.reshape(*inputs[0].shape.reverse).to_a
-          #   op = get_op_with_axis(arr, axis, 0, inputs[0].data_type, ->(a, b) { a > b })
-          #   convert_to_opencl(op, shape_eval(op), data_type: tensor.data_type, name: tensor.name)
-          # end
+              reduce_multi_axis(context, tensor, value, axis, 'arg', op.to_sym)
+             end
+          end
           def reduction(child_context, tensor, value, axis, func)
             if axis.nil?
@@ -164,33 +187,34 @@ module TensorStream
                 end
                end
             else
-              return value if value.shape.empty?
+              reduce_multi_axis(child_context, tensor, value, axis, 'reduce', func)
+            end
+          end
-              axis = axis.is_a?(OpenCLBuffer) ? read_final_result(axis) : axis
-              input = complete_eval(value, child_context)
+          def reduce_multi_axis(child_context, tensor, value, axis, prog, func)
+            return value if value.shape.empty?
-              value = value.buffer.reshape(*value.shape.reverse)
-              rank = input.shape.size - 1
+            rank = value.shape.size
-              if axis.is_a?(Array)
-                axis.map { |x| rank - x.abs }.sort.reverse_each do |x|
-                  value = value.send(func, x.to_i)
-                end
-              else
-                value = value.send(func, rank - axis.abs)
-              end
+            axis = axis.is_a?(OpenCLBuffer) ? read_final_result(axis) : axis
+            axis = [axis] unless axis.is_a?(Array)
+            return value if axis.empty?
+            # remap negative values
+            axis.map! { |axis| axis < 0 ? rank - axis.abs : axis }
-              new_shape = if value.is_a?(NArray)
-                            value.shape.reverse
-                          else
-                            value = [value]
-                            []
-                          end
+            new_shape = value.shape.collect.with_index { |v, index| axis.include?(index) ? nil : v }.compact
-              new_shape = _reduced_shape(input.shape.dup, axis) if tensor.options[:keepdims]
+            buffer_shape = tensor.options[:keepdims] ? _reduced_shape(value.shape.dup, axis) : new_shape
+            output_buffer = _create_result_buffer(tensor.options[:output_type] || tensor.data_type, buffer_shape, tensor.name)
-              convert_to_opencl(value.flatten, new_shape, data_type: tensor.data_type, name: tensor.name)
-            end
+            work_group = new_shape.empty? ? [1] : new_shape
+            dtype = value.data_type
+            output_buffer.op = _cl_program("#{prog}_axis", f: func, axis: axis, shape: value.shape, o_shape: new_shape, dtype: dtype, out_dtype: tensor.options[:output_type])
+                .send("#{prog}_axis_#{dtype}", _opencl_queue, work_group, value.cl_buffer,
+                      output_buffer.cl_buffer, event_wait_list: build_event_wait_list([value]))
+            output_buffer
           end
         end
       end

data/lib/tensor_stream/opencl/opencl_buffer.rb CHANGED Viewed

@@ -23,6 +23,10 @@ module TensorStream
       @shape == [0]
     end
+    def inspect
+      "CLBuffer(shape: #{shape || "?"} data_type: #{data_type}, cl_allocated: #{cl_buffer ? cl_buffer.size : 'unallocated'}) -> raw: #{buffer.to_a}"
+    end
     def to_ruby
       return [] if buffer.empty?

data/lib/tensor_stream/opencl/opencl_template_helper.rb CHANGED Viewed

@@ -76,6 +76,27 @@ class OpenclTemplateHelper
     end
   end
+  def max_value_for(dtype)
+    case dtype.to_s
+    when 'float64'
+      'DBL_MAX'
+    when 'float32', 'float', 'float16'
+      'FLT_MAX'
+    when 'int32', 'int'
+      'INT_MAX'
+    when 'uint32', 'uint16'
+      '0'
+    when 'int16'
+      'SHRT_MAX'
+    when 'int8'
+      '256'
+    when 'boolean'
+      '1'
+    else
+      raise "unknown dtype #{dtype}"
+    end
+  end
   def operator_to_c(op)
     case op
     when 'less'

data/lib/tensor_stream/opencl/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 module TensorStream
   module Opencl
-    VERSION = "0.2.10"
+    VERSION = "0.3.0"
   end
 end

data/tensor_stream-opencl.gemspec CHANGED Viewed

@@ -39,7 +39,7 @@ Gem::Specification.new do |spec|
   spec.add_development_dependency "awesome_print"
   spec.add_development_dependency "mnist-learn"
   spec.add_development_dependency "simplecov"
-  spec.add_dependency "tensor_stream", "1.0.5"
+  spec.add_dependency "tensor_stream", "1.0.6"
   spec.add_dependency "opencl_ruby_ffi"
   spec.add_dependency "oily_png"
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: tensor_stream-opencl
 version: !ruby/object:Gem::Version
-  version: 0.2.10
+  version: 0.3.0
 platform: ruby
 authors:
 - Joseph Dayo
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2019-03-20 00:00:00.000000000 Z
+date: 2019-03-23 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -114,14 +114,14 @@ dependencies:
     requirements:
     - - '='
       - !ruby/object:Gem::Version
-        version: 1.0.5
+        version: 1.0.6
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - '='
       - !ruby/object:Gem::Version
-        version: 1.0.5
+        version: 1.0.6
 - !ruby/object:Gem::Dependency
   name: opencl_ruby_ffi
   requirement: !ruby/object:Gem::Requirement
@@ -169,6 +169,7 @@ files:
 - benchmark/benchmark.rb
 - benchmark_intel.txt
 - benchmark_ryzen.txt
+- benchmark_ryzen_nvidia.txt
 - bin/console
 - bin/setup
 - lib/tensor_stream/opencl.rb
@@ -186,9 +187,12 @@ files:
 - lib/tensor_stream/opencl/kernels/apply_gradient.cl
 - lib/tensor_stream/opencl/kernels/apply_momentum.cl
 - lib/tensor_stream/opencl/kernels/apply_rms_prop.cl
+- lib/tensor_stream/opencl/kernels/arg_axis.cl
 - lib/tensor_stream/opencl/kernels/argmax.cl
 - lib/tensor_stream/opencl/kernels/argmin.cl
 - lib/tensor_stream/opencl/kernels/asin.cl
+- lib/tensor_stream/opencl/kernels/bias_add.cl
+- lib/tensor_stream/opencl/kernels/bias_add_grad.cl
 - lib/tensor_stream/opencl/kernels/cast.cl
 - lib/tensor_stream/opencl/kernels/ceil.cl
 - lib/tensor_stream/opencl/kernels/concat.cl
@@ -217,6 +221,7 @@ files:
 - lib/tensor_stream/opencl/kernels/prod.cl
 - lib/tensor_stream/opencl/kernels/real_div.cl
 - lib/tensor_stream/opencl/kernels/reciprocal.cl
+- lib/tensor_stream/opencl/kernels/reduce_axis.cl
 - lib/tensor_stream/opencl/kernels/relu6.cl
 - lib/tensor_stream/opencl/kernels/round.cl
 - lib/tensor_stream/opencl/kernels/sigmoid.cl