RubyGems - tensor_stream-opencl - Versions diffs - 0.2.2 → 0.2.3 - Mend

tensor_stream-opencl 0.2.2 → 0.2.3

Files changed (30) hide show

checksums.yaml +4 -4
data/.gitignore +1 -0
data/benchmark/benchmark.rb +23 -1
data/benchmark_ryzen.txt +56 -0
data/lib/tensor_stream/opencl/array_ops.rb +3 -3
data/lib/tensor_stream/opencl/images_ops.rb +30 -0
data/lib/tensor_stream/opencl/kernels/conv2d.cl +27 -0
data/lib/tensor_stream/opencl/kernels/conv2d_backprop_filter.cl +26 -0
data/lib/tensor_stream/opencl/kernels/conv2d_backprop_input.cl +32 -0
data/lib/tensor_stream/opencl/kernels/gemm.cl +2 -10
data/lib/tensor_stream/opencl/kernels/max.cl +5 -13
data/lib/tensor_stream/opencl/kernels/mean.cl +26 -0
data/lib/tensor_stream/opencl/kernels/min.cl +3 -11
data/lib/tensor_stream/opencl/kernels/prod.cl +26 -0
data/lib/tensor_stream/opencl/kernels/relu6.cl +7 -0
data/lib/tensor_stream/opencl/kernels/round.cl +3 -4
data/lib/tensor_stream/opencl/kernels/sum.cl +26 -0
data/lib/tensor_stream/opencl/math_ops.rb +86 -29
data/lib/tensor_stream/opencl/nn_ops.rb +89 -5
data/lib/tensor_stream/opencl/opencl_buffer.rb +6 -2
data/lib/tensor_stream/opencl/opencl_evaluator.rb +97 -92
data/lib/tensor_stream/opencl/version.rb +1 -1
data/samples/iris.rb +2 -2
data/samples/logistic_regression.rb +84 -0
data/samples/mnist_data_2.1.rb +9 -4
data/samples/mnist_data_2.2.rb +12 -7
data/samples/mnist_data_2.3.rb +111 -0
data/samples/rnn.rb +1 -1
data/tensor_stream-opencl.gemspec +2 -1
metadata +28 -4

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 2f7c2e06a5711e3efc8503de82f4c836af70c3b0dfd6ce0f4790f0bb6d3abcb9
-  data.tar.gz: c103f23ba5d27f3a6356ed28b10966b8333f9fb3fabc203924ce357c4c0523c8
+  metadata.gz: 7888f5385dcec4b9a747128d1b25d0cb6d7fb01ab12f1c1b70706d0df5770903
+  data.tar.gz: 1129196d93a7b194d9de888fe49b43b0839c00589fb0987c0841ad1ac1693a4c
 SHA512:
-  metadata.gz: 637ede65bf27b9ce06a755e344e58567c4d1e83e4831115e872d6f2ca0ff778f49f4d4e60af643a920fcf3a1b9033078b0c81f6e6e0f62f2e31f8f9ac4fee89b
-  data.tar.gz: af8482a75b98db484c074c2862d455709ed5563e596819ad445a0c502467c0b5189eef04abbf55060dcbc0640289ce839715b19b3332aa9c21217345726ac3f3
+  metadata.gz: 727d97b9aa1402ed9681eb71fe0c0cdcc966e8a9e2f08b7480fdefa503509e26872b187a0b184d31a5b0cee86dc96e1d97f4fce14b378fa4a7c8ce4679b06421
+  data.tar.gz: 0e2f1601543d582042d0911222c1209f2df2c08139b7acc8ee347b5d7e236cd451a1b00032fb75d6f532105fb8f4ee49e53614071d111e0fabc8ed44010c4301

data/.gitignore CHANGED Viewed

@@ -9,6 +9,7 @@
 Gemfile.lock
 *.gem
 *.ckpt
+profile.json
 # rspec failure tracking
 .rspec_status

data/benchmark/benchmark.rb CHANGED Viewed

@@ -26,7 +26,7 @@ tf.set_random_seed(seed)
 SHAPES = [32, 32]
 sess = tf.session(:ruby_evaluator)
+large_tensor = tf.constant(sess.run(tf.random_uniform([256, 256])))
 a = tf.constant(sess.run(tf.random_uniform(SHAPES)))
 a_int = tf.constant([
   [1, 2, 3, 4, 4, 1, 4, 8, 3, 4, 1, 1],
@@ -49,6 +49,9 @@ c = tf.constant(sess.run(tf.random_uniform(SHAPES)))
 d = tf.constant(sess.run(tf.random_uniform(SHAPES)))
+sample_image = tf.constant(sess.run(tf.random_uniform([10, 8, 8, 3])))
+sample_filter = tf.constant(sess.run(tf.random_uniform([2, 2, 3, 3])))
 p = tf.placeholder('float')
 q = tf.placeholder('float')
@@ -61,6 +64,13 @@ out_of_order = tf.matmul(a, b) + tf.matmul(c, d)
 softmax = tf.nn.softmax(a)
 add_n = tf.add_n([a,b,c,d])
 split = tf.split(a, 4)
+sum = tf.reduce_sum(large_tensor)
+sum_axis_1 = tf.reduce_sum(large_tensor, 1)
+min = tf.min(large_tensor, 1)
+index = large_tensor[0]
+conv2d = tf.nn.conv2d(sample_image, sample_filter, [1, 1, 1, 1], 'SAME')
+conv2d_grad = tf.gradients(conv2d, [sample_image, sample_filter])
 puts TensorStream::Evaluator.default_evaluators
@@ -70,6 +80,18 @@ puts `cat /proc/cpuinfo | grep "model name" | head -1`
 device = TensorStream::Evaluator::OpenclEvaluator.default_device.native_device
 puts "OpenCL device #{device.platform.to_s} #{device.name}"
 Benchmark.bmbm do |x|
+  x.report("pure ruby conv2d_backprop      :") { 100.times do sess.run(conv2d_grad) end }
+  x.report("opencl conv2d_backprop         :") { 100.times do sess2.run(conv2d_grad) end }
+  x.report("pure ruby conv2d      :") { 100.times do sess.run(conv2d) end }
+  x.report("opencl conv2d         :") { 100.times do sess2.run(conv2d) end }
+  x.report("pure ruby arr index      :") { 100.times do sess.run(index) end }
+  x.report("opencl arr index         :") { 100.times do sess2.run(index) end }
+  x.report("pure ruby min            :") { 100.times do sess.run(min) end }
+  x.report("opencl min               :") { 100.times do sess2.run(min) end }
+  x.report("pure ruby sum            :") { 100.times do sess.run(sum) end }
+  x.report("opencl sum               :") { 100.times do sess2.run(sum) end }
+  x.report("pure ruby sum axis 1     :") { 100.times do sess.run(sum_axis_1) end }
+  x.report("opencl sum axis 1        :") { 100.times do sess2.run(sum_axis_1) end }
   x.report("pure ruby split          :") { 100.times do sess.run(split) end }
   x.report("opencl split             :") { 100.times do sess2.run(split) end }
   x.report("pure ruby add_n          :") { 100.times do sess.run(add_n) end }

data/benchmark_ryzen.txt ADDED Viewed

@@ -0,0 +1,56 @@
+TensorStream::Evaluator::OpenclEvaluator
+TensorStream::Evaluator::RubyEvaluator
+model name	: AMD Ryzen 3 1300X Quad-Core Processor
+OpenCL device NVIDIA CUDA GeForce GTX 1060 6GB
+Rehearsal --------------------------------------------------------------
+pure ruby arr index      :   0.005448   0.003557   0.009005 (  0.008999)
+opencl arr index         :   0.074642   0.190132   0.264774 (  0.275557)
+pure ruby min            :   0.256004   0.000777   0.256781 (  0.256682)
+opencl min               :   0.017543   0.004523   0.022066 (  0.018797)
+pure ruby sum            :   0.313039   0.000565   0.313604 (  0.313535)
+opencl sum               :   0.009037   0.004249   0.013286 (  0.011073)
+pure ruby split          :   0.017223   0.000300   0.017523 (  0.017542)
+opencl split             :   0.033489   0.014394   0.047883 (  0.038798)
+pure ruby add_n          :   0.159864   0.000153   0.160017 (  0.159992)
+opencl add_n             :   0.018535   0.000563   0.019098 (  0.016168)
+pure ruby ooo matmul     :   1.390970   0.000304   1.391274 (  1.390790)
+opencl    ooo matmul     :   0.014119   0.000229   0.014348 (  0.011738)
+pure ruby softmax        :   0.024103   0.000014   0.024117 (  0.024135)
+opencl    softmax        :   0.010602   0.004277   0.014879 (  0.011941)
+pure ruby matmul         :   0.668126   0.000006   0.668132 (  0.667778)
+opencl    matmul         :   0.006672   0.007527   0.014199 (  0.011594)
+pure ruby                :   2.388817   0.000005   2.388822 (  2.387870)
+opencl                   :   0.152289   0.007804   0.160093 (  0.156279)
+pure ruby single function:   0.356575   0.000062   0.356637 (  0.356488)
+opencl     singlefunction:   0.120073   0.000210   0.120283 (  0.116378)
+pure ruby pow float:         0.088966   0.000051   0.089017 (  0.088996)
+opencl pow float:            0.018054   0.000100   0.018154 (  0.015429)
+pure ruby pow int:           0.025430   0.000070   0.025500 (  0.025524)
+opencl pow int:              0.015652   0.003880   0.019532 (  0.017386)
+----------------------------------------------------- total: 6.429024sec
+                                 user     system      total        real
+pure ruby arr index      :   0.003564   0.000110   0.003674 (  0.003636)
+opencl arr index         :   0.007966   0.003974   0.011940 (  0.009775)
+pure ruby min            :   0.246153   0.000102   0.246255 (  0.246172)
+opencl min               :   0.011787   0.007785   0.019572 (  0.016169)
+pure ruby sum            :   0.294371   0.000000   0.294371 (  0.294335)
+opencl sum               :   0.008266   0.003879   0.012145 (  0.009315)
+pure ruby split          :   0.014552   0.000000   0.014552 (  0.014539)
+opencl split             :   0.037984   0.004037   0.042021 (  0.033276)
+pure ruby add_n          :   0.146300   0.000053   0.146353 (  0.146319)
+opencl add_n             :   0.006426   0.007827   0.014253 (  0.011461)
+pure ruby ooo matmul     :   1.373232   0.000096   1.373328 (  1.372788)
+opencl    ooo matmul     :   0.013838   0.000000   0.013838 (  0.011088)
+pure ruby softmax        :   0.024478   0.000000   0.024478 (  0.024493)
+opencl    softmax        :   0.014117   0.000022   0.014139 (  0.011246)
+pure ruby matmul         :   0.653146   0.000054   0.653200 (  0.652889)
+opencl    matmul         :   0.002750   0.011934   0.014684 (  0.011729)
+pure ruby                :   2.392733   0.000058   2.392791 (  2.391726)
+opencl                   :   0.140118   0.016001   0.156119 (  0.151788)
+pure ruby single function:   0.352515   0.000000   0.352515 (  0.352443)
+opencl     singlefunction:   0.093955   0.011813   0.105768 (  0.102301)
+pure ruby pow float:         0.083659   0.000000   0.083659 (  0.083623)
+opencl pow float:            0.017433   0.000125   0.017558 (  0.014508)
+pure ruby pow int:           0.018381   0.000000   0.018381 (  0.018391)
+opencl pow int:              0.008186   0.003755   0.011941 (  0.009828)

data/lib/tensor_stream/opencl/array_ops.rb CHANGED Viewed

@@ -197,6 +197,7 @@ module TensorStream
             ops = if axis.zero? # fast path
               inputs.each_with_index.map do |input, index|
                 next if input.empty_value?
                 start = index * input.buffer.size * input.buffer.element_size
                 region = [input.buffer.size * input.buffer.element_size, 1, 1]
                 event_wait_list = build_event_wait_list(input)
@@ -339,7 +340,7 @@ module TensorStream
           register_op :index, noop: true do |context, tensor, inputs|
             a = _run(inputs[0], context)
-            index = read_final_result(_run(inputs[1], context))
+            index = inputs[1].value || read_final_result(_run(inputs[1], context))
             if a.is_a?(TensorStream::Evaluator::OutputGroup)
               a.outputs[index]
@@ -348,8 +349,7 @@ module TensorStream
             else
               new_shape = a.shape.dup
               new_shape.shift
-              input_a = read_final_result(a)
-              convert_to_opencl(input_a[index], new_shape, data_type: a.data_type, name: tensor.name)
+              _create_result_sub_buffer(a, index, tensor.data_type, new_shape, "#{tensor.name}/out_#{index}")
             end
           end

data/lib/tensor_stream/opencl/images_ops.rb CHANGED Viewed

@@ -7,9 +7,23 @@ module TensorStream
           register_op :decode_png do |context, tensor, inputs|
             content = _run(inputs[0], context)
             channels = tensor.options[:channels]
+            resample_new_shape = tensor.options[:new_shape]
+            resample_method = tensor.options[:resample_method] || :bilinear
             channels = 4 if channels.zero?
             image = ChunkyPNG::Image.from_blob(content.buffer.to_a.pack('C*'))
+            if resample_new_shape
+              case resample_method
+              when :bilinear
+                image.resample_bilinear!(resample_new_shape[1], resample_new_shape[0]) # width, # height
+              when :nearest_neighbor
+                image.resample_nearest_neighbor!(resample_new_shape[1], resample_new_shape[0])
+              else
+                raise TensorStream::ValueError, "invalid resample method provided #{resample_method}. Available (:bilinear, :nearest_neighbor)"
+              end
+            end
             output_buffer = _create_result_buffer(tensor.data_type, [image.height, image.width, channels], "out_#{tensor.name}")
             image.grayscale! if channels == 1
@@ -38,6 +52,10 @@ module TensorStream
           register_op :encode_png do |_context, tensor, inputs|
             image_data = inputs[0]
+            resample_new_shape = tensor.options[:new_shape]
+            resample_method = tensor.options[:resample_method] || :bilinear
             height, width, channels = image_data.shape
             image_buffer = image_data.buffer.reshape(*image_data.shape.reverse).to_a
 \
@@ -53,6 +71,18 @@ module TensorStream
                 end
               end
             end
+            if resample_new_shape
+              case resample_method
+              when :bilinear
+                png.resample_bilinear!(resample_new_shape[1], resample_new_shape[0]) # width, # height
+              when :nearest_neighbor
+                png.resample_nearest_neighbor!(resample_new_shape[1], resample_new_shape[0])
+              else
+                raise TensorStream::ValueError, "invalid resample method provided #{resample_method}. Available (:bilinear, :nearest_neighbor)"
+              end
+            end
             convert_to_opencl(png.to_s, [], data_type: :string, name: tensor.name)
           end
         end

data/lib/tensor_stream/opencl/kernels/conv2d.cl ADDED Viewed

@@ -0,0 +1,27 @@
+% ctype = dtype_to_c_type(dtype)
+__kernel void conv2d(const int height, const int width, __global const <%= ctype %> *images, __global const <%= ctype %> *filter, __global <%= ctype %> *output) {
+    // Get the index of the current element to be processed
+    const int batch_index = get_global_id(0);
+    const int h_index = get_global_id(1);
+    const int w_index = get_global_id(2);
+    const int h_index_with_stride = h_index * <%= stride[0] %>;
+    const int w_index_with_stride = w_index * <%= stride[1] %>;
+    const int image_index = batch_index * height * width * <%= ch %>;
+    const int image_row_width = width * <%= ch %>;
+    for (int out_channel_index = 0; out_channel_index < <%= out_ch %>; out_channel_index++) {
+      <%= ctype %> sum = 0;
+      for (int channel_index = 0; channel_index < <%= ch %>; channel_index++) {
+        for(int y = 0; y < <%= fh %>; y++) {
+          for (int x = 0; x < <%= fw %>; x++) {
+            if ( (h_index_with_stride + y) < height && (w_index_with_stride + x) < width) {
+              sum += images[image_index + (h_index_with_stride + y)*image_row_width + (w_index_with_stride + x)*<%= ch %> + channel_index] * filter[y*<%= fw * ch * out_ch %> + x*<%= ch * out_ch %> + (channel_index*<%= out_ch %>) + out_channel_index];
+            }
+          }
+        }
+      }
+      output[batch_index * (height/<%= stride[0] %>) * (width/<%= stride[1] %>) * <%= out_ch %> + h_index * (width/<%= stride[1] %>) * <%= out_ch %> +  w_index * <%= out_ch %> + out_channel_index ] = sum;
+    }
+}

data/lib/tensor_stream/opencl/kernels/conv2d_backprop_filter.cl ADDED Viewed

@@ -0,0 +1,26 @@
+% ctype = dtype_to_c_type(dtype)
+__kernel void conv2d_backprop_filter(const int batch_size, const int height, const int width, __global const <%= ctype %> *images, __global const <%= ctype %> *grad, __global <%= ctype %> *output) {
+    // Get the index of the current element to be processed
+    const int fh_index = get_global_id(0);
+    const int fw_index = get_global_id(1);
+    const int f_out_channel = get_global_id(2);
+    const int image_size = height * width * <%= ch %>;
+    const int grad_image_row_width = width * <%= out_ch %>;
+    for(int channel = 0; channel < <%= ch %>; channel++) {
+      <%= ctype %> grad_sum = 0.0;
+      for(int batch = 0; batch < batch_size; batch++) {
+        const int image_index = batch * height * width * <%= out_ch %>;
+        for(int y = 0; y < height; y++) {
+          for (int x = 0; x < width; x++) {
+            if ( ((y - fh_index) % <%= stride[0]%>) == 0  && ((x - fw_index) % <%= stride[1]%>) == 0 && fh_index <= y && fw_index <= x) {
+              const <%= ctype %> image_grad = grad[image_index + ((y - fh_index) / <%= stride[0] %>) * grad_image_row_width + ((x - fw_index) / <%= stride[1] %>) * <%= out_ch %> + f_out_channel];
+              grad_sum += images[batch * image_size + y * width * <%= ch %> + x * <%= ch %> + channel] * image_grad;
+            }
+          }
+        }
+      }
+      output[fh_index * <%= fw * ch * out_ch %> + fw_index * <%= ch * out_ch %> + channel * <%= out_ch %> + f_out_channel] = grad_sum;
+    }
+}

data/lib/tensor_stream/opencl/kernels/conv2d_backprop_input.cl ADDED Viewed

@@ -0,0 +1,32 @@
+% ctype = dtype_to_c_type(dtype)
+__kernel void conv2d_backprop_input(const int height, const int width, __global const <%= ctype %> *filter, __global const <%= ctype %> *grad, __global <%= ctype %> *output) {
+    // Get the index of the current element to be processed
+    int batch_index = get_global_id(0);
+    int h_index = get_global_id(1); // orig image y
+    int w_index = get_global_id(2); // orig image x
+    int h_index_with_stride = h_index / <%= stride[0] %>;
+    int w_index_with_stride = w_index / <%= stride[1] %>;
+    int grad_height = height / <%= stride[0] %>;
+    int grad_width = width / <%= stride[1] %>;
+    int image_index = batch_index * grad_height * grad_width * <%= out_ch %>;
+    int image_row_width = grad_width * <%= out_ch %>;
+    for (int channel_index = 0; channel_index < <%= ch %>; channel_index++) {
+      <%= ctype %> g = 0.0;
+      for (int out_channel_index = 0; out_channel_index < <%= out_ch %>; out_channel_index++) {
+        for(int y = 0; y < <%= fh %>; y++) {
+          for (int x = 0; x < <%= fw %>; x++) {
+            if ( (y <= h_index) && (x <= w_index) && ( (h_index - y) % <%= stride[0]%> == 0) && ( (w_index - x) % <%= stride[1]%> == 0)) {
+              <%= ctype %> imag_grad = grad[image_index + ( (h_index - y) / <%= stride[0] %>) * image_row_width + ( (w_index - x) / <%= stride[1] %>) * <%= out_ch %> + out_channel_index];
+              g += imag_grad * filter[y * <%= fw * ch * out_ch %> + x * <%= ch * out_ch %> + (channel_index*<%= out_ch %>) + out_channel_index];
+            }
+          }
+        }
+      }
+      output[batch_index * height * width * <%= ch %> + h_index * width * <%= ch %> +  w_index * <%= ch %> + channel_index ] = g;
+    }
+}

data/lib/tensor_stream/opencl/kernels/gemm.cl CHANGED Viewed

@@ -1,8 +1,6 @@
 // First naive implementation
 % c_dtype = dtype_to_c_type(dtype)
 __kernel void gemm_<%= dtype %>(const int M, const int N, const int K,
-                      const int A_transpose,
-                      const int B_transpose,
                       const __global <%= c_dtype %>* A,
                       const __global <%= c_dtype %>* B,
                       __global <%= c_dtype %>* C) {
@@ -16,14 +14,8 @@ __kernel void gemm_<%= dtype %>(const int M, const int N, const int K,
     for (int k=0; k<K; k++) {
         int a_index = globalRow*K + k;
         int b_index = k*N + globalCol;
-        if (A_transpose) {
-            a_index = M*k + globalRow;
-        }
-        if (B_transpose) {
-            b_index = globalCol*K + k;
-        }
+<% if ta %>a_index = M*k + globalRow;<% end %>
+<% if tb %>b_index = globalCol*K + k;<% end %>
         acc += A[a_index] * B[b_index];
     }

data/lib/tensor_stream/opencl/kernels/max.cl CHANGED Viewed

@@ -5,7 +5,7 @@
     const int globalRow = get_global_id(0); // Row ID of C (0..M)
     const int globalCol = get_global_id(1); // Col ID of C (0..N)
-    C[globalRow * N + globalCol] = A[globalRow * N + globalCol] >= B[globalRow * N + globalCol] ? A[globalRow * N + globalCol] : B[globalRow * N + globalCol];
+    C[globalRow * N + globalCol] = max((<%= c_dtype %>)A[globalRow * N + globalCol], (<%= c_dtype %>)B[globalRow * N + globalCol]);
 }
  // 1D + Scalar floating point add op
@@ -13,12 +13,8 @@
     // Get the index of the current element to be processed
     const int globalRow = get_global_id(0); // Row ID of C (0..M)
     const int globalCol = get_global_id(1); // Col ID of C (0..N)
-    if (switch_op == 0) {
-      C[globalRow * N + globalCol] = A[globalRow * N + globalCol] >= B[0] ? A[globalRow * N + globalCol] : B[0];
-    } else {
-      C[globalRow * N + globalCol] = B[0] >= A[globalRow * N + globalCol] ? B[0] : A[globalRow * N + globalCol];
-    }
+    C[globalRow * N + globalCol] = max((<%= c_dtype %>)A[globalRow * N + globalCol], (<%= c_dtype %>)B[0]);
 }
  // 1D + Scalar floating point add op broadcast
@@ -26,7 +22,7 @@
     // Get the index of the current element to be processed
     const int globalRow = get_global_id(0); // Row ID of C (0..M)
     const int globalCol = get_global_id(1); // Col ID of C (0..N)
     int b_m_index = globalRow;
     int b_n_index = globalCol;
@@ -38,9 +34,5 @@
       b_n_index = b_n_index % N2;
     }
-    if (switch_op == 0) {
-      C[globalRow * N + globalCol] = A[globalRow * N + globalCol] >= B[b_m_index * N2 + b_n_index] ? A[globalRow * N + globalCol] : B[b_m_index * N2 + b_n_index];
-    } else {
-      C[globalRow * N + globalCol] = B[b_m_index * N2 + b_n_index] >= A[globalRow * N + globalCol] ? B[b_m_index * N2 + b_n_index] :  A[globalRow * N + globalCol];
-    }
+    C[globalRow * N + globalCol] = max((<%= c_dtype %>)A[globalRow * N + globalCol],(<%= c_dtype %>) B[b_m_index * N2 + b_n_index]);
 }

data/lib/tensor_stream/opencl/kernels/mean.cl ADDED Viewed

@@ -0,0 +1,26 @@
+% c_dtype = dtype_to_c_type(dtype)
+__kernel void mean_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
+    // Get the index of the current element to be processed
+    const int id = get_global_id(0);
+    int offset = (id + <%= index %>) * <%= w %>;
+    <%= c_dtype %> sum = 0;
+    <% if n > 4 %>
+      for(int i = 0; i < <%= n/4 %> ; i++) {
+        <% sums = 4.times.map do |i|
+          "A[offset + #{i}]"
+        end %>
+        sum += <%= sums.join(' + ') %>;
+        offset += 4;
+      }
+      <% if n%4!=0 %>
+        <% (n % 4).times do |i| %>
+          sum += A[offset + <%= i %>];
+        <% end %>
+      <% end %>
+    <% else %>
+      <% n.times do |i| %>
+        sum += A[offset + <%= i %>];
+      <% end %>
+    <% end %>
+    C[id] = sum / <%= n %>;
+}

data/lib/tensor_stream/opencl/kernels/min.cl CHANGED Viewed

@@ -5,7 +5,7 @@
     const int globalRow = get_global_id(0); // Row ID of C (0..M)
     const int globalCol = get_global_id(1); // Col ID of C (0..N)
-    C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <= B[globalRow * N + globalCol] ? A[globalRow * N + globalCol] : B[globalRow * N + globalCol];
+    C[globalRow * N + globalCol] = min((<%= c_dtype %>)A[globalRow * N + globalCol],(<%= c_dtype %>) B[globalRow * N + globalCol]);
 }
  // 1D + Scalar floating point add op
@@ -14,11 +14,7 @@
     const int globalRow = get_global_id(0); // Row ID of C (0..M)
     const int globalCol = get_global_id(1); // Col ID of C (0..N)
-    if (switch_op == 0) {
-      C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <= B[0] ? A[globalRow * N + globalCol] : B[0];
-    } else {
-      C[globalRow * N + globalCol] = B[0] <= A[globalRow * N + globalCol] ? B[0] : A[globalRow * N + globalCol];
-    }
+    C[globalRow * N + globalCol] = min((<%= c_dtype %>)A[globalRow * N + globalCol], (<%= c_dtype %>) B[0]);
 }
  // 1D + Scalar floating point add op broadcast
@@ -38,9 +34,5 @@
       b_n_index = b_n_index % N2;
     }
-    if (switch_op == 0) {
-      C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <= B[b_m_index * N2 + b_n_index] ? A[globalRow * N + globalCol] : B[b_m_index * N2 + b_n_index];
-    } else {
-      C[globalRow * N + globalCol] = B[b_m_index * N2 + b_n_index] <= A[globalRow * N + globalCol] ? B[b_m_index * N2 + b_n_index] :  A[globalRow * N + globalCol];
-    }
+    C[globalRow * N + globalCol] = min((<%= c_dtype %>)A[globalRow * N + globalCol], (<%= c_dtype %>)B[b_m_index * N2 + b_n_index]);
 }

data/lib/tensor_stream/opencl/kernels/prod.cl ADDED Viewed

@@ -0,0 +1,26 @@
+% c_dtype = dtype_to_c_type(dtype)
+__kernel void prod_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
+    // Get the index of the current element to be processed
+    int id = get_global_id(0);
+    int offset = (id + <%= index %>) * <%= w %>;
+    <%= c_dtype %> prod = 1;
+    <% if n > 4 %>
+      for(int i = 0; i < <%= n/4 %> ; i++) {
+        <% sums = 4.times.map do |i|
+          "A[offset + #{i}]"
+        end %>
+        prod *= <%= sums.join(' * ') %>;
+        offset += 4;
+      }
+      <% if n%4!=0 %>
+        <% (n % 4).times do |i| %>
+          prod *= A[offset + <%= i %>];
+        <% end %>
+      <% end %>
+    <% else %>
+      <% n.times do |i| %>
+        prod *= A[offset + <%= i %>];
+      <% end %>
+    <% end %>
+    C[id] = prod;
+}