RubyGems - tensor_stream-opencl - Versions diffs - 0.2.2 → 0.2.3 - Mend

tensor_stream-opencl 0.2.2 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

checksums.yaml +4 -4
data/.gitignore +1 -0
data/benchmark/benchmark.rb +23 -1
data/benchmark_ryzen.txt +56 -0
data/lib/tensor_stream/opencl/array_ops.rb +3 -3
data/lib/tensor_stream/opencl/images_ops.rb +30 -0
data/lib/tensor_stream/opencl/kernels/conv2d.cl +27 -0
data/lib/tensor_stream/opencl/kernels/conv2d_backprop_filter.cl +26 -0
data/lib/tensor_stream/opencl/kernels/conv2d_backprop_input.cl +32 -0
data/lib/tensor_stream/opencl/kernels/gemm.cl +2 -10
data/lib/tensor_stream/opencl/kernels/max.cl +5 -13
data/lib/tensor_stream/opencl/kernels/mean.cl +26 -0
data/lib/tensor_stream/opencl/kernels/min.cl +3 -11
data/lib/tensor_stream/opencl/kernels/prod.cl +26 -0
data/lib/tensor_stream/opencl/kernels/relu6.cl +7 -0
data/lib/tensor_stream/opencl/kernels/round.cl +3 -4
data/lib/tensor_stream/opencl/kernels/sum.cl +26 -0
data/lib/tensor_stream/opencl/math_ops.rb +86 -29
data/lib/tensor_stream/opencl/nn_ops.rb +89 -5
data/lib/tensor_stream/opencl/opencl_buffer.rb +6 -2
data/lib/tensor_stream/opencl/opencl_evaluator.rb +97 -92
data/lib/tensor_stream/opencl/version.rb +1 -1
data/samples/iris.rb +2 -2
data/samples/logistic_regression.rb +84 -0
data/samples/mnist_data_2.1.rb +9 -4
data/samples/mnist_data_2.2.rb +12 -7
data/samples/mnist_data_2.3.rb +111 -0
data/samples/rnn.rb +1 -1
data/tensor_stream-opencl.gemspec +2 -1
metadata +28 -4

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 2f7c2e06a5711e3efc8503de82f4c836af70c3b0dfd6ce0f4790f0bb6d3abcb9
-  data.tar.gz: c103f23ba5d27f3a6356ed28b10966b8333f9fb3fabc203924ce357c4c0523c8
+  metadata.gz: 7888f5385dcec4b9a747128d1b25d0cb6d7fb01ab12f1c1b70706d0df5770903
+  data.tar.gz: 1129196d93a7b194d9de888fe49b43b0839c00589fb0987c0841ad1ac1693a4c
 SHA512:
-  metadata.gz: 637ede65bf27b9ce06a755e344e58567c4d1e83e4831115e872d6f2ca0ff778f49f4d4e60af643a920fcf3a1b9033078b0c81f6e6e0f62f2e31f8f9ac4fee89b
-  data.tar.gz: af8482a75b98db484c074c2862d455709ed5563e596819ad445a0c502467c0b5189eef04abbf55060dcbc0640289ce839715b19b3332aa9c21217345726ac3f3
+  metadata.gz: 727d97b9aa1402ed9681eb71fe0c0cdcc966e8a9e2f08b7480fdefa503509e26872b187a0b184d31a5b0cee86dc96e1d97f4fce14b378fa4a7c8ce4679b06421
+  data.tar.gz: 0e2f1601543d582042d0911222c1209f2df2c08139b7acc8ee347b5d7e236cd451a1b00032fb75d6f532105fb8f4ee49e53614071d111e0fabc8ed44010c4301

data/.gitignore CHANGED Viewed

@@ -9,6 +9,7 @@
 Gemfile.lock
 *.gem
 *.ckpt
+profile.json
 # rspec failure tracking
 .rspec_status

data/benchmark/benchmark.rb CHANGED Viewed

@@ -26,7 +26,7 @@ tf.set_random_seed(seed)
 SHAPES = [32, 32]
 sess = tf.session(:ruby_evaluator)
+large_tensor = tf.constant(sess.run(tf.random_uniform([256, 256])))
 a = tf.constant(sess.run(tf.random_uniform(SHAPES)))
 a_int = tf.constant([
   [1, 2, 3, 4, 4, 1, 4, 8, 3, 4, 1, 1],
@@ -49,6 +49,9 @@ c = tf.constant(sess.run(tf.random_uniform(SHAPES)))
 d = tf.constant(sess.run(tf.random_uniform(SHAPES)))
+sample_image = tf.constant(sess.run(tf.random_uniform([10, 8, 8, 3])))
+sample_filter = tf.constant(sess.run(tf.random_uniform([2, 2, 3, 3])))
 p = tf.placeholder('float')
 q = tf.placeholder('float')
@@ -61,6 +64,13 @@ out_of_order = tf.matmul(a, b) + tf.matmul(c, d)
 softmax = tf.nn.softmax(a)
 add_n = tf.add_n([a,b,c,d])
 split = tf.split(a, 4)
+sum = tf.reduce_sum(large_tensor)
+sum_axis_1 = tf.reduce_sum(large_tensor, 1)
+min = tf.min(large_tensor, 1)
+index = large_tensor[0]
+conv2d = tf.nn.conv2d(sample_image, sample_filter, [1, 1, 1, 1], 'SAME')
+conv2d_grad = tf.gradients(conv2d, [sample_image, sample_filter])
 puts TensorStream::Evaluator.default_evaluators
@@ -70,6 +80,18 @@ puts `cat /proc/cpuinfo | grep "model name" | head -1`
 device = TensorStream::Evaluator::OpenclEvaluator.default_device.native_device
 puts "OpenCL device #{device.platform.to_s} #{device.name}"
 Benchmark.bmbm do |x|
+  x.report("pure ruby conv2d_backprop      :") { 100.times do sess.run(conv2d_grad) end }
+  x.report("opencl conv2d_backprop         :") { 100.times do sess2.run(conv2d_grad) end }
+  x.report("pure ruby conv2d      :") { 100.times do sess.run(conv2d) end }
+  x.report("opencl conv2d         :") { 100.times do sess2.run(conv2d) end }
+  x.report("pure ruby arr index      :") { 100.times do sess.run(index) end }
+  x.report("opencl arr index         :") { 100.times do sess2.run(index) end }
+  x.report("pure ruby min            :") { 100.times do sess.run(min) end }
+  x.report("opencl min               :") { 100.times do sess2.run(min) end }
+  x.report("pure ruby sum            :") { 100.times do sess.run(sum) end }
+  x.report("opencl sum               :") { 100.times do sess2.run(sum) end }
+  x.report("pure ruby sum axis 1     :") { 100.times do sess.run(sum_axis_1) end }
+  x.report("opencl sum axis 1        :") { 100.times do sess2.run(sum_axis_1) end }
   x.report("pure ruby split          :") { 100.times do sess.run(split) end }
   x.report("opencl split             :") { 100.times do sess2.run(split) end }
   x.report("pure ruby add_n          :") { 100.times do sess.run(add_n) end }

data/benchmark_ryzen.txt ADDED Viewed

@@ -0,0 +1,56 @@
+TensorStream::Evaluator::OpenclEvaluator
+TensorStream::Evaluator::RubyEvaluator
+model name	: AMD Ryzen 3 1300X Quad-Core Processor
+OpenCL device NVIDIA CUDA GeForce GTX 1060 6GB
+Rehearsal --------------------------------------------------------------
+pure ruby arr index      :   0.005448   0.003557   0.009005 (  0.008999)
+opencl arr index         :   0.074642   0.190132   0.264774 (  0.275557)
+pure ruby min            :   0.256004   0.000777   0.256781 (  0.256682)
+opencl min               :   0.017543   0.004523   0.022066 (  0.018797)
+pure ruby sum            :   0.313039   0.000565   0.313604 (  0.313535)
+opencl sum               :   0.009037   0.004249   0.013286 (  0.011073)
+pure ruby split          :   0.017223   0.000300   0.017523 (  0.017542)
+opencl split             :   0.033489   0.014394   0.047883 (  0.038798)
+pure ruby add_n          :   0.159864   0.000153   0.160017 (  0.159992)
+opencl add_n             :   0.018535   0.000563   0.019098 (  0.016168)
+pure ruby ooo matmul     :   1.390970   0.000304   1.391274 (  1.390790)
+opencl    ooo matmul     :   0.014119   0.000229   0.014348 (  0.011738)
+pure ruby softmax        :   0.024103   0.000014   0.024117 (  0.024135)
+opencl    softmax        :   0.010602   0.004277   0.014879 (  0.011941)
+pure ruby matmul         :   0.668126   0.000006   0.668132 (  0.667778)
+opencl    matmul         :   0.006672   0.007527   0.014199 (  0.011594)
+pure ruby                :   2.388817   0.000005   2.388822 (  2.387870)
+opencl                   :   0.152289   0.007804   0.160093 (  0.156279)
+pure ruby single function:   0.356575   0.000062   0.356637 (  0.356488)
+opencl     singlefunction:   0.120073   0.000210   0.120283 (  0.116378)
+pure ruby pow float:         0.088966   0.000051   0.089017 (  0.088996)
+opencl pow float:            0.018054   0.000100   0.018154 (  0.015429)
+pure ruby pow int:           0.025430   0.000070   0.025500 (  0.025524)
+opencl pow int:              0.015652   0.003880   0.019532 (  0.017386)
+----------------------------------------------------- total: 6.429024sec
+                                 user     system      total        real
+pure ruby arr index      :   0.003564   0.000110   0.003674 (  0.003636)
+opencl arr index         :   0.007966   0.003974   0.011940 (  0.009775)
+pure ruby min            :   0.246153   0.000102   0.246255 (  0.246172)
+opencl min               :   0.011787   0.007785   0.019572 (  0.016169)
+pure ruby sum            :   0.294371   0.000000   0.294371 (  0.294335)
+opencl sum               :   0.008266   0.003879   0.012145 (  0.009315)
+pure ruby split          :   0.014552   0.000000   0.014552 (  0.014539)
+opencl split             :   0.037984   0.004037   0.042021 (  0.033276)
+pure ruby add_n          :   0.146300   0.000053   0.146353 (  0.146319)
+opencl add_n             :   0.006426   0.007827   0.014253 (  0.011461)
+pure ruby ooo matmul     :   1.373232   0.000096   1.373328 (  1.372788)
+opencl    ooo matmul     :   0.013838   0.000000   0.013838 (  0.011088)
+pure ruby softmax        :   0.024478   0.000000   0.024478 (  0.024493)
+opencl    softmax        :   0.014117   0.000022   0.014139 (  0.011246)
+pure ruby matmul         :   0.653146   0.000054   0.653200 (  0.652889)
+opencl    matmul         :   0.002750   0.011934   0.014684 (  0.011729)
+pure ruby                :   2.392733   0.000058   2.392791 (  2.391726)
+opencl                   :   0.140118   0.016001   0.156119 (  0.151788)
+pure ruby single function:   0.352515   0.000000   0.352515 (  0.352443)
+opencl     singlefunction:   0.093955   0.011813   0.105768 (  0.102301)
+pure ruby pow float:         0.083659   0.000000   0.083659 (  0.083623)
+opencl pow float:            0.017433   0.000125   0.017558 (  0.014508)
+pure ruby pow int:           0.018381   0.000000   0.018381 (  0.018391)
+opencl pow int:              0.008186   0.003755   0.011941 (  0.009828)

data/lib/tensor_stream/opencl/array_ops.rb CHANGED Viewed

@@ -197,6 +197,7 @@ module TensorStream
             ops = if axis.zero? # fast path
               inputs.each_with_index.map do |input, index|
                 next if input.empty_value?
                 start = index * input.buffer.size * input.buffer.element_size
                 region = [input.buffer.size * input.buffer.element_size, 1, 1]
                 event_wait_list = build_event_wait_list(input)
@@ -339,7 +340,7 @@ module TensorStream
           register_op :index, noop: true do |context, tensor, inputs|
             a = _run(inputs[0], context)
-            index = read_final_result(_run(inputs[1], context))
+            index = inputs[1].value || read_final_result(_run(inputs[1], context))
             if a.is_a?(TensorStream::Evaluator::OutputGroup)
               a.outputs[index]
@@ -348,8 +349,7 @@ module TensorStream
             else
               new_shape = a.shape.dup
               new_shape.shift
-              input_a = read_final_result(a)
-              convert_to_opencl(input_a[index], new_shape, data_type: a.data_type, name: tensor.name)
+              _create_result_sub_buffer(a, index, tensor.data_type, new_shape, "#{tensor.name}/out_#{index}")
             end
           end

data/lib/tensor_stream/opencl/images_ops.rb CHANGED Viewed

@@ -7,9 +7,23 @@ module TensorStream
           register_op :decode_png do |context, tensor, inputs|
             content = _run(inputs[0], context)
             channels = tensor.options[:channels]
+            resample_new_shape = tensor.options[:new_shape]
+            resample_method = tensor.options[:resample_method] || :bilinear
             channels = 4 if channels.zero?
             image = ChunkyPNG::Image.from_blob(content.buffer.to_a.pack('C*'))
+            if resample_new_shape
+              case resample_method
+              when :bilinear
+                image.resample_bilinear!(resample_new_shape[1], resample_new_shape[0]) # width, # height
+              when :nearest_neighbor
+                image.resample_nearest_neighbor!(resample_new_shape[1], resample_new_shape[0])
+              else
+                raise TensorStream::ValueError, "invalid resample method provided #{resample_method}. Available (:bilinear, :nearest_neighbor)"
+              end
+            end
             output_buffer = _create_result_buffer(tensor.data_type, [image.height, image.width, channels], "out_#{tensor.name}")
             image.grayscale! if channels == 1
@@ -38,6 +52,10 @@ module TensorStream
           register_op :encode_png do |_context, tensor, inputs|
             image_data = inputs[0]
+            resample_new_shape = tensor.options[:new_shape]
+            resample_method = tensor.options[:resample_method] || :bilinear
             height, width, channels = image_data.shape
             image_buffer = image_data.buffer.reshape(*image_data.shape.reverse).to_a
 \
@@ -53,6 +71,18 @@ module TensorStream
                 end
               end
             end
+            if resample_new_shape
+              case resample_method
+              when :bilinear
+                png.resample_bilinear!(resample_new_shape[1], resample_new_shape[0]) # width, # height
+              when :nearest_neighbor
+                png.resample_nearest_neighbor!(resample_new_shape[1], resample_new_shape[0])
+              else
+                raise TensorStream::ValueError, "invalid resample method provided #{resample_method}. Available (:bilinear, :nearest_neighbor)"
+              end
+            end
             convert_to_opencl(png.to_s, [], data_type: :string, name: tensor.name)
           end
         end

data/lib/tensor_stream/opencl/kernels/conv2d.cl ADDED Viewed

@@ -0,0 +1,27 @@
+% ctype = dtype_to_c_type(dtype)
+__kernel void conv2d(const int height, const int width, __global const <%= ctype %> *images, __global const <%= ctype %> *filter, __global <%= ctype %> *output) {
+    // Get the index of the current element to be processed
+    const int batch_index = get_global_id(0);
+    const int h_index = get_global_id(1);
+    const int w_index = get_global_id(2);
+    const int h_index_with_stride = h_index * <%= stride[0] %>;
+    const int w_index_with_stride = w_index * <%= stride[1] %>;
+    const int image_index = batch_index * height * width * <%= ch %>;
+    const int image_row_width = width * <%= ch %>;
+    for (int out_channel_index = 0; out_channel_index < <%= out_ch %>; out_channel_index++) {
+      <%= ctype %> sum = 0;
+      for (int channel_index = 0; channel_index < <%= ch %>; channel_index++) {
+        for(int y = 0; y < <%= fh %>; y++) {
+          for (int x = 0; x < <%= fw %>; x++) {
+            if ( (h_index_with_stride + y) < height && (w_index_with_stride + x) < width) {
+              sum += images[image_index + (h_index_with_stride + y)*image_row_width + (w_index_with_stride + x)*<%= ch %> + channel_index] * filter[y*<%= fw * ch * out_ch %> + x*<%= ch * out_ch %> + (channel_index*<%= out_ch %>) + out_channel_index];
+            }
+          }
+        }
+      }
+      output[batch_index * (height/<%= stride[0] %>) * (width/<%= stride[1] %>) * <%= out_ch %> + h_index * (width/<%= stride[1] %>) * <%= out_ch %> +  w_index * <%= out_ch %> + out_channel_index ] = sum;
+    }
+}

data/lib/tensor_stream/opencl/kernels/conv2d_backprop_filter.cl ADDED Viewed

@@ -0,0 +1,26 @@
+% ctype = dtype_to_c_type(dtype)
+__kernel void conv2d_backprop_filter(const int batch_size, const int height, const int width, __global const <%= ctype %> *images, __global const <%= ctype %> *grad, __global <%= ctype %> *output) {
+    // Get the index of the current element to be processed
+    const int fh_index = get_global_id(0);
+    const int fw_index = get_global_id(1);
+    const int f_out_channel = get_global_id(2);
+    const int image_size = height * width * <%= ch %>;
+    const int grad_image_row_width = width * <%= out_ch %>;
+    for(int channel = 0; channel < <%= ch %>; channel++) {
+      <%= ctype %> grad_sum = 0.0;
+      for(int batch = 0; batch < batch_size; batch++) {
+        const int image_index = batch * height * width * <%= out_ch %>;
+        for(int y = 0; y < height; y++) {
+          for (int x = 0; x < width; x++) {
+            if ( ((y - fh_index) % <%= stride[0]%>) == 0  && ((x - fw_index) % <%= stride[1]%>) == 0 && fh_index <= y && fw_index <= x) {
+              const <%= ctype %> image_grad = grad[image_index + ((y - fh_index) / <%= stride[0] %>) * grad_image_row_width + ((x - fw_index) / <%= stride[1] %>) * <%= out_ch %> + f_out_channel];
+              grad_sum += images[batch * image_size + y * width * <%= ch %> + x * <%= ch %> + channel] * image_grad;
+            }
+          }
+        }
+      }
+      output[fh_index * <%= fw * ch * out_ch %> + fw_index * <%= ch * out_ch %> + channel * <%= out_ch %> + f_out_channel] = grad_sum;
+    }
+}

data/lib/tensor_stream/opencl/kernels/conv2d_backprop_input.cl ADDED Viewed

@@ -0,0 +1,32 @@
+% ctype = dtype_to_c_type(dtype)
+__kernel void conv2d_backprop_input(const int height, const int width, __global const <%= ctype %> *filter, __global const <%= ctype %> *grad, __global <%= ctype %> *output) {
+    // Get the index of the current element to be processed
+    int batch_index = get_global_id(0);
+    int h_index = get_global_id(1); // orig image y
+    int w_index = get_global_id(2); // orig image x
+    int h_index_with_stride = h_index / <%= stride[0] %>;
+    int w_index_with_stride = w_index / <%= stride[1] %>;
+    int grad_height = height / <%= stride[0] %>;
+    int grad_width = width / <%= stride[1] %>;
+    int image_index = batch_index * grad_height * grad_width * <%= out_ch %>;
+    int image_row_width = grad_width * <%= out_ch %>;
+    for (int channel_index = 0; channel_index < <%= ch %>; channel_index++) {
+      <%= ctype %> g = 0.0;
+      for (int out_channel_index = 0; out_channel_index < <%= out_ch %>; out_channel_index++) {
+        for(int y = 0; y < <%= fh %>; y++) {
+          for (int x = 0; x < <%= fw %>; x++) {
+            if ( (y <= h_index) && (x <= w_index) && ( (h_index - y) % <%= stride[0]%> == 0) && ( (w_index - x) % <%= stride[1]%> == 0)) {
+              <%= ctype %> imag_grad = grad[image_index + ( (h_index - y) / <%= stride[0] %>) * image_row_width + ( (w_index - x) / <%= stride[1] %>) * <%= out_ch %> + out_channel_index];
+              g += imag_grad * filter[y * <%= fw * ch * out_ch %> + x * <%= ch * out_ch %> + (channel_index*<%= out_ch %>) + out_channel_index];
+            }
+          }
+        }
+      }
+      output[batch_index * height * width * <%= ch %> + h_index * width * <%= ch %> +  w_index * <%= ch %> + channel_index ] = g;
+    }
+}

data/lib/tensor_stream/opencl/kernels/gemm.cl CHANGED Viewed

@@ -1,8 +1,6 @@
 // First naive implementation
 % c_dtype = dtype_to_c_type(dtype)
 __kernel void gemm_<%= dtype %>(const int M, const int N, const int K,
-                      const int A_transpose,
-                      const int B_transpose,
                       const __global <%= c_dtype %>* A,
                       const __global <%= c_dtype %>* B,
                       __global <%= c_dtype %>* C) {
@@ -16,14 +14,8 @@ __kernel void gemm_<%= dtype %>(const int M, const int N, const int K,
     for (int k=0; k<K; k++) {
         int a_index = globalRow*K + k;
         int b_index = k*N + globalCol;
-        if (A_transpose) {
-            a_index = M*k + globalRow;
-        }
-        if (B_transpose) {
-            b_index = globalCol*K + k;
-        }
+<% if ta %>a_index = M*k + globalRow;<% end %>
+<% if tb %>b_index = globalCol*K + k;<% end %>
         acc += A[a_index] * B[b_index];
     }

data/lib/tensor_stream/opencl/kernels/max.cl CHANGED Viewed

@@ -5,7 +5,7 @@
     const int globalRow = get_global_id(0); // Row ID of C (0..M)
     const int globalCol = get_global_id(1); // Col ID of C (0..N)
-    C[globalRow * N + globalCol] = A[globalRow * N + globalCol] >= B[globalRow * N + globalCol] ? A[globalRow * N + globalCol] : B[globalRow * N + globalCol];
+    C[globalRow * N + globalCol] = max((<%= c_dtype %>)A[globalRow * N + globalCol], (<%= c_dtype %>)B[globalRow * N + globalCol]);
 }
  // 1D + Scalar floating point add op
@@ -13,12 +13,8 @@
     // Get the index of the current element to be processed
     const int globalRow = get_global_id(0); // Row ID of C (0..M)
     const int globalCol = get_global_id(1); // Col ID of C (0..N)
-    if (switch_op == 0) {
-      C[globalRow * N + globalCol] = A[globalRow * N + globalCol] >= B[0] ? A[globalRow * N + globalCol] : B[0];
-    } else {
-      C[globalRow * N + globalCol] = B[0] >= A[globalRow * N + globalCol] ? B[0] : A[globalRow * N + globalCol];
-    }
+    C[globalRow * N + globalCol] = max((<%= c_dtype %>)A[globalRow * N + globalCol], (<%= c_dtype %>)B[0]);
 }
  // 1D + Scalar floating point add op broadcast
@@ -26,7 +22,7 @@
     // Get the index of the current element to be processed
     const int globalRow = get_global_id(0); // Row ID of C (0..M)
     const int globalCol = get_global_id(1); // Col ID of C (0..N)
     int b_m_index = globalRow;
     int b_n_index = globalCol;
@@ -38,9 +34,5 @@
       b_n_index = b_n_index % N2;
     }
-    if (switch_op == 0) {
-      C[globalRow * N + globalCol] = A[globalRow * N + globalCol] >= B[b_m_index * N2 + b_n_index] ? A[globalRow * N + globalCol] : B[b_m_index * N2 + b_n_index];
-    } else {
-      C[globalRow * N + globalCol] = B[b_m_index * N2 + b_n_index] >= A[globalRow * N + globalCol] ? B[b_m_index * N2 + b_n_index] :  A[globalRow * N + globalCol];
-    }
+    C[globalRow * N + globalCol] = max((<%= c_dtype %>)A[globalRow * N + globalCol],(<%= c_dtype %>) B[b_m_index * N2 + b_n_index]);
 }

data/lib/tensor_stream/opencl/kernels/mean.cl ADDED Viewed

@@ -0,0 +1,26 @@
+% c_dtype = dtype_to_c_type(dtype)
+__kernel void mean_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
+    // Get the index of the current element to be processed
+    const int id = get_global_id(0);
+    int offset = (id + <%= index %>) * <%= w %>;
+    <%= c_dtype %> sum = 0;
+    <% if n > 4 %>
+      for(int i = 0; i < <%= n/4 %> ; i++) {
+        <% sums = 4.times.map do |i|
+          "A[offset + #{i}]"
+        end %>
+        sum += <%= sums.join(' + ') %>;
+        offset += 4;
+      }
+      <% if n%4!=0 %>
+        <% (n % 4).times do |i| %>
+          sum += A[offset + <%= i %>];
+        <% end %>
+      <% end %>
+    <% else %>
+      <% n.times do |i| %>
+        sum += A[offset + <%= i %>];
+      <% end %>
+    <% end %>
+    C[id] = sum / <%= n %>;
+}

data/lib/tensor_stream/opencl/kernels/min.cl CHANGED Viewed

@@ -5,7 +5,7 @@
     const int globalRow = get_global_id(0); // Row ID of C (0..M)
     const int globalCol = get_global_id(1); // Col ID of C (0..N)
-    C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <= B[globalRow * N + globalCol] ? A[globalRow * N + globalCol] : B[globalRow * N + globalCol];
+    C[globalRow * N + globalCol] = min((<%= c_dtype %>)A[globalRow * N + globalCol],(<%= c_dtype %>) B[globalRow * N + globalCol]);
 }
  // 1D + Scalar floating point add op
@@ -14,11 +14,7 @@
     const int globalRow = get_global_id(0); // Row ID of C (0..M)
     const int globalCol = get_global_id(1); // Col ID of C (0..N)
-    if (switch_op == 0) {
-      C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <= B[0] ? A[globalRow * N + globalCol] : B[0];
-    } else {
-      C[globalRow * N + globalCol] = B[0] <= A[globalRow * N + globalCol] ? B[0] : A[globalRow * N + globalCol];
-    }
+    C[globalRow * N + globalCol] = min((<%= c_dtype %>)A[globalRow * N + globalCol], (<%= c_dtype %>) B[0]);
 }
  // 1D + Scalar floating point add op broadcast
@@ -38,9 +34,5 @@
       b_n_index = b_n_index % N2;
     }
-    if (switch_op == 0) {
-      C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <= B[b_m_index * N2 + b_n_index] ? A[globalRow * N + globalCol] : B[b_m_index * N2 + b_n_index];
-    } else {
-      C[globalRow * N + globalCol] = B[b_m_index * N2 + b_n_index] <= A[globalRow * N + globalCol] ? B[b_m_index * N2 + b_n_index] :  A[globalRow * N + globalCol];
-    }
+    C[globalRow * N + globalCol] = min((<%= c_dtype %>)A[globalRow * N + globalCol], (<%= c_dtype %>)B[b_m_index * N2 + b_n_index]);
 }

data/lib/tensor_stream/opencl/kernels/prod.cl ADDED Viewed

@@ -0,0 +1,26 @@
+% c_dtype = dtype_to_c_type(dtype)
+__kernel void prod_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
+    // Get the index of the current element to be processed
+    int id = get_global_id(0);
+    int offset = (id + <%= index %>) * <%= w %>;
+    <%= c_dtype %> prod = 1;
+    <% if n > 4 %>
+      for(int i = 0; i < <%= n/4 %> ; i++) {
+        <% sums = 4.times.map do |i|
+          "A[offset + #{i}]"
+        end %>
+        prod *= <%= sums.join(' * ') %>;
+        offset += 4;
+      }
+      <% if n%4!=0 %>
+        <% (n % 4).times do |i| %>
+          prod *= A[offset + <%= i %>];
+        <% end %>
+      <% end %>
+    <% else %>
+      <% n.times do |i| %>
+        prod *= A[offset + <%= i %>];
+      <% end %>
+    <% end %>
+    C[id] = prod;
+}