RubyGems - tensor_stream-opencl - Versions diffs - 0.2.3 → 0.2.4 - Mend

tensor_stream-opencl 0.2.3 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

checksums.yaml +4 -4
data/lib/tensor_stream/opencl/array_ops.rb +10 -4
data/lib/tensor_stream/opencl/kernels/conv2d.cl +9 -6
data/lib/tensor_stream/opencl/kernels/conv2d_backprop_filter.cl +15 -5
data/lib/tensor_stream/opencl/kernels/conv2d_backprop_input.cl +13 -5
data/lib/tensor_stream/opencl/math_ops.rb +11 -4
data/lib/tensor_stream/opencl/nn_ops.rb +78 -9
data/lib/tensor_stream/opencl/opencl_evaluator.rb +15 -5
data/lib/tensor_stream/opencl/version.rb +1 -1
data/samples/mnist_data_2.2.rb +5 -5
data/samples/mnist_data_2.3.rb +6 -5
data/samples/mnist_data_3.0.rb +145 -0
data/tensor_stream-opencl.gemspec +1 -1
metadata +5 -4

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 7888f5385dcec4b9a747128d1b25d0cb6d7fb01ab12f1c1b70706d0df5770903
-  data.tar.gz: 1129196d93a7b194d9de888fe49b43b0839c00589fb0987c0841ad1ac1693a4c
+  metadata.gz: 3e4aa123289372c651cd4da3e7c206abc4f9f67a551d4062180c5cf6555dc243
+  data.tar.gz: 6517954207c85f56cd08b2892b0119d4bb7a35e2d4bd9b9cacc5d3c9ccfb9e42
 SHA512:
-  metadata.gz: 727d97b9aa1402ed9681eb71fe0c0cdcc966e8a9e2f08b7480fdefa503509e26872b187a0b184d31a5b0cee86dc96e1d97f4fce14b378fa4a7c8ce4679b06421
-  data.tar.gz: 0e2f1601543d582042d0911222c1209f2df2c08139b7acc8ee347b5d7e236cd451a1b00032fb75d6f532105fb8f4ee49e53614071d111e0fabc8ed44010c4301
+  metadata.gz: 7f61d61be79dd1e06ebfdc77ed2dff9e717e0cdb292160fe20c9ca08693d867e1b0e0350c71db5d24feb4671a26e793f44d6b80762c384193c1985b6b1616376
+  data.tar.gz: 72c32530717fac8ff947ce4b204535755134bde14e0f70d0d120ff101b5654843312186317cb480fd5e1c620a25328a3590b1f35193faf1d196e7ad631d169b0

data/lib/tensor_stream/opencl/array_ops.rb CHANGED Viewed

@@ -422,10 +422,16 @@ module TensorStream
             a = inputs[0]
             if a.data_type != tensor.data_type
               buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
-              m, n = a.shape
-              cl_m = OpenCL::Int1.new(m || 1)
-              cl_n = OpenCL::Int1.new(n || 1)
-              work_group = [m || 1, n || 1]
+              work_group = if inputs[0].shape.size > 2
+                              [ inputs[0].shape.reduce(:*) / inputs[0].shape.last, inputs[0].shape.last]
+                            else
+                              m, n = inputs[0].shape
+                              [m || 1, n || 1]
+                            end
+              cl_m = OpenCL::Int1.new(work_group[0])
+              cl_n = OpenCL::Int1.new(work_group[1])
               event_wait_list = build_event_wait_list(inputs)
               buffer.op = _cl_program("cast", source_dt: a.data_type, target_dt: tensor.data_type).cast(_opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, buffer.cl_buffer, event_wait_list: event_wait_list)
               buffer

data/lib/tensor_stream/opencl/kernels/conv2d.cl CHANGED Viewed

@@ -1,27 +1,30 @@
 % ctype = dtype_to_c_type(dtype)
-__kernel void conv2d(const int height, const int width, __global const <%= ctype %> *images, __global const <%= ctype %> *filter, __global <%= ctype %> *output) {
+__kernel void conv2d(const int height, const int width, const int out_height, const int out_width, __global const <%= ctype %> *images, __global const <%= ctype %> *filter, __global <%= ctype %> *output) {
     // Get the index of the current element to be processed
     const int batch_index = get_global_id(0);
     const int h_index = get_global_id(1);
     const int w_index = get_global_id(2);
-    const int h_index_with_stride = h_index * <%= stride[0] %>;
-    const int w_index_with_stride = w_index * <%= stride[1] %>;
+    const int h_index_with_stride = h_index * <%= stride[0] %> - <%= padding[0] %>;
+    const int w_index_with_stride = w_index * <%= stride[1] %> - <%= padding[1] %>;
     const int image_index = batch_index * height * width * <%= ch %>;
     const int image_row_width = width * <%= ch %>;
+    const int out_image_row_size = out_height * out_width * <%= out_ch %>;
     for (int out_channel_index = 0; out_channel_index < <%= out_ch %>; out_channel_index++) {
       <%= ctype %> sum = 0;
       for (int channel_index = 0; channel_index < <%= ch %>; channel_index++) {
         for(int y = 0; y < <%= fh %>; y++) {
           for (int x = 0; x < <%= fw %>; x++) {
-            if ( (h_index_with_stride + y) < height && (w_index_with_stride + x) < width) {
-              sum += images[image_index + (h_index_with_stride + y)*image_row_width + (w_index_with_stride + x)*<%= ch %> + channel_index] * filter[y*<%= fw * ch * out_ch %> + x*<%= ch * out_ch %> + (channel_index*<%= out_ch %>) + out_channel_index];
+            if ( (h_index_with_stride + y) < height && (w_index_with_stride + x) < width &&
+                 (h_index_with_stride + y) >= 0 && (w_index_with_stride + x) >=0) {
+              <%= ctype %> f = filter[y*<%= fw * ch * out_ch %> + x*<%= ch * out_ch %> + (channel_index*<%= out_ch %>) + out_channel_index];
+              sum += images[image_index + (h_index_with_stride + y)*image_row_width + (w_index_with_stride + x)*<%= ch %> + channel_index] * f;
             }
           }
         }
       }
-      output[batch_index * (height/<%= stride[0] %>) * (width/<%= stride[1] %>) * <%= out_ch %> + h_index * (width/<%= stride[1] %>) * <%= out_ch %> +  w_index * <%= out_ch %> + out_channel_index ] = sum;
+      output[batch_index * out_image_row_size  + h_index * out_width * <%= out_ch %> +  w_index * <%= out_ch %> + out_channel_index ] = sum;
     }
 }

data/lib/tensor_stream/opencl/kernels/conv2d_backprop_filter.cl CHANGED Viewed

@@ -1,21 +1,31 @@
 % ctype = dtype_to_c_type(dtype)
-__kernel void conv2d_backprop_filter(const int batch_size, const int height, const int width, __global const <%= ctype %> *images, __global const <%= ctype %> *grad, __global <%= ctype %> *output) {
+__kernel void conv2d_backprop_filter(const int batch_size, const int height, const int width, const int out_height, const int out_width, __global const <%= ctype %> *images, __global const <%= ctype %> *grad, __global <%= ctype %> *output) {
     // Get the index of the current element to be processed
     const int fh_index = get_global_id(0);
     const int fw_index = get_global_id(1);
     const int f_out_channel = get_global_id(2);
     const int image_size = height * width * <%= ch %>;
-    const int grad_image_row_width = width * <%= out_ch %>;
+    const int grad_image_row_width = out_width * <%= out_ch %>;
+    const int grad_image_size = out_height * out_width * <%= out_ch %>;
     for(int channel = 0; channel < <%= ch %>; channel++) {
       <%= ctype %> grad_sum = 0.0;
       for(int batch = 0; batch < batch_size; batch++) {
-        const int image_index = batch * height * width * <%= out_ch %>;
+        int image_index = batch * grad_image_size;
         for(int y = 0; y < height; y++) {
           for (int x = 0; x < width; x++) {
-            if ( ((y - fh_index) % <%= stride[0]%>) == 0  && ((x - fw_index) % <%= stride[1]%>) == 0 && fh_index <= y && fw_index <= x) {
-              const <%= ctype %> image_grad = grad[image_index + ((y - fh_index) / <%= stride[0] %>) * grad_image_row_width + ((x - fw_index) / <%= stride[1] %>) * <%= out_ch %> + f_out_channel];
+            int y_offset = y - fh_index + <%= padding[0] %>;
+            int x_offset = x - fw_index + <%= padding[1] %>;
+            int y_offset_end = y + (<%= fh %> - fh_index - 1) - <%= padding[2] %>;
+            int x_offset_end = x + (<%= fw %> - fw_index - 1) - <%= padding[3] %>;
+            if ( (y_offset % <%= stride[0]%>) == 0
+              && (x_offset % <%= stride[1]%>) == 0
+              && (y_offset >=0) && (x_offset >= 0)
+              && (y_offset_end < height)
+              && (x_offset_end < width)) {
+              <%= ctype %> image_grad = grad[image_index + (y_offset / <%= stride[0] %>) * grad_image_row_width + ( x_offset / <%= stride[1] %>) * <%= out_ch %> + f_out_channel];
               grad_sum += images[batch * image_size + y * width * <%= ch %> + x * <%= ch %> + channel] * image_grad;
             }
           }

data/lib/tensor_stream/opencl/kernels/conv2d_backprop_input.cl CHANGED Viewed

@@ -1,6 +1,6 @@
 % ctype = dtype_to_c_type(dtype)
-__kernel void conv2d_backprop_input(const int height, const int width, __global const <%= ctype %> *filter, __global const <%= ctype %> *grad, __global <%= ctype %> *output) {
+__kernel void conv2d_backprop_input(const int height, const int width, const int out_height, const int out_width, __global const <%= ctype %> *filter, __global const <%= ctype %> *grad, __global <%= ctype %> *output) {
     // Get the index of the current element to be processed
     int batch_index = get_global_id(0);
     int h_index = get_global_id(1); // orig image y
@@ -8,8 +8,8 @@ __kernel void conv2d_backprop_input(const int height, const int width, __global
     int h_index_with_stride = h_index / <%= stride[0] %>;
     int w_index_with_stride = w_index / <%= stride[1] %>;
-    int grad_height = height / <%= stride[0] %>;
-    int grad_width = width / <%= stride[1] %>;
+    int grad_height = out_height;
+    int grad_width = out_width;
     int image_index = batch_index * grad_height * grad_width * <%= out_ch %>;
     int image_row_width = grad_width * <%= out_ch %>;
@@ -19,8 +19,16 @@ __kernel void conv2d_backprop_input(const int height, const int width, __global
       for (int out_channel_index = 0; out_channel_index < <%= out_ch %>; out_channel_index++) {
         for(int y = 0; y < <%= fh %>; y++) {
           for (int x = 0; x < <%= fw %>; x++) {
-            if ( (y <= h_index) && (x <= w_index) && ( (h_index - y) % <%= stride[0]%> == 0) && ( (w_index - x) % <%= stride[1]%> == 0)) {
-              <%= ctype %> imag_grad = grad[image_index + ( (h_index - y) / <%= stride[0] %>) * image_row_width + ( (w_index - x) / <%= stride[1] %>) * <%= out_ch %> + out_channel_index];
+            int y_offset = h_index - y + <%= padding[0] %>;
+            int x_offset = w_index - x + <%= padding[1] %>;
+            if ( ( y_offset >= 0) && (x_offset >= 0) &&
+                 ( y_offset % <%= stride[0]%> == 0) &&
+                 ( x_offset % <%= stride[1]%> == 0) &&
+                 ( h_index + (<%= fh %> - y - 1) < (height + <%= padding[2] %>)) &&
+                 ( w_index + (<%= fw %> - x - 1) < (width + <%= padding[3] %>))
+                 ) {
+              <%= ctype %> imag_grad = grad[image_index + ( y_offset / <%= stride[0] %>) * image_row_width + ( x_offset / <%= stride[1] %>) * <%= out_ch %> + out_channel_index];
               g += imag_grad * filter[y * <%= fw * ch * out_ch %> + x * <%= ch * out_ch %> + (channel_index*<%= out_ch %>) + out_channel_index];
             }
           }

data/lib/tensor_stream/opencl/math_ops.rb CHANGED Viewed

@@ -14,10 +14,15 @@ module TensorStream
             if inputs.size == 1
               inputs[0]
             else
-              m, n = inputs[0].shape
-              work_group = [m || 1, n || 1]
-              cl_m = OpenCL::Int1.new(m || 1)
-              cl_n = OpenCL::Int1.new(n || 1)
+              work_group = if inputs[0].shape.size > 2
+                             [ inputs[0].shape.reduce(:*) / inputs[0].shape.last, inputs[0].shape.last]
+                           else
+                             m, n = inputs[0].shape
+                             [m || 1, n || 1]
+                           end
+              cl_m = OpenCL::Int1.new(work_group[0])
+              cl_n = OpenCL::Int1.new(work_group[1])
               cl_switch = OpenCL::Int1.new(0)
               dtype = tensor.data_type
@@ -68,6 +73,7 @@ module TensorStream
             raise "#{tensor.inputs[0].name} rank must be greater than 1" if a.shape.size < 2
             raise "#{tensor.inputs[1].name} rank must be greater than 1" if b.shape.size < 2
+            raise "#{tensor.inputs[0].name} unsupported rank" if b.shape.size != 2 || a.shape.size!=2
             raise "incompatible shape sizes for matrix multiplication (#{a.shape[1]} != #{b.shape[0]}) #{a.shape} vs #{b.shape}" if k != v
             dtype = tensor.data_type
@@ -162,6 +168,7 @@ module TensorStream
               axis = axis.is_a?(OpenCLBuffer) ? read_final_result(axis) : axis
               input = complete_eval(value, child_context)
               value = value.buffer.reshape(*value.shape.reverse)
               rank = input.shape.size - 1

data/lib/tensor_stream/opencl/nn_ops.rb CHANGED Viewed

@@ -220,6 +220,9 @@ module TensorStream
             output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
             m, n = a.shape
+            raise "unsupported rank " if a.shape.size > 2
             work_group = [m]
             n = m if n.nil?
             cl_n = OpenCL::Int1.new(n || 1)
@@ -236,6 +239,9 @@ module TensorStream
             output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
             m, n = a.shape
+            raise "unsupported rank " if a.shape.size > 2
             work_group = [m]
             n = m if n.nil?
             cl_n = OpenCL::Int1.new(n || 1)
@@ -254,6 +260,9 @@ module TensorStream
             output_buffer_backprop = _create_result_buffer(tensor.data_type, a.shape, "#{tensor.name}_2")
             rank = a.shape.size - 1
             m, n = a.shape
+            raise "unsupported rank " if a.shape.size > 2
             work_group = [m]
             n = m if n.nil?
             cl_n = OpenCL::Int1.new(n || 1)
@@ -276,6 +285,9 @@ module TensorStream
             output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
             m, n = a.shape
+            raise "unsupported rank " if a.shape.size > 2
             work_group = [m]
             n = m if n.nil?
             cl_n = OpenCL::Int1.new(n || 1)
@@ -305,6 +317,9 @@ module TensorStream
             output_buffer_backprop = _create_result_buffer(tensor.data_type, a.shape, "#{tensor.name}_2")
             rank = a.shape.size - 1
             m, n = a.shape
+            raise "unsupported rank " if a.shape.size > 2
             work_group = [m]
             n = m if n.nil?
             cl_n = OpenCL::Int1.new(n || 1)
@@ -326,6 +341,7 @@ module TensorStream
             output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
             m, n = a.shape
+            raise "unsupported rank " if a.shape.size > 2
             work_group = [m]
             n = m if n.nil?
             cl_n = OpenCL::Int1.new(n || 1)
@@ -353,19 +369,29 @@ module TensorStream
             raise TensorStream::ValueError, " Current implementation does not yet support strides in the batch and depth dimensions." if strides[0] != 1 || strides[3] != 1
+            padding_option = tensor.options[:padding]
+            padding = conv2d_padding_options(padding_option, filter_shape, height, width, height_stride, width_stride)
             event_wait_list = build_event_wait_list(inputs)
-            f_height, f_width, in_channels, out_channels = filter_shape
-            out_shape = [batch, height / height_stride, width / width_stride, out_channels]
+            f_height, f_width, _in_channels, out_channels = filter_shape
+            out_h = (height - f_height + (padding[0] + padding[2])) / height_stride + 1
+            out_w = (width - f_width + (padding[1] + padding[3])) / width_stride + 1
+            out_shape = [batch, out_h, out_w, out_channels]
             output_buffer = _create_result_buffer(tensor.data_type, out_shape, tensor.name)
             cl_image_height = OpenCL::Int1.new(height)
             cl_image_width = OpenCL::Int1.new(width)
+            cl_out_height = OpenCL::Int1.new(out_h)
+            cl_out_width = OpenCL::Int1.new(out_w)
-            work_dimen = [batch, height / height_stride, width / width_stride]
+            work_dimen = [batch, out_h, out_w]
-            output_buffer.op = _cl_program("conv2d", dtype: tensor.data_type, fh: f_height, fw: f_width, ch: channel, out_ch: out_channels, stride: [height_stride, width_stride] ).send(:conv2d, _opencl_queue, work_dimen, cl_image_height, cl_image_width, inputs[0].cl_buffer,
-              inputs[1].cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
+            output_buffer.op = _cl_program("conv2d", dtype: tensor.data_type, fh: f_height, fw: f_width, ch: channel, out_ch: out_channels, stride: [height_stride, width_stride], padding: padding).
+              send(:conv2d, _opencl_queue, work_dimen, cl_image_height, cl_image_width,
+                    cl_out_height, cl_out_width, inputs[0].cl_buffer,
+                    inputs[1].cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
             output_buffer
           end
@@ -385,13 +411,22 @@ module TensorStream
             batch, height, width, channels = image_shape
             f_height, f_width, in_channels, out_channels = filter_shape
+            padding_option = tensor.options[:padding]
+            padding = conv2d_padding_options(padding_option, filter_shape, height, width, height_stride, width_stride)
             work_dimen = [batch, height, width]
+            out_h = (height - f_height + (padding[0] + padding[2])) / height_stride + 1
+            out_w = (width - f_width + (padding[1] + padding[3])) / width_stride + 1
             cl_image_height = OpenCL::Int1.new(height)
             cl_image_width = OpenCL::Int1.new(width)
+            cl_out_height = OpenCL::Int1.new(out_h)
+            cl_out_width = OpenCL::Int1.new(out_w)
-            output_buffer.op = _cl_program("conv2d_backprop_input", dtype: tensor.data_type, fh: f_height, fw: f_width, ch: channels, out_ch: out_channels, stride: [height_stride, width_stride] ).send(:conv2d_backprop_input, _opencl_queue, work_dimen, cl_image_height, cl_image_width,
-              filter.cl_buffer, grad.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
+            output_buffer.op = _cl_program("conv2d_backprop_input", dtype: tensor.data_type, fh: f_height, fw: f_width, ch: channels, out_ch: out_channels, stride: [height_stride, width_stride], padding: padding).
+              send(:conv2d_backprop_input, _opencl_queue, work_dimen, cl_image_height, cl_image_width,
+                cl_out_height, cl_out_width, filter.cl_buffer, grad.cl_buffer,
+                output_buffer.cl_buffer, event_wait_list: event_wait_list)
             output_buffer
           end
@@ -411,14 +446,48 @@ module TensorStream
             f_height, f_width, input_channels, output_channels = filter_shape
             work_dimen = [f_height, f_width, output_channels]
+            padding_option = tensor.options[:padding]
+            padding = conv2d_padding_options(padding_option, filter_shape, height, width, height_stride, width_stride)
+            out_h = (height - f_height + (padding[0] + padding[2])) / height_stride + 1
+            out_w = (width - f_width + (padding[1] + padding[3])) / width_stride + 1
             cl_batch_size = OpenCL::Int1.new(batch_size)
             cl_image_height = OpenCL::Int1.new(height)
             cl_image_width = OpenCL::Int1.new(width)
+            cl_out_height = OpenCL::Int1.new(out_h)
+            cl_out_width = OpenCL::Int1.new(out_w)
-            output_buffer.op = _cl_program("conv2d_backprop_filter", dtype: tensor.data_type, fh: f_height, fw: f_width, ch: channels, out_ch: output_channels, stride: [height_stride, width_stride] ).send(:conv2d_backprop_filter, _opencl_queue, work_dimen, cl_batch_size, cl_image_height, cl_image_width,
-              images.cl_buffer, grad.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
+            output_buffer.op = _cl_program("conv2d_backprop_filter", dtype: tensor.data_type, fh: f_height, fw: f_width, ch: channels, out_ch: output_channels, stride: [height_stride, width_stride], padding: padding ).
+              send(:conv2d_backprop_filter, _opencl_queue, work_dimen, cl_batch_size, cl_image_height, cl_image_width,
+                cl_out_height, cl_out_width, images.cl_buffer, grad.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
             output_buffer
           end
+          def conv2d_padding_options(padding_option, filter_shape, height, width, h_stride, w_stride)
+            case padding_option
+            when 'SAME'
+              [
+                calc_pad(height, h_stride, filter_shape[0]),
+                calc_pad(width, w_stride, filter_shape[1]),
+                calc_pad(height, h_stride, filter_shape[0], true),
+                calc_pad(width, w_stride, filter_shape[1], true)
+              ]
+            when 'VALID'
+              [0, 0, 0, 0]
+            else
+              raise TensorStream::ValueError, "Unsupported padding value #{padding_option}, valid values 'SAME', 'VALID'"
+            end
+          end
+          def calc_pad(w, stride, f_shape, ceil = false)
+            r = ((w / stride - 1) * stride - w + f_shape)
+            if ceil
+              r.odd? ? r / 2 + 1 : r / 2
+            else
+              r / 2
+            end
+          end
         end
       end
     end

data/lib/tensor_stream/opencl/opencl_evaluator.rb CHANGED Viewed

@@ -112,6 +112,7 @@ module TensorStream
         result = complete_eval(tensor, execution_context)
         # puts "-------------------wait finish------------------------"
         _opencl_queue.finish
+        # puts "-------------------done finish------------------------"
         read_final_result(result)
       end
@@ -170,6 +171,7 @@ module TensorStream
         events = build_event_wait_list([buffer])
         # puts "** wait #{tensor.name} **"
         OpenCL.wait_for_events(events) unless events.empty?
+        # puts "** done #{tensor.name} **"
         buffer
       end
@@ -449,6 +451,7 @@ module TensorStream
         events = build_event_wait_list(inputs)
         # puts "** wait for event flow_group**"
         OpenCL.wait_for_events(events) unless events.empty?
+        # puts "** done for event flow_group**"
         nil
       end
@@ -461,9 +464,7 @@ module TensorStream
         return @context[:_cache][cache_key] if @context[:_cache].key?(cache_key)
         return @context[cache_key] if @context.key?(cache_key)
-        # puts "opencl eval #{object_id} #{tensor.name}"
         invoke(tensor, child_context).tap do |result|
-          # puts "result done opencl #{object_id}: #{tensor.name}"
           if tensor.breakpoint
             a = resolve_placeholder(tensor.inputs[0], child_context) if tensor.inputs && tensor.inputs[0]
             b = resolve_placeholder(tensor.inputs[1], child_context) if tensor.inputs && tensor.inputs[1]
@@ -569,6 +570,9 @@ module TensorStream
                        [m, result_shape.reduce(:*) / m]
                      elsif result_shape.size <= 2
                        [m || 1, n || 1]
+                     elsif (b.shape.size == 1) && (result_shape.last == b.shape.last)
+                      last_dim = b.shape.last
+                      [result_shape.reduce(:*) / last_dim, last_dim]
                      else
                        raise "rank > 2 not supported for now"
                      end
@@ -614,9 +618,15 @@ module TensorStream
         output_buffer = _create_result_buffer(tensor.data_type, p.shape, tensor.name)
         m, n = p.shape
-        work_group = [m || 1, n || 1]
-        cl_m = OpenCL::Int1.new(m || 1)
-        cl_n = OpenCL::Int1.new(n || 1)
+        work_group = if p.shape.size > 2
+                       [m, p.shape.reduce(:*) / m]
+                     else
+                       [ m || 1, n || 1]
+                     end
+        cl_m = OpenCL::Int1.new(work_group[0])
+        cl_n = OpenCL::Int1.new(work_group[1])
         event_wait_list = build_event_wait_list([a, b, p]) # add dependency wait list
         output_buffer.op = _cl_program(op_name.to_s, dtype: dtype).

data/lib/tensor_stream/opencl/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 module TensorStream
   module Opencl
-    VERSION = "0.2.3"
+    VERSION = "0.2.4"
   end
 end

data/samples/mnist_data_2.2.rb CHANGED Viewed

@@ -29,19 +29,19 @@ M = 60
 N = 30
-w1 = tf.variable(tf.random_normal([784, K]))
+w1 = tf.variable(tf.truncated_normal([784, K], stddev: 0.1))
 b1 = tf.variable(tf.ones([K])/10)
-w2 = tf.variable(tf.random_normal([K, L]))
+w2 = tf.variable(tf.truncated_normal([K, L], stddev: 0.1))
 b2 = tf.variable(tf.ones([L])/10)
-w3 = tf.variable(tf.random_normal([L, M]))
+w3 = tf.variable(tf.truncated_normal([L, M], stddev: 0.1))
 b3 = tf.variable(tf.ones([M])/10)
-w4 = tf.variable(tf.random_normal([M, N]))
+w4 = tf.variable(tf.truncated_normal([M, N], stddev: 0.1))
 b4 = tf.variable(tf.ones([N])/10)
-w5 = tf.variable(tf.random_normal([N, 10]))
+w5 = tf.variable(tf.truncated_normal([N, 10], stddev: 0.1))
 b5 = tf.variable(tf.zeros([10]))
 x_ = tf.reshape(x, [-1, 784])

data/samples/mnist_data_2.3.rb CHANGED Viewed

@@ -1,6 +1,7 @@
 # A ruby port of the example code discussed by Martin Gorner in
 # "TensorFlow and Deep Learning without a PhD, Part 1 (Google Cloud Next '17)""
 #
+# Five Layers with relu decay
 # https://www.youtube.com/watch?v=u4alGiomYP4
 #
 # Requirements:
@@ -35,19 +36,19 @@ M = 60
 N = 30
-w1 = tf.variable(tf.random_normal([784, K]))
+w1 = tf.variable(tf.truncated_normal([784, K], stddev: 0.1))
 b1 = tf.variable(tf.ones([K])/10)
-w2 = tf.variable(tf.random_normal([K, L]))
+w2 = tf.variable(tf.truncated_normal([K, L], stddev: 0.1))
 b2 = tf.variable(tf.ones([L])/10)
-w3 = tf.variable(tf.random_normal([L, M]))
+w3 = tf.variable(tf.truncated_normal([L, M], stddev: 0.1))
 b3 = tf.variable(tf.ones([M])/10)
-w4 = tf.variable(tf.random_normal([M, N]))
+w4 = tf.variable(tf.truncated_normal([M, N], stddev: 0.1))
 b4 = tf.variable(tf.ones([N])/10)
-w5 = tf.variable(tf.random_normal([N, 10]))
+w5 = tf.variable(tf.truncated_normal([N, 10], stddev: 0.1))
 b5 = tf.variable(tf.zeros([10]))
 x_ = tf.reshape(x, [-1, 784])

data/samples/mnist_data_3.0.rb ADDED Viewed

@@ -0,0 +1,145 @@
+# A ruby port of the example code discussed by Martin Gorner in
+# "TensorFlow and Deep Learning without a PhD, Part 1 (Google Cloud Next '17)""
+#
+# https://www.youtube.com/watch?v=u4alGiomYP4
+#
+# Requirements:
+#   mnist-learn gem
+#   opencl_ruby_ffi gem
+require "bundler/setup"
+require 'tensor_stream'
+require 'mnist-learn'
+require 'pry-byebug'
+# Enable OpenCL hardware accelerated computation, not using OpenCL can be very slow
+require 'tensor_stream/opencl'
+tf = TensorStream
+puts "Tensorstream version #{tf.__version__} with OpenCL lib #{TensorStream::Opencl::VERSION}"
+# Import MNIST data
+puts "downloading minst data"
+# Download images and labels into mnist.test (10K images+labels) and mnist.train (60K images+labels)
+mnist = Mnist.read_data_sets('/tmp/data', one_hot: true)
+puts "downloading finished"
+# neural network structure for this sample:
+#
+# · · · · · · · · · ·      (input data, 1-deep)                 X [batch, 28, 28, 1]
+# @ @ @ @ @ @ @ @ @ @   -- conv. layer 5x5x1=>4 stride 1        W1 [5, 5, 1, 4]        B1 [4]
+# ∶∶∶∶∶∶∶∶∶∶∶∶∶∶∶∶∶∶∶                                           Y1 [batch, 28, 28, 4]
+#   @ @ @ @ @ @ @ @     -- conv. layer 5x5x4=>8 stride 2        W2 [5, 5, 4, 8]        B2 [8]
+#   ∶∶∶∶∶∶∶∶∶∶∶∶∶∶∶                                             Y2 [batch, 14, 14, 8]
+#     @ @ @ @ @ @       -- conv. layer 4x4x8=>12 stride 2       W3 [4, 4, 8, 12]       B3 [12]
+#     ∶∶∶∶∶∶∶∶∶∶∶                                               Y3 [batch, 7, 7, 12] => reshaped to YY [batch, 7*7*12]
+#      \x/x\x\x/        -- fully connected layer (relu)         W4 [7*7*12, 200]       B4 [200]
+#       · · · ·                                                 Y4 [batch, 200]
+#       \x/x\x/         -- fully connected layer (softmax)      W5 [200, 10]           B5 [10]
+#        · · ·                                                  Y [batch, 10]
+# input X: 28x28 grayscale images, the first dimension (None) will index the images in the mini-batch
+x = tf.placeholder(:float32, shape: [nil, 28, 28, 1])
+# correct answers will go here
+y_ = tf.placeholder(:float32, shape: [nil, 10])
+# step for variable learning rate
+step = tf.placeholder(:int32)
+pkeep = tf.placeholder(tf.float32)
+# three convolutional layers with their channel counts, and a
+# fully connected layer (tha last layer has 10 softmax neurons)
+K = 4 # first convolutional layer output depth
+L = 8 # second convolutional layer output depth
+M = 12 # third convolutional layer
+N = 200 # fully connected layer
+w1 = tf.variable(tf.truncated_normal([6, 6, 1, K], stddev: 0.1))
+b1 = tf.variable(tf.ones([K])/10)
+w2 = tf.variable(tf.truncated_normal([5, 5, K, L], stddev: 0.1))
+b2 = tf.variable(tf.ones([L])/10)
+w3 = tf.variable(tf.truncated_normal([4, 4, L, M], stddev: 0.1))
+b3 = tf.variable(tf.ones([M])/10)
+w4 = tf.variable(tf.truncated_normal([7 * 7 * M, N], stddev: 0.1))
+b4 = tf.variable(tf.ones([N])/10)
+w5 = tf.variable(tf.truncated_normal([N, 10], stddev: 0.1))
+b5 = tf.variable(tf.ones([10])/10)
+# The model
+stride = 1  # output is 28x28
+y1 = tf.nn.relu(tf.nn.conv2d(tf.reshape(x, [-1, 28, 28, 1]), w1, [1, stride, stride, 1], 'SAME') + b1)
+stride = 2  # output is 14x14
+y2 = tf.nn.relu(tf.nn.conv2d(y1, w2, [1, stride, stride, 1], 'SAME') + b2)
+stride = 2  # output is 7x7
+y3 = tf.nn.relu(tf.nn.conv2d(y2, w3, [1, stride, stride, 1], 'SAME') + b3)
+# reshape the output from the third convolution for the fully connected layer
+yy = tf.reshape(y3, [-1, 7 * 7 * M])
+y4 = tf.nn.relu(tf.matmul(yy, w4) + b4)
+# dropout to prevent overfitting
+yy4 = tf.nn.dropout(y4, pkeep)
+ylogits = tf.matmul(yy4, w5) + b5
+# model
+y = tf.nn.softmax(ylogits)
+# training step, learning rate = 0.003
+# cross-entropy loss function (= -sum(Y_i * log(Yi)) ), normalised for batches of 100  images
+# TensorFlow provides the softmax_cross_entropy_with_logits function to avoid numerical stability
+# problems with log(0) which is NaN
+cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits: ylogits, labels: y_)
+cross_entropy = tf.reduce_mean(cross_entropy)*100
+is_correct = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
+accuracy =  tf.reduce_mean(tf.cast(is_correct, :float32))
+# training step, learning rate = 0.003
+lr = 0.0001.t +  tf.train.exponential_decay(0.003, step, 2000, 1/Math::E)
+train_step = TensorStream::Train::AdamOptimizer.new(lr).minimize(cross_entropy)
+sess = tf.session
+# Add ops to save and restore all the variables.
+init = tf.global_variables_initializer
+sess.run(init)
+mnist_train = mnist.train
+test_data = { x => mnist.test.images, y_ => mnist.test.labels, pkeep => 1.0 }
+(0..10001).each do |i|
+  # load batch of images and correct answers
+  batch_x, batch_y = mnist_train.next_batch(100)
+  train_data = { x => batch_x, y_ => batch_y, step => i, pkeep => 0.75 }
+  # train
+  sess.run(train_step, feed_dict: train_data)
+  if (i % 10 == 0)
+    # File.write("profile.json", TensorStream::ReportTool.profile_for(sess).to_json)
+    # success? add code to print it
+    a_train, c_train, l = sess.run([accuracy, cross_entropy, lr], feed_dict: { x => batch_x, y_ => batch_y, step => i, pkeep => 1.0})
+    puts "#{i}: accuracy:#{a_train} loss:#{c_train} (lr:#{l})"
+  end
+  if (i % 100 == 0)
+    # success on test data?
+    a_test, c_test = sess.run([accuracy, cross_entropy], feed_dict: test_data, pkeep => 1.0)
+    puts("#{i}: ******** test accuracy: #{a_test} test loss: #{c_test}")
+  end
+end

data/tensor_stream-opencl.gemspec CHANGED Viewed

@@ -39,7 +39,7 @@ Gem::Specification.new do |spec|
   spec.add_development_dependency "awesome_print"
   spec.add_development_dependency "mnist-learn"
   spec.add_development_dependency "simplecov"
-  spec.add_dependency "tensor_stream", "~> 0.9.7"
+  spec.add_dependency "tensor_stream", "~> 0.9.8"
   spec.add_dependency "opencl_ruby_ffi"
   spec.add_dependency "oily_png"
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: tensor_stream-opencl
 version: !ruby/object:Gem::Version
-  version: 0.2.3
+  version: 0.2.4
 platform: ruby
 authors:
 - Joseph Dayo
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2018-11-19 00:00:00.000000000 Z
+date: 2018-11-25 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -114,14 +114,14 @@ dependencies:
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: 0.9.7
+        version: 0.9.8
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: 0.9.7
+        version: 0.9.8
 - !ruby/object:Gem::Dependency
   name: opencl_ruby_ffi
   requirement: !ruby/object:Gem::Requirement
@@ -252,6 +252,7 @@ files:
 - samples/mnist_data_2.1.rb
 - samples/mnist_data_2.2.rb
 - samples/mnist_data_2.3.rb
+- samples/mnist_data_3.0.rb
 - samples/multigpu.rb
 - samples/nearest_neighbor.rb
 - samples/rnn.rb