RubyGems - tensor_stream-opencl - Versions diffs - 0.3.0 → 0.3.1 - Mend

tensor_stream-opencl 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

checksums.yaml +4 -4
data/benchmark/benchmark.rb +63 -37
data/benchmark_imac2015_iris.txt +80 -0
data/lib/tensor_stream/opencl.rb +1 -0
data/lib/tensor_stream/opencl/array_ops.rb +37 -7
data/lib/tensor_stream/opencl/images_ops.rb +1 -1
data/lib/tensor_stream/opencl/kernels/gemm.cl +5 -4
data/lib/tensor_stream/opencl/kernels/random_uniform.cl +7 -0
data/lib/tensor_stream/opencl/math_ops.rb +13 -11
data/lib/tensor_stream/opencl/opencl_buffer.rb +85 -3
data/lib/tensor_stream/opencl/opencl_evaluator.rb +32 -45
data/lib/tensor_stream/opencl/random_ops.rb +54 -0
data/lib/tensor_stream/opencl/utils.rb +27 -0
data/lib/tensor_stream/opencl/version.rb +1 -1
data/samples/mnist_data_2.3.rb +9 -4
data/samples/mnist_data_3.0.rb +2 -2
data/tensor_stream-opencl.gemspec +1 -1
metadata +8 -4

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: d2b7fa8bb0dcaa041e5ec390418ffac2bb40d777086e9ebe771f9f42d79ccabc
-  data.tar.gz: e21e0cf638e177da495e9748087808c51aee15ac10212c856ee789ae8c32d458
+  metadata.gz: b433e9e7ab38a517c21b57065e5a43b112640fd7c419fb7baa2f3319128cdacf
+  data.tar.gz: fab7d48513cb0f8481e151d18b088782918cb1539b59586613a00c4d5f5aeed2
 SHA512:
-  metadata.gz: 31a1cc671716dee18d6841e39f3bf43c5326a423be59e118e5acf35777aaed9b29acdfdd3425d6ce48f5f5c5f8e5692fae687a592828be0a458f337498965008
-  data.tar.gz: ecef0d771882d662996aa31cfa4eaf9e8cef720c2b05eb48f872ce9c3fbe9e07dfc99427332a6bcbf5fb8054f7f00d27fc55184462a559be834635aaad358c30
+  metadata.gz: 04d106f5ee5fac49eba20ff143bb2212a1cafd5140fc04cee20958ffea0c5909d352824948badf16ec5bc8ca2a7b13b4dcf7748eb03cbd6dc8a466c6ae0f5040
+  data.tar.gz: e17171f28641ce3496c0b338b6913c96e10d9fd5ce93b7980dae6edef00e63e5f7c4dcb60ed04fed5271a474b4940d069ebcf6a00bbfd3c4e6eafa2c0c4f26ed

data/benchmark/benchmark.rb CHANGED Viewed

@@ -4,6 +4,25 @@ require 'benchmark'
 require 'pry-byebug'
 require 'awesome_print'
 require 'tensor_stream/opencl'
+require 'rbconfig'
+def os
+  @os ||= (
+    host_os = RbConfig::CONFIG['host_os']
+    case host_os
+    when /mswin|msys|mingw|cygwin|bccwin|wince|emc/
+      :windows
+    when /darwin|mac os/
+      :macosx
+    when /linux/
+      :linux
+    when /solaris|bsd/
+      :unix
+    else
+      raise Error::WebDriverError, "unknown os: #{host_os.inspect}"
+    end
+  )
+end
 def tr(t, places = 1)
   if t.is_a?(Array)
@@ -77,49 +96,56 @@ conv2d_grad = tf.gradients(conv2d, [sample_image, sample_filter])
 bias_add = tf.nn.bias_add(large_tensor, large_tensor_bias)
 bias_add_grad = tf.gradients(bias_add, [large_tensor_bias])
+dropout = tf.nn.dropout(large_tensor, 0.8)
 puts TensorStream::Evaluator.default_evaluators
 sess2 = tf.session
-puts `cat /proc/cpuinfo | grep "model name" | head -1`
+if os == :macosx
+  puts `sysctl -n machdep.cpu.brand_string`
+else
+  puts `cat /proc/cpuinfo | grep "model name" | head -1`
+end
 device = TensorStream::Evaluator::OpenclEvaluator.default_device.native_device
 puts "OpenCL device #{device.platform.to_s} #{device.name}"
 Benchmark.bmbm do |x|
-  x.report("pure ruby argmin            :") { 100.times do sess.run(argmin) end }
-  x.report("opencl argmin               :") { 100.times do sess2.run(argmin) end }
-  x.report("pure ruby bias_add_grad            :") { 100.times do sess.run(bias_add_grad) end }
-  x.report("opencl bias_add_grad               :") { 100.times do sess2.run(bias_add_grad) end }
-  x.report("pure ruby bias_add             :") { 100.times do sess.run(bias_add) end }
-  x.report("opencl bias_add                :") { 100.times do sess2.run(bias_add) end }
-  x.report("pure ruby conv2d_backprop      :") { 100.times do sess.run(conv2d_grad) end }
-  x.report("opencl conv2d_backprop         :") { 100.times do sess2.run(conv2d_grad) end }
-  x.report("pure ruby conv2d      :") { 100.times do sess.run(conv2d) end }
-  x.report("opencl conv2d         :") { 100.times do sess2.run(conv2d) end }
-  x.report("pure ruby arr index      :") { 100.times do sess.run(index) end }
-  x.report("opencl arr index         :") { 100.times do sess2.run(index) end }
-  x.report("pure ruby min            :") { 100.times do sess.run(min) end }
-  x.report("opencl min               :") { 100.times do sess2.run(min) end }
-  x.report("pure ruby sum            :") { 100.times do sess.run(sum) end }
-  x.report("opencl sum               :") { 100.times do sess2.run(sum) end }
-  x.report("pure ruby sum axis 1     :") { 100.times do sess.run(sum_axis_1) end }
-  x.report("opencl sum axis 1        :") { 100.times do sess2.run(sum_axis_1) end }
-  x.report("pure ruby split          :") { 100.times do sess.run(split) end }
-  x.report("opencl split             :") { 100.times do sess2.run(split) end }
-  x.report("pure ruby add_n          :") { 100.times do sess.run(add_n) end }
-  x.report("opencl add_n             :") { 100.times do sess2.run(add_n) end }
-  x.report("pure ruby ooo matmul     :") { 100.times do sess.run(out_of_order) end }
-  x.report("opencl    ooo matmul     :") { 100.times do sess2.run(out_of_order) end }
-  x.report("pure ruby softmax        :") { 100.times do sess.run(softmax) end }
-  x.report("opencl    softmax        :") { 100.times do sess2.run(softmax) end }
-  x.report("pure ruby matmul         :") { 100.times do sess.run(matmul) end }
-  x.report("opencl    matmul         :") { 100.times do sess2.run(matmul) end }
-  x.report("pure ruby                :") { 100.times do sess.run(model, feed_dict: { p => rand, q => rand }) end }
-  x.report("opencl                   :") { 100.times do sess2.run(model, feed_dict: { p => rand, q => rand }) end }
-  x.report("pure ruby single function:") { 100.times do sess.run(single_function_test, feed_dict: { p => rand, q => rand }) end }
-  x.report("opencl     singlefunction:") { 100.times do sess2.run(single_function_test, feed_dict: { p => rand, q => rand }) end }
-  x.report("pure ruby pow float:") { 100.times do sess.run(pow_f, feed_dict: { p => rand, q => rand }) end }
-  x.report("opencl pow float:") { 100.times do sess2.run(pow_f, feed_dict: { p => rand, q => rand }) end }
-  x.report("pure ruby pow int:") { 100.times do sess.run(pow_i, feed_dict: { p => rand, q => rand }) end }
-  x.report("opencl pow int:") { 100.times do sess2.run(pow_i, feed_dict: { p => rand, q => rand }) end }
+  x.report("ruby argmin            :") { 100.times do sess.run(argmin) end }
+  x.report("opencl argmin          :") { 100.times do sess2.run(argmin) end }
+  x.report("ruby bias_add_grad     :") { 100.times do sess.run(bias_add_grad) end }
+  x.report("opencl bias_add_grad   :") { 100.times do sess2.run(bias_add_grad) end }
+  x.report("ruby bias_add          :") { 100.times do sess.run(bias_add) end }
+  x.report("opencl bias_add        :") { 100.times do sess2.run(bias_add) end }
+  x.report("ruby conv2d_backprop   :") { 100.times do sess.run(conv2d_grad) end }
+  x.report("opencl conv2d_backprop :") { 100.times do sess2.run(conv2d_grad) end }
+  x.report("ruby conv2d            :") { 100.times do sess.run(conv2d) end }
+  x.report("opencl conv2d          :") { 100.times do sess2.run(conv2d) end }
+  x.report("ruby arr index         :") { 100.times do sess.run(index) end }
+  x.report("opencl arr index       :") { 100.times do sess2.run(index) end }
+  x.report("ruby min               :") { 100.times do sess.run(min) end }
+  x.report("opencl min             :") { 100.times do sess2.run(min) end }
+  x.report("ruby sum               :") { 100.times do sess.run(sum) end }
+  x.report("opencl sum             :") { 100.times do sess2.run(sum) end }
+  x.report("ruby sum axis 1        :") { 100.times do sess.run(sum_axis_1) end }
+  x.report("opencl sum axis 1      :") { 100.times do sess2.run(sum_axis_1) end }
+  x.report("ruby split             :") { 100.times do sess.run(split) end }
+  x.report("opencl split           :") { 100.times do sess2.run(split) end }
+  x.report("ruby add_n             :") { 100.times do sess.run(add_n) end }
+  x.report("opencl add_n           :") { 100.times do sess2.run(add_n) end }
+  x.report("ruby ooo matmul        :") { 100.times do sess.run(out_of_order) end }
+  x.report("opencl    ooo matmul   :") { 100.times do sess2.run(out_of_order) end }
+  x.report("ruby softmax           :") { 100.times do sess.run(softmax) end }
+  x.report("opencl    softmax      :") { 100.times do sess2.run(softmax) end }
+  x.report("ruby matmul            :") { 100.times do sess.run(matmul) end }
+  x.report("opencl    matmul       :") { 100.times do sess2.run(matmul) end }
+  x.report("ruby                   :") { 100.times do sess.run(model, feed_dict: { p => rand, q => rand }) end }
+  x.report("opencl                 :") { 100.times do sess2.run(model, feed_dict: { p => rand, q => rand }) end }
+  x.report("ruby single function   :") { 100.times do sess.run(single_function_test, feed_dict: { p => rand, q => rand }) end }
+  x.report("opencl single function :") { 100.times do sess2.run(single_function_test, feed_dict: { p => rand, q => rand }) end }
+  x.report("ruby pow float         :") { 100.times do sess.run(pow_f, feed_dict: { p => rand, q => rand }) end }
+  x.report("opencl pow float       :") { 100.times do sess2.run(pow_f, feed_dict: { p => rand, q => rand }) end }
+  x.report("ruby pow int           :") { 100.times do sess.run(pow_i, feed_dict: { p => rand, q => rand }) end }
+  x.report("opencl pow int         :") { 100.times do sess2.run(pow_i, feed_dict: { p => rand, q => rand }) end }
+  x.report("ruby dropout           :") { 100.times do sess.run(dropout) end }
+  x.report("opencl dropout         :") { 100.times do sess2.run(dropout) end }
 end

data/benchmark_imac2015_iris.txt ADDED Viewed

@@ -0,0 +1,80 @@
+TensorStream::Evaluator::OpenclEvaluator
+TensorStream::Evaluator::RubyEvaluator
+Intel(R) Core(TM) i5-5575R CPU @ 2.80GHz
+OpenCL device Apple Intel(R) Iris(TM) Pro Graphics 6200
+Rehearsal ------------------------------------------------------------
+ruby argmin            :   0.940000   0.030000   0.970000 (  1.197240)
+opencl argmin          :   0.070000   0.020000   0.090000 (  0.093035)
+ruby bias_add_grad     :   2.390000   0.060000   2.450000 (  2.558622)
+opencl bias_add_grad   :   0.020000   0.010000   0.030000 (  0.030563)
+ruby bias_add          :   2.530000   0.070000   2.600000 (  2.749423)
+opencl bias_add        :   0.150000   0.030000   0.180000 (  0.191476)
+ruby conv2d_backprop   :   4.020000   0.060000   4.080000 (  5.306408)
+opencl conv2d_backprop :   0.040000   0.030000   0.070000 (  0.077737)
+ruby conv2d            :   0.890000   0.010000   0.900000 (  0.963062)
+opencl conv2d          :   0.030000   0.010000   0.040000 (  0.042274)
+ruby arr index         :   0.000000   0.000000   0.000000 (  0.004072)
+opencl arr index       :   0.010000   0.010000   0.020000 (  0.023981)
+ruby min               :   3.710000   0.040000   3.750000 (  4.329215)
+opencl min             :   0.160000   0.030000   0.190000 (  0.191062)
+ruby sum               :   6.930000   0.080000   7.010000 (  7.467194)
+opencl sum             :   0.010000   0.010000   0.020000 (  0.034392)
+ruby sum axis 1        :   6.920000   0.070000   6.990000 (  7.412997)
+opencl sum axis 1      :   0.020000   0.020000   0.040000 (  0.027614)
+ruby split             :   0.020000   0.000000   0.020000 (  0.022597)
+opencl split           :   0.060000   0.040000   0.100000 (  0.099309)
+ruby add_n             :   0.150000   0.000000   0.150000 (  0.162702)
+opencl add_n           :   0.020000   0.020000   0.040000 (  0.033757)
+ruby ooo matmul        :   1.670000   0.010000   1.680000 (  1.738712)
+opencl    ooo matmul   :   0.020000   0.010000   0.030000 (  0.029647)
+ruby softmax           :   0.030000   0.000000   0.030000 (  0.033050)
+opencl    softmax      :   0.020000   0.010000   0.030000 (  0.030572)
+ruby matmul            :   0.820000   0.010000   0.830000 (  0.851559)
+opencl    matmul       :   0.010000   0.010000   0.020000 (  0.026167)
+ruby                   :   2.860000   0.020000   2.880000 (  3.033034)
+opencl                 :   0.220000   0.070000   0.290000 (  0.240857)
+ruby single function   :   0.380000   0.000000   0.380000 (  0.398911)
+opencl single function :   0.150000   0.050000   0.200000 (  0.162006)
+ruby pow float         :   0.090000   0.000000   0.090000 (  0.098400)
+opencl pow float       :   0.020000   0.020000   0.040000 (  0.033370)
+ruby pow int           :   0.020000   0.000000   0.020000 (  0.023459)
+opencl pow int         :   0.020000   0.010000   0.030000 (  0.030894)
+-------------------------------------------------- total: 36.290000sec
+                               user     system      total        real
+ruby argmin            :   0.880000   0.010000   0.890000 (  0.933367)
+opencl argmin          :   0.010000   0.010000   0.020000 (  0.023140)
+ruby bias_add_grad     :   2.350000   0.050000   2.400000 (  2.539349)
+opencl bias_add_grad   :   0.010000   0.010000   0.020000 (  0.024700)
+ruby bias_add          :   2.510000   0.060000   2.570000 (  2.667330)
+opencl bias_add        :   0.150000   0.020000   0.170000 (  0.184056)
+ruby conv2d_backprop   :   3.910000   0.040000   3.950000 (  4.320383)
+opencl conv2d_backprop :   0.030000   0.020000   0.050000 (  0.058036)
+ruby conv2d            :   0.910000   0.020000   0.930000 (  1.120605)
+opencl conv2d          :   0.020000   0.010000   0.030000 (  0.034972)
+ruby arr index         :   0.000000   0.000000   0.000000 (  0.004119)
+opencl arr index       :   0.020000   0.010000   0.030000 (  0.024126)
+ruby min               :   3.670000   0.030000   3.700000 (  4.024439)
+opencl min             :   0.140000   0.030000   0.170000 (  0.178683)
+ruby sum               :   6.920000   0.050000   6.970000 (  7.314338)
+opencl sum             :   0.010000   0.020000   0.030000 (  0.024655)
+ruby sum axis 1        :   6.900000   0.050000   6.950000 (  7.332897)
+opencl sum axis 1      :   0.020000   0.020000   0.040000 (  0.026150)
+ruby split             :   0.010000   0.000000   0.010000 (  0.018866)
+opencl split           :   0.050000   0.040000   0.090000 (  0.096327)
+ruby add_n             :   0.140000   0.000000   0.140000 (  0.151006)
+opencl add_n           :   0.020000   0.010000   0.030000 (  0.025622)
+ruby ooo matmul        :   1.670000   0.010000   1.680000 (  1.732486)
+opencl    ooo matmul   :   0.020000   0.020000   0.040000 (  0.027051)
+ruby softmax           :   0.030000   0.000000   0.030000 (  0.032848)
+opencl    softmax      :   0.010000   0.010000   0.020000 (  0.026403)
+ruby matmul            :   0.810000   0.000000   0.810000 (  0.866297)
+opencl    matmul       :   0.020000   0.020000   0.040000 (  0.026677)
+ruby                   :   2.870000   0.020000   2.890000 (  3.237224)
+opencl                 :   0.240000   0.080000   0.320000 (  0.302463)
+ruby single function   :   0.390000   0.010000   0.400000 (  0.470700)
+opencl single function :   0.150000   0.060000   0.210000 (  0.228528)
+ruby pow float         :   0.090000   0.000000   0.090000 (  0.113073)
+opencl pow float       :   0.020000   0.010000   0.030000 (  0.036938)
+ruby pow int           :   0.020000   0.000000   0.020000 (  0.023728)
+opencl pow int         :   0.020000   0.020000   0.040000 (  0.031909)

data/lib/tensor_stream/opencl.rb CHANGED Viewed

@@ -1,5 +1,6 @@
 require "tensor_stream/opencl/version"
 require 'tensor_stream'
+require "tensor_stream/opencl/utils"
 require "tensor_stream/opencl/opencl_evaluator"
 module TensorStream

data/lib/tensor_stream/opencl/array_ops.rb CHANGED Viewed

@@ -10,13 +10,13 @@ module TensorStream
             shape = if %i[zeros_like ones_like].include?(tensor.operation)
                       inputs[0].shape
                     elsif !inputs[0].nil?
-                      read_final_result(complete_eval(inputs[0], context))
+                      complete_eval(inputs[0], context).buffer.to_a
                     else
                       tensor.shape.shape
                     end
             cache_key = "cons_#{tensor.name}_#{tensor.data_type}_#{shape}"
             @context[:_cache][:_cl_buffers][cache_key] ||= begin
-              buffer = allocate_narray_for_type(tensor.data_type, shape.reduce(:*) || 1)
+              buffer = OpenCLBuffer.allocate_narray_for_type(tensor.data_type, shape.reduce(:*) || 1)
               if %i[zeros zeros_like].include?(tensor.operation)
                 buffer.fill!(0)
               else
@@ -47,7 +47,7 @@ module TensorStream
             buffer = if cl_buffer
                        cl_buffer.buffer
                      else
-                       allocate_narray_for_type(tensor.data_type, narray_size)
+                       OpenCLBuffer.allocate_narray_for_type(tensor.data_type, narray_size)
                      end
             buffer.fill!(value.buffer[0])
@@ -365,8 +365,8 @@ module TensorStream
           end
           register_op :reshape do |context, tensor, inputs|
-            arr = inputs[0]
-            new_shape = read_final_result(complete_eval(inputs[1], context))
+            arr, new_shape = inputs
+            new_shape = complete_eval(new_shape, context).buffer.to_a
             shape = if new_shape.size.zero? && arr.buffer.size == 1
                       new_shape
@@ -389,9 +389,9 @@ module TensorStream
               res
             else
               rank = inputs[0].shape.size
-              perm = inputs[1].nil? ? (0...rank).to_a.reverse : inputs[1].buffer
+              perm = inputs[1].nil? ? (0...rank).to_a.reverse : inputs[1].buffer!
               new_shape = perm.map { |p| inputs[0].shape[p] }.to_a
-              output_buffer = _create_result_buffer(tensor.data_type, new_shape, tensor.name)
+              output_buffer = _create_result_buffer(tensor.data_type, new_shape, tensor.name, allocate_host: true)
               transpose_with_perm(inputs[0].buffer, output_buffer.buffer, inputs[0].shape, new_shape, perm)
               write_op = _opencl_queue.enqueue_write_buffer(output_buffer.cl_buffer, output_buffer.buffer)
@@ -442,6 +442,36 @@ module TensorStream
               a
             end
           end
+          register_op :range do |context, tensor, inputs|
+            start, limit, delta = complete_eval(inputs, context).map { |p| p.buffer.to_a.first }
+            if limit.zero?
+              limit = start
+              start = 0
+            end
+            raise " delta !=0 " if delta.zero?
+            raise " Requires start <= limit when delta > 0" if (start > limit) && delta > 0
+            raise " Requires start >= limit when delta < 0" if (start < limit) && delta < 0
+            cache_key = "range_#{start}_#{limit}_#{delta}_#{tensor.data_type}"
+            @context[:_cache][:_cl_buffers][cache_key] ||= begin
+              delta =  fp_type?(tensor.options[:output_type]) ? delta.to_f : delta.to_i
+              cur_step = fp_type?(tensor.options[:output_type]) ? start.to_f : start.to_i
+              r = []
+              Kernel.loop do
+                break if start == limit
+                break if (start < limit) && (cur_step >= limit)
+                break if (start > limit) && (cur_step <= limit)
+                r << cur_step
+                cur_step += delta
+              end
+              r
+              convert_to_opencl(r, [r.size], data_type: tensor.options[:output_type], name: tensor.name)
+            end
+          end
         end
       end
     end

data/lib/tensor_stream/opencl/images_ops.rb CHANGED Viewed

@@ -24,7 +24,7 @@ module TensorStream
               end
             end
-            output_buffer = _create_result_buffer(tensor.data_type, [image.height, image.width, channels], "out_#{tensor.name}")
+            output_buffer = _create_result_buffer(tensor.data_type, [image.height, image.width, channels], "out_#{tensor.name}", allocate_host: true)
             image.grayscale! if channels == 1
             image.pixels.each_with_index do |pixel, index|

data/lib/tensor_stream/opencl/kernels/gemm.cl CHANGED Viewed

@@ -6,8 +6,9 @@ __kernel void gemm_<%= dtype %>(const int M, const int N, const int K,
                       __global <%= c_dtype %>* C) {
     // Get the index of the current element to be processed
-    const int globalRow = get_global_id(0); // Row ID of C (0..M)
-    const int globalCol = get_global_id(1); // Col ID of C (0..N)
+    const int index = get_global_id(0);
+    const int globalRow = get_global_id(1); // Row ID of C (0..M)
+    const int globalCol = get_global_id(2); // Col ID of C (0..N)
     // Compute a single element (loop over K)
     <%= c_dtype %> acc = 0.0f;
@@ -16,9 +17,9 @@ __kernel void gemm_<%= dtype %>(const int M, const int N, const int K,
         int b_index = k*N + globalCol;
 <% if ta %>a_index = M*k + globalRow;<% end %>
 <% if tb %>b_index = globalCol*K + k;<% end %>
-        acc += A[a_index] * B[b_index];
+        acc += A[a_index + index * <%= n_a %>] * B[b_index + index * <%= n_b %>];
     }
     // Store the result
-    C[globalRow*N + globalCol] = acc;
+    C[index * <%= n %> + globalRow*N + globalCol] = acc;
 }

data/lib/tensor_stream/opencl/kernels/random_uniform.cl ADDED Viewed

@@ -0,0 +1,7 @@
+% c_dtype = dtype_to_c_type(dtype)
+__kernel void random_uniform_<%= dtype %>(const int seed_ptr, const float min, const float max, __global const <%= c_dtype %> *rand_table, __global <%= c_dtype %> *C) {
+    // Get the index of the current element to be processed
+    const int id = get_global_id(0);
+    <%= c_dtype %> rand_value = rand_table[ (seed_ptr + id) % <%= tsize %>];
+    C[id] = rand_value * (max - min) + min;
+}

data/lib/tensor_stream/opencl/math_ops.rb CHANGED Viewed

@@ -54,26 +54,28 @@ module TensorStream
           register_op :mat_mul do |_context, tensor, inputs|
             a, b = inputs
-            m = a.shape[0]
-            n = b.shape[1]
-            v = b.shape[0]
-            k = a.shape[1]
+            a_matrix_shape = a.shape.dup
+            b_matrix_shape = b.shape.dup
+            k = a_matrix_shape.pop
+            m = a_matrix_shape.pop
+            n = b_matrix_shape.pop
+            v = b_matrix_shape.pop
             if tensor.options[:transpose_a]
-              m = a.shape[1]
-              k = a.shape[0]
+              m, k = k, m
             end
             if tensor.options[:transpose_b]
-              n = b.shape[0]
-              v = b.shape[1]
+              n, v = v, n
             end
-            result_shape = [m, n]
+            result_shape = [a_matrix_shape.first, m, n].compact
+            work_group = [a_matrix_shape.first || 1, m, n]
             raise "#{tensor.inputs[0].name} rank must be greater than 1" if a.shape.size < 2
             raise "#{tensor.inputs[1].name} rank must be greater than 1" if b.shape.size < 2
-            raise "#{tensor.inputs[0].name} unsupported rank" if b.shape.size != 2 || a.shape.size!=2
+            raise "#{tensor.inputs[0].name} unsupported rank" if b.shape.size > 3 || a.shape.size > 3
             raise "incompatible shape sizes for matrix multiplication (#{a.shape[1]} != #{b.shape[0]}) #{a.shape} vs #{b.shape}" if k != v
             dtype = tensor.data_type
@@ -85,7 +87,7 @@ module TensorStream
             cl_k = OpenCL::Int1.new(k)
             event_wait_list = build_event_wait_list([a, b])
-            output_buffer.op = _cl_program('gemm', ta: !!tensor.options[:transpose_a], tb: !!tensor.options[:transpose_b], dtype: dtype).send(:"gemm_#{dtype}", _opencl_queue, result_shape, cl_m, cl_n, cl_k, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
+            output_buffer.op = _cl_program('gemm', ta: !!tensor.options[:transpose_a], tb: !!tensor.options[:transpose_b], n: m * n, n_a: m * k, n_b: n * v, dtype: dtype).send(:"gemm_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, cl_k, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
             output_buffer
           end

data/lib/tensor_stream/opencl/opencl_buffer.rb CHANGED Viewed

@@ -1,7 +1,48 @@
 module TensorStream
   # Buffer used by the OpenCL evaluator
   class OpenCLBuffer < Buffer
+    class LazyBuffer
+      attr_reader :data_type
+      def initialize(data_type, size)
+        @data_type = data_type
+        @size = size
+      end
+      def size
+        @size
+      end
+      def element_size
+        buffer_size_for_type(@data_type)
+      end
+      def buffer_size_for_type(data_type)
+        case data_type
+        when :float, :float32, :float16
+          4
+        when :float64
+          8
+        when :int, :int32, :int64, :uint64, :uint32 # NArray does not have 64 bit int types
+          4
+        when :int16, :uint16
+          2
+        when :uint8, :int8
+          1
+        when :boolean
+          1
+        when :string
+          1
+        when :unknown
+          nil
+        else
+          raise "unsupported type #{data_type}"
+        end
+      end
+    end
     include ArrayOpsHelper
+    include TensorStream::CLEventHelpers
     attr_accessor :shape, :buffer, :cl_buffer, :op, :owner
@@ -24,15 +65,33 @@ module TensorStream
     end
     def inspect
-      "CLBuffer(shape: #{shape || "?"} data_type: #{data_type}, cl_allocated: #{cl_buffer ? cl_buffer.size : 'unallocated'}) -> raw: #{buffer.to_a}"
+      "CLBuffer(name: #{name} shape: #{shape || "?"} data_type: #{data_type}, cl_allocated: #{cl_buffer ? cl_buffer.size : 'unallocated'}) -> raw: #{buffer.to_a}"
+    end
+    def buffer!
+      return buffer if buffer.is_a?(NArray)
+      @buffer = OpenCLBuffer.allocate_narray_for_type(buffer.data_type, buffer.size) if buffer.is_a?(LazyBuffer)
+      command_queue.enqueue_read_buffer(cl_buffer, @buffer, blocking: true, event_wait_list: build_event_wait_list([self]))
+      @buffer
+    end
+    def command_queue
+      @command_queue ||= begin
+        first_op = op.is_a?(Array) ? op.first : op
+        first_op.command_queue
+      end
     end
     def to_ruby
+      buffer! if buffer.is_a?(LazyBuffer)
       return [] if buffer.empty?
       if dirty
-        op.command_queue.enqueue_read_buffer(cl_buffer, buffer, event_wait_list: [op].compact)
-        op.command_queue.finish
+        command_queue.enqueue_read_buffer(cl_buffer, buffer, event_wait_list: [op].compact)
+        command_queue.finish
         self.dirty = false
       end
@@ -54,5 +113,28 @@ module TensorStream
     def self.nil_buffer(owner, name, data_type)
       OpenCLBuffer.new(owner, name: name, data_type: data_type, shape: [0], buffer: nil, cl_buffer: nil)
     end
+    def self.allocate_narray_for_type(data_type, narray_size)
+      case data_type
+      when :float, :float32, :float16
+        NArray.sfloat(narray_size)
+      when :float64
+        NArray.float(narray_size)
+      when :int, :int32, :int64, :uint64, :uint32 # NArray does not have 64 bit int types
+        NArray.int(narray_size)
+      when :int16, :uint16
+        NArray.sint(narray_size)
+      when :uint8, :int8
+        NArray.byte(narray_size)
+      when :boolean
+        NArray.byte(narray_size)
+      when :string
+        NArray.byte(narray_size)
+      when :unknown
+        nil
+      else
+        raise "unsupported type #{data_type}"
+      end
+    end
   end
 end

data/lib/tensor_stream/opencl/opencl_evaluator.rb CHANGED Viewed

@@ -13,6 +13,7 @@ require 'tensor_stream/opencl/math_ops'
 require 'tensor_stream/opencl/nn_ops'
 require 'tensor_stream/opencl/images_ops'
 require 'tensor_stream/opencl/array_ops'
+require 'tensor_stream/opencl/random_ops'
 require 'tensor_stream/helpers/op_helper'
 module TensorStream
@@ -49,6 +50,8 @@ module TensorStream
       include TensorStream::OpenCLHelpers::NNOps
       include TensorStream::OpenCLHelpers::ImagesOps
       include TensorStream::OpenCLHelpers::ArrayOps
+      include TensorStream::OpenCLHelpers::RandomOps
+      include TensorStream::CLEventHelpers
       def initialize(session, device, thread_pool: nil, log_intermediates: false)
         super
@@ -159,6 +162,9 @@ module TensorStream
           return [] if buffer.buffer.nil?
           return buffer if buffer.buffer.size.zero?
+          # lazy allocate
+          buffer.buffer = OpenCLBuffer.allocate_narray_for_type(buffer.buffer.data_type, buffer.buffer.size) if buffer.buffer.is_a?(OpenCLBuffer::LazyBuffer)
           buffer.op = _opencl_queue.enqueue_read_buffer(buffer.cl_buffer, buffer.buffer, event_wait_list: build_event_wait_list([buffer]))
           buffer
         end
@@ -167,12 +173,19 @@ module TensorStream
       def complete_eval(tensor, context)
         return nil if tensor.nil?
-        buffer = enqueue_buffer_read(tensor, context)
-        events = build_event_wait_list([buffer])
+        buffers = if tensor.is_a?(Array)
+                    tensor.map { |t|
+                      enqueue_buffer_read(t, context)
+                    }
+                  else
+                    [enqueue_buffer_read(tensor, context)]
+                  end
+        events = build_event_wait_list(buffers)
         # puts "** wait #{tensor.name} **"
         OpenCL.wait_for_events(events) unless events.empty?
         # puts "** done #{tensor.name} **"
-        buffer
+        tensor.is_a?(Array) ? buffers : buffers.first
       end
       def self.query_devices_with_score
@@ -355,9 +368,13 @@ module TensorStream
       register_op :identity do |_context, tensor, inputs|
         value = inputs[0]
-        buffer = OpenCLBuffer.new(self, name: tensor.name, data_type: tensor.data_type, shape: value.shape, buffer: value.buffer, cl_buffer: value.cl_buffer)
-        buffer.op = build_event_wait_list(inputs)
-        buffer
+        if value.is_a?(OutputGroup)
+          value
+        else
+          buffer = OpenCLBuffer.new(self, name: tensor.name, data_type: tensor.data_type, shape: value.shape, buffer: value.buffer, cl_buffer: value.cl_buffer)
+          buffer.op = build_event_wait_list(inputs)
+          buffer
+        end
       end
       register_op :assign, noop: true do |context, tensor, inputs|
@@ -773,9 +790,9 @@ module TensorStream
                                  value
                                elsif data_type == :string && shape.empty?
                                  cl_buffer_size = value[0].bytesize
-                                 allocate_narray_for_type(data_type, value[0].bytesize)
+                                 OpenCLBuffer.allocate_narray_for_type(data_type, value[0].bytesize)
                                else
-                                 allocate_narray_for_type(data_type, narray_size)
+                                OpenCLBuffer.allocate_narray_for_type(data_type, narray_size)
                                end
                       return nil if buffer.nil?
@@ -818,39 +835,17 @@ module TensorStream
         cl_object
       end
-      def allocate_narray_for_type(data_type, narray_size)
-        case data_type
-        when :float, :float32, :float16
-          NArray.sfloat(narray_size)
-        when :float64
-          NArray.float(narray_size)
-        when :int, :int32, :int64, :uint64, :uint32 # NArray does not have 64 bit int types
-          NArray.int(narray_size)
-        when :int16, :uint16
-          NArray.sint(narray_size)
-        when :uint8, :int8
-          NArray.byte(narray_size)
-        when :boolean
-          NArray.byte(narray_size)
-        when :string
-          NArray.byte(narray_size)
-        when :unknown
-          nil
-        else
-          raise "unsupported type #{data_type}"
-        end
-      end
-      def _create_result_buffer(data_type, shape, name)
+      def _create_result_buffer(data_type, shape, name, allocate_host: false)
         return OpenCLBuffer.nil_buffer(self, name, data_type) if shape == [0]
         cache_key = "_result_#{name}_#{shape.join('_')}:#{object_id}"
         @context[:_cache][:_cl_buffers][cache_key] ||= begin
           # puts "create result buffer #{cache_key}"
           size = shape.empty? || shape == [0] ? 1 : shape.reduce(:*)
-          buffer =  allocate_narray_for_type(data_type, size)
-          cl_buffer = _opencl_context.create_buffer(buffer.size * buffer.element_size)
-          OpenCLBuffer.new(self, data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer, name: name)
+          lazy_buffer = !allocate_host ? OpenCLBuffer::LazyBuffer.new(data_type, size) : OpenCLBuffer.allocate_narray_for_type(data_type, size)
+          cl_buffer = _opencl_context.create_buffer(size * lazy_buffer.element_size)
+          OpenCLBuffer.new(self, data_type: data_type, shape: shape, buffer: lazy_buffer, cl_buffer: cl_buffer, name: name)
         end
       end
@@ -859,7 +854,7 @@ module TensorStream
         cache_key ="_sub_result_#{parent_buffer.object_id}_#{name}_#{index}:#{object_id}"
         @context[:_cache][:_cl_buffers][cache_key] ||= begin
           size = shape.empty? || shape == [0] ? 1 : shape.reduce(:*)
-          buffer = allocate_narray_for_type(data_type, size)
+          buffer = OpenCLBuffer.allocate_narray_for_type(data_type, size)
           if parent_buffer.cl_buffer.associated_memobject.nil?
             start = index * buffer.size * buffer.element_size
@@ -890,7 +885,7 @@ module TensorStream
         cache_key = "_sub_result_#{parent_buffer.object_id}_#{name}_#{index}:#{object_id}"
         @context[:_cache][:_cl_buffers][cache_key] ||= begin
           size = shape.empty? || shape == [0] ? 1 : shape.reduce(:*)
-          buffer = allocate_narray_for_type(data_type, size)
+          buffer = OpenCLBuffer.allocate_narray_for_type(data_type, size)
           if parent_buffer.cl_buffer.associated_memobject.nil?
             region = OpenCL::BufferRegion::new(start, region_size_in_bytes)
@@ -980,14 +975,6 @@ module TensorStream
         shape.is_a?(Array) ? shape.size : 0
       end
-      def build_event_wait_list(inputs)
-        if inputs.is_a?(Array)
-          inputs.flatten.compact.map(&:op).compact.uniq
-        else
-          inputs.op ? [inputs.op] : []
-        end
-      end
       def resolve_placeholder(placeholder, _execution_context = {})
         return nil if placeholder.nil?
         return placeholder unless placeholder.is_a?(Placeholder)

data/lib/tensor_stream/opencl/random_ops.rb ADDED Viewed

@@ -0,0 +1,54 @@
+module TensorStream
+  module OpenCLHelpers
+    # Collection of math functions for interfacing with OpenCL kernels
+    module RandomOps
+      RAND_TABLE_SIZE = 1024 * 1024
+      def RandomOps.included(klass)
+        klass.class_eval do
+          register_op :random_uniform do |context, tensor, inputs|
+            maxval = tensor.options.fetch(:maxval, 1)
+            minval = tensor.options.fetch(:minval, 0)
+            seed = tensor.options[:seed]
+            rand_buffer = @context[:_cache][:_cl_buffers]["_rand"] ||= begin
+              @context[:_cache][:_cl_buffers]["_rand_seed_ptr"] = 0
+              random = _get_randomizer(tensor, seed)
+              rand_table = RAND_TABLE_SIZE.times.map { random.rand }
+              convert_to_opencl(rand_table, [RAND_TABLE_SIZE], data_type: tensor.data_type, name: "rand_#{tensor.data_type}")
+            end
+            @context[:_cache][:_cl_buffers]["_rand_seed_ptr"] ||= 0
+            seed_ptr = @context[:_cache][:_cl_buffers]["_rand_seed_ptr"]
+            shape = read_final_result(complete_eval(inputs[0], context))
+            shape = shape || tensor.shape.shape
+            workgroup = [shape.reduce(:*) || 1 ]
+            cl_seed_ptr = OpenCL::Int1.new(seed_ptr)
+            cl_min = OpenCL::Float1.new(minval)
+            cl_max = OpenCL::Float1.new(maxval)
+            @context[:_cache][:_cl_buffers]["_rand_seed_ptr"] = (seed_ptr + (shape.reduce(:*) || 0) ) % RAND_TABLE_SIZE
+            buffer = _create_result_buffer(tensor.data_type, shape, tensor.name)
+            buffer.op = _cl_program("random_uniform", dtype: tensor.data_type, tsize: RAND_TABLE_SIZE).send(:"random_uniform_#{tensor.data_type}", _opencl_queue, workgroup, cl_seed_ptr, cl_min, cl_max, rand_buffer.cl_buffer, buffer.cl_buffer)
+            buffer
+          end
+          def _get_randomizer(tensor, seed)
+            if tensor.graph.random_seed && seed
+              Random.new(tensor.graph.random_seed ^ seed)
+            elsif tensor.graph.random_seed
+              @session.randomizer[tensor.graph.object_id] ||= Random.new(tensor.graph.random_seed)
+              @session.randomizer[tensor.graph.object_id]
+            elsif seed
+              @session.randomizer[tensor.operation] ||= Random.new(seed)
+              @session.randomizer[tensor.operation]
+            else
+              Random.new
+            end
+          end
+        end
+      end
+    end
+  end
+end

data/lib/tensor_stream/opencl/utils.rb ADDED Viewed

@@ -0,0 +1,27 @@
+module TensorStream
+  class OpenCLUtil
+    ##
+    # initializes a OpenCL helper class based on a session
+    def initialize(session)
+      @session = session
+    end
+    ##
+    # Retrieves OpenCL memory usage
+    def get_memory_usage
+      cl_buffer_uniq_set = Set.new
+      @session.last_session_context[:_cache][:_cl_buffers].inject(0) do |sum, elem|
+        cl_buffer_uniq_set.add?(elem[1].cl_buffer.object_id) ? sum + elem[1].cl_buffer.size : sum
+      end
+    end
+  end
+  module CLEventHelpers
+    def build_event_wait_list(inputs)
+      if inputs.is_a?(Array)
+        inputs.flatten.compact.map(&:op).compact.uniq
+      else
+        inputs.op ? [inputs.op] : []
+      end
+    end
+  end
+end

data/lib/tensor_stream/opencl/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 module TensorStream
   module Opencl
-    VERSION = "0.3.0"
+    VERSION = "0.3.1"
   end
 end

data/samples/mnist_data_2.3.rb CHANGED Viewed

@@ -53,11 +53,16 @@ b5 = tf.variable(tf.zeros([10]))
 x_ = tf.reshape(x, [-1, 784])
 y1 = tf.nn.relu(tf.matmul(x_, w1) + b1)
-y2 = tf.nn.relu(tf.matmul(y1, w2) + b2)
-y3 = tf.nn.relu(tf.matmul(y2, w3) + b3)
-y4 = tf.nn.relu(tf.matmul(y3, w4) + b4)
-ylogits = tf.matmul(y4, w5) + b5
+Y1d = tf.nn.dropout(y1, pkeep)
+y2 = tf.nn.relu(tf.matmul(Y1d, w2) + b2)
+Y2d = tf.nn.dropout(y2, pkeep)
+y3 = tf.nn.relu(tf.matmul(Y2d, w3) + b3)
+Y3d = tf.nn.dropout(y3, pkeep)
+y4 = tf.nn.relu(tf.matmul(Y3d, w4) + b4)
+Y4d = tf.nn.dropout(y4, pkeep)
+ylogits = tf.matmul(Y4d, w5) + b5
 # model
 y = tf.nn.softmax(ylogits)

data/samples/mnist_data_3.0.rb CHANGED Viewed

@@ -85,8 +85,8 @@ y3 = tf.nn.relu(tf.nn.conv2d(y2, w3, [1, stride, stride, 1], 'SAME') + b3)
 # reshape the output from the third convolution for the fully connected layer
 yy = tf.reshape(y3, [-1, 7 * 7 * M])
 y4 = tf.nn.relu(tf.matmul(yy, w4) + b4)
-ylogits = tf.matmul(y4, w5) + b5
+YY4 = tf.nn.dropout(y4, pkeep)
+ylogits = tf.matmul(YY4, w5) + b5
 # model
 y = tf.nn.softmax(ylogits, name: 'out')

data/tensor_stream-opencl.gemspec CHANGED Viewed

@@ -39,7 +39,7 @@ Gem::Specification.new do |spec|
   spec.add_development_dependency "awesome_print"
   spec.add_development_dependency "mnist-learn"
   spec.add_development_dependency "simplecov"
-  spec.add_dependency "tensor_stream", "1.0.6"
+  spec.add_dependency "tensor_stream", "1.0.7"
   spec.add_dependency "opencl_ruby_ffi"
   spec.add_dependency "oily_png"
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: tensor_stream-opencl
 version: !ruby/object:Gem::Version
-  version: 0.3.0
+  version: 0.3.1
 platform: ruby
 authors:
 - Joseph Dayo
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2019-03-23 00:00:00.000000000 Z
+date: 2019-04-09 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -114,14 +114,14 @@ dependencies:
     requirements:
     - - '='
       - !ruby/object:Gem::Version
-        version: 1.0.6
+        version: 1.0.7
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - '='
       - !ruby/object:Gem::Version
-        version: 1.0.6
+        version: 1.0.7
 - !ruby/object:Gem::Dependency
   name: opencl_ruby_ffi
   requirement: !ruby/object:Gem::Requirement
@@ -167,6 +167,7 @@ files:
 - README.md
 - Rakefile
 - benchmark/benchmark.rb
+- benchmark_imac2015_iris.txt
 - benchmark_intel.txt
 - benchmark_ryzen.txt
 - benchmark_ryzen_nvidia.txt
@@ -219,6 +220,7 @@ files:
 - lib/tensor_stream/opencl/kernels/pack.cl
 - lib/tensor_stream/opencl/kernels/pow.cl
 - lib/tensor_stream/opencl/kernels/prod.cl
+- lib/tensor_stream/opencl/kernels/random_uniform.cl
 - lib/tensor_stream/opencl/kernels/real_div.cl
 - lib/tensor_stream/opencl/kernels/reciprocal.cl
 - lib/tensor_stream/opencl/kernels/reduce_axis.cl
@@ -250,6 +252,8 @@ files:
 - lib/tensor_stream/opencl/opencl_device.rb
 - lib/tensor_stream/opencl/opencl_evaluator.rb
 - lib/tensor_stream/opencl/opencl_template_helper.rb
+- lib/tensor_stream/opencl/random_ops.rb
+- lib/tensor_stream/opencl/utils.rb
 - lib/tensor_stream/opencl/version.rb
 - samples/build_mnist_model.rb
 - samples/classify.rb