RubyGems - tensor_stream-opencl - Versions diffs - 0.2.1 → 0.2.2 - Mend

tensor_stream-opencl 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

checksums.yaml +4 -4
data/.gitignore +1 -0
data/.rubocop.yml +89 -0
data/lib/tensor_stream/opencl/array_ops.rb +30 -6
data/lib/tensor_stream/opencl/kernels/apply_adam.cl +2 -2
data/lib/tensor_stream/opencl/math_ops.rb +3 -1
data/lib/tensor_stream/opencl/opencl_buffer.rb +3 -2
data/lib/tensor_stream/opencl/opencl_evaluator.rb +112 -61
data/lib/tensor_stream/opencl/opencl_template_helper.rb +12 -2
data/lib/tensor_stream/opencl/version.rb +1 -1
data/samples/iris.rb +0 -2
data/samples/mnist_data_2.1.rb +99 -0
data/samples/mnist_data_2.2.rb +98 -0
data/samples/multigpu.rb +27 -13
data/tensor_stream-opencl.gemspec +1 -1
metadata +7 -6
data/Gemfile.lock +0 -70
data/samples/mnist_data.rb +0 -65

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 03fcb3bb50485dd601bf17b58f216209c86fb20aeb6c0b61b23144b5d644efaa
-  data.tar.gz: 7b96f90b902ff747b74575be13015e52cfda0f4104273e14eca5bee90fc1a405
+  metadata.gz: 2f7c2e06a5711e3efc8503de82f4c836af70c3b0dfd6ce0f4790f0bb6d3abcb9
+  data.tar.gz: c103f23ba5d27f3a6356ed28b10966b8333f9fb3fabc203924ce357c4c0523c8
 SHA512:
-  metadata.gz: 2916b8a053754bfd58594cef0680d79d12d38332d99d28a3cded71028c58b7a35ec3a6480b6b80f486aa2a59b617f89ee1b534bf00f51343e4827df250ff92f4
-  data.tar.gz: 308153886efa111da2251b31f4a6b5d4ad610336681e3d94b1bb9b055cf8a7e97ad9460271a5ae14e738410a77fc75d8669636732909af2ff574f0e907cad44b
+  metadata.gz: 637ede65bf27b9ce06a755e344e58567c4d1e83e4831115e872d6f2ca0ff778f49f4d4e60af643a920fcf3a1b9033078b0c81f6e6e0f62f2e31f8f9ac4fee89b
+  data.tar.gz: af8482a75b98db484c074c2862d455709ed5563e596819ad445a0c502467c0b5189eef04abbf55060dcbc0640289ce839715b19b3332aa9c21217345726ac3f3

data/.gitignore CHANGED Viewed

@@ -8,6 +8,7 @@
 /tmp/
 Gemfile.lock
 *.gem
+*.ckpt
 # rspec failure tracking
 .rspec_status

data/.rubocop.yml ADDED Viewed

@@ -0,0 +1,89 @@
+AllCops:
+  Exclude:
+    - samples/*
+    - bin/*
+    - spec/**/*
+    - tensor_stream.gemspec
+    - Rakefile
+Naming/AccessorMethodName:
+  Exclude:
+    - lib/tensor_stream/utils.rb
+Style/StringLiterals:
+  Enabled: false
+Layout/TrailingBlankLines:
+  Enabled: false
+Metrics/LineLength:
+  Max: 200
+Metrics/AbcSize:
+  Enabled: false
+Metrics/PerceivedComplexity:
+  Enabled: false
+Metrics/MethodLength:
+  Enabled: false
+Metrics/CyclomaticComplexity:
+  Enabled: false
+Metrics/BlockLength:
+  Exclude:
+    - lib/tensor_stream/math_gradients.rb
+Naming/AccessorMethodName:
+  Exclude:
+    - lib/tensor_stream.rb
+    - lib/tensor_stream/control_flow.rb
+    - lib/tensor_stream/graph.rb
+    - lib/tensor_stream/operation.rb
+Style/Documentation:
+  Exclude:
+    - lib/tensor_stream/version.rb
+    - lib/tensor_stream/trainer.rb
+    - lib/tensor_stream/nn/nn_ops.rb
+    - lib/tensor_stream/evaluator/evaluator.rb
+Lint/UnusedMethodArgument:
+  Exclude:
+    - lib/tensor_stream/train/saver.rb
+    - lib/tensor_stream/ops.rb
+Metrics/ParameterLists:
+  Max: 8
+Style/PerlBackrefs:
+  Enabled: false
+Style/RegexpLiteral:
+  Enabled: false
+Naming/MemoizedInstanceVariableName:
+  Enabled: false
+Metrics/ModuleLength:
+  Max: 200
+Metrics/ClassLength:
+  Max: 250
+  Exclude:
+    - lib/tensor_stream/evaluator/ruby_evaluator.rb
+Naming/VariableNumber:
+  Enabled: false
+Style/DoubleNegation:
+  Enabled: false
+Style/TrailingCommaInHashLiteral:
+  Enabled: false
+Naming/UncommunicativeMethodParamName:
+  Exclude:
+    - lib/tensor_stream/evaluator/ruby_evaluator.rb
+    - lib/tensor_stream/ops.rb

data/lib/tensor_stream/opencl/array_ops.rb CHANGED Viewed

@@ -4,6 +4,28 @@ module TensorStream
     module ArrayOps
       def ArrayOps.included(klass)
         klass.class_eval do
+          #fast cached 0/1 constant fill
+          register_op %i[zeros ones zeros_like ones_like] do |context, tensor, inputs|
+            shape = if %i[zeros_like ones_like].include?(tensor.operation)
+                      inputs[0].shape
+                    elsif !inputs[0].nil?
+                      read_final_result(complete_eval(inputs[0], context))
+                    else
+                      tensor.shape.shape
+                    end
+            cache_key = "cons_#{tensor.name}_#{tensor.data_type}_#{shape}"
+            @context[:_cache][:_cl_buffers][cache_key] ||= begin
+              buffer = allocate_narray_for_type(tensor.data_type, shape.reduce(:*) || 1)
+              if %i[zeros zeros_like].include?(tensor.operation)
+                buffer.fill!(0)
+              else
+                buffer.fill!(1)
+              end
+              convert_to_opencl(buffer, shape, data_type: tensor.data_type, name: tensor.name)
+            end
+          end
           register_op :expand_dims, buffer: true do |_context, tensor, inputs|
             axis = inputs[1].buffer[0]
             shape = inputs[0].shape.dup
@@ -17,8 +39,10 @@ module TensorStream
             shape = inputs[0]
             value = inputs[1]
-            narray_size = shape.buffer.to_a.reduce(:*) || 1
-            cl_buffer = get_cached_buffer(tensor.name, shape.buffer.to_a)
+            fill_shape = shape.nil? ? tensor.shape.shape : shape.buffer.to_a
+            narray_size = fill_shape.reduce(:*) || 1
+            cl_buffer = get_cached_buffer(tensor.name, fill_shape)
             buffer = if cl_buffer
                        cl_buffer.buffer
@@ -27,7 +51,7 @@ module TensorStream
                      end
             buffer.fill!(value.buffer[0])
-            convert_to_opencl(buffer, shape.buffer.to_a, data_type: tensor.data_type, name: tensor.name)
+            convert_to_opencl(buffer, fill_shape, data_type: tensor.data_type, name: tensor.name)
           end
           register_op :split do |context, tensor, inputs|
@@ -119,7 +143,7 @@ module TensorStream
                             piece_size = shape.reduce(:*) || 1
                             work_group = [piece_size]
                             cl_offset = OpenCL::Int1.new(offset)
                             _cl_program('split_n', axis: axis,
                                                            div: divisors,
                                                            mul: multipliers,
@@ -218,7 +242,7 @@ module TensorStream
               shape = shape.map { |s| s == 1 ? nil : s }
             end
-            OpenCLBuffer.new(name: tensor.name, data_type: tensor.data_type,
+            OpenCLBuffer.new(self, name: tensor.name, data_type: tensor.data_type,
               shape: shape.compact, buffer: arr.buffer,
               cl_buffer: arr.cl_buffer,
               op: arr.op)
@@ -350,7 +374,7 @@ module TensorStream
                       TensorShape.fix_inferred_elements(new_shape, arr.buffer.size)
                     end
-            OpenCLBuffer.new(name: tensor.name, data_type: tensor.data_type,
+            OpenCLBuffer.new(self, name: tensor.name, data_type: tensor.data_type,
                              shape: shape, buffer: arr.buffer,
                              cl_buffer: arr.cl_buffer,
                              op: arr.op)

data/lib/tensor_stream/opencl/kernels/apply_adam.cl CHANGED Viewed

@@ -12,9 +12,9 @@
                                        __global <%= c_dtype %> *output, __global <%= c_dtype %> *v) {
     // Get the index of the current element to be processed
     const int index = get_global_id(0);
-    <%= c_dtype %> alpha = learning_rate[0] * sqrt(1.0 - beta2_power[0]) / (1.0 - beta1_power[0]);
+    <%= c_dtype %> alpha = learning_rate[0] * sqrt((<%= c_dtype %>)1.0 - beta2_power[0]) / (1.0 - beta1_power[0]);
     momentum[index] += (grad[index] - momentum[index]) * (1.0 - beta1[0]);
     v[index] += (grad[index] * grad[index] - v[index]) * (1.0 - beta2[0]);
-    output[index] -= (momentum[index] * alpha) / ( sqrt(v[index]) + epsilon[0] );
+    output[index] -= (momentum[index] * alpha) / ( sqrt((<%= c_dtype %>)v[index]) + epsilon[0] );
 }

data/lib/tensor_stream/opencl/math_ops.rb CHANGED Viewed

@@ -80,8 +80,10 @@ module TensorStream
             transpose_a = OpenCL::Int1.new(tensor.options[:transpose_a] ? 1 : 0)
             transpose_b = OpenCL::Int1.new(tensor.options[:transpose_b] ? 1 : 0)
-            event_wait_list = build_event_wait_list(inputs)
+            event_wait_list = build_event_wait_list([a, b])
             output_buffer.op = _cl_program('gemm', dtype: dtype).send(:"gemm_#{dtype}", _opencl_queue, result_shape, cl_m, cl_n, cl_k, transpose_a, transpose_b, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
             output_buffer
           end

data/lib/tensor_stream/opencl/opencl_buffer.rb CHANGED Viewed

@@ -3,15 +3,16 @@ module TensorStream
   class OpenCLBuffer < Buffer
     include ArrayOpsHelper
-    attr_accessor :shape, :buffer, :cl_buffer, :op
+    attr_accessor :shape, :buffer, :cl_buffer, :op, :owner
-    def initialize(data_type:, shape:, buffer:, cl_buffer:, op: nil, name: nil)
+    def initialize(owner, data_type:, shape:, buffer:, cl_buffer:, op: nil, name: nil)
       @data_type = data_type
       @shape = shape
       @buffer = buffer
       @cl_buffer = cl_buffer
       @name = name
       @op = op
+      @owner = owner
     end
     def total_elements

data/lib/tensor_stream/opencl/opencl_evaluator.rb CHANGED Viewed

@@ -38,7 +38,8 @@ module TensorStream
     # PURE ruby evaluator used for testing and development
     class OpenclEvaluator < BaseEvaluator
       attr_accessor :retain
-      attr_reader :opencl_device
+      attr_reader :opencl_device, :opencl_context
+      attr_writer :context
       include TensorStream::OpHelper
       include TensorStream::ArrayOpsHelper
@@ -50,14 +51,14 @@ module TensorStream
       def initialize(session, device, thread_pool: nil, log_intermediates: false)
         super
-        _create_opencl_context(device.native_device)
+        _create_opencl_context
         @opencl_device = device.native_device
         create_command_queue
       end
       def self.query_supported_devices
         devices = query_devices_with_score
-        devices.sort { |a| a[1] }.reverse.map do |d|
+        devices.sort { |a, b| a[1] <=> b[1] }.map do |d|
           opencl_to_device(d)
         end
       end
@@ -68,16 +69,16 @@ module TensorStream
         opencl_to_device(platform_devices[[query[1].to_i, platform_devices.size - 1].min])
       end
-      def self.opencl_to_device(d)
-        device = d[0]
-        index = d[3]
+      def self.opencl_to_device(dev)
+        device = dev[0]
+        index = dev[3]
         platform_name = device.platform.name.tr(' ', '_').downcase
         uri = [platform_name, index].join(':')
         device_type = device.type.to_s == 'GPU' ? :gpu : :cpu
-        OpenclDevice.new(uri, device_type, self).tap do |devide|
-          devide.native_device = device
+        OpenclDevice.new(uri, device_type, self).tap do |d|
+          d.native_device = device
         end
       end
@@ -85,14 +86,14 @@ module TensorStream
       # Select the best device available in the system for this evaluator
       def self.default_device
         devices = OpenclEvaluator.query_devices_with_score
-        device = devices.sort { |a| a[1] }.reverse.first
+        device = devices.max { |a, b| a[1] <=> b[1] }
         opencl_to_device(device)
       end
       # opencl evaluator main entrypoint
       def run(tensor, execution_context)
-         result = complete_eval(tensor, execution_context)
-        #  puts "wait finish"
+        result = complete_eval(tensor, execution_context)
+        # puts "-------------------wait finish------------------------"
         _opencl_queue.finish
         read_final_result(result)
       end
@@ -115,18 +116,22 @@ module TensorStream
       # buffer comes from non-opencl evaluator
       def convert_from_buffer(tensor, result)
         if result.buffer.is_a?(TensorStream::Evaluator::OutputGroup)
-          converted_outputs = result.buffer.outputs.zip(result.buffer.data_types).map { |output, data_type| convert_to_opencl([output].flatten, shape_eval(output), data_type: data_type, name: tensor.name) }
+          converted_outputs = result.buffer.outputs.zip(result.buffer.data_types).map do |output, data_type|
+            convert_to_opencl([output].flatten, shape_eval(output), data_type: data_type, name: tensor.name)
+          end
           TensorStream::Evaluator::OutputGroup.new(converted_outputs, result.buffer.data_types)
         else
           convert_to_opencl([result.buffer].flatten, shape_eval(result.buffer), data_type: result.data_type, name: tensor.name)
         end
       end
+      # Generate OpenCL instruction to read back from GPU memory to Host memory for a tensor
       def enqueue_buffer_read(tensor, context)
         buffer = _run(tensor, context)
         if buffer.is_a?(Array)
           buffer.collect do |b|
             next b if b.buffer.size.zero?
             b.op = _opencl_queue.enqueue_read_buffer(b.cl_buffer, b.buffer, event_wait_list: build_event_wait_list([b]))
             b
           end
@@ -135,6 +140,7 @@ module TensorStream
           return buffer if buffer.nil?
           return [] if buffer.buffer.nil?
           return buffer if buffer.buffer.size.zero?
           buffer.op = _opencl_queue.enqueue_read_buffer(buffer.cl_buffer, buffer.buffer, event_wait_list: build_event_wait_list([buffer]))
           buffer
         end
@@ -145,7 +151,7 @@ module TensorStream
         buffer = enqueue_buffer_read(tensor, context)
         events = build_event_wait_list([buffer])
-        # puts "wait #{tensor.name}"
+        # puts "** wait #{tensor.name} **"
         OpenCL.wait_for_events(events) unless events.empty?
         buffer
       end
@@ -154,6 +160,7 @@ module TensorStream
         OpenCL.platforms.flat_map do |p|
           p.devices.select { |d| d.available > 0 }.each_with_index.collect do |d, index|
             score = 0
             if d.type.to_s == 'CPU'
               score += 1
             elsif d.type.to_s == 'GPU'
@@ -162,8 +169,7 @@ module TensorStream
             score += 1000 if d.platform.name == 'NVIDIA CUDA'
-            score += d.max_compute_units
-            score += d.max_clock_frequency
+            score += d.max_compute_units * d.max_clock_frequency
             [d, score, p.name, index]
           end
@@ -172,6 +178,31 @@ module TensorStream
       protected
+      ##
+      # called when passing control to another evaluator
+      def perform_transition(tensor, input, next_evaluator, execution_context)
+        if next_evaluator.is_a?(OpenclEvaluator) # OpenCL but different device?
+          # create opencl buffer for this tensor
+          next_evaluator.context = @context
+          foreign_buffer = next_evaluator._run(input, execution_context)
+          event_list = build_event_wait_list([foreign_buffer])
+          output_buffer = _create_result_buffer(input.data_type, foreign_buffer.shape, "t_#{tensor.name}_#{input.name}")
+          output_buffer.op = if next_evaluator.opencl_context == @opencl_context
+                               _opencl_queue.enqueue_copy_buffer(foreign_buffer.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_list)
+                             else
+                               puts "wait finish transition ** #{input.name} **"
+                               read_event = next_evaluator._opencl_queue.enqueue_read_buffer(foreign_buffer.cl_buffer, output_buffer.buffer, event_wait_list: event_list)
+                               OpenCL.wait_for_events(read_event)
+                               _opencl_queue.enqueue_write_buffer(output_buffer.cl_buffer, output_buffer.buffer)
+                             end
+          output_buffer
+        else
+          super
+        end
+      end
       def prepare_input(tensor, context, options = {})
         return nil unless tensor
@@ -195,8 +226,19 @@ module TensorStream
         buffer.to_ruby
       end
-      def _create_opencl_context(opencl_device)
-        @opencl_context = OpenCL.create_context(opencl_device)
+      def _create_opencl_context(device = nil)
+        if device.nil?
+          @@global_opencl_context ||= begin
+            all_devices = OpenclEvaluator.query_supported_devices.map(&:native_device)
+            puts "global context created for #{all_devices}"
+            OpenCL.create_context(all_devices)
+          end
+          @opencl_context = @@global_opencl_context
+        else
+          puts "context created for #{device.native_device}"
+          @opencl_context = OpenCL.create_context(device.native_device)
+        end
       end
       def create_command_queue
@@ -205,6 +247,7 @@ module TensorStream
         properties = []
         properties << OpenCL::CommandQueue::PROFILING_ENABLE if supported_proprties.include?('PROFILING_ENABLE')
         properties << OpenCL::CommandQueue::OUT_OF_ORDER_EXEC_MODE_ENABLE if supported_proprties.include?('OUT_OF_ORDER_EXEC_MODE_ENABLE')
+        # puts "creating queue with properties #{supported_proprties}"
         @command_queue = _opencl_context.create_command_queue(opencl_device, properties: properties)
       end
@@ -222,28 +265,32 @@ module TensorStream
       def _cl_program(kernel, args = {})
         suffix = args.collect { |k, v| "#{k}.#{escape_arg_content(v)}" }.join('.')
-        @context[:_cache]["_opencl_kernel_#{kernel}.#{suffix}:#{object_id}"] ||= begin
-          file_path = File.join('/tmp', "#{kernel}.#{suffix}.cl")
-          source = if File.exist?(file_path) && ENV['TS_OPENCL_FILE_CACHE']
-                     File.read(file_path)
-                   else
-                     filename = %w[cl.erb cl].map { |ext| cl_template_path(kernel, ext) }.find { |n| File.exist?(n) }
-                     raise "opencl kernel template for #{kernel} has not yet been defined" if filename.nil?
-                     source = File.read(filename)
-                     source = OpenclTemplateHelper.new(source).generate(args)
-                     File.write(file_path, source) if ENV['TS_OPENCL_FILE_CACHE']
-                     source
-                   end
-          program = _opencl_context.create_program_with_source(source)
-          program.build
-        rescue OpenCL::Error::BUILD_PROGRAM_FAILURE => e
-          puts "OpenCL Compile error: #{program.build_log}"
-          raise e
-        end
+        kernel_cache_key = "_opencl_kernel_#{kernel}.#{suffix}:#{object_id}"
+        @context[:_cache][kernel_cache_key] ||=
+          begin
+            # puts "building #{kernel_cache_key}"
+            file_path = File.join('/tmp', "#{kernel}.#{suffix}.cl")
+            source = if File.exist?(file_path) && ENV['TS_OPENCL_FILE_CACHE']
+                       File.read(file_path)
+                     else
+                       filename = %w[cl.erb cl].map { |ext| cl_template_path(kernel, ext) }.find { |n| File.exist?(n) }
+                       raise "opencl kernel template for #{kernel} has not yet been defined" if filename.nil?
+                       source = File.read(filename)
+                       source = OpenclTemplateHelper.new(source).generate(args)
+                       File.write(file_path, source) if ENV['TS_OPENCL_FILE_CACHE']
+                       source
+                     end
+            program = _opencl_context.create_program_with_source(source)
+            program.build
+          rescue OpenCL::Error::BUILD_PROGRAM_FAILURE => e
+            puts "OpenCL Compile error: #{program.build_log}"
+            raise e
+          end
       end
       def escape_arg_content(value)
-        return value.tr(' ','_') if value.is_a?(String)
+        return value.tr(' ', '_') if value.is_a?(String)
         return value.join('-') if value.is_a?(Array)
         value
@@ -257,9 +304,8 @@ module TensorStream
         child_context = execution_context.dup
         res = if tensor.is_a?(Operation)
-                if !self.class.ops.include?(tensor.operation.to_sym)
-                  result = @session.delegate_to_evaluator(tensor, @context, execution_context)
-                  convert_from_buffer(tensor, result)
+                if !on_same_device?(tensor) # tensor is on another device or evaluator
+                  perform_transition(tensor, tensor, @context[:_cache][:placement][tensor.name][1], execution_context)
                 else
                   eval_operation(tensor, child_context)
                 end
@@ -295,7 +341,7 @@ module TensorStream
       register_op :identity do |context, tensor, inputs|
         value = inputs[0]
-        buffer = OpenCLBuffer.new(name: tensor.name, data_type: tensor.data_type, shape: value.shape, buffer: value.buffer, cl_buffer: value.cl_buffer)
+        buffer = OpenCLBuffer.new(self, name: tensor.name, data_type: tensor.data_type, shape: value.shape, buffer: value.buffer, cl_buffer: value.cl_buffer)
         buffer.op = build_event_wait_list(inputs)
         buffer
       end
@@ -375,6 +421,7 @@ module TensorStream
       register_op :flow_group do |_context, _tensor, inputs|
         events = build_event_wait_list(inputs)
+        # puts "** wait for event flow_group**"
         OpenCL.wait_for_events(events) unless events.empty?
         nil
       end
@@ -387,8 +434,10 @@ module TensorStream
         cache_key = "#{tensor.graph.object_id}_opencl_#{tensor.name}:#{object_id}"
         return @context[:_cache][cache_key] if @context[:_cache].key?(cache_key)
         return @context[cache_key] if @context.key?(cache_key)
-        # puts "opencl: #{tensor.name}"
+        # puts "opencl eval #{object_id} #{tensor.name}"
         invoke(tensor, child_context).tap do |result|
+          # puts "result done opencl #{object_id}: #{tensor.name}"
           if tensor.breakpoint
             a = resolve_placeholder(tensor.inputs[0], child_context) if tensor.inputs && tensor.inputs[0]
             b = resolve_placeholder(tensor.inputs[1], child_context) if tensor.inputs && tensor.inputs[1]
@@ -603,6 +652,7 @@ module TensorStream
       end
       def convert_to_opencl(value, shape, data_type: nil, name: nil)
+        # puts "convert_to_opencl called for #{name}"
         value = [value] if !value.is_a?(Array) && !value.is_a?(NArray)
         cache_key = "_cl_object_#{name}:#{shape.join('_')}:#{object_id}"
@@ -630,7 +680,7 @@ module TensorStream
                                     _opencl_context.create_buffer(cl_buffer_size * buffer.element_size)
                                   end
-                      @context[:_cache][cache_key] = OpenCLBuffer.new(name: name, data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer)
+                      @context[:_cache][cache_key] = OpenCLBuffer.new(self, name: name, data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer)
                     end
         if data_type == :string
           value[0].each_byte.with_index do |c, index|
@@ -664,15 +714,15 @@ module TensorStream
       def allocate_narray_for_type(data_type, narray_size)
         case data_type
-        when :float, :float32
+        when :float, :float32, :float16
           NArray.sfloat(narray_size)
         when :float64
           NArray.float(narray_size)
-        when :int, :int32, :int64
+        when :int, :int32, :int64, :uint64, :uint32 #NArray does not have 64 bit int types
           NArray.int(narray_size)
-        when :int16
+        when :int16, :uint16
           NArray.sint(narray_size)
-        when :uint8
+        when :uint8, :int8
           NArray.byte(narray_size)
         when :boolean
           NArray.byte(narray_size)
@@ -686,12 +736,14 @@ module TensorStream
       end
       def _create_result_buffer(data_type, shape, name)
-        return OpenCLBuffer.new(name: name, data_type: data_type, shape: [0], buffer: nil, cl_buffer: nil) if shape == [0]
-        @context[:_cache][:_cl_buffers]["_result_#{name}_#{shape.join('_')}:#{object_id}"] ||= begin
+        return OpenCLBuffer.new(self, name: name, data_type: data_type, shape: [0], buffer: nil, cl_buffer: nil) if shape == [0]
+        cache_key = "_result_#{name}_#{shape.join('_')}:#{object_id}"
+        @context[:_cache][:_cl_buffers][cache_key] ||= begin
+          # puts "create result buffer #{cache_key}"
           size = shape.empty? || shape == [0] ? 1 : shape.reduce(:*)
           buffer =  allocate_narray_for_type(data_type, size)
           cl_buffer = _opencl_context.create_buffer(buffer.size * buffer.element_size)
-          OpenCLBuffer.new(data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer, name: name)
+          OpenCLBuffer.new(self, data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer, name: name)
         end
       end
@@ -706,7 +758,7 @@ module TensorStream
             start = index * buffer.size * buffer.element_size
             region = OpenCL::BufferRegion::new(start, buffer.size * buffer.element_size)
             cl_buffer = parent_buffer.cl_buffer.create_sub_buffer(OpenCL::BUFFER_CREATE_TYPE_REGION, region)
-            OpenCLBuffer.new(data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer, name: name)
+            OpenCLBuffer.new(self, data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer, name: name)
           else
             _create_result_buffer(tensor.data_type, shape, name)
           end
@@ -728,7 +780,7 @@ module TensorStream
       # create sub buffers of different sizes
       def _create_variable_result_sub_buffer(parent_buffer, index, start, region_size_in_bytes, data_type, shape, name)
-        cache_key ="_sub_result_#{parent_buffer.object_id}_#{name}_#{index}:#{object_id}"
+        cache_key = "_sub_result_#{parent_buffer.object_id}_#{name}_#{index}:#{object_id}"
         @context[:_cache][:_cl_buffers][cache_key] ||= begin
           size = shape.empty? || shape == [0] ? 1 : shape.reduce(:*)
           buffer = allocate_narray_for_type(data_type, size)
@@ -736,7 +788,7 @@ module TensorStream
           if parent_buffer.cl_buffer.associated_memobject.nil?
             region = OpenCL::BufferRegion::new(start, region_size_in_bytes)
             cl_buffer = parent_buffer.cl_buffer.create_sub_buffer(OpenCL::BUFFER_CREATE_TYPE_REGION, region)
-            OpenCLBuffer.new(data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer, name: "#{name}/sub")
+            OpenCLBuffer.new(self, data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer, name: "#{name}/sub")
           else
             _create_result_buffer(tensor.data_type, shape, name)
           end
@@ -806,6 +858,7 @@ module TensorStream
           convert_to_opencl(red, [], data_type: tensor.data_type, name: tensor.name)
         else
           return input if input.shape.empty?
           value = input.buffer.reshape(*input.shape.reverse)
           rank = input.shape.size - 1
@@ -862,17 +915,15 @@ module TensorStream
       def resolve_placeholder(placeholder, _execution_context = {})
         return nil if placeholder.nil?
+        return placeholder unless placeholder.is_a?(Placeholder)
-        var = if placeholder.is_a?(Placeholder)
-                @context[placeholder.name.to_sym].tap do |c|
-                  raise "missing placeholder #{placeholder.name}" if c.nil?
-                end
-              else
-                placeholder
-              end
+        var = @context[placeholder.name.to_sym]
+        raise "missing placeholder #{placeholder.name}" if var.nil?
-        return convert_to_opencl(var, shape_eval(var), data_type: placeholder.data_type, name: placeholder.name) unless var.is_a?(Tensor)
-        Tensor.cast_dtype(var, placeholder.data_type)
+        cache_key = "#{placeholder.graph.object_id}_opencl_#{placeholder.name}_p:#{object_id}"
+        @context[cache_key] ||= begin
+          convert_to_opencl(var, shape_eval(var), data_type: placeholder.data_type, name: placeholder.name) unless var.is_a?(Tensor)
+        end
       end
       def all_true?(arr)

data/lib/tensor_stream/opencl/opencl_template_helper.rb CHANGED Viewed

@@ -32,10 +32,18 @@ class OpenclTemplateHelper
     case dtype.to_s
     when 'float64'
       'double'
-    when 'float32', 'float'
+    when 'float32', 'float', 'float16'
       'float'
+    when 'uint32'
+      'uint'
+    when 'int64'
+      'int' # 'long' - NArray does not support 64bit int types
+    when 'uint64'
+      'uint'  # 'ulong' - NArray does not support 64bit int types
     when 'int32', 'int'
       'int'
+    when 'uint16'
+      'ushort'
     when 'int16'
       'short'
     when 'uint8'
@@ -51,10 +59,12 @@ class OpenclTemplateHelper
     case dtype.to_s
     when 'float64'
       'DBL_MIN'
-    when 'float32', 'float'
+    when 'float32', 'float', 'float16'
       'FLT_MIN'
     when 'int32', 'int'
       'INT_MIN'
+    when 'uint32', 'uint16'
+      '0'
     when 'int16'
       'SHRT_MIN'
     when 'int8'

data/lib/tensor_stream/opencl/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 module TensorStream
   module Opencl
-    VERSION = "0.2.1"
+    VERSION = "0.2.2"
   end
 end

data/samples/iris.rb CHANGED Viewed

@@ -48,8 +48,6 @@ x_test.each_with_index do |x, index|
   validation_cases << [x, y_test[index]]
 end
 def init_weights(shape)
   # Weight initialization
   weights = TensorStream.random_normal(shape, stddev: 0.1)

data/samples/mnist_data_2.1.rb ADDED Viewed

@@ -0,0 +1,99 @@
+# A ruby port of the example code discussed by Martin Gorner in
+# "TensorFlow and Deep Learning without a PhD, Part 1 (Google Cloud Next '17)""
+#
+# https://www.youtube.com/watch?v=u4alGiomYP4
+#
+# Requirements:
+#   mnist-learn gem
+#   opencl_ruby_ffi gem
+require "bundler/setup"
+require 'tensor_stream'
+require 'mnist-learn'
+require 'pry-byebug'
+# Enable OpenCL hardware accelerated computation, not using OpenCL can be very slow
+require 'tensor_stream/opencl'
+tf = TensorStream
+puts "Tensorstream version #{tf.__version__} with OpenCL lib #{TensorStream::Opencl::VERSION}"
+tf.set_random_seed(0)
+# Import MNIST data
+puts "downloading minst data"
+mnist = Mnist.read_data_sets('/tmp/data', one_hot: true)
+puts "downloading finished"
+x = tf.placeholder(:float32, shape: [nil, 784])
+K = 200
+L = 100
+M = 60
+N = 30
+w1 = tf.variable(tf.random_normal([784, K]))
+b1 = tf.variable(tf.zeros([K]))
+w2 = tf.variable(tf.random_normal([K, L]))
+b2 = tf.variable(tf.zeros([L]))
+w3 = tf.variable(tf.random_normal([L, M]))
+b3 = tf.variable(tf.zeros([M]))
+w4 = tf.variable(tf.random_normal([M, N]))
+b4 = tf.variable(tf.zeros([N]))
+w5 = tf.variable(tf.random_normal([N, 10]))
+b5 = tf.variable(tf.zeros([10]))
+x_ = tf.reshape(x, [-1, 784])
+y1 = tf.sigmoid(tf.matmul(x_, w1) + b1)
+y2 = tf.sigmoid(tf.matmul(y1, w2) + b2)
+y3 = tf.sigmoid(tf.matmul(y2, w3) + b3)
+y4 = tf.sigmoid(tf.matmul(y3, w4) + b4)
+ylogits = tf.matmul(y4, w5) + b5
+# model
+y = tf.nn.softmax(ylogits)
+y_ = tf.placeholder(:float32, shape: [nil, 10])
+# cross-entropy loss function (= -sum(Y_i * log(Yi)) ), normalised for batches of 100  images
+# TensorFlow provides the softmax_cross_entropy_with_logits function to avoid numerical stability
+# problems with log(0) which is NaN
+cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits: ylogits, labels: y_)
+cross_entropy = tf.reduce_mean(cross_entropy)*100
+is_correct = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
+accuracy =  tf.reduce_mean(tf.cast(is_correct, :float32))
+# training step, learning rate = 0.003
+learning_rate = 0.003
+train_step = TensorStream::Train::AdamOptimizer.new(learning_rate).minimize(cross_entropy)
+sess = tf.session
+init = tf.global_variables_initializer
+sess.run(init)
+mnist_train = mnist.train
+test_data = { x => mnist.test.images, y_ => mnist.test.labels }
+(0..10000).each do |i|
+  # load batch of images and correct answers
+  batch_x, batch_y = mnist_train.next_batch(100)
+  train_data = { x => batch_x, y_ => batch_y }
+  # train
+  sess.run(train_step, feed_dict: train_data)
+  if (i % 50 == 0)
+    # success? add code to print it
+    a_train, c_train = sess.run([accuracy, cross_entropy], feed_dict: train_data)
+    # success on test data?
+    a_test, c_test = sess.run([accuracy, cross_entropy], feed_dict: test_data)
+    puts "#{i} train accuracy #{a_train}, error #{c_train} test accuracy #{a_test}, error #{c_test}"
+  end
+end

data/samples/mnist_data_2.2.rb ADDED Viewed

@@ -0,0 +1,98 @@
+# A ruby port of the example code discussed by Martin Gorner in
+# "TensorFlow and Deep Learning without a PhD, Part 1 (Google Cloud Next '17)""
+#
+# https://www.youtube.com/watch?v=u4alGiomYP4
+#
+# Requirements:
+#   mnist-learn gem
+#   opencl_ruby_ffi gem
+require "bundler/setup"
+require 'tensor_stream'
+require 'mnist-learn'
+require 'pry-byebug'
+# Enable OpenCL hardware accelerated computation, not using OpenCL can be very slow
+require 'tensor_stream/opencl'
+tf = TensorStream
+# Import MNIST data
+puts "downloading minst data"
+mnist = Mnist.read_data_sets('/tmp/data', one_hot: true)
+puts "downloading finished"
+x = tf.placeholder(:float32, shape: [nil, 784])
+K = 200
+L = 100
+M = 60
+N = 30
+w1 = tf.variable(tf.random_normal([784, K]))
+b1 = tf.variable(tf.zeros([K]))
+w2 = tf.variable(tf.random_normal([K, L]))
+b2 = tf.variable(tf.zeros([L]))
+w3 = tf.variable(tf.random_normal([L, M]))
+b3 = tf.variable(tf.zeros([M]))
+w4 = tf.variable(tf.random_normal([M, N]))
+b4 = tf.variable(tf.zeros([N]))
+w5 = tf.variable(tf.random_normal([N, 10]))
+b5 = tf.variable(tf.zeros([10]))
+x_ = tf.reshape(x, [-1, 784])
+y1 = tf.nn.relu(tf.matmul(x_, w1) + b1)
+y2 = tf.nn.relu(tf.matmul(y1, w2) + b2)
+y3 = tf.nn.relu(tf.matmul(y2, w3) + b3)
+y4 = tf.nn.relu(tf.matmul(y3, w4) + b4)
+ylogits = tf.matmul(y4, w5) + b5
+# model
+y = tf.nn.softmax(ylogits)
+y_ = tf.placeholder(:float32, shape: [nil, 10])
+# cross-entropy loss function (= -sum(Y_i * log(Yi)) ), normalised for batches of 100  images
+# TensorFlow provides the softmax_cross_entropy_with_logits function to avoid numerical stability
+# problems with log(0) which is NaN
+cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits: ylogits, labels: y_)
+cross_entropy = tf.reduce_mean(cross_entropy)*100
+is_correct = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
+accuracy =  tf.reduce_mean(tf.cast(is_correct, :float32))
+# training step, learning rate = 0.003
+learning_rate = 0.003
+train_step = TensorStream::Train::AdamOptimizer.new(learning_rate).minimize(cross_entropy)
+sess = tf.session
+# Add ops to save and restore all the variables.
+saver = tf::Train::Saver.new
+init = tf.global_variables_initializer
+sess.run(init)
+mnist_train = mnist.train
+test_data = { x => mnist.test.images, y_ => mnist.test.labels }
+(0..1000).each do |i|
+  # load batch of images and correct answers
+  batch_x, batch_y = mnist_train.next_batch(100)
+  train_data = { x => batch_x, y_ => batch_y }
+  # train
+  sess.run(train_step, feed_dict: train_data)
+  if (i % 50 == 0)
+    # success? add code to print it
+    a_train, c_train = sess.run([accuracy, cross_entropy], feed_dict: train_data)
+    # success on test data?
+    a_test, c_test = sess.run([accuracy, cross_entropy], feed_dict: test_data)
+    puts "#{i} train accuracy #{a_train}, error #{c_train} test accuracy #{a_test}, error #{c_test}"
+  end
+end

data/samples/multigpu.rb CHANGED Viewed

@@ -11,7 +11,6 @@ DIMEN = 1024
 A = ts.random_uniform([DIMEN, DIMEN]).eval
 B = ts.random_uniform([DIMEN, DIMEN]).eval
 # Create a graph to store results
 c1 = []
 c2 = []
@@ -35,17 +34,24 @@ sum = ts.device('/device:GPU:0') do
   ts.add_n(c1)
 end
-t1_1 = Time.now.to_i
+t1_1 = nil
 t2_1 = nil
-ts.session(log_device_placement: true) do |sess|
+puts "===================== starting single GPU test ================"
+ts.session(log_device_placement: true, profile_enabled: true) do |sess|
+  puts "-- warmup ---"
+  sess.run(sum, feed_dict: { a => A, b => B}) # warmup
+  puts "-- warmup ---"
+  time = Time.now
+  t1_1 = time.to_i * (10 ** 9) + time.nsec
   sess.run(sum, feed_dict: { a => A, b => B})
-  t2_1 = Time.now.to_i
+  time = Time.now
+  t2_1 = time.to_i * (10 ** 9) + time.nsec
 end
+puts "===================== end single GPU test ================"
+puts "===================== MULTI GPU text ================"
 # Multi GPU computing
 # GPU:0 computes A^n
-ts.device('/device:GPU:1') do
+ts.device('/device:GPU:0') do
   a = ts.placeholder(:float32, shape: [DIMEN, DIMEN])
   c2 << matpow(a, n)
 end
@@ -56,18 +62,26 @@ ts.device('/device:GPU:1') do
   c2 << matpow(b, n)
 end
-ts.device('/device:GPU:1') do
+ts.device('/device:GPU:0') do
   sum = ts.add_n(c2) #Addition of all elements in c2, i.e. A^n + B^n
 end
-t1_2 = Time.now.to_i
+t1_2 = nil
 t2_2 = nil
-ts.session(log_device_placement:true) do |sess|
+ts.session(log_device_placement: true, profile_enabled: true) do |sess|
     # Run the op.
+    puts "-- warmup ---"
+    sess.run(sum, feed_dict: {a => A, b => B}) # warm up
+    puts "-- warmup ---"
+    time = Time.now
+    t1_2 = time.to_i * (10 ** 9) + time.nsec
+    puts "================ starting multiGPU test ==============="
     sess.run(sum, feed_dict: {a => A, b => B})
-    t2_2 = Time.now.to_i
+    time = Time.now
+    t2_2 = time.to_i * (10 ** 9) + time.nsec
 end
-print("Single GPU computation time: " + (t2_1-t1_1).to_s)
-print("Multi GPU computation time: " + (t2_2-t1_2).to_s)
+puts("Single GPU computation time: " + ((t2_1-t1_1)/ 1000000.to_f).to_s)
+puts("Multi GPU computation time: " + ((t2_2-t1_2)/ 1000000.to_f).to_s)

data/tensor_stream-opencl.gemspec CHANGED Viewed

@@ -38,7 +38,7 @@ Gem::Specification.new do |spec|
   spec.add_development_dependency "pry-byebug"
   spec.add_development_dependency "awesome_print"
   spec.add_development_dependency "mnist-learn"
-  spec.add_dependency "tensor_stream", "~> 0.9.0"
+  spec.add_dependency "tensor_stream", "~> 0.9.2"
   spec.add_dependency "opencl_ruby_ffi"
   spec.add_dependency "oily_png"
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: tensor_stream-opencl
 version: !ruby/object:Gem::Version
-  version: 0.2.1
+  version: 0.2.2
 platform: ruby
 authors:
 - Joseph Dayo
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2018-10-08 00:00:00.000000000 Z
+date: 2018-10-21 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -100,14 +100,14 @@ dependencies:
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: 0.9.0
+        version: 0.9.2
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: 0.9.0
+        version: 0.9.2
 - !ruby/object:Gem::Dependency
   name: opencl_ruby_ffi
   requirement: !ruby/object:Gem::Requirement
@@ -145,10 +145,10 @@ extra_rdoc_files: []
 files:
 - ".gitignore"
 - ".rspec"
+- ".rubocop.yml"
 - ".travis.yml"
 - CODE_OF_CONDUCT.md
 - Gemfile
-- Gemfile.lock
 - LICENSE.txt
 - README.md
 - Rakefile
@@ -226,7 +226,8 @@ files:
 - lib/tensor_stream/opencl/version.rb
 - samples/iris.data
 - samples/iris.rb
-- samples/mnist_data.rb
+- samples/mnist_data_2.1.rb
+- samples/mnist_data_2.2.rb
 - samples/multigpu.rb
 - samples/nearest_neighbor.rb
 - samples/rnn.rb

data/Gemfile.lock DELETED Viewed

@@ -1,70 +0,0 @@
-PATH
-  remote: .
-  specs:
-    tensor_stream-opencl (0.2.1)
-      oily_png
-      opencl_ruby_ffi
-      tensor_stream (~> 0.9.0)
-GEM
-  remote: https://rubygems.org/
-  specs:
-    awesome_print (1.8.0)
-    byebug (10.0.2)
-    chunky_png (1.3.10)
-    coderay (1.1.2)
-    concurrent-ruby (1.0.5)
-    deep_merge (1.2.1)
-    diff-lcs (1.3)
-    ffi (1.9.25)
-    method_source (0.9.0)
-    mnist-learn (0.1.1)
-    narray (0.6.1.2)
-    narray_ffi (1.4.4)
-      ffi (~> 1.9, >= 1.9.3)
-      narray (~> 0.6, >= 0.6.0.8)
-    oily_png (1.2.1)
-      chunky_png (~> 1.3.7)
-    opencl_ruby_ffi (1.3.4)
-      ffi (~> 1.9, >= 1.9.3)
-      narray (~> 0.6, >= 0.6.0.8)
-      narray_ffi (~> 1.0, >= 1.0.0)
-    pry (0.11.3)
-      coderay (~> 1.1.0)
-      method_source (~> 0.9.0)
-    pry-byebug (3.6.0)
-      byebug (~> 10.0)
-      pry (~> 0.10)
-    rake (10.5.0)
-    rspec (3.8.0)
-      rspec-core (~> 3.8.0)
-      rspec-expectations (~> 3.8.0)
-      rspec-mocks (~> 3.8.0)
-    rspec-core (3.8.0)
-      rspec-support (~> 3.8.0)
-    rspec-expectations (3.8.1)
-      diff-lcs (>= 1.2.0, < 2.0)
-      rspec-support (~> 3.8.0)
-    rspec-mocks (3.8.0)
-      diff-lcs (>= 1.2.0, < 2.0)
-      rspec-support (~> 3.8.0)
-    rspec-support (3.8.0)
-    tensor_stream (0.9.0)
-      chunky_png
-      concurrent-ruby
-      deep_merge
-PLATFORMS
-  ruby
-DEPENDENCIES
-  awesome_print
-  bundler (~> 1.16)
-  mnist-learn
-  pry-byebug
-  rake (~> 10.0)
-  rspec (~> 3.0)
-  tensor_stream-opencl!
-BUNDLED WITH
-   1.16.2

data/samples/mnist_data.rb DELETED Viewed

@@ -1,65 +0,0 @@
-# A ruby port of the example code discussed by Martin Gorner in
-# "TensorFlow and Deep Learning without a PhD, Part 1 (Google Cloud Next '17)""
-#
-# https://www.youtube.com/watch?v=u4alGiomYP4
-#
-# Requirements:
-#   mnist-learn gem
-#   opencl_ruby_ffi gem
-require "bundler/setup"
-require 'tensor_stream'
-require 'mnist-learn'
-# Enable OpenCL hardware accelerated computation, not using OpenCL can be very slow
-# require 'tensor_stream/opencl'
-tf = TensorStream
-# Import MNIST data
-puts "downloading minst data"
-mnist = Mnist.read_data_sets('/tmp/data', one_hot: true)
-puts "downloading finished"
-x = tf.placeholder(:float32, shape: [nil, 784])
-w = tf.variable(tf.zeros([784, 10]))
-b = tf.variable(tf.zeros([10]))
-# model
-y = tf.nn.softmax(tf.matmul(tf.reshape(x, [-1, 784]), w) + b)
-y_ = tf.placeholder(:float32, shape: [nil, 10])
-# loss function
-cross_entropy = -tf.reduce_sum(y_ * tf.log(y))
-is_correct = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
-accuracy =  tf.reduce_mean(tf.cast(is_correct, :float32))
-optimizer = TensorStream::Train::AdamOptimizer.new
-train_step = optimizer.minimize(cross_entropy)
-sess = tf.session
-init = tf.global_variables_initializer
-sess.run(init)
-(0...1000).each do |i|
-  # load batch of images and correct answers
-  batch_x, batch_y = mnist.train.next_batch(100)
-  train_data = { x => batch_x, y_ => batch_y }
-  # train
-  sess.run(train_step, feed_dict: train_data)
-  if (i % 10 == 0)
-    # success? add code to print it
-    a, c = sess.run([accuracy, cross_entropy], feed_dict: train_data)
-    puts "#{i} train accuracy #{a}, error #{c}"
-    # success on test data?
-    test_data = { x => mnist.test.images, y_ => mnist.test.labels }
-    a, c = sess.run([accuracy, cross_entropy], feed_dict: test_data)
-    puts " test accuracy #{a}, error #{c}"
-  end
-end