RubyGems - tensor_stream - Versions diffs - 0.7.0 → 0.8.0 - Mend

tensor_stream 0.7.0 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

checksums.yaml +5 -5
data/.rubocop.yml +6 -1
data/CHANGELOG.md +10 -0
data/README.md +35 -0
data/lib/tensor_stream.rb +2 -2
data/lib/tensor_stream/debugging/debugging.rb +2 -1
data/lib/tensor_stream/dynamic_stitch.rb +23 -24
data/lib/tensor_stream/evaluator/base_evaluator.rb +27 -18
data/lib/tensor_stream/evaluator/opencl/kernels/apply_momentum.cl +16 -0
data/lib/tensor_stream/evaluator/opencl/kernels/pack.cl +24 -0
data/lib/tensor_stream/evaluator/opencl/kernels/softmax_cross.cl +6 -1
data/lib/tensor_stream/evaluator/opencl/opencl_buffer.rb +6 -6
data/lib/tensor_stream/evaluator/opencl/opencl_evaluator.rb +237 -107
data/lib/tensor_stream/evaluator/operation_helpers/array_ops_helper.rb +97 -7
data/lib/tensor_stream/evaluator/ruby_evaluator.rb +230 -123
data/lib/tensor_stream/exceptions.rb +1 -0
data/lib/tensor_stream/graph_builder.rb +2 -3
data/lib/tensor_stream/graph_deserializers/protobuf.rb +22 -23
data/lib/tensor_stream/graph_serializers/graphml.rb +26 -29
data/lib/tensor_stream/graph_serializers/pbtext.rb +22 -19
data/lib/tensor_stream/helpers/string_helper.rb +4 -5
data/lib/tensor_stream/math_gradients.rb +141 -77
data/lib/tensor_stream/nn/nn_ops.rb +4 -6
data/lib/tensor_stream/operation.rb +139 -120
data/lib/tensor_stream/ops.rb +36 -3
data/lib/tensor_stream/session.rb +7 -11
data/lib/tensor_stream/tensor.rb +3 -3
data/lib/tensor_stream/tensor_shape.rb +5 -0
data/lib/tensor_stream/train/gradient_descent_optimizer.rb +4 -37
data/lib/tensor_stream/train/momentum_optimizer.rb +48 -0
data/lib/tensor_stream/train/optimizer.rb +129 -0
data/lib/tensor_stream/train/saver.rb +0 -1
data/lib/tensor_stream/train/slot_creator.rb +62 -0
data/lib/tensor_stream/train/utils.rb +11 -12
data/lib/tensor_stream/trainer.rb +3 -0
data/lib/tensor_stream/utils.rb +18 -11
data/lib/tensor_stream/variable.rb +19 -12
data/lib/tensor_stream/variable_scope.rb +1 -1
data/lib/tensor_stream/version.rb +1 -1
data/samples/iris.rb +2 -1
data/samples/linear_regression.rb +3 -1
data/samples/nearest_neighbor.rb +2 -0
data/test_samples/neural_network_raw.py +101 -0
data/test_samples/raw_neural_net_sample.rb +6 -4
data/test_samples/test2.py +73 -27
metadata +9 -3

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
-SHA256:
-  metadata.gz: d42a81e850271f080d408c52f2bea15a07c6d41ee3c6790dc04e48f2ab485364
-  data.tar.gz: a4aedfd3c9a532f31ea195c58124644fcae143726d37daa7a4a6afc6b39f439b
+SHA1:
+  metadata.gz: f84c2b9852fcf4931c47c0130b67497a50a87b0f
+  data.tar.gz: 524e1105da4e06e3472cbcfa0e6f764ae4512d37
 SHA512:
-  metadata.gz: 7d9fff1a8af14878c50469cfcee7942d2800906fe388504261290958628861b29d973ea65a3cf986cd1657acf490d2bc7164ec9ec2c14dc7c5d6c25121c6737f
-  data.tar.gz: 42ef6af8fafd1a7f7f069e03f8c344bec87bd15217def07d859d4b33374a038e1b7ed54ac685b901d3ed9b51fd351300cc61553ca45f0591fc4a7c5e50bcee53
+  metadata.gz: 420e2675ab67d4c8462534bdf8c703671656f7852d984579e22ee57f1425dd5740fcb64a1e52363bf337cd7c691d87a75c76bd868b13c8a7f06d78e0eb00aa73
+  data.tar.gz: 24fe1022741883d46cdd5af51309da33d421d72874f0cc84bf2e0ed14a62602f1830c6060bd86e42359b7962b4a57727c9a48ce13d5950d5ba02f6a9cdfd719f

data/.rubocop.yml CHANGED Viewed

@@ -6,6 +6,10 @@ AllCops:
     - tensor_stream.gemspec
     - Rakefile
+Naming/AccessorMethodName:
+  Exclude:
+    - lib/tensor_stream/utils.rb
 Style/StringLiterals:
   Enabled: false
@@ -81,4 +85,5 @@ Style/TrailingCommaInHashLiteral:
 Naming/UncommunicativeMethodParamName:
   Exclude:
-    - lib/tensor_stream/evaluator/ruby_evaluator.rb
+    - lib/tensor_stream/evaluator/ruby_evaluator.rb
+    - lib/tensor_stream/ops.rb

data/CHANGELOG.md CHANGED Viewed

@@ -4,6 +4,16 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+## [0.8.0] - 2018-08-29
+### Added
+- [TRAINING] Added new supported optimizer, MomentumOptimizer loosely based on tensorflow's implementation (with nesterov support)
+- [NEW OP] fill, stack, atan, cumprod, gather, invert_permutation, setdiff1d
+### Fixes
+- Fixed device delegator where it does not pick the correct evaluator to use in some cases
+- [GRADIENTS] Properly implement gradient computation for prod, tile, transpose
+- Fixed gradient computation for softmax_cross_entropy_with_logits_v2 (now based on tensorflow's implementation)
 ## [0.7.0] - 2018-08-08
 ### Added
 - [NEW OP] expand_dims, min, acos, asin, add_n

data/README.md CHANGED Viewed

@@ -324,6 +324,41 @@ result = a + b
 File.write("model.pbtext", result.graph.as_graph_def)
 ```
+## Performance notes
+Comparative performance with respect to other ruby libraries have not yet been performed. However it is
+notable that TruffleRuby and ruby-2.6.0-preview2 with the --jit flag performs considerably better with respect
+to previous versions of ruby(< 2.6)
+Benchmarks running samples/linear_regression.rb on an Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz
+ruby 2.4
+```
+$ ruby -v
+ruby 2.4.0p0 (2016-12-24 revision 57164) [x86_64-linux]
+$ ruby samples/linear_regression.rb
+495 seconds 1000 epochs
+```
+ruby 2.6.0-preview2
+```
+$ ruby -v
+ruby 2.6.0preview2 (2018-05-31 trunk 63539) [x86_64-linux]
+$ ruby --jit samples/linear_regression.rb
+394 seconds 10000 epochs
+```
+truffleruby
+```
+$ ruby -v
+truffleruby 1.0.0-rc5, like ruby 2.4.4, GraalVM CE Native [x86_64-linux]
+219 seconds 10000 epochs
+```
+For training large networks that works on images, the opencl evaluator is the only way to go.
 ## Roadmap
 - Docs

data/lib/tensor_stream.rb CHANGED Viewed

@@ -20,8 +20,6 @@ require 'tensor_stream/operation'
 require 'tensor_stream/placeholder'
 require 'tensor_stream/control_flow'
 require 'tensor_stream/dynamic_stitch'
-require 'tensor_stream/train/utils'
-require 'tensor_stream/trainer'
 require 'tensor_stream/nn/nn_ops'
 require 'tensor_stream/evaluator/evaluator'
 require 'tensor_stream/graph_serializers/serializer'
@@ -31,6 +29,8 @@ require 'tensor_stream/graph_serializers/graphml'
 require 'tensor_stream/math_gradients'
 require "tensor_stream/debugging/debugging"
 require 'tensor_stream/utils'
+require 'tensor_stream/train/utils'
+require 'tensor_stream/trainer'
 # require 'tensor_stream/libraries/layers'
 require 'tensor_stream/monkey_patches/integer'

data/lib/tensor_stream/debugging/debugging.rb CHANGED Viewed

@@ -9,8 +9,9 @@ module TensorStream
       nodes_to_process.each do |node|
         node.inputs = node.inputs.collect do |input|
           next if input.nil?
+          next input if input.is_a?(Variable)
-          if TensorStream::Ops::FLOATING_POINT_TYPES.include?(input.data_type)
+          if input.is_a?(Tensor) && TensorStream::Ops::FLOATING_POINT_TYPES.include?(input.data_type)
             TensorStream.check_numerics(input, "#{node.name}/#{input.name}", name: "check/#{node.name}/#{input.name}" )
           else
             input

data/lib/tensor_stream/dynamic_stitch.rb CHANGED Viewed

@@ -1,28 +1,27 @@
 module TensorStream
-    # Defines a TensorStream controlflow op
-    class DynamicStitch < Operation
-      attr_accessor :ops
-      def initialize(flow_type, inputs, ops = nil, options = {})
-        setup_initial_state(options)
-        @operation = :"flow_#{flow_type}"
-        @inputs = inputs
+  # Defines a TensorStream controlflow op
+  class DynamicStitch < Operation
+    attr_accessor :ops
-        @data_type = Tensor.detect_type(inputs[1])
-        @name = [@graph.get_name_scope, options[:name] || set_name].compact.join('/')
-        @ops = ops
-        @shape = TensorShape.new([inputs.size])
-        @graph.add_node(self)
-      end
-      def set_data_type(_passed_data_type)
-        :unknown
-      end
-      def run
-        eval
-      end
+    def initialize(flow_type, inputs, ops = nil, options = {})
+      setup_initial_state(options)
+      @operation = :"flow_#{flow_type}"
+      @inputs = inputs
+      @data_type = Tensor.detect_type(inputs[1])
+      @name = [@graph.get_name_scope, options[:name] || set_name].compact.join('/')
+      @ops = ops
+      @shape = TensorShape.new([inputs.size])
+      @graph.add_node(self)
+    end
+    def set_data_type(_passed_data_type)
+      :unknown
+    end
+    def run
+      eval
     end
   end
+end

data/lib/tensor_stream/evaluator/base_evaluator.rb CHANGED Viewed

@@ -2,13 +2,14 @@ module TensorStream
   # Evaluator base module
   module Evaluator
     class OutputGroup
-      attr_accessor :outputs
-      def initialize(outputs = [])
+      attr_accessor :outputs, :data_types
+      def initialize(outputs = [], data_types = [])
         @outputs = outputs
+        @data_types = data_types
       end
     end
-    class UnsupportedOp < Exception
+    class UnsupportedOp < RuntimeError
       def initialize(tensor)
         @tensor = tensor
       end
@@ -111,22 +112,13 @@ module TensorStream
         resolved_inputs = tensor.inputs.map do |i|
           next if i.nil?
+          next i if op_options[:noop]
           if i.is_a?(Array)
-            next i.collect { |sub_item| sub_item.is_a?(Tensor) ? invoke(sub_item, execution_context) : sub_item }
+            next i.collect { |sub_item| sub_item.is_a?(Tensor) ? global_eval(tensor, sub_item, execution_context) : sub_item }
           end
-          if !op_options[:noop] && @context[:_cache][:placement][tensor.name] != @context[:_cache][:placement][i.name] # tensor is on another device or evaluator
-            cache_key = "#{tensor.graph.object_id}_#{i.name}:#{object_id}"
-            next @context[:_cache][cache_key] if @context[:_cache].key?(cache_key)
-            result = @session.delegate_to_evaluator(i, @context, execution_context)
-            convert_from_buffer(i, result).tap do |buffer|
-              @context[:_cache][cache_key] = buffer if i.is_const
-            end
-          else
-            prepare_input(i, execution_context, op_options)
-          end
+          global_eval(tensor, i, execution_context, op_options)
         end
         instance_exec(execution_context, tensor, resolved_inputs, &op[:block])
@@ -134,6 +126,23 @@ module TensorStream
       protected
+      def global_eval(tensor, input, execution_context, op_options = {})
+        return nil unless input
+        return input unless input.is_a?(Tensor)
+        if object_id != @context[:_cache][:placement][input.name][1].object_id # tensor is on another device or evaluator
+          cache_key = "#{tensor.graph.object_id}_#{input.name}:#{object_id}"
+          return @context[:_cache][cache_key] if @context[:_cache].key?(cache_key)
+          result = @session.delegate_to_evaluator(input, @context, execution_context)
+          convert_from_buffer(input, result).tap do |buffer|
+            @context[:_cache][cache_key] = buffer if input.is_const
+          end
+        else
+          prepare_input(input, execution_context, op_options)
+        end
+      end
       def get_broadcast_gradient_args(input_a, input_b)
         return [[], []] if input_a == input_b
@@ -153,16 +162,16 @@ module TensorStream
           end
         end
-       [input_a_args.reverse, input_b_args.reverse]
+        [input_a_args.reverse, input_b_args.reverse]
       end
       ##
       # converts from a ruby Buffer object to the evaluator's native buffer format
-      def convert_from_buffer(tensor, result)
+      def convert_from_buffer(_tensor, _result)
         raise "need implementation"
       end
-      def prepare_input(tensor, context, options = {})
+      def prepare_input(_tensor, _context, _options = {})
         raise "need implementation"
       end
     end

data/lib/tensor_stream/evaluator/opencl/kernels/apply_momentum.cl ADDED Viewed

@@ -0,0 +1,16 @@
+% c_dtype = dtype_to_c_type(dtype)
+ // same dimension add floating point op
+ __kernel void apply_momentum_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *grad, __global const <%= c_dtype %> *learning_rate,
+                                          __global const <%= c_dtype %> *momentum, __global <%= c_dtype %> *output, __global <%= c_dtype %> *acc) {
+    // Get the index of the current element to be processed
+    const int globalRow = get_global_id(0); // Row ID of C (0..M)
+    const int globalCol = get_global_id(1); // Col ID of C (0..N)
+    const int index = globalRow * N + globalCol;
+    <%= c_dtype %> acc_m = acc[index];
+    acc[index] = acc_m * momentum[0] + grad[index];
+<% if nesterov %>
+    output[index] -= grad[index] * learning_rate[0] + acc_m * momentum[0] * learning_rate[0];
+<% else %>
+    output[index] -= acc_m * learning_rate[0];
+<% end %>
+}

data/lib/tensor_stream/evaluator/opencl/kernels/pack.cl ADDED Viewed

@@ -0,0 +1,24 @@
+% ctype = dtype_to_c_type(data_type)
+__kernel void pack(const int N, const int index, __global const <%= ctype %> *A, __global <%= ctype %> *C) {
+    // Get the index of the current element to be processed
+    const int globalCol = get_global_id(0); // Col ID of C (0..N)
+    int start = index * <%= divisors[0] %>;
+    int ptr = start + globalCol;
+    int index_map[<%= divisors.size %>] = { <%= Array.new(divisors.size) { 0 }.join(', ') %> };
+    // compute effective coordinates
+<% divisors.each_with_index do |div, index| %>
+    index_map[<%= index %>] = (int)floor(ptr / (float)<%= div %>);<% if index < divisors.size - 1%>ptr = ptr % <%= div %>;<% end %><% end %>
+    // Apply axis translation if needed
+<% if axis > 0 %>
+    int first = index_map[0];
+<% axis.times do |i| %>
+    index_map[<%= i %>] = index_map[<%= (i + 1) %>];<% end %>
+    index_map[<%= axis %>] = first;
+<% end%>
+    C[<%= multipliers.each_with_index.map { |m, idx| "#{m}*index_map[#{idx}]" }.join(' + ') %>] = A[globalCol];
+}

data/lib/tensor_stream/evaluator/opencl/kernels/softmax_cross.cl CHANGED Viewed

@@ -1,9 +1,10 @@
 // First naive implementation
 % c_dtype = dtype_to_c_type(dtype)
 __kernel void softmax_cross_<%= dtype %>(const int N,
                       const __global <%= c_dtype %>* A,
                       const __global <%= c_dtype %>* L,
-                      __global <%= c_dtype %>* C) {
+                      __global <%= c_dtype %>* C, __global <%= c_dtype %>* P) {
     // Get the index of the current element to be processed
     const int globalRow = get_global_id(0); // Row ID of C (0..M)
@@ -24,4 +25,8 @@ __kernel void softmax_cross_<%= dtype %>(const int N,
     for (int k=0; k < N; k++) {
       C[globalRow*N + k] = (log(acc) - (A[globalRow*N + k] - max)) * L[globalRow*N + k];
     }
+    for (int k=0; k < N; k++) {
+      P[globalRow*N + k] = (exp(A[globalRow*N + k] - max) / acc) - L[globalRow*N + k];
+    }
 }

data/lib/tensor_stream/evaluator/opencl/opencl_buffer.rb CHANGED Viewed

@@ -16,10 +16,6 @@ module TensorStream
     def to_ruby
       return [] if buffer.empty?
-      if shape.empty?
-        return buffer[0] != 0 if data_type == :boolean
-        return buffer[0]
-      end
       if dirty
         op.command_queue.enqueue_read_buffer(cl_buffer, buffer, event_wait_list: [op].compact)
@@ -27,9 +23,13 @@ module TensorStream
         self.dirty = false
       end
+      if shape.empty?
+        return buffer[0] != 0 if data_type == :boolean
+        return buffer[0]
+      end
       result = buffer.reshape(*shape.map(&:to_i).reverse).to_a
-      result = process_function_op(result, ->(a, _b) { a != 0 }) if data_type == :boolean
-      result
+      data_type == :boolean ? process_function_op(result, ->(a, _b) { a != 0 }) : result
     end
   end
 end

data/lib/tensor_stream/evaluator/opencl/opencl_evaluator.rb CHANGED Viewed

@@ -30,6 +30,7 @@ module TensorStream
     ## PURE ruby evaluator used for testing and development
     class OpenclEvaluator < BaseEvaluator
       attr_accessor :retain
+      attr_reader :opencl_device
       include TensorStream::OpHelper
       include TensorStream::ArrayOpsHelper
@@ -51,20 +52,20 @@ module TensorStream
       def self.fetch_device(query = [])
         devices = query_devices_with_score
-        platform_devices = devices.select { |d| d[0].platform.to_s.gsub(' ','_').downcase =~ /#{query[0].downcase}/ }
+        platform_devices = devices.select { |d| d[0].platform.to_s.tr(' ', '_').downcase =~ /#{query[0].downcase}/ }
         opencl_to_device(platform_devices[[query[1].to_i, platform_devices.size - 1].min])
       end
       def self.opencl_to_device(d)
         device = d[0]
         index = d[3]
-        platform_name = device.platform.name.gsub(' ', '_').downcase
+        platform_name = device.platform.name.tr(' ', '_').downcase
         uri = [platform_name, index].join(':')
         device_type = device.type.to_s == 'GPU' ? :gpu : :cpu
-        OpenclDevice.new(uri, device_type, self).tap do |d|
-          d.native_device = device
+        OpenclDevice.new(uri, device_type, self).tap do |devide|
+          devide.native_device = device
         end
       end
@@ -96,8 +97,14 @@ module TensorStream
         end
       end
+      # buffer comes from non-opencl evaluator
       def convert_from_buffer(tensor, result)
-        convert_to_opencl([result.buffer].flatten, shape_eval(result.buffer), data_type: result.data_type, name: tensor.name)
+        if result.buffer.is_a?(TensorStream::Evaluator::OutputGroup)
+          converted_outputs = result.buffer.outputs.zip(result.buffer.data_types).map { |output, data_type| convert_to_opencl([output].flatten, shape_eval(output), data_type: data_type, name: tensor.name) }
+          TensorStream::Evaluator::OutputGroup.new(converted_outputs, result.buffer.data_types)
+        else
+          convert_to_opencl([result.buffer].flatten, shape_eval(result.buffer), data_type: result.data_type, name: tensor.name)
+        end
       end
       def complete_eval(tensor, context)
@@ -106,7 +113,7 @@ module TensorStream
         if buffer.is_a?(Array)
           buffer = buffer.collect do |b|
             next b if b.buffer.size.zero?
-            _opencl_queue.enqueue_read_buffer(b.cl_buffer, b.buffer, event_wait_list: [b.op].compact)
+            _opencl_queue.enqueue_read_buffer(b.cl_buffer, b.buffer, event_wait_list: build_event_wait_list([b]))
             b
           end
         else
@@ -114,14 +121,30 @@ module TensorStream
           return buffer if buffer.nil?
           return [] if buffer.buffer.nil?
           return buffer if buffer.buffer.size.zero?
-          _opencl_queue.enqueue_read_buffer(buffer.cl_buffer, buffer.buffer, event_wait_list: [buffer.op].compact)
+          _opencl_queue.enqueue_read_buffer(buffer.cl_buffer, buffer.buffer, event_wait_list: build_event_wait_list([buffer]))
         end
         _opencl_queue.finish
         buffer
       end
-      def opencl_device
-        @opencl_device
+      def self.query_devices_with_score
+        OpenCL.platforms.flat_map do |p|
+          p.devices.select { |d| d.available > 0 }.each_with_index.collect do |d, index|
+            score = 0
+            if d.type.to_s == 'CPU'
+              score += 1
+            elsif d.type.to_s == 'GPU'
+              score += 4
+            end
+            score += 1000 if d.platform.name == 'NVIDIA CUDA'
+            score += d.max_compute_units
+            score += d.max_clock_frequency
+            [d, score, p.name, index]
+          end
+        end
       end
       protected
@@ -152,31 +175,9 @@ module TensorStream
         @opencl_context = OpenCL.create_context(opencl_device)
       end
-      def self.query_devices_with_score
-        OpenCL.platforms.flat_map do |p|
-          p.devices.select { |d| d.available > 0 }.each_with_index.collect do |d, index|
-            score = 0
-            if d.type.to_s == 'CPU'
-              score += 1
-            elsif d.type.to_s == 'GPU'
-              score += 4
-            end
-            if d.platform.name == 'NVIDIA CUDA'
-              score += 1000
-            end
-            score += d.max_compute_units
-            score += d.max_clock_frequency
-            [d, score, p.name, index]
-          end
-        end
-      end
       def create_command_queue
         supported_proprties = opencl_device.queue_properties.names
         properties = []
         properties << OpenCL::CommandQueue::PROFILING_ENABLE if supported_proprties.include?('PROFILING_ENABLE')
         properties << OpenCL::CommandQueue::OUT_OF_ORDER_EXEC_MODE_ENABLE if supported_proprties.include?('OUT_OF_ORDER_EXEC_MODE_ENABLE')
@@ -196,7 +197,7 @@ module TensorStream
       end
       def _cl_program(kernel, args = {})
-        suffix = args.collect { |k,v| "#{k}.#{v}"}.join('.')
+        suffix = args.collect { |k, v| "#{k}.#{escape_arg_content(v)}" }.join('.')
         @context[:_cache]["_opencl_kernel_#{kernel}.#{suffix}:#{object_id}"] ||= begin
           filename = %w[cl.erb cl].map { |ext| cl_template_path(kernel, ext) }.find { |n| File.exist?(n) }
           raise "opencl kernel template for #{kernel} has not yet been defined" if filename.nil?
@@ -211,6 +212,13 @@ module TensorStream
         end
       end
+      def escape_arg_content(value)
+        return value.tr(' ','_') if value.is_a?(String)
+        return value.join('-') if value.is_a?(Array)
+        value
+      end
       def _run(tensor, execution_context)
         return tensor if tensor.is_a?(OpenCLBuffer)
         return tensor.map { |t| _run(t, execution_context) } if tensor.is_a?(Array) && !tensor.size.empty? && tensor[0].is_a?(Tensor)
@@ -236,7 +244,7 @@ module TensorStream
         res
       end
-      def eval_variable(tensor, child_context)
+      def eval_variable(tensor, _child_context)
         raise "variable #{tensor.name} not initalized" if tensor.value.nil? && (tensor.buffer.nil? || !tensor.buffer.dirty)
         tensor.buffer = wrap_opencl(tensor, name: tensor.name) if tensor.buffer.nil?
         tensor.buffer
@@ -259,7 +267,10 @@ module TensorStream
         end
       end
-      register_op :identity do |_context, _tensor, inputs|
+      register_op :identity do |context, tensor, inputs|
+        if tensor.inputs.size > 1
+          tensor.inputs[1..inputs.size].each { |input| complete_eval(input, context) }
+        end
         inputs[0]
       end
@@ -277,18 +288,19 @@ module TensorStream
         assign_var(tensor, value, context)
       end
+      register_op :variable, noop: true do |context, tensor, inputs|
+        variable = tensor.inputs[0]
+        raise "variable #{tensor.name} not initalized" if variable.value.nil? && (variable.buffer.nil? || !variable.buffer.dirty)
+        variable.buffer = wrap_opencl(variable, name: variable.name) if variable.buffer.nil?
+        variable.buffer
+      end
       # Fast in place multiply subtract assign
       register_op :apply_gradient_descent do |_context, tensor, inputs|
         _target_var, learning_rate, delta = inputs
         assign = tensor.inputs[0] || tensor
-        unless assign.buffer
-          value = read_final_result(buffer)
-          assign.buffer = convert_to_opencl(value, buffer.shape, data_type: tensor.data_type, name: assign.name)
-          assign.value = value
-        end
         assign.buffer.dirty = true # force buffer copy when variable is read externally
         output_buffer = assign.buffer
@@ -297,13 +309,39 @@ module TensorStream
         cl_m = OpenCL::Int1.new(m || 1)
         cl_n = OpenCL::Int1.new(n || 1)
-        event_wait_list = [assign.buffer.op, learning_rate.op, delta.op].compact # add dependency wait list
+        event_wait_list = build_event_wait_list([assign.buffer, learning_rate, delta])
         method_call = :"apply_gradient_#{output_buffer.data_type}"
         event = _cl_program("apply_gradient", dtype: output_buffer.data_type).send(method_call, _opencl_queue, work_group, cl_m, cl_n, delta.cl_buffer, learning_rate.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
         output_buffer.op = event
         output_buffer
       end
+      # Fast in place multiply subtract assign
+      register_op :apply_momentum do |_context, tensor, inputs|
+        target_var, momentum_var, learning_rate, grad, momentum = inputs
+        assign = tensor.inputs[0] || tensor
+        assign_acc = tensor.inputs[1]
+        assign.buffer.dirty = true # force buffer copy when variable is read externally
+        assign_acc.buffer.dirty = true # force buffer copy when variable is read externally
+        output_buffer = assign.buffer
+        m, n = output_buffer.shape
+        work_group = [m || 1, n || 1]
+        cl_m = OpenCL::Int1.new(m || 1)
+        cl_n = OpenCL::Int1.new(n || 1)
+        event_wait_list = build_event_wait_list([assign.buffer, assign_acc.buffer, learning_rate, grad, momentum])
+        method_call = :"apply_momentum_#{output_buffer.data_type}"
+        event = _cl_program("apply_momentum", nesterov: tensor.options[:use_nesterov], dtype: output_buffer.data_type).
+                    send(method_call, _opencl_queue, work_group, cl_m, cl_n, grad.cl_buffer,
+                         learning_rate.cl_buffer, momentum.cl_buffer, output_buffer.cl_buffer,
+                         assign_acc.buffer.cl_buffer, event_wait_list: event_wait_list)
+        output_buffer.op = event
+        output_buffer
+      end
       %i[less less_equal greater greater_equal equal not_equal logical_and].each do |op|
         register_op op, noop: true do |context, tensor, inputs|
           execute_2_operand_func(op.to_s, tensor, inputs[0], inputs[1], context, 'cond')
@@ -332,7 +370,7 @@ module TensorStream
           a = inputs_queue.pop
           until inputs_queue.empty?
             b = inputs_queue.pop
-            event_wait_list = [a.op, b.op].compact
+            event_wait_list = build_event_wait_list([a, b])
             method_call = :"add_#{a.data_type}_#{b.data_type}"
             event = _cl_program('add', a: a.data_type, b: b.data_type, dtype: dtype).send(method_call, _opencl_queue, work_group, cl_m, cl_n, cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
             a = output_buffer
@@ -353,6 +391,23 @@ module TensorStream
         convert_to_opencl(new_buf, new_shape, data_type: inputs[0].data_type, name: tensor.name)
       end
+      register_op :fill, buffer: true do |_context, tensor, inputs|
+        shape = inputs[0]
+        value = inputs[1]
+        narray_size = shape.buffer.to_a.reduce(:*) || 1
+        cl_buffer = get_cached_buffer(tensor.name, shape.buffer.to_a)
+        buffer = if cl_buffer
+                   cl_buffer.buffer
+                 else
+                   allocate_narray_for_type(tensor.data_type, narray_size)
+                 end
+        buffer.fill!(value.buffer[0])
+        convert_to_opencl(buffer, shape.buffer.to_a, data_type: tensor.data_type, name: tensor.name)
+      end
       register_op :floor_div, noop: true do |context, tensor, inputs|
         if fp_type?(tensor.data_type)
           execute_2_operand_func('floor_div', tensor, inputs[0], inputs[1], context)
@@ -374,8 +429,15 @@ module TensorStream
         v = b.shape[0]
         k = a.shape[1]
-        m, k = [a.shape[1], a.shape[0]] if tensor.options[:transpose_a]
-        n, v = [b.shape[0], b.shape[1]] if tensor.options[:transpose_b]
+        if tensor.options[:transpose_a]
+          m = a.shape[1]
+          k = a.shape[0]
+        end
+        if tensor.options[:transpose_b]
+          n = b.shape[0]
+          v = b.shape[1]
+        end
         result_shape = [m, n]
@@ -393,8 +455,8 @@ module TensorStream
         transpose_a = OpenCL::Int1.new(tensor.options[:transpose_a] ? 1 : 0)
         transpose_b = OpenCL::Int1.new(tensor.options[:transpose_b] ? 1 : 0)
-        output_buffer.op = _cl_program('gemm', dtype: dtype).send(:"gemm_#{dtype}", _opencl_queue, result_shape, cl_m, cl_n, cl_k, transpose_a, transpose_b, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer)
+        event_wait_list = build_event_wait_list(inputs)
+        output_buffer.op = _cl_program('gemm', dtype: dtype).send(:"gemm_#{dtype}", _opencl_queue, result_shape, cl_m, cl_n, cl_k, transpose_a, transpose_b, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
         output_buffer
       end
@@ -406,14 +468,47 @@ module TensorStream
           cl_m = OpenCL::Int1.new(m || 1)
           cl_n = OpenCL::Int1.new(n || 1)
           work_group = [m || 1, n || 1]
-          buffer.op = _cl_program("cast", source_dt: a.data_type, target_dt: tensor.data_type).cast(_opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, buffer.cl_buffer)
+          event_wait_list = build_event_wait_list(inputs)
+          buffer.op = _cl_program("cast", source_dt: a.data_type, target_dt: tensor.data_type).cast(_opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, buffer.cl_buffer, event_wait_list: event_wait_list)
           buffer
         else
           a
         end
       end
+      register_op :stack do |_context, tensor, inputs|
+        axis = tensor.options[:axis] || 0
+        shape = inputs[0].shape
+        rank = shape.size + 1
+        elem_size = shape.empty? ? 1 : shape.reduce(:*)
+        new_shape = [inputs.size]
+        shape.inject(new_shape) { |ns, s| ns << s }
+        divisors = new_shape.dup.drop(1).reverse.inject([1]) do |a, s|
+          a << s * a.last
+        end.reverse
+        axis = rank + axis if axis < 0
+        rotated_shape = Array.new(axis + 1) { new_shape.shift }
+        new_shape = rotated_shape.rotate! + new_shape
+        output_buffer = _create_result_buffer(tensor.data_type, new_shape, tensor.name)
+        multipliers = new_shape.dup.drop(1).reverse.inject([1]) do |a, s|
+          a << s * a.last
+        end.reverse
+        cl_n = OpenCL::Int1.new(elem_size)
+        work_group = [elem_size]
+        event_wait_list = build_event_wait_list(inputs)
+        ops = inputs.each_with_index.map do |input, index|
+          cl_index = OpenCL::Int1.new(index)
+          _cl_program("pack", data_type: tensor.data_type, divisors: divisors, multipliers: multipliers, axis: axis).pack(_opencl_queue, work_group, cl_n, cl_index, input.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
+        end
+        output_buffer.op = ops
+        output_buffer
+      end
       %i[sign exp tan acos asin sin cos abs sqrt negate square reciprocal tanh tanh_grad sigmoid log1p round floor ceil].each do |op|
         register_op op, noop: true do |context, tensor, inputs|
           execute_func(op.to_s, tensor, inputs[0], context)
@@ -422,7 +517,7 @@ module TensorStream
       register_op :softmax do |_context, tensor, inputs|
         a = inputs[0]
-        event_wait_list = [a.op].compact
+        event_wait_list = build_event_wait_list(inputs)
         dtype = tensor.data_type
         output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
@@ -438,7 +533,7 @@ module TensorStream
       register_op :log_softmax do |_context, tensor, inputs|
         a = inputs[0] # logits
-        event_wait_list = [a.op].compact
+        event_wait_list = build_event_wait_list(inputs)
         dtype = tensor.data_type
         output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
@@ -452,28 +547,33 @@ module TensorStream
         output_buffer
       end
-      register_op :softmax_cross_entropy_with_logits_v2 do |_context, tensor, inputs|
+      register_op :softmax_cross_entropy_with_logits_v2 do |context, tensor, inputs|
         a = inputs[0] # logits
         b = inputs[1] # labels
-        event_wait_list = [a.op, b.op].compact
+        event_wait_list = build_event_wait_list(inputs)
         dtype = tensor.data_type
         output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
+        output_buffer_backprop = _create_result_buffer(tensor.data_type, a.shape, "#{tensor.name}_2")
+        rank = a.shape.size - 1
         m, n = a.shape
         work_group = [m]
         n = m if n.nil?
         cl_n = OpenCL::Int1.new(n || 1)
-        event = _cl_program("softmax_cross", dtype: dtype).send(:"softmax_cross_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
+        event = _cl_program("softmax_cross", dtype: dtype).send(:"softmax_cross_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer, b.cl_buffer,
+                             output_buffer.cl_buffer, output_buffer_backprop.cl_buffer, event_wait_list: event_wait_list)
         output_buffer.op = event
-        output_buffer
+        output_buffer_backprop.op = event
+        loss = reduction(context, tensor, output_buffer, rank, :sum)
+        OutputGroup.new([loss, output_buffer_backprop],  [tensor.inputs[0].data_type, tensor.inputs[0].data_type])
       end
       register_op :softmax_cross_entropy_with_logits_v2_grad do |_context, tensor, inputs|
         a = inputs[0] # logits
         b = inputs[1] # labels
         c = inputs[2] # grads
-        event_wait_list = [a.op, b.op, c.op].compact
+        event_wait_list = build_event_wait_list(inputs)
         dtype = tensor.data_type
         output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
@@ -490,7 +590,7 @@ module TensorStream
       register_op :softmax_grad do |_context, tensor, inputs|
         a, grad = inputs
-        event_wait_list = [a.op].compact
+        event_wait_list = build_event_wait_list(inputs)
         dtype = tensor.data_type
         output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
@@ -508,7 +608,7 @@ module TensorStream
         name = tensor.options[:name]
         a.buffer.each do |input|
-          raise "#{name} Invalid Argument" if input.nan? || input.infinite?
+          raise TensorStream::InvalidArgumentError, "#{name} Invalid Argument" if input.nan? || input.infinite?
         end
         a
       end
@@ -522,8 +622,8 @@ module TensorStream
           input_a = read_final_result(complete_eval(a, context))
           input_b = read_final_result(complete_eval(b, context))
           b_a, b_b = broadcast(input_a, input_b)
-          [ wrap_opencl(b_a, data_type: a.data_type, name: "#{tensor.name}_a"),
-            wrap_opencl(b_b, data_type: a.data_type, name: "#{tensor.name}_b")]
+          [wrap_opencl(b_a, data_type: a.data_type, name: "#{tensor.name}_a"),
+           wrap_opencl(b_b, data_type: a.data_type, name: "#{tensor.name}_b")]
         end
       end
@@ -557,8 +657,22 @@ module TensorStream
       register_op :transpose, buffer: true do |_context, tensor, inputs|
         t_param = Array.new(inputs[0].shape.size) { |index| index }.reverse
-        transposed = inputs[0].buffer.reshape(*inputs[0].shape.reverse).transpose(*t_param)
-        convert_to_opencl(transposed.flatten, transposed.shape.reverse, data_type: inputs[0].data_type, name: tensor.name)
+        if inputs[0].shape.size == 2 && inputs[1].nil?
+          transposed = inputs[0].buffer.reshape(*inputs[0].shape.reverse).transpose(*t_param)
+          res = convert_to_opencl(transposed.flatten, transposed.shape.reverse, data_type: inputs[0].data_type, name: tensor.name)
+          res
+        else
+          rank = inputs[0].shape.size
+          perm = inputs[1].nil? ? (0...rank).to_a.reverse : inputs[1].buffer
+          new_shape = perm.map { |p| inputs[0].shape[p] }.to_a
+          output_buffer = _create_result_buffer(tensor.data_type, new_shape, tensor.name)
+          transpose_with_perm(inputs[0].buffer, output_buffer.buffer, inputs[0].shape, new_shape, perm)
+          write_op = _opencl_queue.enqueue_write_buffer(output_buffer.cl_buffer, output_buffer.buffer)
+          output_buffer.op = write_op
+          output_buffer
+        end
       end
       register_op :index, noop: true do |context, tensor, inputs|
@@ -567,39 +681,36 @@ module TensorStream
         if a.is_a?(OutputGroup)
           a.outputs[index]
+        elsif a.is_a?(Array)
+          a[index]
         else
-          if a.is_a?(Array)
-            a[index]
-          else
-            new_shape = a.shape.dup
-            new_shape.shift
-            input_a = read_final_result(a)
-            convert_to_opencl(input_a[index], new_shape, data_type: a.data_type, name: tensor.name)
-          end
+          new_shape = a.shape.dup
+          new_shape.shift
+          input_a = read_final_result(a)
+          convert_to_opencl(input_a[index], new_shape, data_type: a.data_type, name: tensor.name)
         end
       end
       register_op :broadcast_gradient_args, buffer: true do |_context, tensor, inputs|
         rx, ry = get_broadcast_gradient_args(inputs[0].buffer.to_a, inputs[1].buffer.to_a)
-        OutputGroup.new([wrap_opencl(rx, data_type: :int32, name: "#{tensor.name}"), wrap_opencl(ry, data_type: :int32, name: "#{tensor.name}:1")])
+        OutputGroup.new([wrap_opencl(rx, data_type: :int32, name: tensor.name), wrap_opencl(ry, data_type: :int32, name: "#{tensor.name}:1")], tensor.inputs.map(&:data_type))
       end
       register_op :shape do |_context, tensor, inputs|
         wrap_opencl(inputs[0].shape, name: tensor.name, data_type: tensor.data_type)
       end
-      register_op :reshape, buffer: true do |_context, _tensor, inputs|
+      register_op :reshape, buffer: true do |_context, tensor, inputs|
         arr = inputs[0]
         new_shape = read_final_result(inputs[1])
-        if new_shape.size.zero? && arr.buffer.size == 1
-          arr.shape = new_shape
-          arr
-        else
-          new_shape = TensorShape.fix_inferred_elements(new_shape, arr.buffer.size)
-          arr.shape = new_shape
-          arr
-        end
+        shape = if new_shape.size.zero? && arr.buffer.size == 1
+                  new_shape
+                else
+                  TensorShape.fix_inferred_elements(new_shape, arr.buffer.size)
+                end
+        convert_to_opencl(arr.buffer, shape, data_type: arr.data_type, name: tensor.name)
       end
       register_op :flow_group do |_context, _tensor, inputs|
@@ -618,6 +729,7 @@ module TensorStream
       register_op :prod, noop: true do |context, tensor, inputs|
         input_a = complete_eval(inputs[0], context)
         if input_a.buffer.empty?
           convert_to_opencl([1.0], [], data_type: inputs[0].data_type, name: tensor.name)
         else
@@ -646,13 +758,11 @@ module TensorStream
       end
       def eval_operation(tensor, child_context)
         cache_key = "#{tensor.graph.object_id}_opencl_#{tensor.name}:#{object_id}"
         return @context[:_cache][cache_key] if @context[:_cache].key?(cache_key)
         return @context[cache_key] if @context.key?(cache_key)
-         # puts tensor.name
+        # puts "opencl: #{tensor.name}"
         invoke(tensor, child_context).tap do |result|
-          # puts "#{tensor.to_math(true,1)} = #{read_final_result(complete_eval(result, child_context))}"
           if tensor.breakpoint
             a = resolve_placeholder(tensor.inputs[0], child_context) if tensor.inputs && tensor.inputs[0]
             b = resolve_placeholder(tensor.inputs[1], child_context) if tensor.inputs && tensor.inputs[1]
@@ -676,9 +786,11 @@ module TensorStream
           @context[:_cache][cache_key] = result if tensor.is_const
         end
       rescue EvaluatorExcecutionException => e
-        raise e
+        _opencl_queue.finish # dump queue
+        raise e, "error #{e.message} while evaluating #{tensor.name} : #{tensor.to_math(true, 1)} defined at #{tensor.source}"
       rescue TensorStreamError => e
-        raise e
+        _opencl_queue.finish # dump queue
+        raise e, "error #{e.message} while evaluating #{tensor.name} : #{tensor.to_math(true, 1)} defined at #{tensor.source}"
       rescue StandardError => e
         _opencl_queue.finish # dump queue
         puts e.message
@@ -698,7 +810,7 @@ module TensorStream
         # File.write('/home/jedld/workspace/tensor_stream/samples/error.graphml', TensorStream::Graphml.new.get_string(tensor, @session))
         # File.write('/Users/josephemmanueldayo/workspace/gradients.graphml', TensorStream::Graphml.new.get_string(tensor, @session))
-        raise EvaluatorExcecutionException.new(e, tensor), "error #{e.message} while evaluating #{tensor.name} : #{tensor.to_math(true,1)} defined at #{tensor.source}"
+        raise EvaluatorExcecutionException.new(e, tensor), "error #{e.message} while evaluating #{tensor.name} : #{tensor.to_math(true, 1)} defined at #{tensor.source}"
       end
       def eval_tensor(tensor, child_context)
@@ -724,8 +836,9 @@ module TensorStream
         if assign.buffer
           # buffer = type_cast(buffer, assign.data_type, name: "#{tensor.name}/cast_#{tensor.name}_#{tensor.data_type}")
+          event_wait_list = build_event_wait_list([buffer, assign.buffer])
           assign.buffer.op = if assign.buffer.cl_buffer != buffer.cl_buffer
-                               _opencl_queue.enqueue_copy_buffer(buffer.cl_buffer, assign.buffer.cl_buffer, event_wait_list: [buffer.op, assign.buffer.op])
+                               _opencl_queue.enqueue_copy_buffer(buffer.cl_buffer, assign.buffer.cl_buffer, event_wait_list: event_wait_list)
                              else
                                buffer.op
                              end
@@ -745,7 +858,6 @@ module TensorStream
         dtype = tensor.data_type
         result_shape = TensorShape.infer_shape(a.shape, b.shape)
         return _create_result_buffer(dtype, [0], "out_#{tensor.name}") if result_shape == [0]
         output_buffer = _create_result_buffer(tensor.data_type, result_shape, "out_#{tensor.name}")
         a, b, prog, switch_operands = select_program(a, b, op_name)
         m, n = result_shape
@@ -754,21 +866,26 @@ module TensorStream
         cl_n = OpenCL::Int1.new(n || 1)
         cl_switch = OpenCL::Int1.new(switch_operands) # no need to switch for addition
-        event_wait_list = [a.op, b.op].compact # add dependency wait list
+        event_wait_list = build_event_wait_list([a, b]) # add dependency wait list
         method_call = :"#{prog}_#{a.data_type}_#{b.data_type}"
+        prog_name ||= op_name
         event = if prog == "#{op_name}_b"
-          cl_m_b, cl_n_b = if b.shape.size == 2
-                             [OpenCL::Int1.new(b.shape[0]), OpenCL::Int1.new(b.shape[1])]
-                           elsif b.shape.size == 1
-                             [OpenCL::Int1.new(1), OpenCL::Int1.new(b.shape[0])]
-                           else
-                             raise "rank > 2 not supported!"
-                           end
-          _cl_program("#{prog_name || op_name}", a: a.data_type, b: b.data_type, dtype: dtype).send(method_call, _opencl_queue, work_group, cl_m, cl_n, cl_m_b, cl_n_b, cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
-        else
-          _cl_program("#{prog_name || op_name}", a: a.data_type, b: b.data_type, dtype: dtype).send(method_call, _opencl_queue, work_group, cl_m, cl_n, cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
-        end
+                  cl_m_b, cl_n_b = if b.shape.size == 2
+                                     [OpenCL::Int1.new(b.shape[0]), OpenCL::Int1.new(b.shape[1])]
+                                   elsif b.shape.size == 1
+                                     [OpenCL::Int1.new(1), OpenCL::Int1.new(b.shape[0])]
+                                   else
+                                     raise "rank > 2 not supported!"
+                                   end
+                  _cl_program(prog_name, a: a.data_type, b: b.data_type, dtype: dtype).
+                    send(method_call, _opencl_queue, work_group, cl_m, cl_n, cl_m_b, cl_n_b,
+                         cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
+                else
+                  _cl_program(prog_name, a: a.data_type, b: b.data_type, dtype: dtype).
+                    send(method_call, _opencl_queue, work_group, cl_m, cl_n, cl_switch,
+                         a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
+                end
         output_buffer.op = event
         output_buffer
@@ -789,14 +906,14 @@ module TensorStream
         cl_m = OpenCL::Int1.new(m || 1)
         cl_n = OpenCL::Int1.new(n || 1)
-        event_wait_list = [a.op, b.op, p.op].compact # add dependency wait list
+        event_wait_list = build_event_wait_list([a, b, p]) # add dependency wait list
         output_buffer.op = _cl_program(op_name.to_s, dtype: dtype).send(:"#{op_name}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, p.cl_buffer, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
         output_buffer
       end
       def execute_func(op_name, tensor, a, child_context)
         a = _run(a, child_context)
-        event_wait_list = [a.op].compact
+        event_wait_list = build_event_wait_list([a])
         dtype = tensor.data_type
         output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
@@ -814,7 +931,7 @@ module TensorStream
         return [a, b] if a.data_type == b.data_type
         m, n = b.shape
         work_group = [m || 1, n || 1]
-        event_wait_list = [b.op].compact
+        event_wait_list = build_event_wait_list([b])
         buffer = _create_result_buffer(b.data_type, b.shape, name)
         cl_m = OpenCL::Int1.new(m || 1)
@@ -848,6 +965,11 @@ module TensorStream
         convert_to_opencl(value, shape, data_type: data_type || tensor.data_type, name: name)
       end
+      def get_cached_buffer(name, shape)
+        cache_key = "_cl_object_#{name}:#{shape.join('_')}:#{object_id}"
+        @context[:_cache][cache_key]
+      end
       def convert_to_opencl(value, shape, data_type: nil, name: nil)
         value = [value] if !value.is_a?(Array) && !value.is_a?(NArray)
@@ -863,6 +985,8 @@ module TensorStream
                                  allocate_narray_for_type(data_type, narray_size)
                                end
+                      return nil if buffer.nil?
                       cl_buffer_size = shape.empty? ? 1 : shape.reduce(:*)
                       cl_buffer = unless value.flatten.empty?
@@ -908,18 +1032,20 @@ module TensorStream
           NArray.sint(narray_size)
         when :boolean
           NArray.sint(narray_size)
+        when :unknown
+          nil
         else
           raise "unsupported type #{data_type}"
         end
       end
       def _create_result_buffer(data_type, shape, name)
-        return OpenCLBuffer.new(data_type: data_type, shape: [0], buffer: nil, cl_buffer: nil) if shape == [0]
+        return OpenCLBuffer.new(name: name, data_type: data_type, shape: [0], buffer: nil, cl_buffer: nil) if shape == [0]
         @context[:_cache][:_cl_buffers]["_result_#{name}_#{shape.join('_')}:#{object_id}"] ||= begin
           size = shape.empty? || shape == [0] ? 1 : shape.reduce(:*)
           buffer =  allocate_narray_for_type(data_type, size)
           cl_buffer = _opencl_context.create_buffer(buffer.size * buffer.element_size)
-          OpenCLBuffer.new(data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer)
+          OpenCLBuffer.new(data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer, name: name)
         end
       end
@@ -969,7 +1095,7 @@ module TensorStream
       def reduction(child_context, tensor, a, b, func)
         input = complete_eval(a, child_context)
-        axis = read_final_result(complete_eval(b, child_context))
+        axis = b.is_a?(Tensor) ? read_final_result(complete_eval(b, child_context)) : b
         if axis.nil?
           red = input.buffer.send(func)
           convert_to_opencl(red, [], data_type: tensor.data_type, name: tensor.name)
@@ -1021,6 +1147,10 @@ module TensorStream
         shape.is_a?(Array) ? shape.size : 0
       end
+      def build_event_wait_list(inputs)
+        inputs.compact.map(&:op).flatten
+      end
       def resolve_placeholder(placeholder, _execution_context = {})
         return nil if placeholder.nil?