RubyGems - tensor_stream - Versions diffs - 0.3.0 → 0.4.0 - Mend

tensor_stream 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

checksums.yaml +4 -4
data/.circleci/config.yml +7 -7
data/CHANGELOG.md +13 -0
data/Dockerfile +25 -0
data/Rakefile +6 -0
data/benchmark/benchmark.rb +16 -57
data/benchmark_intel.txt +21 -0
data/benchmark_nvidia.txt +33 -0
data/lib/tensor_stream.rb +4 -173
data/lib/tensor_stream/debugging/debugging.rb +20 -0
data/lib/tensor_stream/evaluator/kernels/abs.cl +9 -5
data/lib/tensor_stream/evaluator/kernels/add.cl +2 -4
data/lib/tensor_stream/evaluator/kernels/argmax.cl +2 -9
data/lib/tensor_stream/evaluator/kernels/argmin.cl +2 -9
data/lib/tensor_stream/evaluator/kernels/cast.cl +3 -8
data/lib/tensor_stream/evaluator/kernels/cond.cl.erb +1 -1
data/lib/tensor_stream/evaluator/kernels/cos.cl +2 -1
data/lib/tensor_stream/evaluator/kernels/div.cl.erb +2 -4
data/lib/tensor_stream/evaluator/kernels/exp.cl +2 -1
data/lib/tensor_stream/evaluator/kernels/gemm.cl +8 -39
data/lib/tensor_stream/evaluator/kernels/log.cl +2 -1
data/lib/tensor_stream/evaluator/kernels/log1p.cl +2 -1
data/lib/tensor_stream/evaluator/kernels/max.cl +4 -49
data/lib/tensor_stream/evaluator/kernels/mul.cl +2 -4
data/lib/tensor_stream/evaluator/kernels/negate.cl +2 -9
data/lib/tensor_stream/evaluator/kernels/pow.cl +4 -88
data/lib/tensor_stream/evaluator/kernels/reciprocal.cl +2 -9
data/lib/tensor_stream/evaluator/kernels/round.cl +2 -1
data/lib/tensor_stream/evaluator/kernels/sigmoid.cl +2 -1
data/lib/tensor_stream/evaluator/kernels/sigmoid_grad.cl +6 -5
data/lib/tensor_stream/evaluator/kernels/sign.cl +12 -14
data/lib/tensor_stream/evaluator/kernels/sin.cl +2 -1
data/lib/tensor_stream/evaluator/kernels/softmax.cl +26 -0
data/lib/tensor_stream/evaluator/kernels/softmax_grad.cl +46 -0
data/lib/tensor_stream/evaluator/kernels/sqrt.cl +2 -1
data/lib/tensor_stream/evaluator/kernels/square.cl +2 -8
data/lib/tensor_stream/evaluator/kernels/sub.cl +2 -4
data/lib/tensor_stream/evaluator/kernels/tan.cl +2 -1
data/lib/tensor_stream/evaluator/kernels/tanh.cl +2 -1
data/lib/tensor_stream/evaluator/kernels/tanh_grad.cl +2 -1
data/lib/tensor_stream/evaluator/kernels/where.cl +2 -9
data/lib/tensor_stream/evaluator/opencl_evaluator.rb +108 -58
data/lib/tensor_stream/evaluator/opencl_template_helper.rb +40 -5
data/lib/tensor_stream/evaluator/operation_helpers/array_ops_helper.rb +35 -0
data/lib/tensor_stream/evaluator/ruby_evaluator.rb +30 -9
data/lib/tensor_stream/graph_serializers/graphml.rb +1 -1
data/lib/tensor_stream/graph_serializers/pbtext.rb +4 -0
data/lib/tensor_stream/math_gradients.rb +6 -5
data/lib/tensor_stream/nn/nn_ops.rb +18 -2
data/lib/tensor_stream/ops.rb +237 -44
data/lib/tensor_stream/tensor.rb +16 -2
data/lib/tensor_stream/utils.rb +205 -0
data/lib/tensor_stream/variable.rb +2 -1
data/lib/tensor_stream/version.rb +1 -1
data/samples/error.graphml +2755 -0
data/{test_samples → samples}/iris.rb +18 -24
data/samples/logistic_regression.rb +0 -1
data/test_samples/raw_neural_net_sample.rb +80 -23
metadata +11 -3

data/lib/tensor_stream/evaluator/kernels/round.cl CHANGED Viewed

@@ -1,4 +1,5 @@
-__kernel void round_fp(const int M, const int N, __global const float *A, __global float *C) {
+% c_dtype = dtype_to_c_type(dtype)
+__kernel void round_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
     // Get the index of the current element to be processed
     const int globalRow = get_global_id(0); // Row ID of C (0..M)
     const int globalCol = get_global_id(1); // Col ID of C (0..N)

data/lib/tensor_stream/evaluator/kernels/sigmoid.cl CHANGED Viewed

@@ -1,5 +1,6 @@
+% c_dtype = dtype_to_c_type(dtype)
-__kernel void sigmoid_fp(const int M, const int N, __global const float *A, __global float *C) {
+__kernel void sigmoid_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
     // Get the index of the current element to be processed
     const int globalRow = get_global_id(0); // Row ID of C (0..M)
     const int globalCol = get_global_id(1); // Col ID of C (0..N)

data/lib/tensor_stream/evaluator/kernels/sigmoid_grad.cl CHANGED Viewed

@@ -1,14 +1,15 @@
+% c_dtype = dtype_to_c_type(dtype)
-float sigmoid(float x) {
+float sigmoid(<%= c_dtype %> x) {
   return 1.0f/(1.0f + exp(-x));
 }
-float sigmoid_grad(float x, float g) {
+float sigmoid_grad(<%= c_dtype %> x, <%= c_dtype %> g) {
   return g * sigmoid(x) * ( 1.0f - sigmoid(x));
 }
  // same dimension add floating point op
- __kernel void sigmoid_grad_fp(const int M, const int N, const int switch_op, __global const float *A, __global const float *B, __global float *C) {
+ __kernel void sigmoid_grad_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
     // Get the index of the current element to be processed
     const int globalRow = get_global_id(0); // Row ID of C (0..M)
     const int globalCol = get_global_id(1); // Col ID of C (0..N)
@@ -17,7 +18,7 @@ float sigmoid_grad(float x, float g) {
 }
  // 1D + Scalar floating point add op
- __kernel void sigmoid_grad_c_fp(const int M, const int N, const int switch_op, __global const float *A, __global const float *B, __global float *C) {
+ __kernel void sigmoid_grad_c_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
     // Get the index of the current element to be processed
     const int globalRow = get_global_id(0); // Row ID of C (0..M)
     const int globalCol = get_global_id(1); // Col ID of C (0..N)
@@ -30,7 +31,7 @@ float sigmoid_grad(float x, float g) {
 }
  // 1D + Scalar floating point add op broadcast
- __kernel void sigmoid_grad_b_fp(const int M, const int N, const int M2, const int N2, const int switch_op, __global const float *A, __global const float *B, __global float *C) {
+ __kernel void sigmoid_grad_b_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
     // Get the index of the current element to be processed
     const int globalRow = get_global_id(0); // Row ID of C (0..M)
     const int globalCol = get_global_id(1); // Col ID of C (0..N)

data/lib/tensor_stream/evaluator/kernels/sign.cl CHANGED Viewed

@@ -1,23 +1,21 @@
-__kernel void sign_fp(const int M, const int N, __global const float *A, __global float *C) {
+% c_dtype = dtype_to_c_type(dtype)
+__kernel void sign_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
     // Get the index of the current element to be processed
     const int globalRow = get_global_id(0); // Row ID of C (0..M)
     const int globalCol = get_global_id(1); // Col ID of C (0..N)
-    float value = A[globalRow * N + globalCol];
+    <%= c_dtype %> value = A[globalRow * N + globalCol];
+% if is_floating_point?(dtype)
     if (isnan(value) || value == 0.0f) {
       C[globalRow * N + globalCol] = 0.0;
     } else {
       C[globalRow * N + globalCol] = value < 0 ? -1.0 : 1.0;
     }
-}
-__kernel void sign_int(const int M, const int N, __global const int *A, __global int *C) {
-    // Get the index of the current element to be processed
-    const int globalRow = get_global_id(0); // Row ID of C (0..M)
-    const int globalCol = get_global_id(1); // Col ID of C (0..N)
-    float value = A[globalRow * N + globalCol];
-    if (isnan(value) || value == 0) {
-      C[globalRow * N + globalCol] = 0;
-    } else {
-      C[globalRow * N + globalCol] = value < 0 ? -1 : 1;
-    }
+% else
+  if (value == 0) {
+    C[globalRow * N + globalCol] = 0;
+  } else {
+    C[globalRow * N + globalCol] = value < 0 ? -1 : 1;
+  }
+% end
 }

data/lib/tensor_stream/evaluator/kernels/sin.cl CHANGED Viewed

@@ -1,5 +1,6 @@
-__kernel void sin_fp(const int M, const int N, __global const float *A, __global float *C) {
+% c_dtype = dtype_to_c_type(dtype)
+__kernel void sin_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
     // Get the index of the current element to be processed
     const int globalRow = get_global_id(0); // Row ID of C (0..M)
     const int globalCol = get_global_id(1); // Col ID of C (0..N)

data/lib/tensor_stream/evaluator/kernels/softmax.cl ADDED Viewed

@@ -0,0 +1,26 @@
+// First naive implementation
+% c_dtype = dtype_to_c_type(dtype)
+__kernel void softmax_<%= dtype %>(const int N,
+                      const __global <%= c_dtype %>* A,
+                      __global <%= c_dtype %>* C) {
+    // Get the index of the current element to be processed
+    const int globalRow = get_global_id(0); // Row ID of C (0..M)
+    // Compute a single element (loop over K)
+    <%= c_dtype %> acc = 0.0f;
+    <%= c_dtype %> max = <%= min_value_for(dtype) %>;
+    for (int k=0; k<N; k++) {
+      max = A[globalRow*N + k] > max ? A[globalRow*N + k] : max;
+    }
+    for (int k=0; k<N; k++) {
+      acc += exp(A[globalRow*N + k] - max);
+    }
+    // Store the result
+    for (int k=0; k < N; k++) {
+      C[globalRow*N + k] = exp(A[globalRow*N + k] - max) / acc;
+    }
+}

data/lib/tensor_stream/evaluator/kernels/softmax_grad.cl ADDED Viewed

@@ -0,0 +1,46 @@
+% c_dtype = dtype_to_c_type(dtype)
+__kernel void softmax_grad_<%= dtype %>(const int N,
+                      const __global <%= c_dtype %>* A,
+                      const __global <%= c_dtype %>* G,
+                      __global <%= c_dtype %>* C) {
+    // Get the index of the current element to be processed
+    const int globalRow = get_global_id(0); // Row ID of C (0..M)
+    // Compute a single element (loop over K)
+    float acc = 0.0f;
+    float max = FLT_MIN;
+    float row[<%= size %>];
+    float grads[<%= size %>][<%= size %>];
+    for (int k=0; k<N; k++) {
+      max = A[globalRow*N + k] > max ? A[globalRow*N + k] : max;
+    }
+    for (int k=0; k<N; k++) {
+      acc += exp(A[globalRow*N + k] - max);
+    }
+    // Store the result
+    for (int k=0; k < N; k++) {
+      row[k] = exp(A[globalRow*N + k] - max) / acc;
+    }
+    for (int a=0; a < N; a++) {
+      for(int b=0; b < N; b++) {
+        if (a != b) {
+          grads[a][b] = -row[a] * row[b];
+        } else {
+          grads[a][b] = row[a] * (1.0f - row[a]);
+        }
+      }
+    }
+    for (int k=0; k < N; k++) {
+      float total_grad = 0.0f;
+      for (int a = 0; a < N; a++) {
+        total_grad += grads[a][k] * G[globalRow*N + a];
+      }
+      C[globalRow*N + k] = total_grad;
+    }
+}

data/lib/tensor_stream/evaluator/kernels/sqrt.cl CHANGED Viewed

@@ -1,5 +1,6 @@
+% c_dtype = dtype_to_c_type(dtype)
-__kernel void sqrt_fp(const int M, const int N, __global const float *A, __global float *C) {
+__kernel void sqrt_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
     // Get the index of the current element to be processed
     const int globalRow = get_global_id(0); // Row ID of C (0..M)
     const int globalCol = get_global_id(1); // Col ID of C (0..N)

data/lib/tensor_stream/evaluator/kernels/square.cl CHANGED Viewed

@@ -1,12 +1,6 @@
-__kernel void square_fp(const int M, const int N, __global const float *A, __global float *C) {
-    // Get the index of the current element to be processed
-    const int globalRow = get_global_id(0); // Row ID of C (0..M)
-    const int globalCol = get_global_id(1); // Col ID of C (0..N)
-    C[globalRow * N + globalCol] = A[globalRow * N + globalCol] * A[globalRow * N + globalCol];
-}
+% c_dtype = dtype_to_c_type(dtype)
-__kernel void square_int(const int M, const int N, __global const int *A, __global int *C) {
+__kernel void square_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
     // Get the index of the current element to be processed
     const int globalRow = get_global_id(0); // Row ID of C (0..M)
     const int globalCol = get_global_id(1); // Col ID of C (0..N)

data/lib/tensor_stream/evaluator/kernels/sub.cl CHANGED Viewed

@@ -1,5 +1,3 @@
-% %w[fp int].product(%w[sub]).each do |dtype, fname|
 % c_dtype = dtype_to_c_type(dtype)
-% op = operator_to_c(fname)
-<%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: fname, dtype: dtype, result_t: c_dtype %>
-% end
+% op = operator_to_c('sub')
+<%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'sub', dtype: dtype, result_t: c_dtype %>

data/lib/tensor_stream/evaluator/kernels/tan.cl CHANGED Viewed

@@ -1,4 +1,5 @@
-__kernel void tan_fp(const int M, const int N, __global const float *A, __global float *C) {
+% c_dtype = dtype_to_c_type(dtype)
+__kernel void tan_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
     // Get the index of the current element to be processed
     const int globalRow = get_global_id(0); // Row ID of C (0..M)
     const int globalCol = get_global_id(1); // Col ID of C (0..N)

data/lib/tensor_stream/evaluator/kernels/tanh.cl CHANGED Viewed

@@ -1,4 +1,5 @@
-__kernel void tanh_fp(const int M, const int N, __global const float *A, __global float *C) {
+% c_dtype = dtype_to_c_type(dtype)
+__kernel void tanh_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
     // Get the index of the current element to be processed
     const int globalRow = get_global_id(0); // Row ID of C (0..M)
     const int globalCol = get_global_id(1); // Col ID of C (0..N)

data/lib/tensor_stream/evaluator/kernels/tanh_grad.cl CHANGED Viewed

@@ -1,4 +1,5 @@
-__kernel void tanh_grad_fp(const int M, const int N, __global const float *A, __global float *C) {
+% c_dtype = dtype_to_c_type(dtype)
+__kernel void tanh_grad_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
     // Get the index of the current element to be processed
     const int globalRow = get_global_id(0); // Row ID of C (0..M)
     const int globalCol = get_global_id(1); // Col ID of C (0..N)

data/lib/tensor_stream/evaluator/kernels/where.cl CHANGED Viewed

@@ -1,12 +1,5 @@
- __kernel void where_fp(const int M, const int N, __global const int *PRED, __global const float *A, __global const float *B, __global float *C) {
-    // Get the index of the current element to be processed
-    const int globalRow = get_global_id(0); // Row ID of C (0..M)
-    const int globalCol = get_global_id(1); // Col ID of C (0..N)
-    C[globalRow * N + globalCol] = PRED[globalRow * N + globalCol]  ? A[globalRow * N + globalCol] : B[globalRow * N + globalCol];
-}
- __kernel void where_int(const int M, const int N,  __global const int *PRED, __global const int *A, __global const int *B, __global int *C) {
+% c_dtype = dtype_to_c_type(dtype)
+ __kernel void where_<%= dtype %>(const int M, const int N, __global const int *PRED, __global const <%= c_dtype %> *A, __global const  <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
     // Get the index of the current element to be processed
     const int globalRow = get_global_id(0); // Row ID of C (0..M)
     const int globalCol = get_global_id(1); // Col ID of C (0..N)

data/lib/tensor_stream/evaluator/opencl_evaluator.rb CHANGED Viewed

@@ -41,20 +41,18 @@ module TensorStream
         @preferred_device = preferred_device
         @retain = context[:retain] || []
         @thread_pool = thread_pool || Concurrent::ImmediateExecutor.new
+        @context[:_cache][:_cl_buffers] ||= {} if @context[:_cache]
         @context[:compute_history] = [] if log_intermediates
       end
       # opencl evaluator main entrypoint
       def run(tensor, execution_context)
         _create_opencl_context
-        # _prepare_kernels
+        create_command_queue
         read_final_result(complete_eval(tensor, execution_context))
       end
       def complete_eval(tensor, context)
-        create_command_queue
         buffer = _run(tensor, context)
         if buffer.is_a?(Array)
           buffer = buffer.collect do |b|
@@ -66,7 +64,6 @@ module TensorStream
           return buffer if buffer.nil? || buffer.buffer.size.zero?
           _opencl_queue.enqueue_read_buffer(buffer.cl_buffer, buffer.buffer, event_wait_list: [buffer.op].compact)
         end
         _opencl_queue.finish
         buffer
       end
@@ -91,15 +88,18 @@ module TensorStream
             @preferred_device
           else
             device, _score, _platform, _index = choose_best_device
+            # puts "using #{device.name}"
             device
           end
         end
+        @context[:cl_device] = opencl_device
         @context[:_cache][:_opencl_context] ||= OpenCL.create_context(opencl_device)
       end
       def choose_best_device
         @best_device ||= begin
           devices = OpenCL.platforms.flat_map do |p|
             p.devices.select { |d| d.available > 0 }.each_with_index.collect do |d, index|
               score = 0
               if d.type.to_s == 'CPU'
@@ -108,13 +108,18 @@ module TensorStream
                 score += 4
               end
+              if d.platform.name == 'NVIDIA CUDA'
+                score += 1000
+              end
               score += d.max_compute_units
+              score += d.max_clock_frequency
               [d, score, p.name, index]
             end
           end
+          devices.sort { |a| a[1] }.reverse.first
         end
-        devices.max { |a| a[1] }
       end
       def create_command_queue
@@ -137,11 +142,13 @@ module TensorStream
         File.join(File.dirname(__FILE__), 'kernels', "#{kernel}.#{extension}")
       end
-      def _cl_program(kernel)
-        @context[:_cache]["_opencl_kernel_#{kernel}"] ||= begin
+      def _cl_program(kernel, args = {})
+        suffix = args.collect { |k,v| "#{k}.#{v}"}.join('.')
+        @context[:_cache]["_opencl_kernel_#{kernel}.#{suffix}"] ||= begin
           filename = %w[cl.erb cl].map { |ext| cl_template_path(kernel, ext) }.find { |n| File.exist?(n) }
           source = File.read(filename)
-          source = OpenclTemplateHelper.new(source).generate
+          source = OpenclTemplateHelper.new(source).generate(args)
+          File.write("/tmp/#{kernel}.#{suffix}.cl", source)
           program = _opencl_context.create_program_with_source(source)
           program.build
         rescue OpenCL::Error::BUILD_PROGRAM_FAILURE => e
@@ -152,7 +159,9 @@ module TensorStream
       def _run(tensor, execution_context)
         return tensor if tensor.is_a?(OpenCLBuffer)
-        return tensor.map { |t| _run(t, execution_context) } if tensor.is_a?(Array)
+        if tensor.is_a?(Array) && tensor.size > 0 && tensor[0].is_a?(Tensor)
+          return tensor.map { |t| _run(t, execution_context) }
+        end
         return tensor if retain.include?(tensor) # if var is in retain don't eval to value
@@ -180,10 +189,11 @@ module TensorStream
       def eval_operation(tensor, child_context)
         return @context[tensor.name] if @context.key?(tensor.name)
+        cache_key = "#{tensor.graph.object_id}_opencl_#{tensor.name}"
+        return @context[cache_key] if @context.key?(cache_key)
         a = resolve_placeholder(tensor.items[0], child_context) if tensor.items && tensor.items[0]
         b = resolve_placeholder(tensor.items[1], child_context) if tensor.items && tensor.items[1]
+        # puts tensor.name
         case tensor.operation
         when :concat
           input_a = read_final_result(complete_eval(a, child_context))
@@ -238,7 +248,6 @@ module TensorStream
         when :assign_add
           a = _run(a, child_context)
           b = _run(b, child_context)
           value = execute_2_operand_func('add', tensor, a, b, child_context)
           assign_var(tensor, value, child_context)
         when :assign_sub
@@ -290,8 +299,8 @@ module TensorStream
           raise "#{tensor.items[1].name} rank must be greater than 1" if b.shape.size < 2
           raise "incompatible shape sizes for matrix multiplication (#{a.shape[1]} != #{b.shape[0]}) #{a.shape} vs #{b.shape}" if k != v
-          dtype = TensorStream::Ops::FLOATING_POINT_TYPES.include?(tensor.data_type) ? 'fp' : 'int'
-          a, b = type_cast(a, b)
+          dtype = tensor.data_type
+          a, b = auto_type_cast(a, b, name: "#{tensor.name}/cast_#{a.name}_#{b.data_type}")
           output_buffer = _create_result_buffer(a.data_type, result_shape, tensor.name)
           cl_m = OpenCL::Int1.new(m)
@@ -301,7 +310,7 @@ module TensorStream
           transpose_a = OpenCL::Int1.new(tensor.options[:transpose_a] ? 1 : 0)
           transpose_b = OpenCL::Int1.new(tensor.options[:transpose_b] ? 1 : 0)
-          output_buffer.op = _cl_program('gemm').send(:"gemm_#{dtype}", _opencl_queue, result_shape, cl_m, cl_n, cl_k, transpose_a, transpose_b, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer)
+          output_buffer.op = _cl_program('gemm', dtype: dtype).send(:"gemm_#{dtype}", _opencl_queue, result_shape, cl_m, cl_n, cl_k, transpose_a, transpose_b, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer)
           output_buffer
         when :mul
           execute_2_operand_func('mul', tensor, a, b, child_context)
@@ -311,14 +320,12 @@ module TensorStream
           a = _run(a, child_context)
           if a.data_type != tensor.data_type
             buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
-            s_dtype = TensorStream::Ops::FLOATING_POINT_TYPES.include?(a.data_type) ? 'fp' : 'int'
-            t_dtype = TensorStream::Ops::FLOATING_POINT_TYPES.include?(tensor.data_type) ? 'fp' : 'int'
             m, n = a.shape
             cl_m = OpenCL::Int1.new(m || 1)
             cl_n = OpenCL::Int1.new(n || 1)
             work_group = [m || 1, n || 1]
-            buffer.op = _cl_program("cast").send(:"cast_#{s_dtype}_#{t_dtype}",_opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, buffer.cl_buffer)
+            buffer.op = _cl_program("cast", source_dt: a.data_type, target_dt: tensor.data_type).cast(_opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, buffer.cl_buffer)
             buffer
           else
             a
@@ -355,6 +362,34 @@ module TensorStream
           execute_func('log1p', tensor, a, child_context)
         when :round
           execute_func('round', tensor, a, child_context)
+        when :softmax
+          a = _run(a, child_context)
+          event_wait_list = [a.op].compact
+          dtype = tensor.data_type
+          output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
+          m, n = a.shape
+          work_group = [m]
+          n = m if n.nil?
+          cl_n = OpenCL::Int1.new(n || 1)
+          event = _cl_program("softmax", dtype: dtype).send(:"softmax_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
+          output_buffer.op = event
+          output_buffer
+        when :softmax_grad
+          a = _run(a, child_context)
+          grad = _run(b, child_context)
+          event_wait_list = [a.op].compact
+          dtype = tensor.data_type
+          output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
+          m, n = a.shape
+          work_group = [m]
+          n = m if n.nil?
+          cl_n = OpenCL::Int1.new(n || 1)
+          event = _cl_program("softmax_grad", dtype: dtype, size: n).send(:"softmax_grad_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer, grad.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
+          output_buffer.op = event
+          output_buffer
         when :sigmoid_grad
           execute_2_operand_func('sigmoid_grad', tensor, a, b, child_context)
         when :truncate
@@ -381,6 +416,14 @@ module TensorStream
               end
             end
           end
+        when :check_numerics
+          a = complete_eval(a, child_context)
+          name = tensor.options[:name]
+          a.buffer.each do |item|
+            raise "#{name} Invalid Argument" if item.nan? || item.infinite?
+          end
+          a
         when :zeros, :ones, :zeros_like, :ones_like
           shape = if %i[zeros_like ones_like].include?(tensor.operation)
             _run(a, child_context).shape
@@ -551,6 +594,7 @@ module TensorStream
         else
           raise "unknown op #{tensor.operation}"
         end.tap do |result|
+          # puts "#{tensor.to_math(true,1)} = #{read_final_result(complete_eval(result, child_context))}"
           if tensor.breakpoint
             a = read_final_result(complete_eval(a, child_context))
             b = read_final_result(complete_eval(b, child_context))
@@ -568,11 +612,13 @@ module TensorStream
               value: result
             }
           end
+          @context[:_cache][cache_key] =  @context[cache_key] if tensor.is_const
           @context[tensor.name] = result
         end
       rescue EvaluatorExcecutionException => e
         raise e
       rescue StandardError => e
+        _opencl_queue.finish # dump queue
         puts e.message
         puts e.backtrace.join("\n")
@@ -612,8 +658,12 @@ module TensorStream
       def assign_var(tensor, b, child_context)
         assign = tensor.items[0] || tensor
         buffer = complete_eval(b, child_context)
         if assign.buffer
-          assign.buffer.op = _opencl_queue.enqueue_write_buffer(assign.buffer.cl_buffer, buffer.buffer)
+          buffer = type_cast(buffer, assign.data_type, name: "#{tensor.name}/cast_#{tensor.name}_#{tensor.data_type}")
+          if assign.buffer.cl_buffer != buffer.cl_buffer
+            assign.buffer.op = _opencl_queue.enqueue_copy_buffer(buffer.cl_buffer, assign.buffer.cl_buffer, event_wait_list: [buffer.op, assign.buffer.op])
+          end
         else
           assign.buffer = convert_to_opencl(read_final_result(buffer), buffer.shape, data_type: tensor.data_type, name: tensor.name)
         end
@@ -624,8 +674,8 @@ module TensorStream
       def execute_2_operand_func(op_name, tensor, input_a, input_b, child_context, prog_name = nil)
         a = _run(input_a, child_context)
         b = _run(input_b, child_context)
-        a, b = type_cast(a, b)
-        dtype = TensorStream::Ops::FLOATING_POINT_TYPES.include?(tensor.data_type) ? 'fp' : 'int'
+        a, b = auto_type_cast(a, b, name: "#{tensor.name}/cast_#{a.name}_#{b.data_type}")
+        dtype = tensor.data_type
         result_shape = TensorShape.infer_shape(a.shape, b.shape)
         output_buffer = _create_result_buffer(tensor.data_type, result_shape, tensor.name)
@@ -646,9 +696,9 @@ module TensorStream
           else
             raise "rank > 2 not supported!"
           end
-          _cl_program("#{prog_name || op_name}").send(:"#{prog}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, cl_m_b, cl_n_b, cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
+          _cl_program("#{prog_name || op_name}", dtype: dtype).send(:"#{prog}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, cl_m_b, cl_n_b, cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
         else
-          _cl_program("#{prog_name || op_name}").send(:"#{prog}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
+          _cl_program("#{prog_name || op_name}", dtype: dtype).send(:"#{prog}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
         end
         output_buffer.op = event
@@ -660,8 +710,8 @@ module TensorStream
         a = _run(input_a, child_context)
         b = _run(input_b, child_context)
-        a, b = type_cast(a, b)
-        dtype = TensorStream::Ops::FLOATING_POINT_TYPES.include?(tensor.data_type) ? 'fp' : 'int'
+        a, b = auto_type_cast(a, b, name: "#{tensor.name}/cast_#{a.name}_#{b.data_type}")
+        dtype = tensor.data_type
         output_buffer = _create_result_buffer(tensor.data_type, p.shape, tensor.name)
@@ -671,14 +721,14 @@ module TensorStream
         cl_n = OpenCL::Int1.new(n || 1)
         event_wait_list = [a.op, b.op, p.op].compact # add dependency wait list
-        output_buffer.op = _cl_program("#{op_name}").send(:"#{op_name}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, p.cl_buffer, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
+        output_buffer.op = _cl_program("#{op_name}", dtype: dtype).send(:"#{op_name}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, p.cl_buffer, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
         output_buffer
       end
       def execute_func(op_name, tensor, a, child_context)
         a = _run(a, child_context)
-        event_wait_list = [a.op].compact
-        dtype = TensorStream::Ops::FLOATING_POINT_TYPES.include?(tensor.data_type) ? 'fp' : 'int'
+        event_wait_list = [a.op].compact
+        dtype = tensor.data_type
         output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
         m, n = a.shape
@@ -686,43 +736,37 @@ module TensorStream
         cl_m = OpenCL::Int1.new(m || 1)
         cl_n = OpenCL::Int1.new(n || 1)
-        event = _cl_program("#{op_name}").send(:"#{op_name}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
+        event = _cl_program("#{op_name}", dtype: dtype).send(:"#{op_name}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
         output_buffer.op = event
         output_buffer
       end
-      def type_cast(a, b)
+      def auto_type_cast(a, b, name: nil)
         return [a, b] if a.data_type == b.data_type
         m, n = b.shape
         work_group = [m || 1, n || 1]
-        buffer = buffer_for(b.shape, b.data_type)
-        if (TensorStream::Ops::FLOATING_POINT_TYPES.include?(a.data_type.to_sym))
-          if TensorStream::Ops::INTEGER_TYPES.include?(b.data_type.to_sym)
-            cl_m = OpenCL::Int1.new(m || 1)
-            cl_n = OpenCL::Int1.new(n || 1)
+        event_wait_list = [b.op].compact
+        buffer = _create_result_buffer(b.data_type, b.shape, name)
-            _cl_program("cast").cast_int_fp(_opencl_queue, work_group, cl_m, cl_n, b.cl_buffer, buffer.cl_buffer)
-            return [a, buffer]
-          end
-        elsif TensorStream::Ops::INTEGER_TYPES.include?(a.data_type.to_sym)
-          if TensorStream::Ops::FLOATING_POINT_TYPES.include?(b.data_type.to_sym)
-            cl_m = OpenCL::Int1.new(m || 1)
-            cl_n = OpenCL::Int1.new(n || 1)
-            _cl_program("cast").cast_fp_int(_opencl_queue, work_group, cl_m, cl_n, b.cl_buffer, buffer.cl_buffer)
-            return [a, buffer]
-          end
-        end
+        cl_m = OpenCL::Int1.new(m || 1)
+        cl_n = OpenCL::Int1.new(n || 1)
-        [a, b]
+        buffer.op = _cl_program("cast", source_dt: a.data_type, target_dt: b.data_type).cast(_opencl_queue, work_group, cl_m, cl_n, b.cl_buffer, buffer.cl_buffer, event_wait_list: event_wait_list)
+        [a, buffer]
       end
-      def buffer_for(shape, data_type)
-        size = shape.empty? ? 1 : shape.reduce(:*)
+      def type_cast(source, data_type, name: nil)
+        return source if source.data_type == data_type
+        m, n = source.shape
+        work_group = [m || 1, n || 1]
+        event_wait_list = [source.op].compact
+        buffer = _create_result_buffer(data_type, source.shape, name)
-        buffer = allocate_narray_for_type(data_type, size)
+        cl_m = OpenCL::Int1.new(m || 1)
+        cl_n = OpenCL::Int1.new(n || 1)
-        cl_buffer = _opencl_context.create_buffer(buffer.size * buffer.element_size)
-        OpenCLBuffer.new(data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer)
+        buffer.op = _cl_program("cast", source_dt: source.data_type, target_dt: data_type).cast(_opencl_queue, work_group, cl_m, cl_n, source.cl_buffer, buffer.cl_buffer, event_wait_list: event_wait_list)
+        buffer
       end
       def wrap_opencl(tensor, data_type: nil, name: nil)
@@ -786,11 +830,16 @@ module TensorStream
       end
       def allocate_narray_for_type(data_type, narray_size)
-        if TensorStream::Ops::FLOATING_POINT_TYPES.include?(data_type.to_sym) || TensorStream::Ops::FLOATING_POINT_TYPES.include?(data_type.to_sym)
+        case data_type
+        when :float, :float32
           NArray.sfloat(narray_size)
-        elsif TensorStream::Ops::INTEGER_TYPES.include?(data_type.to_sym) || TensorStream::Ops::INTEGER_TYPES.include?(data_type.to_sym)
+        when :float64
+          NArray.float(narray_size)
+        when :int, :int32, :int64
           NArray.int(narray_size)
-        elsif data_type.to_sym == :boolean
+        when :int16
+          NArray.sint(narray_size)
+        when :boolean
           NArray.int(narray_size)
         else
           raise "unsupported type #{data_type}"
@@ -798,7 +847,7 @@ module TensorStream
       end
       def _create_result_buffer(data_type, shape, name)
-        @context[:_cache]["_result_#{name}_#{shape.join('_')}"] ||= begin
+        @context[:_cache][:_cl_buffers]["_result_#{name}_#{shape.join('_')}"] ||= begin
           size = shape.empty? ? 1 : shape.reduce(:*)
           buffer =  allocate_narray_for_type(data_type, size)
           cl_buffer = _opencl_context.create_buffer(buffer.size * buffer.element_size)
@@ -843,7 +892,8 @@ module TensorStream
         input = complete_eval(a, child_context)
         axis = read_final_result(complete_eval(b, child_context))
         if axis.nil?
-          convert_to_opencl(input.buffer.send(func), [], data_type: tensor.data_type, name: tensor.name)
+          red = input.buffer.send(func)
+          convert_to_opencl(red, [], data_type: tensor.data_type, name: tensor.name)
         else
           return input if input.shape.empty?
           value = input.buffer.reshape(*input.shape.reverse)