RubyGems - tensor_stream-opencl - Versions diffs - 0.1.3 → 0.2.0 - Mend

tensor_stream-opencl 0.1.3 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

checksums.yaml +4 -4
data/Gemfile.lock +11 -4
data/benchmark/benchmark.rb +91 -0
data/benchmark_intel.txt +36 -0
data/lib/tensor_stream/opencl/array_ops.rb +395 -0
data/lib/tensor_stream/opencl/images_ops.rb +62 -0
data/lib/tensor_stream/opencl/kernels/abs.cl +6 -8
data/lib/tensor_stream/opencl/kernels/acos.cl +3 -4
data/lib/tensor_stream/opencl/kernels/apply_adadelta.cl +2 -4
data/lib/tensor_stream/opencl/kernels/apply_adagrad.cl +12 -0
data/lib/tensor_stream/opencl/kernels/apply_adam.cl +2 -5
data/lib/tensor_stream/opencl/kernels/apply_centered_rms_prop.cl +19 -0
data/lib/tensor_stream/opencl/kernels/apply_gradient.cl +3 -4
data/lib/tensor_stream/opencl/kernels/apply_momentum.cl +2 -4
data/lib/tensor_stream/opencl/kernels/apply_rms_prop.cl +16 -0
data/lib/tensor_stream/opencl/kernels/asin.cl +3 -4
data/lib/tensor_stream/opencl/kernels/ceil.cl +3 -4
data/lib/tensor_stream/opencl/kernels/concat.cl +21 -0
data/lib/tensor_stream/opencl/kernels/cos.cl +3 -5
data/lib/tensor_stream/opencl/kernels/exp.cl +3 -5
data/lib/tensor_stream/opencl/kernels/floor.cl +3 -4
data/lib/tensor_stream/opencl/kernels/log.cl +3 -4
data/lib/tensor_stream/opencl/kernels/log1p.cl +3 -4
data/lib/tensor_stream/opencl/kernels/negate.cl +3 -4
data/lib/tensor_stream/opencl/kernels/reciprocal.cl +3 -4
data/lib/tensor_stream/opencl/kernels/sigmoid.cl +3 -4
data/lib/tensor_stream/opencl/kernels/sign.cl +7 -8
data/lib/tensor_stream/opencl/kernels/sin.cl +3 -4
data/lib/tensor_stream/opencl/kernels/split.cl +17 -0
data/lib/tensor_stream/opencl/kernels/split_n.cl +18 -0
data/lib/tensor_stream/opencl/kernels/sqrt.cl +3 -4
data/lib/tensor_stream/opencl/kernels/square.cl +3 -4
data/lib/tensor_stream/opencl/kernels/tan.cl +3 -4
data/lib/tensor_stream/opencl/kernels/tanh.cl +3 -4
data/lib/tensor_stream/opencl/kernels/tanh_grad.cl +3 -4
data/lib/tensor_stream/opencl/kernels/unpack.cl +23 -0
data/lib/tensor_stream/opencl/nn_ops.rb +111 -26
data/lib/tensor_stream/opencl/opencl_buffer.rb +9 -0
data/lib/tensor_stream/opencl/opencl_evaluator.rb +129 -172
data/lib/tensor_stream/opencl/version.rb +1 -1
data/samples/iris.data +150 -0
data/samples/iris.rb +110 -0
data/samples/mnist_data.rb +65 -0
data/samples/multigpu.rb +73 -0
data/samples/nearest_neighbor.rb +56 -0
data/samples/rnn.rb +108 -0
data/tensor_stream-opencl.gemspec +4 -1
metadata +62 -3

data/lib/tensor_stream/opencl/kernels/split_n.cl ADDED Viewed

@@ -0,0 +1,18 @@
+% ctype = dtype_to_c_type(data_type)
+% mul_str = mul.each_with_index.collect { |mul, index| "#{mul} * index_map_#{index}" }
+__kernel void split(const int offset, __global const <%= ctype %> *A, __global <%= ctype %> *C) {
+    // Get the index of the current element to be processed
+    const int globalCol = get_global_id(0); // Col ID of C (0..N)
+    // compute effective coordinates
+    int ptr = globalCol;
+<% div.each_with_index do |div, index| %>
+    <% if index == axis %>
+    int index_map_<%= index %> = (int)floor(ptr / (float)<%= div %>) + <%= step %>;
+    <% else %>
+    int index_map_<%= index %> = (int)floor(ptr / (float)<%= div %>);
+    <% end %>
+    <% if index < div.size - 1%>ptr = ptr % <%= div %>;<% end %><% end %>
+    C[offset + globalCol] =  A[<%= mul_str.join(" + ") %>];
+}

data/lib/tensor_stream/opencl/kernels/sqrt.cl CHANGED Viewed

@@ -1,9 +1,8 @@
 % c_dtype = dtype_to_c_type(dtype)
-__kernel void sqrt_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
+__kernel void sqrt_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
     // Get the index of the current element to be processed
-    const int globalRow = get_global_id(0); // Row ID of C (0..M)
-    const int globalCol = get_global_id(1); // Col ID of C (0..N)
+    const int id = get_global_id(0);
-    C[globalRow * N + globalCol] = sqrt(A[globalRow * N + globalCol]);
+    C[id] = sqrt(A[id]);
 }

data/lib/tensor_stream/opencl/kernels/square.cl CHANGED Viewed

@@ -1,9 +1,8 @@
 % c_dtype = dtype_to_c_type(dtype)
-__kernel void square_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
+__kernel void square_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
     // Get the index of the current element to be processed
-    const int globalRow = get_global_id(0); // Row ID of C (0..M)
-    const int globalCol = get_global_id(1); // Col ID of C (0..N)
+    const int id = get_global_id(0);
-    C[globalRow * N + globalCol] = A[globalRow * N + globalCol] * A[globalRow * N + globalCol];
+    C[id] = A[id] * A[id];
 }

data/lib/tensor_stream/opencl/kernels/tan.cl CHANGED Viewed

@@ -1,8 +1,7 @@
 % c_dtype = dtype_to_c_type(dtype)
-__kernel void tan_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
+__kernel void tan_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
     // Get the index of the current element to be processed
-    const int globalRow = get_global_id(0); // Row ID of C (0..M)
-    const int globalCol = get_global_id(1); // Col ID of C (0..N)
+    const int id = get_global_id(0); // Row ID of C (0..M)
-    C[globalRow * N + globalCol] = tan(A[globalRow * N + globalCol]);
+    C[id] = tan(A[id]);
 }

data/lib/tensor_stream/opencl/kernels/tanh.cl CHANGED Viewed

@@ -1,8 +1,7 @@
 % c_dtype = dtype_to_c_type(dtype)
-__kernel void tanh_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
+__kernel void tanh_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
     // Get the index of the current element to be processed
-    const int globalRow = get_global_id(0); // Row ID of C (0..M)
-    const int globalCol = get_global_id(1); // Col ID of C (0..N)
+    const int id = get_global_id(0); // Row ID of C (0..M)
-    C[globalRow * N + globalCol] = tanh(A[globalRow * N + globalCol]);
+    C[id] = tanh(A[id]);
 }

data/lib/tensor_stream/opencl/kernels/tanh_grad.cl CHANGED Viewed

@@ -1,7 +1,6 @@
 % c_dtype = dtype_to_c_type(dtype)
-__kernel void tanh_grad_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
+__kernel void tanh_grad_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
     // Get the index of the current element to be processed
-    const int globalRow = get_global_id(0); // Row ID of C (0..M)
-    const int globalCol = get_global_id(1); // Col ID of C (0..N)
-    C[globalRow * N + globalCol] = 1 - tanh(A[globalRow * N + globalCol]) * tanh(A[globalRow * N + globalCol]);
+    const int id = get_global_id(0);
+    C[id] = 1 - tanh(A[id]) * tanh(A[id]);
 }

data/lib/tensor_stream/opencl/kernels/unpack.cl ADDED Viewed

@@ -0,0 +1,23 @@
+% ctype = dtype_to_c_type(data_type)
+__kernel void unpack(const int N, const int index, __global const <%= ctype %> *A, __global <%= ctype %> *C) {
+    // Get the index of the current element to be processed
+    const int globalCol = get_global_id(0); // Col ID of C (0..N)
+    int start = index * <%= divisors[0] %>;
+    int ptr = start + globalCol;
+    int index_map[<%= divisors.size %>] = { <%= Array.new(divisors.size) { 0 }.join(', ') %> };
+    // compute effective coordinates
+<% divisors.each_with_index do |div, index| %>
+    index_map[<%= index %>] = (int)floor(ptr / (float)<%= div %>);<% if index < divisors.size - 1%>ptr = ptr % <%= div %>;<% end %><% end %>
+    // Apply axis translation if needed
+<% if axis > 0 %>
+    int last = index_map[<%= axis %>];
+<% axis.downto(1) do |i| %> index_map[<%= i %>] = index_map[<%= (i - 1) %>];<% end %>
+    index_map[0] = last;
+<% end%>
+    C[<%= multipliers.each_with_index.map { |m, idx| "#{m}*index_map[#{idx}]" }.join(' + ') %>] = A[globalCol];
+}

data/lib/tensor_stream/opencl/nn_ops.rb CHANGED Viewed

@@ -14,14 +14,15 @@ module TensorStream
             assign.buffer.dirty = true # force buffer copy when variable is read externally
             output_buffer = assign.buffer
-            m, n = output_buffer.shape
-            work_group = [m || 1, n || 1]
-            cl_m = OpenCL::Int1.new(m || 1)
-            cl_n = OpenCL::Int1.new(n || 1)
+            work_group = [output_buffer.total_elements]
             event_wait_list = build_event_wait_list([assign.buffer, learning_rate, delta])
-            method_call = :"apply_gradient_#{output_buffer.data_type}"
-            event = _cl_program("apply_gradient", dtype: output_buffer.data_type).send(method_call, _opencl_queue, work_group, cl_m, cl_n, delta.cl_buffer, learning_rate.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
+            event = call_program("apply_gradient", output_buffer.data_type,
+                           work_group,
+                           delta.cl_buffer,
+                           learning_rate.cl_buffer,
+                           output_buffer.cl_buffer, event_wait_list: event_wait_list)
             output_buffer.op = event
             output_buffer
           end
@@ -37,15 +38,12 @@ module TensorStream
             output_buffer = assign.buffer
-            m, n = output_buffer.shape
-            work_group = [m || 1, n || 1]
-            cl_m = OpenCL::Int1.new(m || 1)
-            cl_n = OpenCL::Int1.new(n || 1)
+            work_group = [output_buffer.total_elements]
             event_wait_list = build_event_wait_list([assign.buffer, assign_acc.buffer, learning_rate, grad, momentum])
             method_call = :"apply_momentum_#{output_buffer.data_type}"
             event = _cl_program("apply_momentum", nesterov: tensor.options[:use_nesterov], dtype: output_buffer.data_type).
-                        send(method_call, _opencl_queue, work_group, cl_m, cl_n, grad.cl_buffer,
+                        send(method_call, _opencl_queue, work_group, grad.cl_buffer,
                             learning_rate.cl_buffer, momentum.cl_buffer, output_buffer.cl_buffer,
                             assign_acc.buffer.cl_buffer, event_wait_list: event_wait_list)
             output_buffer.op = event
@@ -66,15 +64,11 @@ module TensorStream
             output_buffer = assign.buffer
-            m, n = output_buffer.shape
-            work_group = [m || 1, n || 1]
-            cl_m = OpenCL::Int1.new(m || 1)
-            cl_n = OpenCL::Int1.new(n || 1)
+            work_group = [output_buffer.total_elements]
             event_wait_list = build_event_wait_list(inputs)
-            method_call = :"apply_adadelta_#{output_buffer.data_type}"
-            event = _cl_program('apply_adadelta', dtype: output_buffer.data_type)
-                                .send(method_call, _opencl_queue, work_group, cl_m, cl_n,
+            event = call_program('apply_adadelta', output_buffer.data_type,
+                                      work_group,
                                       lr.cl_buffer,
                                       rho.cl_buffer,
                                       epsilon.cl_buffer,
@@ -104,15 +98,11 @@ module TensorStream
             output_buffer = assign.buffer
-            m, n = output_buffer.shape
-            work_group = [m || 1, n || 1]
-            cl_m = OpenCL::Int1.new(m || 1)
-            cl_n = OpenCL::Int1.new(n || 1)
+            work_group = [output_buffer.total_elements]
             event_wait_list = build_event_wait_list(inputs)
-            method_call = :"apply_adam_#{output_buffer.data_type}"
-            event = _cl_program("apply_adam", dtype: output_buffer.data_type)
-                                .send(method_call, _opencl_queue, work_group, cl_m, cl_n,
+            event = call_program("apply_adam", output_buffer.data_type,
+                                      work_group,
                                       grad.cl_buffer,
                                       lr_t.cl_buffer,
                                       beta1_power.cl_buffer,
@@ -130,6 +120,99 @@ module TensorStream
             output_buffer
           end
+          register_op :apply_adagrad do |context, tensor, inputs|
+            target_var, accum, lr, grad = inputs
+            assign = tensor.inputs[0] || tensor
+            assign_acc = tensor.inputs[1]
+            assign.buffer.dirty = true
+            assign_acc.buffer.dirty = true
+            output_buffer = assign.buffer
+            work_group = [output_buffer.total_elements]
+            event_wait_list = build_event_wait_list(inputs)
+            event = call_program('apply_adagrad',
+                                      output_buffer.data_type,
+                                      work_group,
+                                      lr.cl_buffer,
+                                      grad.cl_buffer,
+                                      assign.buffer.cl_buffer,
+                                      assign_acc.buffer.cl_buffer,
+                                      event_wait_list: event_wait_list)
+            output_buffer.op = event
+            assign_acc.buffer.op = event
+            output_buffer
+          end
+          register_op :apply_centered_rms_prop do |context, tensor, inputs|
+            var, mg, ms, mom, lr, rho, momentum, epsilon, grad = inputs
+            assign = tensor.inputs[0]
+            assign_mg = tensor.inputs[1]
+            assign_ms = tensor.inputs[2]
+            assign_mom = tensor.inputs[3]
+            assign.buffer.dirty = true
+            assign_mg.buffer.dirty = true
+            assign_ms.buffer.dirty = true
+            assign_mom.buffer.dirty = true
+            output_buffer = assign.buffer
+            event_wait_list = build_event_wait_list(inputs)
+            work_group = [output_buffer.total_elements]
+            event = call_program('apply_centered_rms_prop', output_buffer.data_type, work_group,
+                            lr.cl_buffer,
+                            rho.cl_buffer,
+                            momentum.cl_buffer,
+                            epsilon.cl_buffer,
+                            grad.cl_buffer,
+                            assign.buffer.cl_buffer,
+                            assign_ms.buffer.cl_buffer,
+                            assign_mg.buffer.cl_buffer,
+                            assign_mom.buffer.cl_buffer,
+                            event_wait_list: event_wait_list)
+            output_buffer.op = event
+            assign_mg.buffer.op = event
+            assign_ms.buffer.op = event
+            assign_mom.buffer.op = event
+            output_buffer
+          end
+          register_op :apply_rms_prop do |context, tensor, inputs|
+            var, ms, mom, lr, rho, momentum, epsilon, grad = inputs
+            assign = tensor.inputs[0]
+            assign_ms = tensor.inputs[1]
+            assign_mom = tensor.inputs[2]
+            assign.buffer.dirty = true
+            assign_ms.buffer.dirty = true
+            assign_mom.buffer.dirty = true
+            output_buffer = assign.buffer
+            event_wait_list = build_event_wait_list(inputs)
+            work_group = [output_buffer.total_elements]
+            event = call_program('apply_rms_prop', output_buffer.data_type,
+                            work_group,
+                            lr.cl_buffer,
+                            rho.cl_buffer,
+                            momentum.cl_buffer,
+                            epsilon.cl_buffer,
+                            grad.cl_buffer,
+                            assign.buffer.cl_buffer,
+                            assign_ms.buffer.cl_buffer,
+                            assign_mom.buffer.cl_buffer,
+                            event_wait_list: event_wait_list)
+            output_buffer.op = event
+            assign_ms.buffer.op = event
+            assign_mom.buffer.op = event
+            output_buffer
+          end
           register_op :softmax do |_context, tensor, inputs|
             a = inputs[0]
             event_wait_list = build_event_wait_list(inputs)
@@ -213,7 +296,9 @@ module TensorStream
             work_group = [m]
             n = m if n.nil?
             cl_n = OpenCL::Int1.new(n || 1)
-            event = _cl_program('softmax_grad', dtype: dtype, size: n).send(:"softmax_grad_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer, grad.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
+            event = _cl_program('softmax_grad', dtype: dtype, size: n).
+                        send(:"softmax_grad_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer,
+                             grad.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
             output_buffer.op = event
             output_buffer
           end

data/lib/tensor_stream/opencl/opencl_buffer.rb CHANGED Viewed

@@ -14,6 +14,14 @@ module TensorStream
       @op = op
     end
+    def total_elements
+      shape.reduce(:*) || 1
+    end
+    def empty_value?
+      @shape == [0]
+    end
     def to_ruby
       return [] if buffer.empty?
@@ -24,6 +32,7 @@ module TensorStream
       end
       if shape.empty?
+        return buffer.to_s if data_type == :string
         return buffer[0] != 0 if data_type == :boolean
         return buffer[0]
       end

data/lib/tensor_stream/opencl/opencl_evaluator.rb CHANGED Viewed

@@ -11,6 +11,8 @@ require 'narray_ffi'
 require 'tensor_stream/evaluator/base_evaluator'
 require 'tensor_stream/opencl/math_ops'
 require 'tensor_stream/opencl/nn_ops'
+require 'tensor_stream/opencl/images_ops'
+require 'tensor_stream/opencl/array_ops'
 require 'tensor_stream/helpers/op_helper'
 module TensorStream
@@ -32,7 +34,8 @@ module TensorStream
       end
     end
-    ## PURE ruby evaluator used for testing and development
+    ##
+    # PURE ruby evaluator used for testing and development
     class OpenclEvaluator < BaseEvaluator
       attr_accessor :retain
       attr_reader :opencl_device
@@ -42,6 +45,8 @@ module TensorStream
       include TensorStream::MathHelper
       include TensorStream::OpenCLHelpers::MathOps
       include TensorStream::OpenCLHelpers::NNOps
+      include TensorStream::OpenCLHelpers::ImagesOps
+      include TensorStream::OpenCLHelpers::ArrayOps
       def initialize(session, device, thread_pool: nil, log_intermediates: false)
         super
@@ -86,7 +91,10 @@ module TensorStream
       # opencl evaluator main entrypoint
       def run(tensor, execution_context)
-        read_final_result(complete_eval(tensor, execution_context))
+         result = complete_eval(tensor, execution_context)
+        #  puts "wait finish"
+        _opencl_queue.finish
+        read_final_result(result)
       end
       def run_with_buffer(tensor, context, execution_context)
@@ -117,9 +125,9 @@ module TensorStream
       def enqueue_buffer_read(tensor, context)
         buffer = _run(tensor, context)
         if buffer.is_a?(Array)
-          buffer = buffer.collect do |b|
+          buffer.collect do |b|
             next b if b.buffer.size.zero?
-            _opencl_queue.enqueue_read_buffer(b.cl_buffer, b.buffer, event_wait_list: build_event_wait_list([b]))
+            b.op = _opencl_queue.enqueue_read_buffer(b.cl_buffer, b.buffer, event_wait_list: build_event_wait_list([b]))
             b
           end
         else
@@ -127,14 +135,18 @@ module TensorStream
           return buffer if buffer.nil?
           return [] if buffer.buffer.nil?
           return buffer if buffer.buffer.size.zero?
-          _opencl_queue.enqueue_read_buffer(buffer.cl_buffer, buffer.buffer, event_wait_list: build_event_wait_list([buffer]))
+          buffer.op = _opencl_queue.enqueue_read_buffer(buffer.cl_buffer, buffer.buffer, event_wait_list: build_event_wait_list([buffer]))
           buffer
         end
       end
       def complete_eval(tensor, context)
+        return nil if tensor.nil?
         buffer = enqueue_buffer_read(tensor, context)
-        _opencl_queue.finish
+        events = build_event_wait_list([buffer])
+        # puts "wait #{tensor.name}"
+        OpenCL.wait_for_events(events) unless events.empty?
         buffer
       end
@@ -162,6 +174,7 @@ module TensorStream
       def prepare_input(tensor, context, options = {})
         return nil unless tensor
         tensor = resolve_placeholder(tensor)
         if options[:noop]
           tensor
@@ -210,11 +223,17 @@ module TensorStream
       def _cl_program(kernel, args = {})
         suffix = args.collect { |k, v| "#{k}.#{escape_arg_content(v)}" }.join('.')
         @context[:_cache]["_opencl_kernel_#{kernel}.#{suffix}:#{object_id}"] ||= begin
-          filename = %w[cl.erb cl].map { |ext| cl_template_path(kernel, ext) }.find { |n| File.exist?(n) }
-          raise "opencl kernel template for #{kernel} has not yet been defined" if filename.nil?
-          source = File.read(filename)
-          source = OpenclTemplateHelper.new(source).generate(args)
-          # File.write("/tmp/#{kernel}.#{suffix}.cl", source)
+          file_path = File.join('/tmp', "#{kernel}.#{suffix}.cl")
+          source = if File.exist?(file_path) && ENV['TS_OPENCL_FILE_CACHE']
+                     File.read(file_path)
+                   else
+                     filename = %w[cl.erb cl].map { |ext| cl_template_path(kernel, ext) }.find { |n| File.exist?(n) }
+                     raise "opencl kernel template for #{kernel} has not yet been defined" if filename.nil?
+                     source = File.read(filename)
+                     source = OpenclTemplateHelper.new(source).generate(args)
+                     File.write(file_path, source) if ENV['TS_OPENCL_FILE_CACHE']
+                     source
+                   end
           program = _opencl_context.create_program_with_source(source)
           program.build
         rescue OpenCL::Error::BUILD_PROGRAM_FAILURE => e
@@ -275,10 +294,10 @@ module TensorStream
       end
       register_op :identity do |context, tensor, inputs|
-        if tensor.inputs.size > 1
-          tensor.inputs[1..inputs.size].each { |input| complete_eval(input, context) }
-        end
-        inputs[0]
+        value = inputs[0]
+        buffer = OpenCLBuffer.new(name: tensor.name, data_type: tensor.data_type, shape: value.shape, buffer: value.buffer, cl_buffer: value.cl_buffer)
+        buffer.op = build_event_wait_list(inputs)
+        buffer
       end
       register_op :assign, noop: true do |context, tensor, inputs|
@@ -308,86 +327,11 @@ module TensorStream
         end
       end
-      register_op :expand_dims, buffer: true do |_context, tensor, inputs|
-        axis = inputs[1].buffer[0]
-        shape = inputs[0].shape.dup
-        axis = -axis if axis == shape.size
-        new_shape = shape.insert(axis, 1).compact
-        new_buf = inputs[0].buffer.reshape(*new_shape.reverse)
-        convert_to_opencl(new_buf, new_shape, data_type: inputs[0].data_type, name: tensor.name)
-      end
-      register_op :fill, buffer: true do |_context, tensor, inputs|
-        shape = inputs[0]
-        value = inputs[1]
-        narray_size = shape.buffer.to_a.reduce(:*) || 1
-        cl_buffer = get_cached_buffer(tensor.name, shape.buffer.to_a)
-        buffer = if cl_buffer
-                   cl_buffer.buffer
-                 else
-                   allocate_narray_for_type(tensor.data_type, narray_size)
-                 end
-        buffer.fill!(value.buffer[0])
-        convert_to_opencl(buffer, shape.buffer.to_a, data_type: tensor.data_type, name: tensor.name)
-      end
       register_op :where, noop: true do |context, tensor, inputs|
         pred = tensor.options[:pred]
         execute_cond_func('where', tensor, pred, inputs[0], inputs[1], context)
       end
-      register_op :cast do |_context, tensor, inputs|
-        a = inputs[0]
-        if a.data_type != tensor.data_type
-          buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
-          m, n = a.shape
-          cl_m = OpenCL::Int1.new(m || 1)
-          cl_n = OpenCL::Int1.new(n || 1)
-          work_group = [m || 1, n || 1]
-          event_wait_list = build_event_wait_list(inputs)
-          buffer.op = _cl_program("cast", source_dt: a.data_type, target_dt: tensor.data_type).cast(_opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, buffer.cl_buffer, event_wait_list: event_wait_list)
-          buffer
-        else
-          a
-        end
-      end
-      register_op :stack do |_context, tensor, inputs|
-        axis = tensor.options[:axis] || 0
-        shape = inputs[0].shape
-        rank = shape.size + 1
-        elem_size = shape.empty? ? 1 : shape.reduce(:*)
-        new_shape = [inputs.size]
-        shape.inject(new_shape) { |ns, s| ns << s }
-        divisors = new_shape.dup.drop(1).reverse.inject([1]) do |a, s|
-          a << s * a.last
-        end.reverse
-        axis = rank + axis if axis < 0
-        rotated_shape = Array.new(axis + 1) { new_shape.shift }
-        new_shape = rotated_shape.rotate! + new_shape
-        output_buffer = _create_result_buffer(tensor.data_type, new_shape, tensor.name)
-        multipliers = new_shape.dup.drop(1).reverse.inject([1]) do |a, s|
-          a << s * a.last
-        end.reverse
-        cl_n = OpenCL::Int1.new(elem_size)
-        work_group = [elem_size]
-        event_wait_list = build_event_wait_list(inputs)
-        ops = inputs.each_with_index.map do |input, index|
-          cl_index = OpenCL::Int1.new(index)
-          _cl_program("pack", data_type: tensor.data_type, divisors: divisors, multipliers: multipliers, axis: axis).pack(_opencl_queue, work_group, cl_n, cl_index, input.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
-        end
-        output_buffer.op = ops
-        output_buffer
-      end
       register_op :check_numerics, noop: true do |context, tensor, inputs|
         a = complete_eval(inputs[0], context)
         name = tensor.options[:name]
@@ -420,86 +364,18 @@ module TensorStream
         a
       end
-      register_op :rank do |_context, tensor, inputs|
-        wrap_opencl(inputs[0].shape.size, data_type: tensor.data_type, name: tensor.name)
-      end
       register_op :stop_gradient do |_context, _tensor, inputs|
         inputs[0]
       end
-      register_op :slice, noop: true do |context, tensor, inputs|
-        input_a = complete_eval(inputs[0], context)
-        input_b = read_final_result(complete_eval(inputs[1], context))
-        size = tensor.options[:size]
-        slice_param = input_b.zip(size).collect { |x, y| x..x + y - 1 }.reverse
-        new_buf = input_a.buffer.reshape(*input_a.shape.reverse)
-        sliced = new_buf.slice[*slice_param]
-        convert_to_opencl(sliced.flatten, sliced.shape.reverse, data_type: inputs[0].data_type, name: tensor.name)
-      end
-      register_op :transpose, buffer: true do |_context, tensor, inputs|
-        t_param = Array.new(inputs[0].shape.size) { |index| index }.reverse
-        if inputs[0].shape.size == 2 && inputs[1].nil?
-          transposed = inputs[0].buffer.reshape(*inputs[0].shape.reverse).transpose(*t_param)
-          res = convert_to_opencl(transposed.flatten, transposed.shape.reverse, data_type: inputs[0].data_type, name: tensor.name)
-          res
-        else
-          rank = inputs[0].shape.size
-          perm = inputs[1].nil? ? (0...rank).to_a.reverse : inputs[1].buffer
-          new_shape = perm.map { |p| inputs[0].shape[p] }.to_a
-          output_buffer = _create_result_buffer(tensor.data_type, new_shape, tensor.name)
-          transpose_with_perm(inputs[0].buffer, output_buffer.buffer, inputs[0].shape, new_shape, perm)
-          write_op = _opencl_queue.enqueue_write_buffer(output_buffer.cl_buffer, output_buffer.buffer)
-          output_buffer.op = write_op
-          output_buffer
-        end
-      end
-      register_op :index, noop: true do |context, tensor, inputs|
-        a = _run(inputs[0], context)
-        index = read_final_result(_run(inputs[1], context))
-        if a.is_a?(OutputGroup)
-          a.outputs[index]
-        elsif a.is_a?(Array)
-          a[index]
-        else
-          new_shape = a.shape.dup
-          new_shape.shift
-          input_a = read_final_result(a)
-          convert_to_opencl(input_a[index], new_shape, data_type: a.data_type, name: tensor.name)
-        end
-      end
       register_op :broadcast_gradient_args, buffer: true do |_context, tensor, inputs|
         rx, ry = get_broadcast_gradient_args(inputs[0].buffer.to_a, inputs[1].buffer.to_a)
         OutputGroup.new([wrap_opencl(rx, data_type: :int32, name: tensor.name), wrap_opencl(ry, data_type: :int32, name: "#{tensor.name}:1")], tensor.inputs.map(&:data_type))
       end
-      register_op :shape do |_context, tensor, inputs|
-        wrap_opencl(inputs[0].shape, name: tensor.name, data_type: tensor.data_type)
-      end
-      register_op :reshape, buffer: true do |_context, tensor, inputs|
-        arr = inputs[0]
-        new_shape = read_final_result(inputs[1])
-        shape = if new_shape.size.zero? && arr.buffer.size == 1
-                  new_shape
-                else
-                  TensorShape.fix_inferred_elements(new_shape, arr.buffer.size)
-                end
-        convert_to_opencl(arr.buffer, shape, data_type: arr.data_type, name: tensor.name)
-      end
-      register_op :flow_group do |context, _tensor, inputs|
-        _opencl_queue.finish
+      register_op :flow_group do |_context, _tensor, inputs|
+        events = build_event_wait_list(inputs)
+        OpenCL.wait_for_events(events) unless events.empty?
         nil
       end
@@ -657,7 +533,10 @@ module TensorStream
         cl_n = OpenCL::Int1.new(n || 1)
         event_wait_list = build_event_wait_list([a, b, p]) # add dependency wait list
-        output_buffer.op = _cl_program(op_name.to_s, dtype: dtype).send(:"#{op_name}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, p.cl_buffer, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
+        output_buffer.op = _cl_program(op_name.to_s, dtype: dtype).
+                                        send(:"#{op_name}_#{dtype}", _opencl_queue, work_group,
+                                              cl_m, cl_n, p.cl_buffer, a.cl_buffer, b.cl_buffer,
+                                              output_buffer.cl_buffer, event_wait_list: event_wait_list)
         output_buffer
       end
@@ -667,16 +546,17 @@ module TensorStream
         dtype = tensor.data_type
         output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
-        m, n = a.shape
-        work_group = [m || 1, n || 1]
-        cl_m = OpenCL::Int1.new(m || 1)
-        cl_n = OpenCL::Int1.new(n || 1)
+        work_group = [a.total_elements]
-        event = _cl_program(op_name.to_s, dtype: dtype).send(:"#{op_name}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
+        event = call_program(op_name, dtype, work_group, a.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
         output_buffer.op = event
         output_buffer
       end
+      def call_program(name, dtype, work_group, *args)
+        _cl_program(name.to_s, dtype: dtype).send(:"#{name}_#{dtype}", _opencl_queue, work_group, *args)
+      end
       def auto_type_cast(a, b, name: nil)
         return [a, b] if a.data_type == b.data_type
         m, n = b.shape
@@ -728,16 +608,20 @@ module TensorStream
                       @context[:_cache][cache_key]
                     else
                       narray_size = shape.reduce(:*) || 1
+                      cl_buffer_size = shape.empty? ? 1 : shape.reduce(:*)
                       buffer = if value.is_a?(NArray)
                                  value
+                               elsif data_type == :string && shape.empty?
+                                 cl_buffer_size = value[0].bytesize
+                                 allocate_narray_for_type(data_type, value[0].bytesize)
                                else
                                  allocate_narray_for_type(data_type, narray_size)
                                end
                       return nil if buffer.nil?
-                      cl_buffer_size = shape.empty? ? 1 : shape.reduce(:*)
                       cl_buffer = unless value.flatten.empty?
                                     cl_buffer_size = 1 if cl_buffer_size.zero?
@@ -746,8 +630,11 @@ module TensorStream
                       @context[:_cache][cache_key] = OpenCLBuffer.new(name: name, data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer)
                     end
-        if value.is_a?(Array)
+        if data_type == :string
+          value[0].each_byte.with_index do |c, index|
+            cl_object.buffer[index] = c
+          end
+        elsif value.is_a?(Array)
           value.flatten.each_with_index do |element, index|
             cl_object.buffer[index] = if element.is_a?(Tensor)
                                         read_final_result(complete_eval(element, {}))
@@ -765,7 +652,10 @@ module TensorStream
           cl_object.buffer[0] = Tensor.cast_dtype(value, data_type)
         end
-        write_op = _opencl_queue.enqueue_write_buffer(cl_object.cl_buffer, cl_object.buffer) if cl_object.cl_buffer && !value.nil? && (!value.is_a?(Array) || !value.empty?)
+        if cl_object.cl_buffer && !value.nil? && (!value.is_a?(Array) || !value.empty?)
+          write_op = _opencl_queue.enqueue_write_buffer(cl_object.cl_buffer, cl_object.buffer)
+        end
         cl_object.op = write_op
         cl_object
       end
@@ -780,8 +670,12 @@ module TensorStream
           NArray.int(narray_size)
         when :int16
           NArray.sint(narray_size)
+        when :uint8
+          NArray.byte(narray_size)
         when :boolean
           NArray.byte(narray_size)
+        when :string
+          NArray.byte(narray_size)
         when :unknown
           nil
         else
@@ -799,6 +693,65 @@ module TensorStream
         end
       end
+      # automatically use sub buffers
+      def _create_result_sub_buffer(parent_buffer, index, data_type, shape, name)
+        cache_key ="_sub_result_#{parent_buffer.object_id}_#{name}_#{index}:#{object_id}"
+        @context[:_cache][:_cl_buffers][cache_key] ||= begin
+          size = shape.empty? || shape == [0] ? 1 : shape.reduce(:*)
+          buffer = allocate_narray_for_type(data_type, size)
+          if parent_buffer.cl_buffer.associated_memobject.nil?
+            start = index * buffer.size * buffer.element_size
+            region = OpenCL::BufferRegion::new(start, buffer.size * buffer.element_size)
+            cl_buffer = parent_buffer.cl_buffer.create_sub_buffer(OpenCL::BUFFER_CREATE_TYPE_REGION, region)
+            OpenCLBuffer.new(data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer, name: name)
+          else
+            _create_result_buffer(tensor.data_type, shape, name)
+          end
+        end
+        buffer = @context[:_cache][:_cl_buffers][cache_key]
+        if buffer.cl_buffer.associated_memobject
+          buffer.op = parent_buffer.op
+        else
+          region_size_in_bytes = buffer.buffer.size * buffer.buffer.element_size
+          start = index * region_size_in_bytes
+          region = [region_size_in_bytes, 1, 1]
+          buffer.op = _opencl_queue.enqueue_copy_buffer_rect(parent_buffer.cl_buffer, buffer.cl_buffer, region, src_origin: [start, 0, 0], event_wait_list: parent_buffer.op)
+        end
+        buffer
+      end
+      # create sub buffers of different sizes
+      def _create_variable_result_sub_buffer(parent_buffer, index, start, region_size_in_bytes, data_type, shape, name)
+        cache_key ="_sub_result_#{parent_buffer.object_id}_#{name}_#{index}:#{object_id}"
+        @context[:_cache][:_cl_buffers][cache_key] ||= begin
+          size = shape.empty? || shape == [0] ? 1 : shape.reduce(:*)
+          buffer = allocate_narray_for_type(data_type, size)
+          if parent_buffer.cl_buffer.associated_memobject.nil?
+            region = OpenCL::BufferRegion::new(start, region_size_in_bytes)
+            cl_buffer = parent_buffer.cl_buffer.create_sub_buffer(OpenCL::BUFFER_CREATE_TYPE_REGION, region)
+            OpenCLBuffer.new(data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer, name: "#{name}/sub")
+          else
+            _create_result_buffer(tensor.data_type, shape, name)
+          end
+        end
+        buffer = @context[:_cache][:_cl_buffers][cache_key]
+        if buffer.cl_buffer.associated_memobject
+          buffer.op = parent_buffer.op
+        else
+          region = [region_size_in_bytes, 1, 1]
+          buffer.op = _opencl_queue.enqueue_copy_buffer_rect(parent_buffer.cl_buffer, buffer.cl_buffer, region, src_origin: [start, 0, 0], event_wait_list: parent_buffer.op)
+        end
+        buffer
+      end
       def get_op_with_axis(a, target_axis, current_axis, output_type, op = ->(t, u) { t > u })
         if target_axis == current_axis
           if a[0].is_a?(Array)
@@ -898,7 +851,11 @@ module TensorStream
       end
       def build_event_wait_list(inputs)
-        inputs.compact.map(&:op).flatten
+        if inputs.is_a?(Array)
+          inputs.flatten.compact.map(&:op).compact.uniq
+        else
+          inputs.op ? [inputs.op] : []
+        end
       end
       def resolve_placeholder(placeholder, _execution_context = {})