RubyGems - tensor_stream-opencl - Versions diffs - 0.1.3 → 0.2.0 - Mend

tensor_stream-opencl 0.1.3 → 0.2.0

Files changed (48) hide show

checksums.yaml +4 -4
data/Gemfile.lock +11 -4
data/benchmark/benchmark.rb +91 -0
data/benchmark_intel.txt +36 -0
data/lib/tensor_stream/opencl/array_ops.rb +395 -0
data/lib/tensor_stream/opencl/images_ops.rb +62 -0
data/lib/tensor_stream/opencl/kernels/abs.cl +6 -8
data/lib/tensor_stream/opencl/kernels/acos.cl +3 -4
data/lib/tensor_stream/opencl/kernels/apply_adadelta.cl +2 -4
data/lib/tensor_stream/opencl/kernels/apply_adagrad.cl +12 -0
data/lib/tensor_stream/opencl/kernels/apply_adam.cl +2 -5
data/lib/tensor_stream/opencl/kernels/apply_centered_rms_prop.cl +19 -0
data/lib/tensor_stream/opencl/kernels/apply_gradient.cl +3 -4
data/lib/tensor_stream/opencl/kernels/apply_momentum.cl +2 -4
data/lib/tensor_stream/opencl/kernels/apply_rms_prop.cl +16 -0
data/lib/tensor_stream/opencl/kernels/asin.cl +3 -4
data/lib/tensor_stream/opencl/kernels/ceil.cl +3 -4
data/lib/tensor_stream/opencl/kernels/concat.cl +21 -0
data/lib/tensor_stream/opencl/kernels/cos.cl +3 -5
data/lib/tensor_stream/opencl/kernels/exp.cl +3 -5
data/lib/tensor_stream/opencl/kernels/floor.cl +3 -4
data/lib/tensor_stream/opencl/kernels/log.cl +3 -4
data/lib/tensor_stream/opencl/kernels/log1p.cl +3 -4
data/lib/tensor_stream/opencl/kernels/negate.cl +3 -4
data/lib/tensor_stream/opencl/kernels/reciprocal.cl +3 -4
data/lib/tensor_stream/opencl/kernels/sigmoid.cl +3 -4
data/lib/tensor_stream/opencl/kernels/sign.cl +7 -8
data/lib/tensor_stream/opencl/kernels/sin.cl +3 -4
data/lib/tensor_stream/opencl/kernels/split.cl +17 -0
data/lib/tensor_stream/opencl/kernels/split_n.cl +18 -0
data/lib/tensor_stream/opencl/kernels/sqrt.cl +3 -4
data/lib/tensor_stream/opencl/kernels/square.cl +3 -4
data/lib/tensor_stream/opencl/kernels/tan.cl +3 -4
data/lib/tensor_stream/opencl/kernels/tanh.cl +3 -4
data/lib/tensor_stream/opencl/kernels/tanh_grad.cl +3 -4
data/lib/tensor_stream/opencl/kernels/unpack.cl +23 -0
data/lib/tensor_stream/opencl/nn_ops.rb +111 -26
data/lib/tensor_stream/opencl/opencl_buffer.rb +9 -0
data/lib/tensor_stream/opencl/opencl_evaluator.rb +129 -172
data/lib/tensor_stream/opencl/version.rb +1 -1
data/samples/iris.data +150 -0
data/samples/iris.rb +110 -0
data/samples/mnist_data.rb +65 -0
data/samples/multigpu.rb +73 -0
data/samples/nearest_neighbor.rb +56 -0
data/samples/rnn.rb +108 -0
data/tensor_stream-opencl.gemspec +4 -1
metadata +62 -3

data/lib/tensor_stream/opencl/images_ops.rb ADDED Viewed

@@ -0,0 +1,62 @@
+# require 'oily_png'
+module TensorStream
+  module OpenCLHelpers
+    module ImagesOps
+      def ImagesOps.included(klass)
+        klass.class_eval do
+          register_op :decode_png do |context, tensor, inputs|
+            content = _run(inputs[0], context)
+            channels = tensor.options[:channels]
+            channels = 4 if channels.zero?
+            image = ChunkyPNG::Image.from_blob(content.buffer.to_a.pack('C*'))
+            output_buffer = _create_result_buffer(tensor.data_type, [image.height, image.width, channels], "out_#{tensor.name}")
+            image.grayscale! if channels == 1
+            image.pixels.each_with_index do |pixel, index|
+              start_index = index * channels
+              if channels == 4
+                output_buffer.buffer[start_index] = ChunkyPNG::Color.r(pixel)
+                output_buffer.buffer[start_index + 1] = ChunkyPNG::Color.g(pixel)
+                output_buffer.buffer[start_index + 2] = ChunkyPNG::Color.b(pixel)
+                output_buffer.buffer[start_index + 3] = ChunkyPNG::Color.a(pixel)
+              elsif channels == 3
+                output_buffer.buffer[start_index] = ChunkyPNG::Color.r(pixel)
+                output_buffer.buffer[start_index + 1] = ChunkyPNG::Color.g(pixel)
+                output_buffer.buffer[start_index + 2] = ChunkyPNG::Color.b(pixel)
+              elsif channels == 1
+                output_buffer.buffer[start_index] = ChunkyPNG::Color.r(pixel)
+              else
+                raise "Invalid channel value #{channels}"
+              end
+            end
+            write_op = _opencl_queue.enqueue_write_buffer(output_buffer.cl_buffer, output_buffer.buffer)
+            output_buffer.op = write_op
+            output_buffer
+          end
+          register_op :encode_png do |_context, tensor, inputs|
+            image_data = inputs[0]
+            height, width, channels = image_data.shape
+            image_buffer = image_data.buffer.reshape(*image_data.shape.reverse).to_a
+\
+            png = ChunkyPNG::Image.new(width, height)
+            image_buffer.each_with_index do |rows, h_index|
+              rows.each_with_index do |p_data, w_index|
+                if channels == 4
+                  png[w_index, h_index] = ChunkyPNG::Color.rgba(p_data[0], p_data[1], p_data[2], p_data[3])
+                elsif channels == 3
+                  png[w_index, h_index] = ChunkyPNG::Color.rgb(p_data[0], p_data[1], p_data[2])
+                elsif channels == 1
+                  png[w_index, h_index] = ChunkyPNG::Color.rgb(p_data[0], p_data[0], p_data[0])
+                end
+              end
+            end
+            convert_to_opencl(png.to_s, [], data_type: :string, name: tensor.name)
+          end
+        end
+      end
+    end
+  end
+end

data/lib/tensor_stream/opencl/kernels/abs.cl CHANGED Viewed

@@ -1,20 +1,18 @@
 % c_dtype = dtype_to_c_type(dtype)
 % if TensorStream::Ops::FLOATING_POINT_TYPES.include?(dtype)
-__kernel void abs_<%= dtype%>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
+__kernel void abs_<%= dtype%>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
     // Get the index of the current element to be processed
-    const int globalRow = get_global_id(0); // Row ID of C (0..M)
-    const int globalCol = get_global_id(1); // Col ID of C (0..N)
+    const int id = get_global_id(0); // Row ID of C (0..M)
-    C[globalRow * N + globalCol] = fabs(A[globalRow * N + globalCol]);
+    C[id] = fabs(A[id]);
 }
 % else
 % %w[int int32].each do |dt|
-__kernel void abs_<%= dt %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
+__kernel void abs_<%= dt %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
     // Get the index of the current element to be processed
-    const int globalRow = get_global_id(0); // Row ID of C (0..M)
-    const int globalCol = get_global_id(1); // Col ID of C (0..N)
+    const int id = get_global_id(0); // Row ID of C (0..M)
-    C[globalRow * N + globalCol] = fabs((float)A[globalRow * N + globalCol]);
+    C[id] = fabs((float)A[id]);
 }
 % end
 %end

data/lib/tensor_stream/opencl/kernels/acos.cl CHANGED Viewed

@@ -1,8 +1,7 @@
 % c_dtype = dtype_to_c_type(dtype)
-__kernel void acos_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
+__kernel void acos_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
     // Get the index of the current element to be processed
-    const int globalRow = get_global_id(0); // Row ID of C (0..M)
-    const int globalCol = get_global_id(1); // Col ID of C (0..N)
+    const int id = get_global_id(0);
-    C[globalRow * N + globalCol] = acos(A[globalRow * N + globalCol]);
+    C[id] = acos(A[id]);
 }

data/lib/tensor_stream/opencl/kernels/apply_adadelta.cl CHANGED Viewed

@@ -1,6 +1,6 @@
 % c_dtype = dtype_to_c_type(dtype)
  // same dimension add floating point op
- __kernel void apply_adadelta_<%= dtype %>(const int M, const int N,
+ __kernel void apply_adadelta_<%= dtype %>(
                                        __global const <%= c_dtype %> *lr,
                                        __global const <%= c_dtype %> *rho,
                                        __global const <%= c_dtype %> *epsilon,
@@ -10,9 +10,7 @@
                                        __global <%= c_dtype %> *acc_update
                                        ) {
     // Get the index of the current element to be processed
-    const int globalRow = get_global_id(0); // Row ID of C (0..M)
-    const int globalCol = get_global_id(1); // Col ID of C (0..N)
-    const int index = globalRow * N + globalCol;
+    const int index = get_global_id(0);
     acc[index] = acc[index] * rho[0] + (grad[index] * grad[index]) * ((<%= c_dtype %>)1 - rho[0]);
     const <%= c_dtype %> update = sqrt(acc_update[index] + epsilon[0]) * rsqrt(acc[index] + epsilon[0]) * grad[index];

data/lib/tensor_stream/opencl/kernels/apply_adagrad.cl ADDED Viewed

@@ -0,0 +1,12 @@
+% c_dtype = dtype_to_c_type(dtype)
+ // same dimension add floating point op
+ __kernel void apply_adagrad_<%= dtype %>(
+                                        __global const <%= c_dtype %> *lr,
+                                       __global const <%= c_dtype %> *grad,
+                                       __global <%= c_dtype %> *output,
+                                       __global <%= c_dtype %> *acc
+                                       ) {
+    // Get the index of the current element to be processed
+    const int index = get_global_id(0);
+    output[index] -= grad[index] * lr[0] * rsqrt(acc[index]);
+ }

data/lib/tensor_stream/opencl/kernels/apply_adam.cl CHANGED Viewed

@@ -1,6 +1,6 @@
 % c_dtype = dtype_to_c_type(dtype)
  // same dimension add floating point op
- __kernel void apply_adam_<%= dtype %>(const int M, const int N,
+ __kernel void apply_adam_<%= dtype %>(
                                        __global const <%= c_dtype %> *grad,
                                        __global const <%= c_dtype %> *learning_rate,
                                        __global const <%= c_dtype %> *beta1_power,
@@ -11,10 +11,7 @@
                                        __global <%= c_dtype %> *momentum,
                                        __global <%= c_dtype %> *output, __global <%= c_dtype %> *v) {
     // Get the index of the current element to be processed
-    const int globalRow = get_global_id(0); // Row ID of C (0..M)
-    const int globalCol = get_global_id(1); // Col ID of C (0..N)
-    const int index = globalRow * N + globalCol;
+    const int index = get_global_id(0);
     <%= c_dtype %> alpha = learning_rate[0] * sqrt(1.0 - beta2_power[0]) / (1.0 - beta1_power[0]);
     momentum[index] += (grad[index] - momentum[index]) * (1.0 - beta1[0]);

data/lib/tensor_stream/opencl/kernels/apply_centered_rms_prop.cl ADDED Viewed

@@ -0,0 +1,19 @@
+% c_dtype = dtype_to_c_type(dtype)
+ // same dimension add floating point op
+ __kernel void apply_centered_rms_prop_<%= dtype %>(__global const <%= c_dtype %> *lr,
+                                           __global const <%= c_dtype %> *rho,
+                                           __global const <%= c_dtype %> *momentum,
+                                           __global const <%= c_dtype %> *epsilon,
+                                           __global const <%= c_dtype %> *grad,
+                                           __global <%= c_dtype %> *output,
+                                           __global <%= c_dtype %> *ms,
+                                           __global <%= c_dtype %> *mg,
+                                           __global <%= c_dtype %> *mom) {
+    // Get the index of the current element to be processed
+    const int id = get_global_id(0);
+    ms[id] += (grad[id] * grad[id] - ms[id]) * (1.0 - rho[0]);
+    <%= c_dtype %> denom = ms[id] - mg[id] * mg[id] + epsilon[0];
+    mg[id] = (grad[id] - mg[id]) * (1.0 - rho[0]);
+    mom[id] = mom[id] * momentum[0] + (grad[id] * lr[0]) / sqrt(denom);
+    output[id] -= mom[id];
+ }

data/lib/tensor_stream/opencl/kernels/apply_gradient.cl CHANGED Viewed

@@ -1,9 +1,8 @@
 % c_dtype = dtype_to_c_type(dtype)
  // same dimension add floating point op
- __kernel void apply_gradient_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
+ __kernel void apply_gradient_<%= dtype %>(__global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
     // Get the index of the current element to be processed
-    const int globalRow = get_global_id(0); // Row ID of C (0..M)
-    const int globalCol = get_global_id(1); // Col ID of C (0..N)
+    const int id = get_global_id(0);
-    C[globalRow * N + globalCol] -= A[globalRow * N + globalCol] * B[0];
+    C[id] -= A[id] * B[0];
 }

data/lib/tensor_stream/opencl/kernels/apply_momentum.cl CHANGED Viewed

@@ -1,11 +1,9 @@
 % c_dtype = dtype_to_c_type(dtype)
  // same dimension add floating point op
- __kernel void apply_momentum_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *grad, __global const <%= c_dtype %> *learning_rate,
+ __kernel void apply_momentum_<%= dtype %>(__global const <%= c_dtype %> *grad, __global const <%= c_dtype %> *learning_rate,
                                           __global const <%= c_dtype %> *momentum, __global <%= c_dtype %> *output, __global <%= c_dtype %> *acc) {
     // Get the index of the current element to be processed
-    const int globalRow = get_global_id(0); // Row ID of C (0..M)
-    const int globalCol = get_global_id(1); // Col ID of C (0..N)
-    const int index = globalRow * N + globalCol;
+    const int index = get_global_id(0);
     <%= c_dtype %> acc_m = acc[index];
     acc[index] = acc_m * momentum[0] + grad[index];
 <% if nesterov %>

data/lib/tensor_stream/opencl/kernels/apply_rms_prop.cl ADDED Viewed

@@ -0,0 +1,16 @@
+% c_dtype = dtype_to_c_type(dtype)
+ // same dimension add floating point op
+ __kernel void apply_rms_prop_<%= dtype %>(__global const <%= c_dtype %> *lr,
+                                           __global const <%= c_dtype %> *rho,
+                                           __global const <%= c_dtype %> *momentum,
+                                           __global const <%= c_dtype %> *epsilon,
+                                           __global const <%= c_dtype %> *grad,
+                                           __global <%= c_dtype %> *output,
+                                           __global <%= c_dtype %> *ms,
+                                           __global <%= c_dtype %> *mom) {
+    // Get the index of the current element to be processed
+    const int id = get_global_id(0);
+    ms[id] += (grad[id] * grad[id] - ms[id]) * (1.0 - rho[0]);
+    mom[id] = mom[id] * momentum[0] + (grad[id] * lr[0]) / sqrt(ms[id] + epsilon[0]);
+    output[id] -= mom[id];
+ }

data/lib/tensor_stream/opencl/kernels/asin.cl CHANGED Viewed

@@ -1,9 +1,8 @@
 % c_dtype = dtype_to_c_type(dtype)
-__kernel void asin_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
+__kernel void asin_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
     // Get the index of the current element to be processed
-    const int globalRow = get_global_id(0); // Row ID of C (0..M)
-    const int globalCol = get_global_id(1); // Col ID of C (0..N)
+    const int id = get_global_id(0);
-    C[globalRow * N + globalCol] = asin(A[globalRow * N + globalCol]);
+    C[id] = asin(A[id]);
 }

data/lib/tensor_stream/opencl/kernels/ceil.cl CHANGED Viewed

@@ -1,8 +1,7 @@
 % c_dtype = dtype_to_c_type(dtype)
-__kernel void ceil_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
+__kernel void ceil_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
     // Get the index of the current element to be processed
-    const int globalRow = get_global_id(0); // Row ID of C (0..M)
-    const int globalCol = get_global_id(1); // Col ID of C (0..N)
+    const int id = get_global_id(0);
-    C[globalRow * N + globalCol] = ceil(A[globalRow * N + globalCol]);
+    C[id] = ceil(A[id]);
 }

data/lib/tensor_stream/opencl/kernels/concat.cl ADDED Viewed

@@ -0,0 +1,21 @@
+% ctype = dtype_to_c_type(data_type)
+__kernel void concat(const int N, const int index, const int step, __global const <%= ctype %> *A, __global <%= ctype %> *C) {
+    // Get the index of the current element to be processed
+    const int globalCol = get_global_id(0); // Col ID of C (0..N)
+    int ptr = globalCol;
+    // compute effective coordinates
+<% divisors.each_with_index do |div, index| %>
+    <% if axis == index %>
+        int index_map_<%= index %> = (int)floor(ptr / (float)<%= div %>) + step;
+    <% else %>
+        int index_map_<%= index %> = (int)floor(ptr / (float)<%= div %>);
+    <% end %>
+    <% if index < divisors.size - 1%>
+        ptr = ptr % <%= div %>;
+    <% end %>
+<% end %>
+    C[<%= multipliers.each_with_index.map { |m, idx| "#{m}*index_map_#{idx}" }.join(' + ') %>] = A[globalCol];
+}

data/lib/tensor_stream/opencl/kernels/cos.cl CHANGED Viewed

@@ -1,8 +1,6 @@
 % c_dtype = dtype_to_c_type(dtype)
-__kernel void cos_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
+__kernel void cos_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
     // Get the index of the current element to be processed
-    const int globalRow = get_global_id(0); // Row ID of C (0..M)
-    const int globalCol = get_global_id(1); // Col ID of C (0..N)
-    C[globalRow * N + globalCol] = cos(A[globalRow * N + globalCol]);
+    const int id = get_global_id(0);
+    C[id] = cos(A[id]);
 }

data/lib/tensor_stream/opencl/kernels/exp.cl CHANGED Viewed

@@ -1,8 +1,6 @@
 % c_dtype = dtype_to_c_type(dtype)
-__kernel void exp_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
+__kernel void exp_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
     // Get the index of the current element to be processed
-    const int globalRow = get_global_id(0); // Row ID of C (0..M)
-    const int globalCol = get_global_id(1); // Col ID of C (0..N)
-    C[globalRow * N + globalCol] = exp(A[globalRow * N + globalCol]);
+    const int id = get_global_id(0);
+    C[id] = exp(A[id]);
 }

data/lib/tensor_stream/opencl/kernels/floor.cl CHANGED Viewed

@@ -1,8 +1,7 @@
 % c_dtype = dtype_to_c_type(dtype)
-__kernel void floor_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
+__kernel void floor_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
     // Get the index of the current element to be processed
-    const int globalRow = get_global_id(0); // Row ID of C (0..M)
-    const int globalCol = get_global_id(1); // Col ID of C (0..N)
+    const int id = get_global_id(0);
-    C[globalRow * N + globalCol] = floor(A[globalRow * N + globalCol]);
+    C[id] = floor(A[id]);
 }

data/lib/tensor_stream/opencl/kernels/log.cl CHANGED Viewed

@@ -1,8 +1,7 @@
 % c_dtype = dtype_to_c_type(dtype)
-__kernel void log_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
+__kernel void log_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
     // Get the index of the current element to be processed
-    const int globalRow = get_global_id(0); // Row ID of C (0..M)
-    const int globalCol = get_global_id(1); // Col ID of C (0..N)
+    const int id = get_global_id(0);
-    C[globalRow * N + globalCol] = log(A[globalRow * N + globalCol]);
+    C[id] = log(A[id]);
 }

data/lib/tensor_stream/opencl/kernels/log1p.cl CHANGED Viewed

@@ -1,8 +1,7 @@
 % c_dtype = dtype_to_c_type(dtype)
-__kernel void log1p_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
+__kernel void log1p_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
     // Get the index of the current element to be processed
-    const int globalRow = get_global_id(0); // Row ID of C (0..M)
-    const int globalCol = get_global_id(1); // Col ID of C (0..N)
+    const int id = get_global_id(0); // Row ID of C (0..M)
-    C[globalRow * N + globalCol] = log1p(A[globalRow * N + globalCol]);
+    C[id] = log1p(A[id]);
 }

data/lib/tensor_stream/opencl/kernels/negate.cl CHANGED Viewed

@@ -1,8 +1,7 @@
 % c_dtype = dtype_to_c_type(dtype)
-__kernel void negate_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
+__kernel void negate_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
     // Get the index of the current element to be processed
-    const int globalRow = get_global_id(0); // Row ID of C (0..M)
-    const int globalCol = get_global_id(1); // Col ID of C (0..N)
+    const int id = get_global_id(0); // Row ID of C (0..M)
-    C[globalRow * N + globalCol] = -A[globalRow * N + globalCol];
+    C[id] = -A[id];
 }

data/lib/tensor_stream/opencl/kernels/reciprocal.cl CHANGED Viewed

@@ -1,8 +1,7 @@
 % c_dtype = dtype_to_c_type(dtype)
-__kernel void reciprocal_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
+__kernel void reciprocal_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
     // Get the index of the current element to be processed
-    const int globalRow = get_global_id(0); // Row ID of C (0..M)
-    const int globalCol = get_global_id(1); // Col ID of C (0..N)
+    const int id = get_global_id(0);
-    C[globalRow * N + globalCol] = 1 / A[globalRow * N + globalCol];
+    C[id] = 1 / A[id];
 }

data/lib/tensor_stream/opencl/kernels/sigmoid.cl CHANGED Viewed

@@ -1,9 +1,8 @@
 % c_dtype = dtype_to_c_type(dtype)
-__kernel void sigmoid_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
+__kernel void sigmoid_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
     // Get the index of the current element to be processed
-    const int globalRow = get_global_id(0); // Row ID of C (0..M)
-    const int globalCol = get_global_id(1); // Col ID of C (0..N)
+    const int id = get_global_id(0);
-    C[globalRow * N + globalCol] = 1.0f/(1.0f + exp(-A[globalRow * N + globalCol]));
+    C[id] = 1.0f/(1.0f + exp(-A[id]));
 }

data/lib/tensor_stream/opencl/kernels/sign.cl CHANGED Viewed

@@ -1,21 +1,20 @@
 % c_dtype = dtype_to_c_type(dtype)
-__kernel void sign_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
+__kernel void sign_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
     // Get the index of the current element to be processed
-    const int globalRow = get_global_id(0); // Row ID of C (0..M)
-    const int globalCol = get_global_id(1); // Col ID of C (0..N)
-    <%= c_dtype %> value = A[globalRow * N + globalCol];
+    const int id = get_global_id(0);
+    <%= c_dtype %> value = A[id];
 % if floating_point?(dtype)
     if (isnan(value) || value == 0.0f) {
-      C[globalRow * N + globalCol] = 0.0;
+      C[id] = 0.0;
     } else {
-      C[globalRow * N + globalCol] = value < 0 ? -1.0 : 1.0;
+      C[id] = value < 0 ? -1.0 : 1.0;
     }
 % else
   if (value == 0) {
-    C[globalRow * N + globalCol] = 0;
+    C[id] = 0;
   } else {
-    C[globalRow * N + globalCol] = value < 0 ? -1 : 1;
+    C[id] = value < 0 ? -1 : 1;
   }
 % end
 }

data/lib/tensor_stream/opencl/kernels/sin.cl CHANGED Viewed

@@ -1,9 +1,8 @@
 % c_dtype = dtype_to_c_type(dtype)
-__kernel void sin_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
+__kernel void sin_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
     // Get the index of the current element to be processed
-    const int globalRow = get_global_id(0); // Row ID of C (0..M)
-    const int globalCol = get_global_id(1); // Col ID of C (0..N)
+    const int id = get_global_id(0); // Row ID of C (0..M)
-    C[globalRow * N + globalCol] = sin(A[globalRow * N + globalCol]);
+    C[id] = sin(A[id]);
 }

data/lib/tensor_stream/opencl/kernels/split.cl ADDED Viewed

@@ -0,0 +1,17 @@
+% ctype = dtype_to_c_type(data_type)
+% mul_str = mul.each_with_index.collect { |mul, index| "#{mul} * index_map_#{index}" }
+__kernel void split(const int N, __global const <%= ctype %> *A, __global <%= ctype %> *C) {
+    // Get the index of the current element to be processed
+    const int globalCol = get_global_id(0); // Col ID of C (0..N)
+    const int localCol = get_global_id(1);
+    // compute effective coordinates
+    int ptr = localCol;
+<% dest.each_with_index do |div, index| %>
+    <% if index == axis %>
+    int index_map_<%= index %> = (int)floor(ptr / (float)<%= div %>) + globalCol * <%= step %>;
+    <% else %>
+    int index_map_<%= index %> = (int)floor(ptr / (float)<%= div %>);
+    <% end %>
+    <% if index < dest.size - 1%>ptr = ptr % <%= div %>;<% end %><% end %>
+    C[N*globalCol + localCol] =  A[<%= mul_str.join(" + ") %>];
+}