tensor_stream-opencl 0.1.3 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (48) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +11 -4
  3. data/benchmark/benchmark.rb +91 -0
  4. data/benchmark_intel.txt +36 -0
  5. data/lib/tensor_stream/opencl/array_ops.rb +395 -0
  6. data/lib/tensor_stream/opencl/images_ops.rb +62 -0
  7. data/lib/tensor_stream/opencl/kernels/abs.cl +6 -8
  8. data/lib/tensor_stream/opencl/kernels/acos.cl +3 -4
  9. data/lib/tensor_stream/opencl/kernels/apply_adadelta.cl +2 -4
  10. data/lib/tensor_stream/opencl/kernels/apply_adagrad.cl +12 -0
  11. data/lib/tensor_stream/opencl/kernels/apply_adam.cl +2 -5
  12. data/lib/tensor_stream/opencl/kernels/apply_centered_rms_prop.cl +19 -0
  13. data/lib/tensor_stream/opencl/kernels/apply_gradient.cl +3 -4
  14. data/lib/tensor_stream/opencl/kernels/apply_momentum.cl +2 -4
  15. data/lib/tensor_stream/opencl/kernels/apply_rms_prop.cl +16 -0
  16. data/lib/tensor_stream/opencl/kernels/asin.cl +3 -4
  17. data/lib/tensor_stream/opencl/kernels/ceil.cl +3 -4
  18. data/lib/tensor_stream/opencl/kernels/concat.cl +21 -0
  19. data/lib/tensor_stream/opencl/kernels/cos.cl +3 -5
  20. data/lib/tensor_stream/opencl/kernels/exp.cl +3 -5
  21. data/lib/tensor_stream/opencl/kernels/floor.cl +3 -4
  22. data/lib/tensor_stream/opencl/kernels/log.cl +3 -4
  23. data/lib/tensor_stream/opencl/kernels/log1p.cl +3 -4
  24. data/lib/tensor_stream/opencl/kernels/negate.cl +3 -4
  25. data/lib/tensor_stream/opencl/kernels/reciprocal.cl +3 -4
  26. data/lib/tensor_stream/opencl/kernels/sigmoid.cl +3 -4
  27. data/lib/tensor_stream/opencl/kernels/sign.cl +7 -8
  28. data/lib/tensor_stream/opencl/kernels/sin.cl +3 -4
  29. data/lib/tensor_stream/opencl/kernels/split.cl +17 -0
  30. data/lib/tensor_stream/opencl/kernels/split_n.cl +18 -0
  31. data/lib/tensor_stream/opencl/kernels/sqrt.cl +3 -4
  32. data/lib/tensor_stream/opencl/kernels/square.cl +3 -4
  33. data/lib/tensor_stream/opencl/kernels/tan.cl +3 -4
  34. data/lib/tensor_stream/opencl/kernels/tanh.cl +3 -4
  35. data/lib/tensor_stream/opencl/kernels/tanh_grad.cl +3 -4
  36. data/lib/tensor_stream/opencl/kernels/unpack.cl +23 -0
  37. data/lib/tensor_stream/opencl/nn_ops.rb +111 -26
  38. data/lib/tensor_stream/opencl/opencl_buffer.rb +9 -0
  39. data/lib/tensor_stream/opencl/opencl_evaluator.rb +129 -172
  40. data/lib/tensor_stream/opencl/version.rb +1 -1
  41. data/samples/iris.data +150 -0
  42. data/samples/iris.rb +110 -0
  43. data/samples/mnist_data.rb +65 -0
  44. data/samples/multigpu.rb +73 -0
  45. data/samples/nearest_neighbor.rb +56 -0
  46. data/samples/rnn.rb +108 -0
  47. data/tensor_stream-opencl.gemspec +4 -1
  48. metadata +62 -3
@@ -0,0 +1,62 @@
1
+ # require 'oily_png'
2
+ module TensorStream
3
+ module OpenCLHelpers
4
+ module ImagesOps
5
+ def ImagesOps.included(klass)
6
+ klass.class_eval do
7
+ register_op :decode_png do |context, tensor, inputs|
8
+ content = _run(inputs[0], context)
9
+ channels = tensor.options[:channels]
10
+ channels = 4 if channels.zero?
11
+
12
+ image = ChunkyPNG::Image.from_blob(content.buffer.to_a.pack('C*'))
13
+ output_buffer = _create_result_buffer(tensor.data_type, [image.height, image.width, channels], "out_#{tensor.name}")
14
+
15
+ image.grayscale! if channels == 1
16
+ image.pixels.each_with_index do |pixel, index|
17
+ start_index = index * channels
18
+ if channels == 4
19
+ output_buffer.buffer[start_index] = ChunkyPNG::Color.r(pixel)
20
+ output_buffer.buffer[start_index + 1] = ChunkyPNG::Color.g(pixel)
21
+ output_buffer.buffer[start_index + 2] = ChunkyPNG::Color.b(pixel)
22
+ output_buffer.buffer[start_index + 3] = ChunkyPNG::Color.a(pixel)
23
+ elsif channels == 3
24
+ output_buffer.buffer[start_index] = ChunkyPNG::Color.r(pixel)
25
+ output_buffer.buffer[start_index + 1] = ChunkyPNG::Color.g(pixel)
26
+ output_buffer.buffer[start_index + 2] = ChunkyPNG::Color.b(pixel)
27
+ elsif channels == 1
28
+ output_buffer.buffer[start_index] = ChunkyPNG::Color.r(pixel)
29
+ else
30
+ raise "Invalid channel value #{channels}"
31
+ end
32
+ end
33
+
34
+ write_op = _opencl_queue.enqueue_write_buffer(output_buffer.cl_buffer, output_buffer.buffer)
35
+ output_buffer.op = write_op
36
+ output_buffer
37
+ end
38
+
39
+ register_op :encode_png do |_context, tensor, inputs|
40
+ image_data = inputs[0]
41
+ height, width, channels = image_data.shape
42
+ image_buffer = image_data.buffer.reshape(*image_data.shape.reverse).to_a
43
+ \
44
+ png = ChunkyPNG::Image.new(width, height)
45
+ image_buffer.each_with_index do |rows, h_index|
46
+ rows.each_with_index do |p_data, w_index|
47
+ if channels == 4
48
+ png[w_index, h_index] = ChunkyPNG::Color.rgba(p_data[0], p_data[1], p_data[2], p_data[3])
49
+ elsif channels == 3
50
+ png[w_index, h_index] = ChunkyPNG::Color.rgb(p_data[0], p_data[1], p_data[2])
51
+ elsif channels == 1
52
+ png[w_index, h_index] = ChunkyPNG::Color.rgb(p_data[0], p_data[0], p_data[0])
53
+ end
54
+ end
55
+ end
56
+ convert_to_opencl(png.to_s, [], data_type: :string, name: tensor.name)
57
+ end
58
+ end
59
+ end
60
+ end
61
+ end
62
+ end
@@ -1,20 +1,18 @@
1
1
  % c_dtype = dtype_to_c_type(dtype)
2
2
  % if TensorStream::Ops::FLOATING_POINT_TYPES.include?(dtype)
3
- __kernel void abs_<%= dtype%>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
+ __kernel void abs_<%= dtype%>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
4
4
  // Get the index of the current element to be processed
5
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
5
+ const int id = get_global_id(0); // Row ID of C (0..M)
7
6
 
8
- C[globalRow * N + globalCol] = fabs(A[globalRow * N + globalCol]);
7
+ C[id] = fabs(A[id]);
9
8
  }
10
9
  % else
11
10
  % %w[int int32].each do |dt|
12
- __kernel void abs_<%= dt %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
11
+ __kernel void abs_<%= dt %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
13
12
  // Get the index of the current element to be processed
14
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
15
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
13
+ const int id = get_global_id(0); // Row ID of C (0..M)
16
14
 
17
- C[globalRow * N + globalCol] = fabs((float)A[globalRow * N + globalCol]);
15
+ C[id] = fabs((float)A[id]);
18
16
  }
19
17
  % end
20
18
  %end
@@ -1,8 +1,7 @@
1
1
  % c_dtype = dtype_to_c_type(dtype)
2
- __kernel void acos_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
2
+ __kernel void acos_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
3
  // Get the index of the current element to be processed
4
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
4
+ const int id = get_global_id(0);
6
5
 
7
- C[globalRow * N + globalCol] = acos(A[globalRow * N + globalCol]);
6
+ C[id] = acos(A[id]);
8
7
  }
@@ -1,6 +1,6 @@
1
1
  % c_dtype = dtype_to_c_type(dtype)
2
2
  // same dimension add floating point op
3
- __kernel void apply_adadelta_<%= dtype %>(const int M, const int N,
3
+ __kernel void apply_adadelta_<%= dtype %>(
4
4
  __global const <%= c_dtype %> *lr,
5
5
  __global const <%= c_dtype %> *rho,
6
6
  __global const <%= c_dtype %> *epsilon,
@@ -10,9 +10,7 @@
10
10
  __global <%= c_dtype %> *acc_update
11
11
  ) {
12
12
  // Get the index of the current element to be processed
13
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
14
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
15
- const int index = globalRow * N + globalCol;
13
+ const int index = get_global_id(0);
16
14
 
17
15
  acc[index] = acc[index] * rho[0] + (grad[index] * grad[index]) * ((<%= c_dtype %>)1 - rho[0]);
18
16
  const <%= c_dtype %> update = sqrt(acc_update[index] + epsilon[0]) * rsqrt(acc[index] + epsilon[0]) * grad[index];
@@ -0,0 +1,12 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ // same dimension add floating point op
3
+ __kernel void apply_adagrad_<%= dtype %>(
4
+ __global const <%= c_dtype %> *lr,
5
+ __global const <%= c_dtype %> *grad,
6
+ __global <%= c_dtype %> *output,
7
+ __global <%= c_dtype %> *acc
8
+ ) {
9
+ // Get the index of the current element to be processed
10
+ const int index = get_global_id(0);
11
+ output[index] -= grad[index] * lr[0] * rsqrt(acc[index]);
12
+ }
@@ -1,6 +1,6 @@
1
1
  % c_dtype = dtype_to_c_type(dtype)
2
2
  // same dimension add floating point op
3
- __kernel void apply_adam_<%= dtype %>(const int M, const int N,
3
+ __kernel void apply_adam_<%= dtype %>(
4
4
  __global const <%= c_dtype %> *grad,
5
5
  __global const <%= c_dtype %> *learning_rate,
6
6
  __global const <%= c_dtype %> *beta1_power,
@@ -11,10 +11,7 @@
11
11
  __global <%= c_dtype %> *momentum,
12
12
  __global <%= c_dtype %> *output, __global <%= c_dtype %> *v) {
13
13
  // Get the index of the current element to be processed
14
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
15
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
16
- const int index = globalRow * N + globalCol;
17
-
14
+ const int index = get_global_id(0);
18
15
  <%= c_dtype %> alpha = learning_rate[0] * sqrt(1.0 - beta2_power[0]) / (1.0 - beta1_power[0]);
19
16
 
20
17
  momentum[index] += (grad[index] - momentum[index]) * (1.0 - beta1[0]);
@@ -0,0 +1,19 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ // same dimension add floating point op
3
+ __kernel void apply_centered_rms_prop_<%= dtype %>(__global const <%= c_dtype %> *lr,
4
+ __global const <%= c_dtype %> *rho,
5
+ __global const <%= c_dtype %> *momentum,
6
+ __global const <%= c_dtype %> *epsilon,
7
+ __global const <%= c_dtype %> *grad,
8
+ __global <%= c_dtype %> *output,
9
+ __global <%= c_dtype %> *ms,
10
+ __global <%= c_dtype %> *mg,
11
+ __global <%= c_dtype %> *mom) {
12
+ // Get the index of the current element to be processed
13
+ const int id = get_global_id(0);
14
+ ms[id] += (grad[id] * grad[id] - ms[id]) * (1.0 - rho[0]);
15
+ <%= c_dtype %> denom = ms[id] - mg[id] * mg[id] + epsilon[0];
16
+ mg[id] = (grad[id] - mg[id]) * (1.0 - rho[0]);
17
+ mom[id] = mom[id] * momentum[0] + (grad[id] * lr[0]) / sqrt(denom);
18
+ output[id] -= mom[id];
19
+ }
@@ -1,9 +1,8 @@
1
1
  % c_dtype = dtype_to_c_type(dtype)
2
2
  // same dimension add floating point op
3
- __kernel void apply_gradient_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
3
+ __kernel void apply_gradient_<%= dtype %>(__global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
4
4
  // Get the index of the current element to be processed
5
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
5
+ const int id = get_global_id(0);
7
6
 
8
- C[globalRow * N + globalCol] -= A[globalRow * N + globalCol] * B[0];
7
+ C[id] -= A[id] * B[0];
9
8
  }
@@ -1,11 +1,9 @@
1
1
  % c_dtype = dtype_to_c_type(dtype)
2
2
  // same dimension add floating point op
3
- __kernel void apply_momentum_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *grad, __global const <%= c_dtype %> *learning_rate,
3
+ __kernel void apply_momentum_<%= dtype %>(__global const <%= c_dtype %> *grad, __global const <%= c_dtype %> *learning_rate,
4
4
  __global const <%= c_dtype %> *momentum, __global <%= c_dtype %> *output, __global <%= c_dtype %> *acc) {
5
5
  // Get the index of the current element to be processed
6
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
7
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
8
- const int index = globalRow * N + globalCol;
6
+ const int index = get_global_id(0);
9
7
  <%= c_dtype %> acc_m = acc[index];
10
8
  acc[index] = acc_m * momentum[0] + grad[index];
11
9
  <% if nesterov %>
@@ -0,0 +1,16 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ // same dimension add floating point op
3
+ __kernel void apply_rms_prop_<%= dtype %>(__global const <%= c_dtype %> *lr,
4
+ __global const <%= c_dtype %> *rho,
5
+ __global const <%= c_dtype %> *momentum,
6
+ __global const <%= c_dtype %> *epsilon,
7
+ __global const <%= c_dtype %> *grad,
8
+ __global <%= c_dtype %> *output,
9
+ __global <%= c_dtype %> *ms,
10
+ __global <%= c_dtype %> *mom) {
11
+ // Get the index of the current element to be processed
12
+ const int id = get_global_id(0);
13
+ ms[id] += (grad[id] * grad[id] - ms[id]) * (1.0 - rho[0]);
14
+ mom[id] = mom[id] * momentum[0] + (grad[id] * lr[0]) / sqrt(ms[id] + epsilon[0]);
15
+ output[id] -= mom[id];
16
+ }
@@ -1,9 +1,8 @@
1
1
 
2
2
  % c_dtype = dtype_to_c_type(dtype)
3
- __kernel void asin_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
+ __kernel void asin_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
4
4
  // Get the index of the current element to be processed
5
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
5
+ const int id = get_global_id(0);
7
6
 
8
- C[globalRow * N + globalCol] = asin(A[globalRow * N + globalCol]);
7
+ C[id] = asin(A[id]);
9
8
  }
@@ -1,8 +1,7 @@
1
1
  % c_dtype = dtype_to_c_type(dtype)
2
- __kernel void ceil_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
2
+ __kernel void ceil_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
3
  // Get the index of the current element to be processed
4
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
4
+ const int id = get_global_id(0);
6
5
 
7
- C[globalRow * N + globalCol] = ceil(A[globalRow * N + globalCol]);
6
+ C[id] = ceil(A[id]);
8
7
  }
@@ -0,0 +1,21 @@
1
+ % ctype = dtype_to_c_type(data_type)
2
+
3
+ __kernel void concat(const int N, const int index, const int step, __global const <%= ctype %> *A, __global <%= ctype %> *C) {
4
+ // Get the index of the current element to be processed
5
+ const int globalCol = get_global_id(0); // Col ID of C (0..N)
6
+ int ptr = globalCol;
7
+
8
+ // compute effective coordinates
9
+ <% divisors.each_with_index do |div, index| %>
10
+ <% if axis == index %>
11
+ int index_map_<%= index %> = (int)floor(ptr / (float)<%= div %>) + step;
12
+ <% else %>
13
+ int index_map_<%= index %> = (int)floor(ptr / (float)<%= div %>);
14
+ <% end %>
15
+ <% if index < divisors.size - 1%>
16
+ ptr = ptr % <%= div %>;
17
+ <% end %>
18
+ <% end %>
19
+
20
+ C[<%= multipliers.each_with_index.map { |m, idx| "#{m}*index_map_#{idx}" }.join(' + ') %>] = A[globalCol];
21
+ }
@@ -1,8 +1,6 @@
1
1
  % c_dtype = dtype_to_c_type(dtype)
2
- __kernel void cos_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
2
+ __kernel void cos_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
3
  // Get the index of the current element to be processed
4
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
-
7
- C[globalRow * N + globalCol] = cos(A[globalRow * N + globalCol]);
4
+ const int id = get_global_id(0);
5
+ C[id] = cos(A[id]);
8
6
  }
@@ -1,8 +1,6 @@
1
1
  % c_dtype = dtype_to_c_type(dtype)
2
- __kernel void exp_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
2
+ __kernel void exp_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
3
  // Get the index of the current element to be processed
4
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
-
7
- C[globalRow * N + globalCol] = exp(A[globalRow * N + globalCol]);
4
+ const int id = get_global_id(0);
5
+ C[id] = exp(A[id]);
8
6
  }
@@ -1,8 +1,7 @@
1
1
  % c_dtype = dtype_to_c_type(dtype)
2
- __kernel void floor_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
2
+ __kernel void floor_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
3
  // Get the index of the current element to be processed
4
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
4
+ const int id = get_global_id(0);
6
5
 
7
- C[globalRow * N + globalCol] = floor(A[globalRow * N + globalCol]);
6
+ C[id] = floor(A[id]);
8
7
  }
@@ -1,8 +1,7 @@
1
1
  % c_dtype = dtype_to_c_type(dtype)
2
- __kernel void log_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
2
+ __kernel void log_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
3
  // Get the index of the current element to be processed
4
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
4
+ const int id = get_global_id(0);
6
5
 
7
- C[globalRow * N + globalCol] = log(A[globalRow * N + globalCol]);
6
+ C[id] = log(A[id]);
8
7
  }
@@ -1,8 +1,7 @@
1
1
  % c_dtype = dtype_to_c_type(dtype)
2
- __kernel void log1p_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
2
+ __kernel void log1p_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
3
  // Get the index of the current element to be processed
4
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
4
+ const int id = get_global_id(0); // Row ID of C (0..M)
6
5
 
7
- C[globalRow * N + globalCol] = log1p(A[globalRow * N + globalCol]);
6
+ C[id] = log1p(A[id]);
8
7
  }
@@ -1,8 +1,7 @@
1
1
  % c_dtype = dtype_to_c_type(dtype)
2
- __kernel void negate_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
2
+ __kernel void negate_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
3
  // Get the index of the current element to be processed
4
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
4
+ const int id = get_global_id(0); // Row ID of C (0..M)
6
5
 
7
- C[globalRow * N + globalCol] = -A[globalRow * N + globalCol];
6
+ C[id] = -A[id];
8
7
  }
@@ -1,8 +1,7 @@
1
1
  % c_dtype = dtype_to_c_type(dtype)
2
- __kernel void reciprocal_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
2
+ __kernel void reciprocal_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
3
  // Get the index of the current element to be processed
4
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
4
+ const int id = get_global_id(0);
6
5
 
7
- C[globalRow * N + globalCol] = 1 / A[globalRow * N + globalCol];
6
+ C[id] = 1 / A[id];
8
7
  }
@@ -1,9 +1,8 @@
1
1
  % c_dtype = dtype_to_c_type(dtype)
2
2
 
3
- __kernel void sigmoid_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
+ __kernel void sigmoid_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
4
4
  // Get the index of the current element to be processed
5
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
5
+ const int id = get_global_id(0);
7
6
 
8
- C[globalRow * N + globalCol] = 1.0f/(1.0f + exp(-A[globalRow * N + globalCol]));
7
+ C[id] = 1.0f/(1.0f + exp(-A[id]));
9
8
  }
@@ -1,21 +1,20 @@
1
1
  % c_dtype = dtype_to_c_type(dtype)
2
2
 
3
- __kernel void sign_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
+ __kernel void sign_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
4
4
  // Get the index of the current element to be processed
5
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
7
- <%= c_dtype %> value = A[globalRow * N + globalCol];
5
+ const int id = get_global_id(0);
6
+ <%= c_dtype %> value = A[id];
8
7
  % if floating_point?(dtype)
9
8
  if (isnan(value) || value == 0.0f) {
10
- C[globalRow * N + globalCol] = 0.0;
9
+ C[id] = 0.0;
11
10
  } else {
12
- C[globalRow * N + globalCol] = value < 0 ? -1.0 : 1.0;
11
+ C[id] = value < 0 ? -1.0 : 1.0;
13
12
  }
14
13
  % else
15
14
  if (value == 0) {
16
- C[globalRow * N + globalCol] = 0;
15
+ C[id] = 0;
17
16
  } else {
18
- C[globalRow * N + globalCol] = value < 0 ? -1 : 1;
17
+ C[id] = value < 0 ? -1 : 1;
19
18
  }
20
19
  % end
21
20
  }
@@ -1,9 +1,8 @@
1
1
 
2
2
  % c_dtype = dtype_to_c_type(dtype)
3
- __kernel void sin_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
+ __kernel void sin_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
4
4
  // Get the index of the current element to be processed
5
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
5
+ const int id = get_global_id(0); // Row ID of C (0..M)
7
6
 
8
- C[globalRow * N + globalCol] = sin(A[globalRow * N + globalCol]);
7
+ C[id] = sin(A[id]);
9
8
  }
@@ -0,0 +1,17 @@
1
+ % ctype = dtype_to_c_type(data_type)
2
+ % mul_str = mul.each_with_index.collect { |mul, index| "#{mul} * index_map_#{index}" }
3
+ __kernel void split(const int N, __global const <%= ctype %> *A, __global <%= ctype %> *C) {
4
+ // Get the index of the current element to be processed
5
+ const int globalCol = get_global_id(0); // Col ID of C (0..N)
6
+ const int localCol = get_global_id(1);
7
+ // compute effective coordinates
8
+ int ptr = localCol;
9
+ <% dest.each_with_index do |div, index| %>
10
+ <% if index == axis %>
11
+ int index_map_<%= index %> = (int)floor(ptr / (float)<%= div %>) + globalCol * <%= step %>;
12
+ <% else %>
13
+ int index_map_<%= index %> = (int)floor(ptr / (float)<%= div %>);
14
+ <% end %>
15
+ <% if index < dest.size - 1%>ptr = ptr % <%= div %>;<% end %><% end %>
16
+ C[N*globalCol + localCol] = A[<%= mul_str.join(" + ") %>];
17
+ }