tensor_stream-opencl 0.1.3 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +11 -4
  3. data/benchmark/benchmark.rb +91 -0
  4. data/benchmark_intel.txt +36 -0
  5. data/lib/tensor_stream/opencl/array_ops.rb +395 -0
  6. data/lib/tensor_stream/opencl/images_ops.rb +62 -0
  7. data/lib/tensor_stream/opencl/kernels/abs.cl +6 -8
  8. data/lib/tensor_stream/opencl/kernels/acos.cl +3 -4
  9. data/lib/tensor_stream/opencl/kernels/apply_adadelta.cl +2 -4
  10. data/lib/tensor_stream/opencl/kernels/apply_adagrad.cl +12 -0
  11. data/lib/tensor_stream/opencl/kernels/apply_adam.cl +2 -5
  12. data/lib/tensor_stream/opencl/kernels/apply_centered_rms_prop.cl +19 -0
  13. data/lib/tensor_stream/opencl/kernels/apply_gradient.cl +3 -4
  14. data/lib/tensor_stream/opencl/kernels/apply_momentum.cl +2 -4
  15. data/lib/tensor_stream/opencl/kernels/apply_rms_prop.cl +16 -0
  16. data/lib/tensor_stream/opencl/kernels/asin.cl +3 -4
  17. data/lib/tensor_stream/opencl/kernels/ceil.cl +3 -4
  18. data/lib/tensor_stream/opencl/kernels/concat.cl +21 -0
  19. data/lib/tensor_stream/opencl/kernels/cos.cl +3 -5
  20. data/lib/tensor_stream/opencl/kernels/exp.cl +3 -5
  21. data/lib/tensor_stream/opencl/kernels/floor.cl +3 -4
  22. data/lib/tensor_stream/opencl/kernels/log.cl +3 -4
  23. data/lib/tensor_stream/opencl/kernels/log1p.cl +3 -4
  24. data/lib/tensor_stream/opencl/kernels/negate.cl +3 -4
  25. data/lib/tensor_stream/opencl/kernels/reciprocal.cl +3 -4
  26. data/lib/tensor_stream/opencl/kernels/sigmoid.cl +3 -4
  27. data/lib/tensor_stream/opencl/kernels/sign.cl +7 -8
  28. data/lib/tensor_stream/opencl/kernels/sin.cl +3 -4
  29. data/lib/tensor_stream/opencl/kernels/split.cl +17 -0
  30. data/lib/tensor_stream/opencl/kernels/split_n.cl +18 -0
  31. data/lib/tensor_stream/opencl/kernels/sqrt.cl +3 -4
  32. data/lib/tensor_stream/opencl/kernels/square.cl +3 -4
  33. data/lib/tensor_stream/opencl/kernels/tan.cl +3 -4
  34. data/lib/tensor_stream/opencl/kernels/tanh.cl +3 -4
  35. data/lib/tensor_stream/opencl/kernels/tanh_grad.cl +3 -4
  36. data/lib/tensor_stream/opencl/kernels/unpack.cl +23 -0
  37. data/lib/tensor_stream/opencl/nn_ops.rb +111 -26
  38. data/lib/tensor_stream/opencl/opencl_buffer.rb +9 -0
  39. data/lib/tensor_stream/opencl/opencl_evaluator.rb +129 -172
  40. data/lib/tensor_stream/opencl/version.rb +1 -1
  41. data/samples/iris.data +150 -0
  42. data/samples/iris.rb +110 -0
  43. data/samples/mnist_data.rb +65 -0
  44. data/samples/multigpu.rb +73 -0
  45. data/samples/nearest_neighbor.rb +56 -0
  46. data/samples/rnn.rb +108 -0
  47. data/tensor_stream-opencl.gemspec +4 -1
  48. metadata +62 -3
@@ -0,0 +1,62 @@
1
+ # require 'oily_png'
2
+ module TensorStream
3
+ module OpenCLHelpers
4
+ module ImagesOps
5
+ def ImagesOps.included(klass)
6
+ klass.class_eval do
7
+ register_op :decode_png do |context, tensor, inputs|
8
+ content = _run(inputs[0], context)
9
+ channels = tensor.options[:channels]
10
+ channels = 4 if channels.zero?
11
+
12
+ image = ChunkyPNG::Image.from_blob(content.buffer.to_a.pack('C*'))
13
+ output_buffer = _create_result_buffer(tensor.data_type, [image.height, image.width, channels], "out_#{tensor.name}")
14
+
15
+ image.grayscale! if channels == 1
16
+ image.pixels.each_with_index do |pixel, index|
17
+ start_index = index * channels
18
+ if channels == 4
19
+ output_buffer.buffer[start_index] = ChunkyPNG::Color.r(pixel)
20
+ output_buffer.buffer[start_index + 1] = ChunkyPNG::Color.g(pixel)
21
+ output_buffer.buffer[start_index + 2] = ChunkyPNG::Color.b(pixel)
22
+ output_buffer.buffer[start_index + 3] = ChunkyPNG::Color.a(pixel)
23
+ elsif channels == 3
24
+ output_buffer.buffer[start_index] = ChunkyPNG::Color.r(pixel)
25
+ output_buffer.buffer[start_index + 1] = ChunkyPNG::Color.g(pixel)
26
+ output_buffer.buffer[start_index + 2] = ChunkyPNG::Color.b(pixel)
27
+ elsif channels == 1
28
+ output_buffer.buffer[start_index] = ChunkyPNG::Color.r(pixel)
29
+ else
30
+ raise "Invalid channel value #{channels}"
31
+ end
32
+ end
33
+
34
+ write_op = _opencl_queue.enqueue_write_buffer(output_buffer.cl_buffer, output_buffer.buffer)
35
+ output_buffer.op = write_op
36
+ output_buffer
37
+ end
38
+
39
+ register_op :encode_png do |_context, tensor, inputs|
40
+ image_data = inputs[0]
41
+ height, width, channels = image_data.shape
42
+ image_buffer = image_data.buffer.reshape(*image_data.shape.reverse).to_a
43
+ \
44
+ png = ChunkyPNG::Image.new(width, height)
45
+ image_buffer.each_with_index do |rows, h_index|
46
+ rows.each_with_index do |p_data, w_index|
47
+ if channels == 4
48
+ png[w_index, h_index] = ChunkyPNG::Color.rgba(p_data[0], p_data[1], p_data[2], p_data[3])
49
+ elsif channels == 3
50
+ png[w_index, h_index] = ChunkyPNG::Color.rgb(p_data[0], p_data[1], p_data[2])
51
+ elsif channels == 1
52
+ png[w_index, h_index] = ChunkyPNG::Color.rgb(p_data[0], p_data[0], p_data[0])
53
+ end
54
+ end
55
+ end
56
+ convert_to_opencl(png.to_s, [], data_type: :string, name: tensor.name)
57
+ end
58
+ end
59
+ end
60
+ end
61
+ end
62
+ end
@@ -1,20 +1,18 @@
1
1
  % c_dtype = dtype_to_c_type(dtype)
2
2
  % if TensorStream::Ops::FLOATING_POINT_TYPES.include?(dtype)
3
- __kernel void abs_<%= dtype%>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
+ __kernel void abs_<%= dtype%>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
4
4
  // Get the index of the current element to be processed
5
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
5
+ const int id = get_global_id(0); // Row ID of C (0..M)
7
6
 
8
- C[globalRow * N + globalCol] = fabs(A[globalRow * N + globalCol]);
7
+ C[id] = fabs(A[id]);
9
8
  }
10
9
  % else
11
10
  % %w[int int32].each do |dt|
12
- __kernel void abs_<%= dt %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
11
+ __kernel void abs_<%= dt %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
13
12
  // Get the index of the current element to be processed
14
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
15
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
13
+ const int id = get_global_id(0); // Row ID of C (0..M)
16
14
 
17
- C[globalRow * N + globalCol] = fabs((float)A[globalRow * N + globalCol]);
15
+ C[id] = fabs((float)A[id]);
18
16
  }
19
17
  % end
20
18
  %end
@@ -1,8 +1,7 @@
1
1
  % c_dtype = dtype_to_c_type(dtype)
2
- __kernel void acos_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
2
+ __kernel void acos_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
3
  // Get the index of the current element to be processed
4
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
4
+ const int id = get_global_id(0);
6
5
 
7
- C[globalRow * N + globalCol] = acos(A[globalRow * N + globalCol]);
6
+ C[id] = acos(A[id]);
8
7
  }
@@ -1,6 +1,6 @@
1
1
  % c_dtype = dtype_to_c_type(dtype)
2
2
  // same dimension add floating point op
3
- __kernel void apply_adadelta_<%= dtype %>(const int M, const int N,
3
+ __kernel void apply_adadelta_<%= dtype %>(
4
4
  __global const <%= c_dtype %> *lr,
5
5
  __global const <%= c_dtype %> *rho,
6
6
  __global const <%= c_dtype %> *epsilon,
@@ -10,9 +10,7 @@
10
10
  __global <%= c_dtype %> *acc_update
11
11
  ) {
12
12
  // Get the index of the current element to be processed
13
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
14
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
15
- const int index = globalRow * N + globalCol;
13
+ const int index = get_global_id(0);
16
14
 
17
15
  acc[index] = acc[index] * rho[0] + (grad[index] * grad[index]) * ((<%= c_dtype %>)1 - rho[0]);
18
16
  const <%= c_dtype %> update = sqrt(acc_update[index] + epsilon[0]) * rsqrt(acc[index] + epsilon[0]) * grad[index];
@@ -0,0 +1,12 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ // same dimension add floating point op
3
+ __kernel void apply_adagrad_<%= dtype %>(
4
+ __global const <%= c_dtype %> *lr,
5
+ __global const <%= c_dtype %> *grad,
6
+ __global <%= c_dtype %> *output,
7
+ __global <%= c_dtype %> *acc
8
+ ) {
9
+ // Get the index of the current element to be processed
10
+ const int index = get_global_id(0);
11
+ output[index] -= grad[index] * lr[0] * rsqrt(acc[index]);
12
+ }
@@ -1,6 +1,6 @@
1
1
  % c_dtype = dtype_to_c_type(dtype)
2
2
  // same dimension add floating point op
3
- __kernel void apply_adam_<%= dtype %>(const int M, const int N,
3
+ __kernel void apply_adam_<%= dtype %>(
4
4
  __global const <%= c_dtype %> *grad,
5
5
  __global const <%= c_dtype %> *learning_rate,
6
6
  __global const <%= c_dtype %> *beta1_power,
@@ -11,10 +11,7 @@
11
11
  __global <%= c_dtype %> *momentum,
12
12
  __global <%= c_dtype %> *output, __global <%= c_dtype %> *v) {
13
13
  // Get the index of the current element to be processed
14
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
15
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
16
- const int index = globalRow * N + globalCol;
17
-
14
+ const int index = get_global_id(0);
18
15
  <%= c_dtype %> alpha = learning_rate[0] * sqrt(1.0 - beta2_power[0]) / (1.0 - beta1_power[0]);
19
16
 
20
17
  momentum[index] += (grad[index] - momentum[index]) * (1.0 - beta1[0]);
@@ -0,0 +1,19 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ // same dimension add floating point op
3
+ __kernel void apply_centered_rms_prop_<%= dtype %>(__global const <%= c_dtype %> *lr,
4
+ __global const <%= c_dtype %> *rho,
5
+ __global const <%= c_dtype %> *momentum,
6
+ __global const <%= c_dtype %> *epsilon,
7
+ __global const <%= c_dtype %> *grad,
8
+ __global <%= c_dtype %> *output,
9
+ __global <%= c_dtype %> *ms,
10
+ __global <%= c_dtype %> *mg,
11
+ __global <%= c_dtype %> *mom) {
12
+ // Get the index of the current element to be processed
13
+ const int id = get_global_id(0);
14
+ ms[id] += (grad[id] * grad[id] - ms[id]) * (1.0 - rho[0]);
15
+ <%= c_dtype %> denom = ms[id] - mg[id] * mg[id] + epsilon[0];
16
+ mg[id] = (grad[id] - mg[id]) * (1.0 - rho[0]);
17
+ mom[id] = mom[id] * momentum[0] + (grad[id] * lr[0]) / sqrt(denom);
18
+ output[id] -= mom[id];
19
+ }
@@ -1,9 +1,8 @@
1
1
  % c_dtype = dtype_to_c_type(dtype)
2
2
  // same dimension add floating point op
3
- __kernel void apply_gradient_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
3
+ __kernel void apply_gradient_<%= dtype %>(__global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
4
4
  // Get the index of the current element to be processed
5
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
5
+ const int id = get_global_id(0);
7
6
 
8
- C[globalRow * N + globalCol] -= A[globalRow * N + globalCol] * B[0];
7
+ C[id] -= A[id] * B[0];
9
8
  }
@@ -1,11 +1,9 @@
1
1
  % c_dtype = dtype_to_c_type(dtype)
2
2
  // same dimension add floating point op
3
- __kernel void apply_momentum_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *grad, __global const <%= c_dtype %> *learning_rate,
3
+ __kernel void apply_momentum_<%= dtype %>(__global const <%= c_dtype %> *grad, __global const <%= c_dtype %> *learning_rate,
4
4
  __global const <%= c_dtype %> *momentum, __global <%= c_dtype %> *output, __global <%= c_dtype %> *acc) {
5
5
  // Get the index of the current element to be processed
6
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
7
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
8
- const int index = globalRow * N + globalCol;
6
+ const int index = get_global_id(0);
9
7
  <%= c_dtype %> acc_m = acc[index];
10
8
  acc[index] = acc_m * momentum[0] + grad[index];
11
9
  <% if nesterov %>
@@ -0,0 +1,16 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ // same dimension add floating point op
3
+ __kernel void apply_rms_prop_<%= dtype %>(__global const <%= c_dtype %> *lr,
4
+ __global const <%= c_dtype %> *rho,
5
+ __global const <%= c_dtype %> *momentum,
6
+ __global const <%= c_dtype %> *epsilon,
7
+ __global const <%= c_dtype %> *grad,
8
+ __global <%= c_dtype %> *output,
9
+ __global <%= c_dtype %> *ms,
10
+ __global <%= c_dtype %> *mom) {
11
+ // Get the index of the current element to be processed
12
+ const int id = get_global_id(0);
13
+ ms[id] += (grad[id] * grad[id] - ms[id]) * (1.0 - rho[0]);
14
+ mom[id] = mom[id] * momentum[0] + (grad[id] * lr[0]) / sqrt(ms[id] + epsilon[0]);
15
+ output[id] -= mom[id];
16
+ }
@@ -1,9 +1,8 @@
1
1
 
2
2
  % c_dtype = dtype_to_c_type(dtype)
3
- __kernel void asin_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
+ __kernel void asin_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
4
4
  // Get the index of the current element to be processed
5
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
5
+ const int id = get_global_id(0);
7
6
 
8
- C[globalRow * N + globalCol] = asin(A[globalRow * N + globalCol]);
7
+ C[id] = asin(A[id]);
9
8
  }
@@ -1,8 +1,7 @@
1
1
  % c_dtype = dtype_to_c_type(dtype)
2
- __kernel void ceil_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
2
+ __kernel void ceil_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
3
  // Get the index of the current element to be processed
4
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
4
+ const int id = get_global_id(0);
6
5
 
7
- C[globalRow * N + globalCol] = ceil(A[globalRow * N + globalCol]);
6
+ C[id] = ceil(A[id]);
8
7
  }
@@ -0,0 +1,21 @@
1
+ % ctype = dtype_to_c_type(data_type)
2
+
3
+ __kernel void concat(const int N, const int index, const int step, __global const <%= ctype %> *A, __global <%= ctype %> *C) {
4
+ // Get the index of the current element to be processed
5
+ const int globalCol = get_global_id(0); // Col ID of C (0..N)
6
+ int ptr = globalCol;
7
+
8
+ // compute effective coordinates
9
+ <% divisors.each_with_index do |div, index| %>
10
+ <% if axis == index %>
11
+ int index_map_<%= index %> = (int)floor(ptr / (float)<%= div %>) + step;
12
+ <% else %>
13
+ int index_map_<%= index %> = (int)floor(ptr / (float)<%= div %>);
14
+ <% end %>
15
+ <% if index < divisors.size - 1%>
16
+ ptr = ptr % <%= div %>;
17
+ <% end %>
18
+ <% end %>
19
+
20
+ C[<%= multipliers.each_with_index.map { |m, idx| "#{m}*index_map_#{idx}" }.join(' + ') %>] = A[globalCol];
21
+ }
@@ -1,8 +1,6 @@
1
1
  % c_dtype = dtype_to_c_type(dtype)
2
- __kernel void cos_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
2
+ __kernel void cos_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
3
  // Get the index of the current element to be processed
4
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
-
7
- C[globalRow * N + globalCol] = cos(A[globalRow * N + globalCol]);
4
+ const int id = get_global_id(0);
5
+ C[id] = cos(A[id]);
8
6
  }
@@ -1,8 +1,6 @@
1
1
  % c_dtype = dtype_to_c_type(dtype)
2
- __kernel void exp_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
2
+ __kernel void exp_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
3
  // Get the index of the current element to be processed
4
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
-
7
- C[globalRow * N + globalCol] = exp(A[globalRow * N + globalCol]);
4
+ const int id = get_global_id(0);
5
+ C[id] = exp(A[id]);
8
6
  }
@@ -1,8 +1,7 @@
1
1
  % c_dtype = dtype_to_c_type(dtype)
2
- __kernel void floor_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
2
+ __kernel void floor_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
3
  // Get the index of the current element to be processed
4
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
4
+ const int id = get_global_id(0);
6
5
 
7
- C[globalRow * N + globalCol] = floor(A[globalRow * N + globalCol]);
6
+ C[id] = floor(A[id]);
8
7
  }
@@ -1,8 +1,7 @@
1
1
  % c_dtype = dtype_to_c_type(dtype)
2
- __kernel void log_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
2
+ __kernel void log_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
3
  // Get the index of the current element to be processed
4
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
4
+ const int id = get_global_id(0);
6
5
 
7
- C[globalRow * N + globalCol] = log(A[globalRow * N + globalCol]);
6
+ C[id] = log(A[id]);
8
7
  }
@@ -1,8 +1,7 @@
1
1
  % c_dtype = dtype_to_c_type(dtype)
2
- __kernel void log1p_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
2
+ __kernel void log1p_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
3
  // Get the index of the current element to be processed
4
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
4
+ const int id = get_global_id(0); // Row ID of C (0..M)
6
5
 
7
- C[globalRow * N + globalCol] = log1p(A[globalRow * N + globalCol]);
6
+ C[id] = log1p(A[id]);
8
7
  }
@@ -1,8 +1,7 @@
1
1
  % c_dtype = dtype_to_c_type(dtype)
2
- __kernel void negate_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
2
+ __kernel void negate_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
3
  // Get the index of the current element to be processed
4
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
4
+ const int id = get_global_id(0); // Row ID of C (0..M)
6
5
 
7
- C[globalRow * N + globalCol] = -A[globalRow * N + globalCol];
6
+ C[id] = -A[id];
8
7
  }
@@ -1,8 +1,7 @@
1
1
  % c_dtype = dtype_to_c_type(dtype)
2
- __kernel void reciprocal_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
2
+ __kernel void reciprocal_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
3
  // Get the index of the current element to be processed
4
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
4
+ const int id = get_global_id(0);
6
5
 
7
- C[globalRow * N + globalCol] = 1 / A[globalRow * N + globalCol];
6
+ C[id] = 1 / A[id];
8
7
  }
@@ -1,9 +1,8 @@
1
1
  % c_dtype = dtype_to_c_type(dtype)
2
2
 
3
- __kernel void sigmoid_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
+ __kernel void sigmoid_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
4
4
  // Get the index of the current element to be processed
5
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
5
+ const int id = get_global_id(0);
7
6
 
8
- C[globalRow * N + globalCol] = 1.0f/(1.0f + exp(-A[globalRow * N + globalCol]));
7
+ C[id] = 1.0f/(1.0f + exp(-A[id]));
9
8
  }
@@ -1,21 +1,20 @@
1
1
  % c_dtype = dtype_to_c_type(dtype)
2
2
 
3
- __kernel void sign_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
+ __kernel void sign_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
4
4
  // Get the index of the current element to be processed
5
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
7
- <%= c_dtype %> value = A[globalRow * N + globalCol];
5
+ const int id = get_global_id(0);
6
+ <%= c_dtype %> value = A[id];
8
7
  % if floating_point?(dtype)
9
8
  if (isnan(value) || value == 0.0f) {
10
- C[globalRow * N + globalCol] = 0.0;
9
+ C[id] = 0.0;
11
10
  } else {
12
- C[globalRow * N + globalCol] = value < 0 ? -1.0 : 1.0;
11
+ C[id] = value < 0 ? -1.0 : 1.0;
13
12
  }
14
13
  % else
15
14
  if (value == 0) {
16
- C[globalRow * N + globalCol] = 0;
15
+ C[id] = 0;
17
16
  } else {
18
- C[globalRow * N + globalCol] = value < 0 ? -1 : 1;
17
+ C[id] = value < 0 ? -1 : 1;
19
18
  }
20
19
  % end
21
20
  }
@@ -1,9 +1,8 @@
1
1
 
2
2
  % c_dtype = dtype_to_c_type(dtype)
3
- __kernel void sin_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
+ __kernel void sin_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
4
4
  // Get the index of the current element to be processed
5
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
5
+ const int id = get_global_id(0); // Row ID of C (0..M)
7
6
 
8
- C[globalRow * N + globalCol] = sin(A[globalRow * N + globalCol]);
7
+ C[id] = sin(A[id]);
9
8
  }
@@ -0,0 +1,17 @@
1
+ % ctype = dtype_to_c_type(data_type)
2
+ % mul_str = mul.each_with_index.collect { |mul, index| "#{mul} * index_map_#{index}" }
3
+ __kernel void split(const int N, __global const <%= ctype %> *A, __global <%= ctype %> *C) {
4
+ // Get the index of the current element to be processed
5
+ const int globalCol = get_global_id(0); // Col ID of C (0..N)
6
+ const int localCol = get_global_id(1);
7
+ // compute effective coordinates
8
+ int ptr = localCol;
9
+ <% dest.each_with_index do |div, index| %>
10
+ <% if index == axis %>
11
+ int index_map_<%= index %> = (int)floor(ptr / (float)<%= div %>) + globalCol * <%= step %>;
12
+ <% else %>
13
+ int index_map_<%= index %> = (int)floor(ptr / (float)<%= div %>);
14
+ <% end %>
15
+ <% if index < dest.size - 1%>ptr = ptr % <%= div %>;<% end %><% end %>
16
+ C[N*globalCol + localCol] = A[<%= mul_str.join(" + ") %>];
17
+ }