tensor_stream-opencl 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (72) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +11 -0
  3. data/.rspec +3 -0
  4. data/.travis.yml +5 -0
  5. data/CODE_OF_CONDUCT.md +74 -0
  6. data/Gemfile +6 -0
  7. data/Gemfile.lock +51 -0
  8. data/LICENSE.txt +21 -0
  9. data/README.md +58 -0
  10. data/Rakefile +6 -0
  11. data/bin/console +14 -0
  12. data/bin/setup +8 -0
  13. data/lib/tensor_stream/opencl.rb +7 -0
  14. data/lib/tensor_stream/opencl/kernels/_bool_operand.cl +45 -0
  15. data/lib/tensor_stream/opencl/kernels/_operand.cl +45 -0
  16. data/lib/tensor_stream/opencl/kernels/abs.cl +20 -0
  17. data/lib/tensor_stream/opencl/kernels/acos.cl +8 -0
  18. data/lib/tensor_stream/opencl/kernels/add.cl +3 -0
  19. data/lib/tensor_stream/opencl/kernels/apply_adam.cl +23 -0
  20. data/lib/tensor_stream/opencl/kernels/apply_gradient.cl +9 -0
  21. data/lib/tensor_stream/opencl/kernels/apply_momentum.cl +16 -0
  22. data/lib/tensor_stream/opencl/kernels/argmax.cl +8 -0
  23. data/lib/tensor_stream/opencl/kernels/argmin.cl +8 -0
  24. data/lib/tensor_stream/opencl/kernels/asin.cl +9 -0
  25. data/lib/tensor_stream/opencl/kernels/cast.cl +10 -0
  26. data/lib/tensor_stream/opencl/kernels/ceil.cl +8 -0
  27. data/lib/tensor_stream/opencl/kernels/cond.cl.erb +6 -0
  28. data/lib/tensor_stream/opencl/kernels/cos.cl +8 -0
  29. data/lib/tensor_stream/opencl/kernels/div.cl.erb +3 -0
  30. data/lib/tensor_stream/opencl/kernels/exp.cl +8 -0
  31. data/lib/tensor_stream/opencl/kernels/floor.cl +8 -0
  32. data/lib/tensor_stream/opencl/kernels/floor_div.cl +48 -0
  33. data/lib/tensor_stream/opencl/kernels/floor_mod.cl +3 -0
  34. data/lib/tensor_stream/opencl/kernels/gemm.cl +32 -0
  35. data/lib/tensor_stream/opencl/kernels/log.cl +8 -0
  36. data/lib/tensor_stream/opencl/kernels/log1p.cl +8 -0
  37. data/lib/tensor_stream/opencl/kernels/log_softmax.cl +26 -0
  38. data/lib/tensor_stream/opencl/kernels/max.cl +46 -0
  39. data/lib/tensor_stream/opencl/kernels/min.cl +46 -0
  40. data/lib/tensor_stream/opencl/kernels/mod.cl +3 -0
  41. data/lib/tensor_stream/opencl/kernels/mul.cl +3 -0
  42. data/lib/tensor_stream/opencl/kernels/negate.cl +8 -0
  43. data/lib/tensor_stream/opencl/kernels/pack.cl +24 -0
  44. data/lib/tensor_stream/opencl/kernels/pow.cl +46 -0
  45. data/lib/tensor_stream/opencl/kernels/real_div.cl +3 -0
  46. data/lib/tensor_stream/opencl/kernels/reciprocal.cl +8 -0
  47. data/lib/tensor_stream/opencl/kernels/round.cl +8 -0
  48. data/lib/tensor_stream/opencl/kernels/sigmoid.cl +9 -0
  49. data/lib/tensor_stream/opencl/kernels/sigmoid_grad.cl +55 -0
  50. data/lib/tensor_stream/opencl/kernels/sign.cl +21 -0
  51. data/lib/tensor_stream/opencl/kernels/sin.cl +9 -0
  52. data/lib/tensor_stream/opencl/kernels/softmax.cl +26 -0
  53. data/lib/tensor_stream/opencl/kernels/softmax_cross.cl +32 -0
  54. data/lib/tensor_stream/opencl/kernels/softmax_cross_grad.cl +28 -0
  55. data/lib/tensor_stream/opencl/kernels/softmax_grad.cl +46 -0
  56. data/lib/tensor_stream/opencl/kernels/sqrt.cl +9 -0
  57. data/lib/tensor_stream/opencl/kernels/square.cl +9 -0
  58. data/lib/tensor_stream/opencl/kernels/squared_difference.cl +53 -0
  59. data/lib/tensor_stream/opencl/kernels/sub.cl +3 -0
  60. data/lib/tensor_stream/opencl/kernels/tan.cl +8 -0
  61. data/lib/tensor_stream/opencl/kernels/tanh.cl +8 -0
  62. data/lib/tensor_stream/opencl/kernels/tanh_grad.cl +7 -0
  63. data/lib/tensor_stream/opencl/kernels/where.cl +8 -0
  64. data/lib/tensor_stream/opencl/math_ops.rb +133 -0
  65. data/lib/tensor_stream/opencl/nn_ops.rb +191 -0
  66. data/lib/tensor_stream/opencl/opencl_buffer.rb +35 -0
  67. data/lib/tensor_stream/opencl/opencl_device.rb +5 -0
  68. data/lib/tensor_stream/opencl/opencl_evaluator.rb +933 -0
  69. data/lib/tensor_stream/opencl/opencl_template_helper.rb +99 -0
  70. data/lib/tensor_stream/opencl/version.rb +5 -0
  71. data/tensor_stream-opencl.gemspec +40 -0
  72. metadata +185 -0
@@ -0,0 +1,9 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ // same dimension add floating point op
3
+ __kernel void apply_gradient_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
4
+ // Get the index of the current element to be processed
5
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
7
+
8
+ C[globalRow * N + globalCol] -= A[globalRow * N + globalCol] * B[0];
9
+ }
@@ -0,0 +1,16 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ // same dimension add floating point op
3
+ __kernel void apply_momentum_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *grad, __global const <%= c_dtype %> *learning_rate,
4
+ __global const <%= c_dtype %> *momentum, __global <%= c_dtype %> *output, __global <%= c_dtype %> *acc) {
5
+ // Get the index of the current element to be processed
6
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
7
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
8
+ const int index = globalRow * N + globalCol;
9
+ <%= c_dtype %> acc_m = acc[index];
10
+ acc[index] = acc_m * momentum[0] + grad[index];
11
+ <% if nesterov %>
12
+ output[index] -= grad[index] * learning_rate[0] + acc_m * momentum[0] * learning_rate[0];
13
+ <% else %>
14
+ output[index] -= acc_m * learning_rate[0];
15
+ <% end %>
16
+ }
@@ -0,0 +1,8 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ __kernel void argmax_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
3
+ // Get the index of the current element to be processed
4
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
+
7
+ C[globalRow * N + globalCol] = A[globalRow * N + globalCol] + B[globalRow * N + globalCol];
8
+ }
@@ -0,0 +1,8 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ __kernel void argmin_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
3
+ // Get the index of the current element to be processed
4
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
+
7
+ C[globalRow * N + globalCol] = A[globalRow * N + globalCol] + B[globalRow * N + globalCol];
8
+ }
@@ -0,0 +1,9 @@
1
+
2
+ % c_dtype = dtype_to_c_type(dtype)
3
+ __kernel void asin_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
4
+ // Get the index of the current element to be processed
5
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
7
+
8
+ C[globalRow * N + globalCol] = asin(A[globalRow * N + globalCol]);
9
+ }
@@ -0,0 +1,10 @@
1
+ % source_ctype = dtype_to_c_type(source_dt)
2
+ % target_ctype = dtype_to_c_type(target_dt)
3
+
4
+ __kernel void cast(const int M, const int N, __global const <%= source_ctype %> *A, __global <%= target_ctype %> *C) {
5
+ // Get the index of the current element to be processed
6
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
7
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
8
+
9
+ C[globalRow * N + globalCol] = A[globalRow * N + globalCol];
10
+ }
@@ -0,0 +1,8 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ __kernel void ceil_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
+ // Get the index of the current element to be processed
4
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
+
7
+ C[globalRow * N + globalCol] = ceil(A[globalRow * N + globalCol]);
8
+ }
@@ -0,0 +1,6 @@
1
+ % ["#{dtype}"].product(%w[less less_equal equal not_equal greater greater_equal logical_and]).each do |dtype, fname|
2
+ % a_dtype = dtype_to_c_type(a)
3
+ % b_dtype = dtype_to_c_type(b)
4
+ % op = operator_to_c(fname)
5
+ <%= render 'bool_operand.cl', a_dtype: a_dtype, b_dtype: b_dtype, op: op, fname: fname, dtype: "#{a}_#{b}", result_t: 'short' %>
6
+ % end
@@ -0,0 +1,8 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ __kernel void cos_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
+ // Get the index of the current element to be processed
4
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
+
7
+ C[globalRow * N + globalCol] = cos(A[globalRow * N + globalCol]);
8
+ }
@@ -0,0 +1,3 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ % op = operator_to_c('div')
3
+ <%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'div', dtype: "#{a}_#{b}", result_t: c_dtype %>
@@ -0,0 +1,8 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ __kernel void exp_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
+ // Get the index of the current element to be processed
4
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
+
7
+ C[globalRow * N + globalCol] = exp(A[globalRow * N + globalCol]);
8
+ }
@@ -0,0 +1,8 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ __kernel void floor_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
+ // Get the index of the current element to be processed
4
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
+
7
+ C[globalRow * N + globalCol] = floor(A[globalRow * N + globalCol]);
8
+ }
@@ -0,0 +1,48 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ % fname = 'floor_div'
3
+ % result_t = c_dtype
4
+ // same dimension add floating point op
5
+ __kernel void <%= fname%>_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= result_t %> *C) {
6
+ // Get the index of the current element to be processed
7
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
8
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
9
+
10
+ C[globalRow * N + globalCol] = (int)(A[globalRow * N + globalCol] / B[globalRow * N + globalCol]);
11
+ }
12
+
13
+ // 1D + Scalar floating point add op
14
+ __kernel void <%=fname%>_c_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= result_t %> *C) {
15
+ // Get the index of the current element to be processed
16
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
17
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
18
+
19
+ if (switch_op == 0) {
20
+ C[globalRow * N + globalCol] = (int)(A[globalRow * N + globalCol] / B[0]);
21
+ } else {
22
+ C[globalRow * N + globalCol] = (int)(B[0] / A[globalRow * N + globalCol]);
23
+ }
24
+ }
25
+
26
+ // 1D + Scalar floating point add op broadcast
27
+ __kernel void <%= fname%>_b_<%= dtype %>_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op,__global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= result_t %> *C) {
28
+ // Get the index of the current element to be processed
29
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
30
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
31
+
32
+ int b_m_index = globalRow;
33
+ int b_n_index = globalCol;
34
+
35
+ if ( b_m_index >= M2) {
36
+ b_m_index = b_m_index % M2;
37
+ };
38
+
39
+ if (b_n_index >= N2) {
40
+ b_n_index = b_n_index % N2;
41
+ }
42
+
43
+ if (switch_op == 0) {
44
+ C[globalRow * N + globalCol] = (int)(A[globalRow * N + globalCol] / B[b_m_index * N2 + b_n_index]);
45
+ } else {
46
+ C[globalRow * N + globalCol] = (int)(B[b_m_index * N2 + b_n_index] / A[globalRow * N + globalCol]);
47
+ }
48
+ }
@@ -0,0 +1,3 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ % op = operator_to_c('mod')
3
+ <%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'floor_mod', dtype: "#{a}_#{b}", result_t: c_dtype %>
@@ -0,0 +1,32 @@
1
+ // First naive implementation
2
+ % c_dtype = dtype_to_c_type(dtype)
3
+ __kernel void gemm_<%= dtype %>(const int M, const int N, const int K,
4
+ const int A_transpose,
5
+ const int B_transpose,
6
+ const __global <%= c_dtype %>* A,
7
+ const __global <%= c_dtype %>* B,
8
+ __global <%= c_dtype %>* C) {
9
+
10
+ // Get the index of the current element to be processed
11
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
12
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
13
+
14
+ // Compute a single element (loop over K)
15
+ <%= c_dtype %> acc = 0.0f;
16
+ for (int k=0; k<K; k++) {
17
+ int a_index = globalRow*K + k;
18
+ int b_index = k*N + globalCol;
19
+
20
+ if (A_transpose) {
21
+ a_index = M*k + globalRow;
22
+ }
23
+
24
+ if (B_transpose) {
25
+ b_index = globalCol*K + k;
26
+ }
27
+ acc += A[a_index] * B[b_index];
28
+ }
29
+
30
+ // Store the result
31
+ C[globalRow*N + globalCol] = acc;
32
+ }
@@ -0,0 +1,8 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ __kernel void log_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
+ // Get the index of the current element to be processed
4
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
+
7
+ C[globalRow * N + globalCol] = log(A[globalRow * N + globalCol]);
8
+ }
@@ -0,0 +1,8 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ __kernel void log1p_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
+ // Get the index of the current element to be processed
4
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
+
7
+ C[globalRow * N + globalCol] = log1p(A[globalRow * N + globalCol]);
8
+ }
@@ -0,0 +1,26 @@
1
+ // First naive implementation
2
+ % c_dtype = dtype_to_c_type(dtype)
3
+ __kernel void log_softmax_<%= dtype %>(const int N,
4
+ const __global <%= c_dtype %>* A,
5
+ __global <%= c_dtype %>* C) {
6
+
7
+ // Get the index of the current element to be processed
8
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
9
+
10
+ // Compute a single element (loop over K)
11
+ <%= c_dtype %> acc = 0.0f;
12
+ <%= c_dtype %> max = <%= min_value_for(dtype) %>;
13
+
14
+ for (int k=0; k<N; k++) {
15
+ max = A[globalRow*N + k] > max ? A[globalRow*N + k] : max;
16
+ }
17
+
18
+ for (int k=0; k<N; k++) {
19
+ acc += exp(A[globalRow*N + k] - max);
20
+ }
21
+
22
+ // Store the result
23
+ for (int k=0; k < N; k++) {
24
+ C[globalRow*N + k] = (A[globalRow*N + k] - max) - log(acc);
25
+ }
26
+ }
@@ -0,0 +1,46 @@
1
+ // same dimension add floating point op
2
+ % c_dtype = dtype_to_c_type(dtype)
3
+ __kernel void max_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
4
+ // Get the index of the current element to be processed
5
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
7
+
8
+ C[globalRow * N + globalCol] = A[globalRow * N + globalCol] >= B[globalRow * N + globalCol] ? A[globalRow * N + globalCol] : B[globalRow * N + globalCol];
9
+ }
10
+
11
+ // 1D + Scalar floating point add op
12
+ __kernel void max_c_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
13
+ // Get the index of the current element to be processed
14
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
15
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
16
+
17
+ if (switch_op == 0) {
18
+ C[globalRow * N + globalCol] = A[globalRow * N + globalCol] >= B[0] ? A[globalRow * N + globalCol] : B[0];
19
+ } else {
20
+ C[globalRow * N + globalCol] = B[0] >= A[globalRow * N + globalCol] ? B[0] : A[globalRow * N + globalCol];
21
+ }
22
+ }
23
+
24
+ // 1D + Scalar floating point add op broadcast
25
+ __kernel void max_b_<%= dtype %>_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
26
+ // Get the index of the current element to be processed
27
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
28
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
29
+
30
+ int b_m_index = globalRow;
31
+ int b_n_index = globalCol;
32
+
33
+ if ( b_m_index >= M2) {
34
+ b_m_index = b_m_index % M2;
35
+ };
36
+
37
+ if (b_n_index >= N2) {
38
+ b_n_index = b_n_index % N2;
39
+ }
40
+
41
+ if (switch_op == 0) {
42
+ C[globalRow * N + globalCol] = A[globalRow * N + globalCol] >= B[b_m_index * N2 + b_n_index] ? A[globalRow * N + globalCol] : B[b_m_index * N2 + b_n_index];
43
+ } else {
44
+ C[globalRow * N + globalCol] = B[b_m_index * N2 + b_n_index] >= A[globalRow * N + globalCol] ? B[b_m_index * N2 + b_n_index] : A[globalRow * N + globalCol];
45
+ }
46
+ }
@@ -0,0 +1,46 @@
1
+ // same dimension add floating point op
2
+ % c_dtype = dtype_to_c_type(dtype)
3
+ __kernel void min_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
4
+ // Get the index of the current element to be processed
5
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
7
+
8
+ C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <= B[globalRow * N + globalCol] ? A[globalRow * N + globalCol] : B[globalRow * N + globalCol];
9
+ }
10
+
11
+ // 1D + Scalar floating point add op
12
+ __kernel void min_c_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
13
+ // Get the index of the current element to be processed
14
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
15
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
16
+
17
+ if (switch_op == 0) {
18
+ C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <= B[0] ? A[globalRow * N + globalCol] : B[0];
19
+ } else {
20
+ C[globalRow * N + globalCol] = B[0] <= A[globalRow * N + globalCol] ? B[0] : A[globalRow * N + globalCol];
21
+ }
22
+ }
23
+
24
+ // 1D + Scalar floating point add op broadcast
25
+ __kernel void min_b_<%= dtype %>_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
26
+ // Get the index of the current element to be processed
27
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
28
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
29
+
30
+ int b_m_index = globalRow;
31
+ int b_n_index = globalCol;
32
+
33
+ if ( b_m_index >= M2) {
34
+ b_m_index = b_m_index % M2;
35
+ };
36
+
37
+ if (b_n_index >= N2) {
38
+ b_n_index = b_n_index % N2;
39
+ }
40
+
41
+ if (switch_op == 0) {
42
+ C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <= B[b_m_index * N2 + b_n_index] ? A[globalRow * N + globalCol] : B[b_m_index * N2 + b_n_index];
43
+ } else {
44
+ C[globalRow * N + globalCol] = B[b_m_index * N2 + b_n_index] <= A[globalRow * N + globalCol] ? B[b_m_index * N2 + b_n_index] : A[globalRow * N + globalCol];
45
+ }
46
+ }
@@ -0,0 +1,3 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ % op = operator_to_c('mod')
3
+ <%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'mod', dtype: "#{a}_#{b}", result_t: c_dtype %>
@@ -0,0 +1,3 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ % op = operator_to_c('mul')
3
+ <%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'mul', dtype: "#{a}_#{b}", result_t: c_dtype %>
@@ -0,0 +1,8 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ __kernel void negate_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
+ // Get the index of the current element to be processed
4
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
+
7
+ C[globalRow * N + globalCol] = -A[globalRow * N + globalCol];
8
+ }
@@ -0,0 +1,24 @@
1
+ % ctype = dtype_to_c_type(data_type)
2
+
3
+ __kernel void pack(const int N, const int index, __global const <%= ctype %> *A, __global <%= ctype %> *C) {
4
+ // Get the index of the current element to be processed
5
+ const int globalCol = get_global_id(0); // Col ID of C (0..N)
6
+
7
+ int start = index * <%= divisors[0] %>;
8
+ int ptr = start + globalCol;
9
+ int index_map[<%= divisors.size %>] = { <%= Array.new(divisors.size) { 0 }.join(', ') %> };
10
+
11
+ // compute effective coordinates
12
+ <% divisors.each_with_index do |div, index| %>
13
+ index_map[<%= index %>] = (int)floor(ptr / (float)<%= div %>);<% if index < divisors.size - 1%>ptr = ptr % <%= div %>;<% end %><% end %>
14
+
15
+ // Apply axis translation if needed
16
+ <% if axis > 0 %>
17
+ int first = index_map[0];
18
+ <% axis.times do |i| %>
19
+ index_map[<%= i %>] = index_map[<%= (i + 1) %>];<% end %>
20
+ index_map[<%= axis %>] = first;
21
+ <% end%>
22
+
23
+ C[<%= multipliers.each_with_index.map { |m, idx| "#{m}*index_map[#{idx}]" }.join(' + ') %>] = A[globalCol];
24
+ }
@@ -0,0 +1,46 @@
1
+ // same dimension add floating point op
2
+ % c_dtype = dtype_to_c_type(dtype)
3
+ __kernel void pow_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
4
+ // Get the index of the current element to be processed
5
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
7
+
8
+ C[globalRow * N + globalCol] = pow((float)A[globalRow * N + globalCol], (float)B[globalRow * N + globalCol]);
9
+ }
10
+
11
+ // 1D + Scalar floating point add op
12
+ __kernel void pow_c_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
13
+ // Get the index of the current element to be processed
14
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
15
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
16
+
17
+ if (switch_op == 0) {
18
+ C[globalRow * N + globalCol] = pow((float)A[globalRow * N + globalCol], (float)B[0]);
19
+ } else {
20
+ C[globalRow * N + globalCol] = pow((float)B[0], (float)A[globalRow * N + globalCol]);
21
+ }
22
+ }
23
+
24
+ // 1D + Scalar floating point add op broadcast
25
+ __kernel void pow_b_<%= dtype %>_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
26
+ // Get the index of the current element to be processed
27
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
28
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
29
+
30
+ int b_m_index = globalRow;
31
+ int b_n_index = globalCol;
32
+
33
+ if ( b_m_index >= M2) {
34
+ b_m_index = b_m_index % M2;
35
+ };
36
+
37
+ if (b_n_index >= N2) {
38
+ b_n_index = b_n_index % N2;
39
+ }
40
+
41
+ if (switch_op == 0) {
42
+ C[globalRow * N + globalCol] = pow((float)A[globalRow * N + globalCol], (float)B[b_m_index * N2 + b_n_index]);
43
+ } else {
44
+ C[globalRow * N + globalCol] = pow((float)B[b_m_index * N2 + b_n_index], (float)A[globalRow * N + globalCol]);
45
+ }
46
+ }