tensor_stream-opencl 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (72) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +11 -0
  3. data/.rspec +3 -0
  4. data/.travis.yml +5 -0
  5. data/CODE_OF_CONDUCT.md +74 -0
  6. data/Gemfile +6 -0
  7. data/Gemfile.lock +51 -0
  8. data/LICENSE.txt +21 -0
  9. data/README.md +58 -0
  10. data/Rakefile +6 -0
  11. data/bin/console +14 -0
  12. data/bin/setup +8 -0
  13. data/lib/tensor_stream/opencl.rb +7 -0
  14. data/lib/tensor_stream/opencl/kernels/_bool_operand.cl +45 -0
  15. data/lib/tensor_stream/opencl/kernels/_operand.cl +45 -0
  16. data/lib/tensor_stream/opencl/kernels/abs.cl +20 -0
  17. data/lib/tensor_stream/opencl/kernels/acos.cl +8 -0
  18. data/lib/tensor_stream/opencl/kernels/add.cl +3 -0
  19. data/lib/tensor_stream/opencl/kernels/apply_adam.cl +23 -0
  20. data/lib/tensor_stream/opencl/kernels/apply_gradient.cl +9 -0
  21. data/lib/tensor_stream/opencl/kernels/apply_momentum.cl +16 -0
  22. data/lib/tensor_stream/opencl/kernels/argmax.cl +8 -0
  23. data/lib/tensor_stream/opencl/kernels/argmin.cl +8 -0
  24. data/lib/tensor_stream/opencl/kernels/asin.cl +9 -0
  25. data/lib/tensor_stream/opencl/kernels/cast.cl +10 -0
  26. data/lib/tensor_stream/opencl/kernels/ceil.cl +8 -0
  27. data/lib/tensor_stream/opencl/kernels/cond.cl.erb +6 -0
  28. data/lib/tensor_stream/opencl/kernels/cos.cl +8 -0
  29. data/lib/tensor_stream/opencl/kernels/div.cl.erb +3 -0
  30. data/lib/tensor_stream/opencl/kernels/exp.cl +8 -0
  31. data/lib/tensor_stream/opencl/kernels/floor.cl +8 -0
  32. data/lib/tensor_stream/opencl/kernels/floor_div.cl +48 -0
  33. data/lib/tensor_stream/opencl/kernels/floor_mod.cl +3 -0
  34. data/lib/tensor_stream/opencl/kernels/gemm.cl +32 -0
  35. data/lib/tensor_stream/opencl/kernels/log.cl +8 -0
  36. data/lib/tensor_stream/opencl/kernels/log1p.cl +8 -0
  37. data/lib/tensor_stream/opencl/kernels/log_softmax.cl +26 -0
  38. data/lib/tensor_stream/opencl/kernels/max.cl +46 -0
  39. data/lib/tensor_stream/opencl/kernels/min.cl +46 -0
  40. data/lib/tensor_stream/opencl/kernels/mod.cl +3 -0
  41. data/lib/tensor_stream/opencl/kernels/mul.cl +3 -0
  42. data/lib/tensor_stream/opencl/kernels/negate.cl +8 -0
  43. data/lib/tensor_stream/opencl/kernels/pack.cl +24 -0
  44. data/lib/tensor_stream/opencl/kernels/pow.cl +46 -0
  45. data/lib/tensor_stream/opencl/kernels/real_div.cl +3 -0
  46. data/lib/tensor_stream/opencl/kernels/reciprocal.cl +8 -0
  47. data/lib/tensor_stream/opencl/kernels/round.cl +8 -0
  48. data/lib/tensor_stream/opencl/kernels/sigmoid.cl +9 -0
  49. data/lib/tensor_stream/opencl/kernels/sigmoid_grad.cl +55 -0
  50. data/lib/tensor_stream/opencl/kernels/sign.cl +21 -0
  51. data/lib/tensor_stream/opencl/kernels/sin.cl +9 -0
  52. data/lib/tensor_stream/opencl/kernels/softmax.cl +26 -0
  53. data/lib/tensor_stream/opencl/kernels/softmax_cross.cl +32 -0
  54. data/lib/tensor_stream/opencl/kernels/softmax_cross_grad.cl +28 -0
  55. data/lib/tensor_stream/opencl/kernels/softmax_grad.cl +46 -0
  56. data/lib/tensor_stream/opencl/kernels/sqrt.cl +9 -0
  57. data/lib/tensor_stream/opencl/kernels/square.cl +9 -0
  58. data/lib/tensor_stream/opencl/kernels/squared_difference.cl +53 -0
  59. data/lib/tensor_stream/opencl/kernels/sub.cl +3 -0
  60. data/lib/tensor_stream/opencl/kernels/tan.cl +8 -0
  61. data/lib/tensor_stream/opencl/kernels/tanh.cl +8 -0
  62. data/lib/tensor_stream/opencl/kernels/tanh_grad.cl +7 -0
  63. data/lib/tensor_stream/opencl/kernels/where.cl +8 -0
  64. data/lib/tensor_stream/opencl/math_ops.rb +133 -0
  65. data/lib/tensor_stream/opencl/nn_ops.rb +191 -0
  66. data/lib/tensor_stream/opencl/opencl_buffer.rb +35 -0
  67. data/lib/tensor_stream/opencl/opencl_device.rb +5 -0
  68. data/lib/tensor_stream/opencl/opencl_evaluator.rb +933 -0
  69. data/lib/tensor_stream/opencl/opencl_template_helper.rb +99 -0
  70. data/lib/tensor_stream/opencl/version.rb +5 -0
  71. data/tensor_stream-opencl.gemspec +40 -0
  72. metadata +185 -0
@@ -0,0 +1,3 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ % op = operator_to_c('div')
3
+ <%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'real_div', dtype: "#{a}_#{b}", result_t: c_dtype %>
@@ -0,0 +1,8 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ __kernel void reciprocal_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
+ // Get the index of the current element to be processed
4
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
+
7
+ C[globalRow * N + globalCol] = 1 / A[globalRow * N + globalCol];
8
+ }
@@ -0,0 +1,8 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ __kernel void round_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
+ // Get the index of the current element to be processed
4
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
+
7
+ C[globalRow * N + globalCol] = round(A[globalRow * N + globalCol]);
8
+ }
@@ -0,0 +1,9 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+
3
+ __kernel void sigmoid_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
4
+ // Get the index of the current element to be processed
5
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
7
+
8
+ C[globalRow * N + globalCol] = 1.0f/(1.0f + exp(-A[globalRow * N + globalCol]));
9
+ }
@@ -0,0 +1,55 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+
3
+ float sigmoid(<%= c_dtype %> x) {
4
+ return 1.0f/(1.0f + exp(-x));
5
+ }
6
+
7
+ float sigmoid_grad(<%= c_dtype %> x, <%= c_dtype %> g) {
8
+ return g * sigmoid(x) * ( 1.0f - sigmoid(x));
9
+ }
10
+
11
+ // same dimension add floating point op
12
+ __kernel void sigmoid_grad_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
13
+ // Get the index of the current element to be processed
14
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
15
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
16
+
17
+ C[globalRow * N + globalCol] = sigmoid_grad(A[globalRow * N + globalCol], B[globalRow * N + globalCol]);
18
+ }
19
+
20
+ // 1D + Scalar floating point add op
21
+ __kernel void sigmoid_grad_c_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
22
+ // Get the index of the current element to be processed
23
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
24
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
25
+
26
+ if (switch_op == 0) {
27
+ C[globalRow * N + globalCol] = sigmoid_grad(A[globalRow * N + globalCol], B[0]);
28
+ } else {
29
+ C[globalRow * N + globalCol] = sigmoid_grad(B[0], A[globalRow * N + globalCol]);
30
+ }
31
+ }
32
+
33
+ // 1D + Scalar floating point add op broadcast
34
+ __kernel void sigmoid_grad_b_<%= dtype %>_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
35
+ // Get the index of the current element to be processed
36
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
37
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
38
+
39
+ int b_m_index = globalRow;
40
+ int b_n_index = globalCol;
41
+
42
+ if ( b_m_index >= M2) {
43
+ b_m_index = b_m_index % M2;
44
+ };
45
+
46
+ if (b_n_index >= N2) {
47
+ b_n_index = b_n_index % N2;
48
+ }
49
+
50
+ if (switch_op == 0) {
51
+ C[globalRow * N + globalCol] = sigmoid_grad(A[globalRow * N + globalCol], B[b_m_index * N2 + b_n_index]);
52
+ } else {
53
+ C[globalRow * N + globalCol] = sigmoid_grad(B[b_m_index * N2 + b_n_index], A[globalRow * N + globalCol]);
54
+ }
55
+ }
@@ -0,0 +1,21 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+
3
+ __kernel void sign_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
4
+ // Get the index of the current element to be processed
5
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
7
+ <%= c_dtype %> value = A[globalRow * N + globalCol];
8
+ % if floating_point?(dtype)
9
+ if (isnan(value) || value == 0.0f) {
10
+ C[globalRow * N + globalCol] = 0.0;
11
+ } else {
12
+ C[globalRow * N + globalCol] = value < 0 ? -1.0 : 1.0;
13
+ }
14
+ % else
15
+ if (value == 0) {
16
+ C[globalRow * N + globalCol] = 0;
17
+ } else {
18
+ C[globalRow * N + globalCol] = value < 0 ? -1 : 1;
19
+ }
20
+ % end
21
+ }
@@ -0,0 +1,9 @@
1
+
2
+ % c_dtype = dtype_to_c_type(dtype)
3
+ __kernel void sin_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
4
+ // Get the index of the current element to be processed
5
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
7
+
8
+ C[globalRow * N + globalCol] = sin(A[globalRow * N + globalCol]);
9
+ }
@@ -0,0 +1,26 @@
1
+ // First naive implementation
2
+ % c_dtype = dtype_to_c_type(dtype)
3
+ __kernel void softmax_<%= dtype %>(const int N,
4
+ const __global <%= c_dtype %>* A,
5
+ __global <%= c_dtype %>* C) {
6
+
7
+ // Get the index of the current element to be processed
8
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
9
+
10
+ // Compute a single element (loop over K)
11
+ <%= c_dtype %> acc = 0.0f;
12
+ <%= c_dtype %> max = <%= min_value_for(dtype) %>;
13
+
14
+ for (int k=0; k<N; k++) {
15
+ max = A[globalRow*N + k] > max ? A[globalRow*N + k] : max;
16
+ }
17
+
18
+ for (int k=0; k<N; k++) {
19
+ acc += exp(A[globalRow*N + k] - max);
20
+ }
21
+
22
+ // Store the result
23
+ for (int k=0; k < N; k++) {
24
+ C[globalRow*N + k] = exp(A[globalRow*N + k] - max) / acc;
25
+ }
26
+ }
@@ -0,0 +1,32 @@
1
+
2
+ // First naive implementation
3
+ % c_dtype = dtype_to_c_type(dtype)
4
+ __kernel void softmax_cross_<%= dtype %>(const int N,
5
+ const __global <%= c_dtype %>* A,
6
+ const __global <%= c_dtype %>* L,
7
+ __global <%= c_dtype %>* C, __global <%= c_dtype %>* P) {
8
+
9
+ // Get the index of the current element to be processed
10
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
11
+
12
+ // Compute a single element (loop over K)
13
+ <%= c_dtype %> acc = 0.0f;
14
+ <%= c_dtype %> max = <%= min_value_for(dtype) %>;
15
+
16
+ for (int k=0; k<N; k++) {
17
+ max = A[globalRow*N + k] > max ? A[globalRow*N + k] : max;
18
+ }
19
+
20
+ for (int k=0; k<N; k++) {
21
+ acc += exp(A[globalRow*N + k] - max);
22
+ }
23
+
24
+ // Store the result
25
+ for (int k=0; k < N; k++) {
26
+ C[globalRow*N + k] = (log(acc) - (A[globalRow*N + k] - max)) * L[globalRow*N + k];
27
+ }
28
+
29
+ for (int k=0; k < N; k++) {
30
+ P[globalRow*N + k] = (exp(A[globalRow*N + k] - max) / acc) - L[globalRow*N + k];
31
+ }
32
+ }
@@ -0,0 +1,28 @@
1
+ // First naive implementation
2
+ % c_dtype = dtype_to_c_type(dtype)
3
+ __kernel void softmax_cross_grad_<%= dtype %>(const int N,
4
+ const __global <%= c_dtype %>* A,
5
+ const __global <%= c_dtype %>* L,
6
+ const __global <%= c_dtype %>* G,
7
+ __global <%= c_dtype %>* C) {
8
+
9
+ // Get the index of the current element to be processed
10
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
11
+
12
+ // Compute a single element (loop over K)
13
+ <%= c_dtype %> acc = 0.0f;
14
+ <%= c_dtype %> max = <%= min_value_for(dtype) %>;
15
+
16
+ for (int k=0; k<N; k++) {
17
+ max = A[globalRow*N + k] > max ? A[globalRow*N + k] : max;
18
+ }
19
+
20
+ for (int k=0; k<N; k++) {
21
+ acc += exp(A[globalRow*N + k] - max);
22
+ }
23
+
24
+ // Store the result
25
+ for (int k=0; k < N; k++) {
26
+ C[globalRow*N + k] = ((exp(A[globalRow*N + k] - max)/acc) * G[globalRow*N + k] - L[globalRow*N + k]);
27
+ }
28
+ }
@@ -0,0 +1,46 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ __kernel void softmax_grad_<%= dtype %>(const int N,
3
+ const __global <%= c_dtype %>* A,
4
+ const __global <%= c_dtype %>* G,
5
+ __global <%= c_dtype %>* C) {
6
+
7
+ // Get the index of the current element to be processed
8
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
9
+
10
+ // Compute a single element (loop over K)
11
+ float acc = 0.0f;
12
+ float max = FLT_MIN;
13
+ float row[<%= size %>];
14
+ float grads[<%= size %>][<%= size %>];
15
+
16
+ for (int k=0; k<N; k++) {
17
+ max = A[globalRow*N + k] > max ? A[globalRow*N + k] : max;
18
+ }
19
+
20
+ for (int k=0; k<N; k++) {
21
+ acc += exp(A[globalRow*N + k] - max);
22
+ }
23
+
24
+ // Store the result
25
+ for (int k=0; k < N; k++) {
26
+ row[k] = exp(A[globalRow*N + k] - max) / acc;
27
+ }
28
+
29
+ for (int a=0; a < N; a++) {
30
+ for(int b=0; b < N; b++) {
31
+ if (a != b) {
32
+ grads[a][b] = -row[a] * row[b];
33
+ } else {
34
+ grads[a][b] = row[a] * (1.0f - row[a]);
35
+ }
36
+ }
37
+ }
38
+
39
+ for (int k=0; k < N; k++) {
40
+ float total_grad = 0.0f;
41
+ for (int a = 0; a < N; a++) {
42
+ total_grad += grads[a][k] * G[globalRow*N + a];
43
+ }
44
+ C[globalRow*N + k] = total_grad;
45
+ }
46
+ }
@@ -0,0 +1,9 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+
3
+ __kernel void sqrt_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
4
+ // Get the index of the current element to be processed
5
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
7
+
8
+ C[globalRow * N + globalCol] = sqrt(A[globalRow * N + globalCol]);
9
+ }
@@ -0,0 +1,9 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+
3
+ __kernel void square_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
4
+ // Get the index of the current element to be processed
5
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
7
+
8
+ C[globalRow * N + globalCol] = A[globalRow * N + globalCol] * A[globalRow * N + globalCol];
9
+ }
@@ -0,0 +1,53 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ // same dimension add floating point op
3
+ __kernel void squared_difference_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
4
+ // Get the index of the current element to be processed
5
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
7
+ <%= c_dtype %> x = A[globalRow * N + globalCol];
8
+ <%= c_dtype %> y = B[globalRow * N + globalCol];
9
+ C[globalRow * N + globalCol] = (x - y) * (x - y);
10
+ }
11
+
12
+ // 1D + Scalar floating point add op
13
+ __kernel void squared_difference_c_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
14
+ // Get the index of the current element to be processed
15
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
16
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
17
+
18
+ <%= c_dtype %> x = A[globalRow * N + globalCol];
19
+ <%= c_dtype %> y = B[0];
20
+
21
+ if (switch_op == 0) {
22
+ C[globalRow * N + globalCol] = (x - y) * (x - y);
23
+ } else {
24
+ C[globalRow * N + globalCol] = (y - x) * (y - x);
25
+ }
26
+ }
27
+
28
+ // 1D + Scalar floating point add op broadcast
29
+ __kernel void squared_difference_b_<%= dtype %>_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op,__global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
30
+ // Get the index of the current element to be processed
31
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
32
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
33
+
34
+ int b_m_index = globalRow;
35
+ int b_n_index = globalCol;
36
+
37
+ if ( b_m_index >= M2) {
38
+ b_m_index = b_m_index % M2;
39
+ };
40
+
41
+ if (b_n_index >= N2) {
42
+ b_n_index = b_n_index % N2;
43
+ }
44
+
45
+ <%= c_dtype %> x = A[globalRow * N + globalCol];
46
+ <%= c_dtype %> y = B[b_m_index * N2 + b_n_index];
47
+
48
+ if (switch_op == 0) {
49
+ C[globalRow * N + globalCol] = (x - y) * (x - y);
50
+ } else {
51
+ C[globalRow * N + globalCol] = (y - x) * (y - x);
52
+ }
53
+ }
@@ -0,0 +1,3 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ % op = operator_to_c('sub')
3
+ <%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'sub', dtype: "#{a}_#{b}", result_t: c_dtype %>
@@ -0,0 +1,8 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ __kernel void tan_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
+ // Get the index of the current element to be processed
4
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
+
7
+ C[globalRow * N + globalCol] = tan(A[globalRow * N + globalCol]);
8
+ }
@@ -0,0 +1,8 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ __kernel void tanh_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
+ // Get the index of the current element to be processed
4
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
+
7
+ C[globalRow * N + globalCol] = tanh(A[globalRow * N + globalCol]);
8
+ }
@@ -0,0 +1,7 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ __kernel void tanh_grad_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
+ // Get the index of the current element to be processed
4
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
+ C[globalRow * N + globalCol] = 1 - tanh(A[globalRow * N + globalCol]) * tanh(A[globalRow * N + globalCol]);
7
+ }
@@ -0,0 +1,8 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ __kernel void where_<%= dtype %>(const int M, const int N, __global const short *PRED, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
3
+ // Get the index of the current element to be processed
4
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
+
7
+ C[globalRow * N + globalCol] = PRED[globalRow * N + globalCol] ? A[globalRow * N + globalCol] : B[globalRow * N + globalCol];
8
+ }
@@ -0,0 +1,133 @@
1
+ module TensorStream
2
+ module OpenCLHelpers
3
+ # Collection of math functions for interfacing with OpenCL kernels
4
+ module MathOps
5
+ def MathOps.included(klass)
6
+ klass.class_eval do
7
+ %i[max min add real_div div sub floor_mod mod mul pow sigmoid_grad squared_difference].each do |op|
8
+ register_op op, noop: true do |context, tensor, inputs|
9
+ execute_2_operand_func(op.to_s, tensor, inputs[0], inputs[1], context)
10
+ end
11
+ end
12
+
13
+ register_op :add_n do |_context, tensor, inputs|
14
+ if inputs.size == 1
15
+ inputs[0]
16
+ else
17
+ m, n = inputs[0].shape
18
+ work_group = [m || 1, n || 1]
19
+ cl_m = OpenCL::Int1.new(m || 1)
20
+ cl_n = OpenCL::Int1.new(n || 1)
21
+ cl_switch = OpenCL::Int1.new(0)
22
+ dtype = tensor.data_type
23
+
24
+ output_buffer = _create_result_buffer(tensor.data_type, inputs[0].shape, "out_#{tensor.name}")
25
+ inputs_queue = inputs.dup
26
+ a = inputs_queue.pop
27
+ until inputs_queue.empty?
28
+ b = inputs_queue.pop
29
+ event_wait_list = build_event_wait_list([a, b])
30
+ method_call = :"add_#{a.data_type}_#{b.data_type}"
31
+ event = _cl_program('add', a: a.data_type, b: b.data_type, dtype: dtype).send(method_call, _opencl_queue, work_group, cl_m, cl_n, cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
32
+ a = output_buffer
33
+ a.op = event
34
+ end
35
+
36
+ output_buffer.op = a.op
37
+ output_buffer
38
+ end
39
+ end
40
+
41
+ register_op :floor_div, noop: true do |context, tensor, inputs|
42
+ if fp_type?(tensor.data_type)
43
+ execute_2_operand_func('floor_div', tensor, inputs[0], inputs[1], context)
44
+ else
45
+ execute_2_operand_func('div', tensor, inputs[0], inputs[1], context)
46
+ end
47
+ end
48
+
49
+ register_op :mat_mul do |_context, tensor, inputs|
50
+ a, b = inputs
51
+
52
+ m = a.shape[0]
53
+ n = b.shape[1]
54
+ v = b.shape[0]
55
+ k = a.shape[1]
56
+
57
+ if tensor.options[:transpose_a]
58
+ m = a.shape[1]
59
+ k = a.shape[0]
60
+ end
61
+
62
+ if tensor.options[:transpose_b]
63
+ n = b.shape[0]
64
+ v = b.shape[1]
65
+ end
66
+
67
+ result_shape = [m, n]
68
+
69
+ raise "#{tensor.inputs[0].name} rank must be greater than 1" if a.shape.size < 2
70
+ raise "#{tensor.inputs[1].name} rank must be greater than 1" if b.shape.size < 2
71
+ raise "incompatible shape sizes for matrix multiplication (#{a.shape[1]} != #{b.shape[0]}) #{a.shape} vs #{b.shape}" if k != v
72
+
73
+ dtype = tensor.data_type
74
+ a, b = auto_type_cast(a, b, name: "#{tensor.name}/cast_#{a.name}_#{b.data_type}")
75
+ output_buffer = _create_result_buffer(a.data_type, result_shape, tensor.name)
76
+
77
+ cl_m = OpenCL::Int1.new(m)
78
+ cl_n = OpenCL::Int1.new(n)
79
+ cl_k = OpenCL::Int1.new(k)
80
+
81
+ transpose_a = OpenCL::Int1.new(tensor.options[:transpose_a] ? 1 : 0)
82
+ transpose_b = OpenCL::Int1.new(tensor.options[:transpose_b] ? 1 : 0)
83
+ event_wait_list = build_event_wait_list(inputs)
84
+ output_buffer.op = _cl_program('gemm', dtype: dtype).send(:"gemm_#{dtype}", _opencl_queue, result_shape, cl_m, cl_n, cl_k, transpose_a, transpose_b, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
85
+ output_buffer
86
+ end
87
+
88
+ %i[sign exp tan acos asin sin cos abs sqrt negate square reciprocal tanh tanh_grad sigmoid log1p round floor ceil log].each do |op|
89
+ register_op op, noop: true do |context, tensor, inputs|
90
+ execute_func(op.to_s, tensor, inputs[0], context)
91
+ end
92
+ end
93
+
94
+ %i[sum mean].each do |op|
95
+ register_op op, noop: true do |context, tensor, inputs|
96
+ reduction(context, tensor, inputs[0], inputs[1], op.to_sym)
97
+ end
98
+ end
99
+
100
+ register_op :prod, noop: true do |context, tensor, inputs|
101
+ input_a = complete_eval(inputs[0], context)
102
+
103
+ if input_a.buffer.empty?
104
+ convert_to_opencl([1.0], [], data_type: inputs[0].data_type, name: tensor.name)
105
+ else
106
+ reduction(context, tensor, inputs[0], inputs[1], :prod)
107
+ end
108
+ end
109
+
110
+ register_op :argmin, buffer: true do |_context, tensor, inputs|
111
+ axis = tensor.options[:axis] || 0
112
+ rank = inputs[0].shape.size
113
+ raise TensorStream::InvalidArgumentError, "Expected dimension in the range [#{-rank},#{rank}) but got #{axis}" if axis < -rank || axis >= rank
114
+
115
+ arr = inputs[0].buffer.reshape(*inputs[0].shape.reverse).to_a
116
+ op = get_op_with_axis(arr, axis, 0, inputs[0].data_type, ->(a, b) { a < b })
117
+ convert_to_opencl(op, shape_eval(op), data_type: tensor.data_type, name: tensor.name)
118
+ end
119
+
120
+ register_op :argmax, buffer: true do |_context, tensor, inputs|
121
+ axis = tensor.options[:axis] || 0
122
+ rank = inputs[0].shape.size
123
+ raise TensorStream::InvalidArgumentError, "Expected dimension in the range [#{-rank},#{rank}) but got #{axis}" if axis < -rank || axis >= rank
124
+
125
+ arr = inputs[0].buffer.reshape(*inputs[0].shape.reverse).to_a
126
+ op = get_op_with_axis(arr, axis, 0, inputs[0].data_type, ->(a, b) { a > b })
127
+ convert_to_opencl(op, shape_eval(op), data_type: tensor.data_type, name: tensor.name)
128
+ end
129
+ end
130
+ end
131
+ end
132
+ end
133
+ end