tensor_stream-opencl 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +11 -0
- data/.rspec +3 -0
- data/.travis.yml +5 -0
- data/CODE_OF_CONDUCT.md +74 -0
- data/Gemfile +6 -0
- data/Gemfile.lock +51 -0
- data/LICENSE.txt +21 -0
- data/README.md +58 -0
- data/Rakefile +6 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/lib/tensor_stream/opencl.rb +7 -0
- data/lib/tensor_stream/opencl/kernels/_bool_operand.cl +45 -0
- data/lib/tensor_stream/opencl/kernels/_operand.cl +45 -0
- data/lib/tensor_stream/opencl/kernels/abs.cl +20 -0
- data/lib/tensor_stream/opencl/kernels/acos.cl +8 -0
- data/lib/tensor_stream/opencl/kernels/add.cl +3 -0
- data/lib/tensor_stream/opencl/kernels/apply_adam.cl +23 -0
- data/lib/tensor_stream/opencl/kernels/apply_gradient.cl +9 -0
- data/lib/tensor_stream/opencl/kernels/apply_momentum.cl +16 -0
- data/lib/tensor_stream/opencl/kernels/argmax.cl +8 -0
- data/lib/tensor_stream/opencl/kernels/argmin.cl +8 -0
- data/lib/tensor_stream/opencl/kernels/asin.cl +9 -0
- data/lib/tensor_stream/opencl/kernels/cast.cl +10 -0
- data/lib/tensor_stream/opencl/kernels/ceil.cl +8 -0
- data/lib/tensor_stream/opencl/kernels/cond.cl.erb +6 -0
- data/lib/tensor_stream/opencl/kernels/cos.cl +8 -0
- data/lib/tensor_stream/opencl/kernels/div.cl.erb +3 -0
- data/lib/tensor_stream/opencl/kernels/exp.cl +8 -0
- data/lib/tensor_stream/opencl/kernels/floor.cl +8 -0
- data/lib/tensor_stream/opencl/kernels/floor_div.cl +48 -0
- data/lib/tensor_stream/opencl/kernels/floor_mod.cl +3 -0
- data/lib/tensor_stream/opencl/kernels/gemm.cl +32 -0
- data/lib/tensor_stream/opencl/kernels/log.cl +8 -0
- data/lib/tensor_stream/opencl/kernels/log1p.cl +8 -0
- data/lib/tensor_stream/opencl/kernels/log_softmax.cl +26 -0
- data/lib/tensor_stream/opencl/kernels/max.cl +46 -0
- data/lib/tensor_stream/opencl/kernels/min.cl +46 -0
- data/lib/tensor_stream/opencl/kernels/mod.cl +3 -0
- data/lib/tensor_stream/opencl/kernels/mul.cl +3 -0
- data/lib/tensor_stream/opencl/kernels/negate.cl +8 -0
- data/lib/tensor_stream/opencl/kernels/pack.cl +24 -0
- data/lib/tensor_stream/opencl/kernels/pow.cl +46 -0
- data/lib/tensor_stream/opencl/kernels/real_div.cl +3 -0
- data/lib/tensor_stream/opencl/kernels/reciprocal.cl +8 -0
- data/lib/tensor_stream/opencl/kernels/round.cl +8 -0
- data/lib/tensor_stream/opencl/kernels/sigmoid.cl +9 -0
- data/lib/tensor_stream/opencl/kernels/sigmoid_grad.cl +55 -0
- data/lib/tensor_stream/opencl/kernels/sign.cl +21 -0
- data/lib/tensor_stream/opencl/kernels/sin.cl +9 -0
- data/lib/tensor_stream/opencl/kernels/softmax.cl +26 -0
- data/lib/tensor_stream/opencl/kernels/softmax_cross.cl +32 -0
- data/lib/tensor_stream/opencl/kernels/softmax_cross_grad.cl +28 -0
- data/lib/tensor_stream/opencl/kernels/softmax_grad.cl +46 -0
- data/lib/tensor_stream/opencl/kernels/sqrt.cl +9 -0
- data/lib/tensor_stream/opencl/kernels/square.cl +9 -0
- data/lib/tensor_stream/opencl/kernels/squared_difference.cl +53 -0
- data/lib/tensor_stream/opencl/kernels/sub.cl +3 -0
- data/lib/tensor_stream/opencl/kernels/tan.cl +8 -0
- data/lib/tensor_stream/opencl/kernels/tanh.cl +8 -0
- data/lib/tensor_stream/opencl/kernels/tanh_grad.cl +7 -0
- data/lib/tensor_stream/opencl/kernels/where.cl +8 -0
- data/lib/tensor_stream/opencl/math_ops.rb +133 -0
- data/lib/tensor_stream/opencl/nn_ops.rb +191 -0
- data/lib/tensor_stream/opencl/opencl_buffer.rb +35 -0
- data/lib/tensor_stream/opencl/opencl_device.rb +5 -0
- data/lib/tensor_stream/opencl/opencl_evaluator.rb +933 -0
- data/lib/tensor_stream/opencl/opencl_template_helper.rb +99 -0
- data/lib/tensor_stream/opencl/version.rb +5 -0
- data/tensor_stream-opencl.gemspec +40 -0
- metadata +185 -0
@@ -0,0 +1,8 @@
|
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
__kernel void reciprocal_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
|
+
// Get the index of the current element to be processed
|
4
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
5
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
6
|
+
|
7
|
+
C[globalRow * N + globalCol] = 1 / A[globalRow * N + globalCol];
|
8
|
+
}
|
@@ -0,0 +1,8 @@
|
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
__kernel void round_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
|
+
// Get the index of the current element to be processed
|
4
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
5
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
6
|
+
|
7
|
+
C[globalRow * N + globalCol] = round(A[globalRow * N + globalCol]);
|
8
|
+
}
|
@@ -0,0 +1,9 @@
|
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
|
3
|
+
__kernel void sigmoid_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
4
|
+
// Get the index of the current element to be processed
|
5
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
6
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
7
|
+
|
8
|
+
C[globalRow * N + globalCol] = 1.0f/(1.0f + exp(-A[globalRow * N + globalCol]));
|
9
|
+
}
|
@@ -0,0 +1,55 @@
|
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
|
3
|
+
float sigmoid(<%= c_dtype %> x) {
|
4
|
+
return 1.0f/(1.0f + exp(-x));
|
5
|
+
}
|
6
|
+
|
7
|
+
float sigmoid_grad(<%= c_dtype %> x, <%= c_dtype %> g) {
|
8
|
+
return g * sigmoid(x) * ( 1.0f - sigmoid(x));
|
9
|
+
}
|
10
|
+
|
11
|
+
// same dimension add floating point op
|
12
|
+
__kernel void sigmoid_grad_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
13
|
+
// Get the index of the current element to be processed
|
14
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
15
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
16
|
+
|
17
|
+
C[globalRow * N + globalCol] = sigmoid_grad(A[globalRow * N + globalCol], B[globalRow * N + globalCol]);
|
18
|
+
}
|
19
|
+
|
20
|
+
// 1D + Scalar floating point add op
|
21
|
+
__kernel void sigmoid_grad_c_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
22
|
+
// Get the index of the current element to be processed
|
23
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
24
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
25
|
+
|
26
|
+
if (switch_op == 0) {
|
27
|
+
C[globalRow * N + globalCol] = sigmoid_grad(A[globalRow * N + globalCol], B[0]);
|
28
|
+
} else {
|
29
|
+
C[globalRow * N + globalCol] = sigmoid_grad(B[0], A[globalRow * N + globalCol]);
|
30
|
+
}
|
31
|
+
}
|
32
|
+
|
33
|
+
// 1D + Scalar floating point add op broadcast
|
34
|
+
__kernel void sigmoid_grad_b_<%= dtype %>_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
35
|
+
// Get the index of the current element to be processed
|
36
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
37
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
38
|
+
|
39
|
+
int b_m_index = globalRow;
|
40
|
+
int b_n_index = globalCol;
|
41
|
+
|
42
|
+
if ( b_m_index >= M2) {
|
43
|
+
b_m_index = b_m_index % M2;
|
44
|
+
};
|
45
|
+
|
46
|
+
if (b_n_index >= N2) {
|
47
|
+
b_n_index = b_n_index % N2;
|
48
|
+
}
|
49
|
+
|
50
|
+
if (switch_op == 0) {
|
51
|
+
C[globalRow * N + globalCol] = sigmoid_grad(A[globalRow * N + globalCol], B[b_m_index * N2 + b_n_index]);
|
52
|
+
} else {
|
53
|
+
C[globalRow * N + globalCol] = sigmoid_grad(B[b_m_index * N2 + b_n_index], A[globalRow * N + globalCol]);
|
54
|
+
}
|
55
|
+
}
|
@@ -0,0 +1,21 @@
|
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
|
3
|
+
__kernel void sign_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
4
|
+
// Get the index of the current element to be processed
|
5
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
6
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
7
|
+
<%= c_dtype %> value = A[globalRow * N + globalCol];
|
8
|
+
% if floating_point?(dtype)
|
9
|
+
if (isnan(value) || value == 0.0f) {
|
10
|
+
C[globalRow * N + globalCol] = 0.0;
|
11
|
+
} else {
|
12
|
+
C[globalRow * N + globalCol] = value < 0 ? -1.0 : 1.0;
|
13
|
+
}
|
14
|
+
% else
|
15
|
+
if (value == 0) {
|
16
|
+
C[globalRow * N + globalCol] = 0;
|
17
|
+
} else {
|
18
|
+
C[globalRow * N + globalCol] = value < 0 ? -1 : 1;
|
19
|
+
}
|
20
|
+
% end
|
21
|
+
}
|
@@ -0,0 +1,9 @@
|
|
1
|
+
|
2
|
+
% c_dtype = dtype_to_c_type(dtype)
|
3
|
+
__kernel void sin_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
4
|
+
// Get the index of the current element to be processed
|
5
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
6
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
7
|
+
|
8
|
+
C[globalRow * N + globalCol] = sin(A[globalRow * N + globalCol]);
|
9
|
+
}
|
@@ -0,0 +1,26 @@
|
|
1
|
+
// First naive implementation
|
2
|
+
% c_dtype = dtype_to_c_type(dtype)
|
3
|
+
__kernel void softmax_<%= dtype %>(const int N,
|
4
|
+
const __global <%= c_dtype %>* A,
|
5
|
+
__global <%= c_dtype %>* C) {
|
6
|
+
|
7
|
+
// Get the index of the current element to be processed
|
8
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
9
|
+
|
10
|
+
// Compute a single element (loop over K)
|
11
|
+
<%= c_dtype %> acc = 0.0f;
|
12
|
+
<%= c_dtype %> max = <%= min_value_for(dtype) %>;
|
13
|
+
|
14
|
+
for (int k=0; k<N; k++) {
|
15
|
+
max = A[globalRow*N + k] > max ? A[globalRow*N + k] : max;
|
16
|
+
}
|
17
|
+
|
18
|
+
for (int k=0; k<N; k++) {
|
19
|
+
acc += exp(A[globalRow*N + k] - max);
|
20
|
+
}
|
21
|
+
|
22
|
+
// Store the result
|
23
|
+
for (int k=0; k < N; k++) {
|
24
|
+
C[globalRow*N + k] = exp(A[globalRow*N + k] - max) / acc;
|
25
|
+
}
|
26
|
+
}
|
@@ -0,0 +1,32 @@
|
|
1
|
+
|
2
|
+
// First naive implementation
|
3
|
+
% c_dtype = dtype_to_c_type(dtype)
|
4
|
+
__kernel void softmax_cross_<%= dtype %>(const int N,
|
5
|
+
const __global <%= c_dtype %>* A,
|
6
|
+
const __global <%= c_dtype %>* L,
|
7
|
+
__global <%= c_dtype %>* C, __global <%= c_dtype %>* P) {
|
8
|
+
|
9
|
+
// Get the index of the current element to be processed
|
10
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
11
|
+
|
12
|
+
// Compute a single element (loop over K)
|
13
|
+
<%= c_dtype %> acc = 0.0f;
|
14
|
+
<%= c_dtype %> max = <%= min_value_for(dtype) %>;
|
15
|
+
|
16
|
+
for (int k=0; k<N; k++) {
|
17
|
+
max = A[globalRow*N + k] > max ? A[globalRow*N + k] : max;
|
18
|
+
}
|
19
|
+
|
20
|
+
for (int k=0; k<N; k++) {
|
21
|
+
acc += exp(A[globalRow*N + k] - max);
|
22
|
+
}
|
23
|
+
|
24
|
+
// Store the result
|
25
|
+
for (int k=0; k < N; k++) {
|
26
|
+
C[globalRow*N + k] = (log(acc) - (A[globalRow*N + k] - max)) * L[globalRow*N + k];
|
27
|
+
}
|
28
|
+
|
29
|
+
for (int k=0; k < N; k++) {
|
30
|
+
P[globalRow*N + k] = (exp(A[globalRow*N + k] - max) / acc) - L[globalRow*N + k];
|
31
|
+
}
|
32
|
+
}
|
@@ -0,0 +1,28 @@
|
|
1
|
+
// First naive implementation
|
2
|
+
% c_dtype = dtype_to_c_type(dtype)
|
3
|
+
__kernel void softmax_cross_grad_<%= dtype %>(const int N,
|
4
|
+
const __global <%= c_dtype %>* A,
|
5
|
+
const __global <%= c_dtype %>* L,
|
6
|
+
const __global <%= c_dtype %>* G,
|
7
|
+
__global <%= c_dtype %>* C) {
|
8
|
+
|
9
|
+
// Get the index of the current element to be processed
|
10
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
11
|
+
|
12
|
+
// Compute a single element (loop over K)
|
13
|
+
<%= c_dtype %> acc = 0.0f;
|
14
|
+
<%= c_dtype %> max = <%= min_value_for(dtype) %>;
|
15
|
+
|
16
|
+
for (int k=0; k<N; k++) {
|
17
|
+
max = A[globalRow*N + k] > max ? A[globalRow*N + k] : max;
|
18
|
+
}
|
19
|
+
|
20
|
+
for (int k=0; k<N; k++) {
|
21
|
+
acc += exp(A[globalRow*N + k] - max);
|
22
|
+
}
|
23
|
+
|
24
|
+
// Store the result
|
25
|
+
for (int k=0; k < N; k++) {
|
26
|
+
C[globalRow*N + k] = ((exp(A[globalRow*N + k] - max)/acc) * G[globalRow*N + k] - L[globalRow*N + k]);
|
27
|
+
}
|
28
|
+
}
|
@@ -0,0 +1,46 @@
|
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
__kernel void softmax_grad_<%= dtype %>(const int N,
|
3
|
+
const __global <%= c_dtype %>* A,
|
4
|
+
const __global <%= c_dtype %>* G,
|
5
|
+
__global <%= c_dtype %>* C) {
|
6
|
+
|
7
|
+
// Get the index of the current element to be processed
|
8
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
9
|
+
|
10
|
+
// Compute a single element (loop over K)
|
11
|
+
float acc = 0.0f;
|
12
|
+
float max = FLT_MIN;
|
13
|
+
float row[<%= size %>];
|
14
|
+
float grads[<%= size %>][<%= size %>];
|
15
|
+
|
16
|
+
for (int k=0; k<N; k++) {
|
17
|
+
max = A[globalRow*N + k] > max ? A[globalRow*N + k] : max;
|
18
|
+
}
|
19
|
+
|
20
|
+
for (int k=0; k<N; k++) {
|
21
|
+
acc += exp(A[globalRow*N + k] - max);
|
22
|
+
}
|
23
|
+
|
24
|
+
// Store the result
|
25
|
+
for (int k=0; k < N; k++) {
|
26
|
+
row[k] = exp(A[globalRow*N + k] - max) / acc;
|
27
|
+
}
|
28
|
+
|
29
|
+
for (int a=0; a < N; a++) {
|
30
|
+
for(int b=0; b < N; b++) {
|
31
|
+
if (a != b) {
|
32
|
+
grads[a][b] = -row[a] * row[b];
|
33
|
+
} else {
|
34
|
+
grads[a][b] = row[a] * (1.0f - row[a]);
|
35
|
+
}
|
36
|
+
}
|
37
|
+
}
|
38
|
+
|
39
|
+
for (int k=0; k < N; k++) {
|
40
|
+
float total_grad = 0.0f;
|
41
|
+
for (int a = 0; a < N; a++) {
|
42
|
+
total_grad += grads[a][k] * G[globalRow*N + a];
|
43
|
+
}
|
44
|
+
C[globalRow*N + k] = total_grad;
|
45
|
+
}
|
46
|
+
}
|
@@ -0,0 +1,9 @@
|
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
|
3
|
+
__kernel void sqrt_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
4
|
+
// Get the index of the current element to be processed
|
5
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
6
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
7
|
+
|
8
|
+
C[globalRow * N + globalCol] = sqrt(A[globalRow * N + globalCol]);
|
9
|
+
}
|
@@ -0,0 +1,9 @@
|
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
|
3
|
+
__kernel void square_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
4
|
+
// Get the index of the current element to be processed
|
5
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
6
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
7
|
+
|
8
|
+
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] * A[globalRow * N + globalCol];
|
9
|
+
}
|
@@ -0,0 +1,53 @@
|
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
// same dimension add floating point op
|
3
|
+
__kernel void squared_difference_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
4
|
+
// Get the index of the current element to be processed
|
5
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
6
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
7
|
+
<%= c_dtype %> x = A[globalRow * N + globalCol];
|
8
|
+
<%= c_dtype %> y = B[globalRow * N + globalCol];
|
9
|
+
C[globalRow * N + globalCol] = (x - y) * (x - y);
|
10
|
+
}
|
11
|
+
|
12
|
+
// 1D + Scalar floating point add op
|
13
|
+
__kernel void squared_difference_c_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
14
|
+
// Get the index of the current element to be processed
|
15
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
16
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
17
|
+
|
18
|
+
<%= c_dtype %> x = A[globalRow * N + globalCol];
|
19
|
+
<%= c_dtype %> y = B[0];
|
20
|
+
|
21
|
+
if (switch_op == 0) {
|
22
|
+
C[globalRow * N + globalCol] = (x - y) * (x - y);
|
23
|
+
} else {
|
24
|
+
C[globalRow * N + globalCol] = (y - x) * (y - x);
|
25
|
+
}
|
26
|
+
}
|
27
|
+
|
28
|
+
// 1D + Scalar floating point add op broadcast
|
29
|
+
__kernel void squared_difference_b_<%= dtype %>_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op,__global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
30
|
+
// Get the index of the current element to be processed
|
31
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
32
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
33
|
+
|
34
|
+
int b_m_index = globalRow;
|
35
|
+
int b_n_index = globalCol;
|
36
|
+
|
37
|
+
if ( b_m_index >= M2) {
|
38
|
+
b_m_index = b_m_index % M2;
|
39
|
+
};
|
40
|
+
|
41
|
+
if (b_n_index >= N2) {
|
42
|
+
b_n_index = b_n_index % N2;
|
43
|
+
}
|
44
|
+
|
45
|
+
<%= c_dtype %> x = A[globalRow * N + globalCol];
|
46
|
+
<%= c_dtype %> y = B[b_m_index * N2 + b_n_index];
|
47
|
+
|
48
|
+
if (switch_op == 0) {
|
49
|
+
C[globalRow * N + globalCol] = (x - y) * (x - y);
|
50
|
+
} else {
|
51
|
+
C[globalRow * N + globalCol] = (y - x) * (y - x);
|
52
|
+
}
|
53
|
+
}
|
@@ -0,0 +1,8 @@
|
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
__kernel void tan_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
|
+
// Get the index of the current element to be processed
|
4
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
5
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
6
|
+
|
7
|
+
C[globalRow * N + globalCol] = tan(A[globalRow * N + globalCol]);
|
8
|
+
}
|
@@ -0,0 +1,8 @@
|
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
__kernel void tanh_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
|
+
// Get the index of the current element to be processed
|
4
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
5
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
6
|
+
|
7
|
+
C[globalRow * N + globalCol] = tanh(A[globalRow * N + globalCol]);
|
8
|
+
}
|
@@ -0,0 +1,7 @@
|
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
__kernel void tanh_grad_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
|
+
// Get the index of the current element to be processed
|
4
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
5
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
6
|
+
C[globalRow * N + globalCol] = 1 - tanh(A[globalRow * N + globalCol]) * tanh(A[globalRow * N + globalCol]);
|
7
|
+
}
|
@@ -0,0 +1,8 @@
|
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
__kernel void where_<%= dtype %>(const int M, const int N, __global const short *PRED, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
3
|
+
// Get the index of the current element to be processed
|
4
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
5
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
6
|
+
|
7
|
+
C[globalRow * N + globalCol] = PRED[globalRow * N + globalCol] ? A[globalRow * N + globalCol] : B[globalRow * N + globalCol];
|
8
|
+
}
|
@@ -0,0 +1,133 @@
|
|
1
|
+
module TensorStream
|
2
|
+
module OpenCLHelpers
|
3
|
+
# Collection of math functions for interfacing with OpenCL kernels
|
4
|
+
module MathOps
|
5
|
+
def MathOps.included(klass)
|
6
|
+
klass.class_eval do
|
7
|
+
%i[max min add real_div div sub floor_mod mod mul pow sigmoid_grad squared_difference].each do |op|
|
8
|
+
register_op op, noop: true do |context, tensor, inputs|
|
9
|
+
execute_2_operand_func(op.to_s, tensor, inputs[0], inputs[1], context)
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
register_op :add_n do |_context, tensor, inputs|
|
14
|
+
if inputs.size == 1
|
15
|
+
inputs[0]
|
16
|
+
else
|
17
|
+
m, n = inputs[0].shape
|
18
|
+
work_group = [m || 1, n || 1]
|
19
|
+
cl_m = OpenCL::Int1.new(m || 1)
|
20
|
+
cl_n = OpenCL::Int1.new(n || 1)
|
21
|
+
cl_switch = OpenCL::Int1.new(0)
|
22
|
+
dtype = tensor.data_type
|
23
|
+
|
24
|
+
output_buffer = _create_result_buffer(tensor.data_type, inputs[0].shape, "out_#{tensor.name}")
|
25
|
+
inputs_queue = inputs.dup
|
26
|
+
a = inputs_queue.pop
|
27
|
+
until inputs_queue.empty?
|
28
|
+
b = inputs_queue.pop
|
29
|
+
event_wait_list = build_event_wait_list([a, b])
|
30
|
+
method_call = :"add_#{a.data_type}_#{b.data_type}"
|
31
|
+
event = _cl_program('add', a: a.data_type, b: b.data_type, dtype: dtype).send(method_call, _opencl_queue, work_group, cl_m, cl_n, cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
32
|
+
a = output_buffer
|
33
|
+
a.op = event
|
34
|
+
end
|
35
|
+
|
36
|
+
output_buffer.op = a.op
|
37
|
+
output_buffer
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
register_op :floor_div, noop: true do |context, tensor, inputs|
|
42
|
+
if fp_type?(tensor.data_type)
|
43
|
+
execute_2_operand_func('floor_div', tensor, inputs[0], inputs[1], context)
|
44
|
+
else
|
45
|
+
execute_2_operand_func('div', tensor, inputs[0], inputs[1], context)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
register_op :mat_mul do |_context, tensor, inputs|
|
50
|
+
a, b = inputs
|
51
|
+
|
52
|
+
m = a.shape[0]
|
53
|
+
n = b.shape[1]
|
54
|
+
v = b.shape[0]
|
55
|
+
k = a.shape[1]
|
56
|
+
|
57
|
+
if tensor.options[:transpose_a]
|
58
|
+
m = a.shape[1]
|
59
|
+
k = a.shape[0]
|
60
|
+
end
|
61
|
+
|
62
|
+
if tensor.options[:transpose_b]
|
63
|
+
n = b.shape[0]
|
64
|
+
v = b.shape[1]
|
65
|
+
end
|
66
|
+
|
67
|
+
result_shape = [m, n]
|
68
|
+
|
69
|
+
raise "#{tensor.inputs[0].name} rank must be greater than 1" if a.shape.size < 2
|
70
|
+
raise "#{tensor.inputs[1].name} rank must be greater than 1" if b.shape.size < 2
|
71
|
+
raise "incompatible shape sizes for matrix multiplication (#{a.shape[1]} != #{b.shape[0]}) #{a.shape} vs #{b.shape}" if k != v
|
72
|
+
|
73
|
+
dtype = tensor.data_type
|
74
|
+
a, b = auto_type_cast(a, b, name: "#{tensor.name}/cast_#{a.name}_#{b.data_type}")
|
75
|
+
output_buffer = _create_result_buffer(a.data_type, result_shape, tensor.name)
|
76
|
+
|
77
|
+
cl_m = OpenCL::Int1.new(m)
|
78
|
+
cl_n = OpenCL::Int1.new(n)
|
79
|
+
cl_k = OpenCL::Int1.new(k)
|
80
|
+
|
81
|
+
transpose_a = OpenCL::Int1.new(tensor.options[:transpose_a] ? 1 : 0)
|
82
|
+
transpose_b = OpenCL::Int1.new(tensor.options[:transpose_b] ? 1 : 0)
|
83
|
+
event_wait_list = build_event_wait_list(inputs)
|
84
|
+
output_buffer.op = _cl_program('gemm', dtype: dtype).send(:"gemm_#{dtype}", _opencl_queue, result_shape, cl_m, cl_n, cl_k, transpose_a, transpose_b, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
85
|
+
output_buffer
|
86
|
+
end
|
87
|
+
|
88
|
+
%i[sign exp tan acos asin sin cos abs sqrt negate square reciprocal tanh tanh_grad sigmoid log1p round floor ceil log].each do |op|
|
89
|
+
register_op op, noop: true do |context, tensor, inputs|
|
90
|
+
execute_func(op.to_s, tensor, inputs[0], context)
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
%i[sum mean].each do |op|
|
95
|
+
register_op op, noop: true do |context, tensor, inputs|
|
96
|
+
reduction(context, tensor, inputs[0], inputs[1], op.to_sym)
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
register_op :prod, noop: true do |context, tensor, inputs|
|
101
|
+
input_a = complete_eval(inputs[0], context)
|
102
|
+
|
103
|
+
if input_a.buffer.empty?
|
104
|
+
convert_to_opencl([1.0], [], data_type: inputs[0].data_type, name: tensor.name)
|
105
|
+
else
|
106
|
+
reduction(context, tensor, inputs[0], inputs[1], :prod)
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
register_op :argmin, buffer: true do |_context, tensor, inputs|
|
111
|
+
axis = tensor.options[:axis] || 0
|
112
|
+
rank = inputs[0].shape.size
|
113
|
+
raise TensorStream::InvalidArgumentError, "Expected dimension in the range [#{-rank},#{rank}) but got #{axis}" if axis < -rank || axis >= rank
|
114
|
+
|
115
|
+
arr = inputs[0].buffer.reshape(*inputs[0].shape.reverse).to_a
|
116
|
+
op = get_op_with_axis(arr, axis, 0, inputs[0].data_type, ->(a, b) { a < b })
|
117
|
+
convert_to_opencl(op, shape_eval(op), data_type: tensor.data_type, name: tensor.name)
|
118
|
+
end
|
119
|
+
|
120
|
+
register_op :argmax, buffer: true do |_context, tensor, inputs|
|
121
|
+
axis = tensor.options[:axis] || 0
|
122
|
+
rank = inputs[0].shape.size
|
123
|
+
raise TensorStream::InvalidArgumentError, "Expected dimension in the range [#{-rank},#{rank}) but got #{axis}" if axis < -rank || axis >= rank
|
124
|
+
|
125
|
+
arr = inputs[0].buffer.reshape(*inputs[0].shape.reverse).to_a
|
126
|
+
op = get_op_with_axis(arr, axis, 0, inputs[0].data_type, ->(a, b) { a > b })
|
127
|
+
convert_to_opencl(op, shape_eval(op), data_type: tensor.data_type, name: tensor.name)
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
133
|
+
end
|