tensor_stream-opencl 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +11 -0
- data/.rspec +3 -0
- data/.travis.yml +5 -0
- data/CODE_OF_CONDUCT.md +74 -0
- data/Gemfile +6 -0
- data/Gemfile.lock +51 -0
- data/LICENSE.txt +21 -0
- data/README.md +58 -0
- data/Rakefile +6 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/lib/tensor_stream/opencl.rb +7 -0
- data/lib/tensor_stream/opencl/kernels/_bool_operand.cl +45 -0
- data/lib/tensor_stream/opencl/kernels/_operand.cl +45 -0
- data/lib/tensor_stream/opencl/kernels/abs.cl +20 -0
- data/lib/tensor_stream/opencl/kernels/acos.cl +8 -0
- data/lib/tensor_stream/opencl/kernels/add.cl +3 -0
- data/lib/tensor_stream/opencl/kernels/apply_adam.cl +23 -0
- data/lib/tensor_stream/opencl/kernels/apply_gradient.cl +9 -0
- data/lib/tensor_stream/opencl/kernels/apply_momentum.cl +16 -0
- data/lib/tensor_stream/opencl/kernels/argmax.cl +8 -0
- data/lib/tensor_stream/opencl/kernels/argmin.cl +8 -0
- data/lib/tensor_stream/opencl/kernels/asin.cl +9 -0
- data/lib/tensor_stream/opencl/kernels/cast.cl +10 -0
- data/lib/tensor_stream/opencl/kernels/ceil.cl +8 -0
- data/lib/tensor_stream/opencl/kernels/cond.cl.erb +6 -0
- data/lib/tensor_stream/opencl/kernels/cos.cl +8 -0
- data/lib/tensor_stream/opencl/kernels/div.cl.erb +3 -0
- data/lib/tensor_stream/opencl/kernels/exp.cl +8 -0
- data/lib/tensor_stream/opencl/kernels/floor.cl +8 -0
- data/lib/tensor_stream/opencl/kernels/floor_div.cl +48 -0
- data/lib/tensor_stream/opencl/kernels/floor_mod.cl +3 -0
- data/lib/tensor_stream/opencl/kernels/gemm.cl +32 -0
- data/lib/tensor_stream/opencl/kernels/log.cl +8 -0
- data/lib/tensor_stream/opencl/kernels/log1p.cl +8 -0
- data/lib/tensor_stream/opencl/kernels/log_softmax.cl +26 -0
- data/lib/tensor_stream/opencl/kernels/max.cl +46 -0
- data/lib/tensor_stream/opencl/kernels/min.cl +46 -0
- data/lib/tensor_stream/opencl/kernels/mod.cl +3 -0
- data/lib/tensor_stream/opencl/kernels/mul.cl +3 -0
- data/lib/tensor_stream/opencl/kernels/negate.cl +8 -0
- data/lib/tensor_stream/opencl/kernels/pack.cl +24 -0
- data/lib/tensor_stream/opencl/kernels/pow.cl +46 -0
- data/lib/tensor_stream/opencl/kernels/real_div.cl +3 -0
- data/lib/tensor_stream/opencl/kernels/reciprocal.cl +8 -0
- data/lib/tensor_stream/opencl/kernels/round.cl +8 -0
- data/lib/tensor_stream/opencl/kernels/sigmoid.cl +9 -0
- data/lib/tensor_stream/opencl/kernels/sigmoid_grad.cl +55 -0
- data/lib/tensor_stream/opencl/kernels/sign.cl +21 -0
- data/lib/tensor_stream/opencl/kernels/sin.cl +9 -0
- data/lib/tensor_stream/opencl/kernels/softmax.cl +26 -0
- data/lib/tensor_stream/opencl/kernels/softmax_cross.cl +32 -0
- data/lib/tensor_stream/opencl/kernels/softmax_cross_grad.cl +28 -0
- data/lib/tensor_stream/opencl/kernels/softmax_grad.cl +46 -0
- data/lib/tensor_stream/opencl/kernels/sqrt.cl +9 -0
- data/lib/tensor_stream/opencl/kernels/square.cl +9 -0
- data/lib/tensor_stream/opencl/kernels/squared_difference.cl +53 -0
- data/lib/tensor_stream/opencl/kernels/sub.cl +3 -0
- data/lib/tensor_stream/opencl/kernels/tan.cl +8 -0
- data/lib/tensor_stream/opencl/kernels/tanh.cl +8 -0
- data/lib/tensor_stream/opencl/kernels/tanh_grad.cl +7 -0
- data/lib/tensor_stream/opencl/kernels/where.cl +8 -0
- data/lib/tensor_stream/opencl/math_ops.rb +133 -0
- data/lib/tensor_stream/opencl/nn_ops.rb +191 -0
- data/lib/tensor_stream/opencl/opencl_buffer.rb +35 -0
- data/lib/tensor_stream/opencl/opencl_device.rb +5 -0
- data/lib/tensor_stream/opencl/opencl_evaluator.rb +933 -0
- data/lib/tensor_stream/opencl/opencl_template_helper.rb +99 -0
- data/lib/tensor_stream/opencl/version.rb +5 -0
- data/tensor_stream-opencl.gemspec +40 -0
- metadata +185 -0
@@ -0,0 +1,9 @@
|
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
// same dimension add floating point op
|
3
|
+
__kernel void apply_gradient_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
4
|
+
// Get the index of the current element to be processed
|
5
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
6
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
7
|
+
|
8
|
+
C[globalRow * N + globalCol] -= A[globalRow * N + globalCol] * B[0];
|
9
|
+
}
|
@@ -0,0 +1,16 @@
|
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
// same dimension add floating point op
|
3
|
+
__kernel void apply_momentum_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *grad, __global const <%= c_dtype %> *learning_rate,
|
4
|
+
__global const <%= c_dtype %> *momentum, __global <%= c_dtype %> *output, __global <%= c_dtype %> *acc) {
|
5
|
+
// Get the index of the current element to be processed
|
6
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
7
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
8
|
+
const int index = globalRow * N + globalCol;
|
9
|
+
<%= c_dtype %> acc_m = acc[index];
|
10
|
+
acc[index] = acc_m * momentum[0] + grad[index];
|
11
|
+
<% if nesterov %>
|
12
|
+
output[index] -= grad[index] * learning_rate[0] + acc_m * momentum[0] * learning_rate[0];
|
13
|
+
<% else %>
|
14
|
+
output[index] -= acc_m * learning_rate[0];
|
15
|
+
<% end %>
|
16
|
+
}
|
@@ -0,0 +1,8 @@
|
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
__kernel void argmax_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
3
|
+
// Get the index of the current element to be processed
|
4
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
5
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
6
|
+
|
7
|
+
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] + B[globalRow * N + globalCol];
|
8
|
+
}
|
@@ -0,0 +1,8 @@
|
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
__kernel void argmin_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
3
|
+
// Get the index of the current element to be processed
|
4
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
5
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
6
|
+
|
7
|
+
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] + B[globalRow * N + globalCol];
|
8
|
+
}
|
@@ -0,0 +1,9 @@
|
|
1
|
+
|
2
|
+
% c_dtype = dtype_to_c_type(dtype)
|
3
|
+
__kernel void asin_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
4
|
+
// Get the index of the current element to be processed
|
5
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
6
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
7
|
+
|
8
|
+
C[globalRow * N + globalCol] = asin(A[globalRow * N + globalCol]);
|
9
|
+
}
|
@@ -0,0 +1,10 @@
|
|
1
|
+
% source_ctype = dtype_to_c_type(source_dt)
|
2
|
+
% target_ctype = dtype_to_c_type(target_dt)
|
3
|
+
|
4
|
+
__kernel void cast(const int M, const int N, __global const <%= source_ctype %> *A, __global <%= target_ctype %> *C) {
|
5
|
+
// Get the index of the current element to be processed
|
6
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
7
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
8
|
+
|
9
|
+
C[globalRow * N + globalCol] = A[globalRow * N + globalCol];
|
10
|
+
}
|
@@ -0,0 +1,8 @@
|
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
__kernel void ceil_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
|
+
// Get the index of the current element to be processed
|
4
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
5
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
6
|
+
|
7
|
+
C[globalRow * N + globalCol] = ceil(A[globalRow * N + globalCol]);
|
8
|
+
}
|
@@ -0,0 +1,6 @@
|
|
1
|
+
% ["#{dtype}"].product(%w[less less_equal equal not_equal greater greater_equal logical_and]).each do |dtype, fname|
|
2
|
+
% a_dtype = dtype_to_c_type(a)
|
3
|
+
% b_dtype = dtype_to_c_type(b)
|
4
|
+
% op = operator_to_c(fname)
|
5
|
+
<%= render 'bool_operand.cl', a_dtype: a_dtype, b_dtype: b_dtype, op: op, fname: fname, dtype: "#{a}_#{b}", result_t: 'short' %>
|
6
|
+
% end
|
@@ -0,0 +1,8 @@
|
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
__kernel void cos_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
|
+
// Get the index of the current element to be processed
|
4
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
5
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
6
|
+
|
7
|
+
C[globalRow * N + globalCol] = cos(A[globalRow * N + globalCol]);
|
8
|
+
}
|
@@ -0,0 +1,8 @@
|
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
__kernel void exp_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
|
+
// Get the index of the current element to be processed
|
4
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
5
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
6
|
+
|
7
|
+
C[globalRow * N + globalCol] = exp(A[globalRow * N + globalCol]);
|
8
|
+
}
|
@@ -0,0 +1,8 @@
|
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
__kernel void floor_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
|
+
// Get the index of the current element to be processed
|
4
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
5
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
6
|
+
|
7
|
+
C[globalRow * N + globalCol] = floor(A[globalRow * N + globalCol]);
|
8
|
+
}
|
@@ -0,0 +1,48 @@
|
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
% fname = 'floor_div'
|
3
|
+
% result_t = c_dtype
|
4
|
+
// same dimension add floating point op
|
5
|
+
__kernel void <%= fname%>_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= result_t %> *C) {
|
6
|
+
// Get the index of the current element to be processed
|
7
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
8
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
9
|
+
|
10
|
+
C[globalRow * N + globalCol] = (int)(A[globalRow * N + globalCol] / B[globalRow * N + globalCol]);
|
11
|
+
}
|
12
|
+
|
13
|
+
// 1D + Scalar floating point add op
|
14
|
+
__kernel void <%=fname%>_c_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= result_t %> *C) {
|
15
|
+
// Get the index of the current element to be processed
|
16
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
17
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
18
|
+
|
19
|
+
if (switch_op == 0) {
|
20
|
+
C[globalRow * N + globalCol] = (int)(A[globalRow * N + globalCol] / B[0]);
|
21
|
+
} else {
|
22
|
+
C[globalRow * N + globalCol] = (int)(B[0] / A[globalRow * N + globalCol]);
|
23
|
+
}
|
24
|
+
}
|
25
|
+
|
26
|
+
// 1D + Scalar floating point add op broadcast
|
27
|
+
__kernel void <%= fname%>_b_<%= dtype %>_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op,__global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= result_t %> *C) {
|
28
|
+
// Get the index of the current element to be processed
|
29
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
30
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
31
|
+
|
32
|
+
int b_m_index = globalRow;
|
33
|
+
int b_n_index = globalCol;
|
34
|
+
|
35
|
+
if ( b_m_index >= M2) {
|
36
|
+
b_m_index = b_m_index % M2;
|
37
|
+
};
|
38
|
+
|
39
|
+
if (b_n_index >= N2) {
|
40
|
+
b_n_index = b_n_index % N2;
|
41
|
+
}
|
42
|
+
|
43
|
+
if (switch_op == 0) {
|
44
|
+
C[globalRow * N + globalCol] = (int)(A[globalRow * N + globalCol] / B[b_m_index * N2 + b_n_index]);
|
45
|
+
} else {
|
46
|
+
C[globalRow * N + globalCol] = (int)(B[b_m_index * N2 + b_n_index] / A[globalRow * N + globalCol]);
|
47
|
+
}
|
48
|
+
}
|
@@ -0,0 +1,32 @@
|
|
1
|
+
// First naive implementation
|
2
|
+
% c_dtype = dtype_to_c_type(dtype)
|
3
|
+
__kernel void gemm_<%= dtype %>(const int M, const int N, const int K,
|
4
|
+
const int A_transpose,
|
5
|
+
const int B_transpose,
|
6
|
+
const __global <%= c_dtype %>* A,
|
7
|
+
const __global <%= c_dtype %>* B,
|
8
|
+
__global <%= c_dtype %>* C) {
|
9
|
+
|
10
|
+
// Get the index of the current element to be processed
|
11
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
12
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
13
|
+
|
14
|
+
// Compute a single element (loop over K)
|
15
|
+
<%= c_dtype %> acc = 0.0f;
|
16
|
+
for (int k=0; k<K; k++) {
|
17
|
+
int a_index = globalRow*K + k;
|
18
|
+
int b_index = k*N + globalCol;
|
19
|
+
|
20
|
+
if (A_transpose) {
|
21
|
+
a_index = M*k + globalRow;
|
22
|
+
}
|
23
|
+
|
24
|
+
if (B_transpose) {
|
25
|
+
b_index = globalCol*K + k;
|
26
|
+
}
|
27
|
+
acc += A[a_index] * B[b_index];
|
28
|
+
}
|
29
|
+
|
30
|
+
// Store the result
|
31
|
+
C[globalRow*N + globalCol] = acc;
|
32
|
+
}
|
@@ -0,0 +1,8 @@
|
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
__kernel void log_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
|
+
// Get the index of the current element to be processed
|
4
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
5
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
6
|
+
|
7
|
+
C[globalRow * N + globalCol] = log(A[globalRow * N + globalCol]);
|
8
|
+
}
|
@@ -0,0 +1,8 @@
|
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
__kernel void log1p_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
|
+
// Get the index of the current element to be processed
|
4
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
5
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
6
|
+
|
7
|
+
C[globalRow * N + globalCol] = log1p(A[globalRow * N + globalCol]);
|
8
|
+
}
|
@@ -0,0 +1,26 @@
|
|
1
|
+
// First naive implementation
|
2
|
+
% c_dtype = dtype_to_c_type(dtype)
|
3
|
+
__kernel void log_softmax_<%= dtype %>(const int N,
|
4
|
+
const __global <%= c_dtype %>* A,
|
5
|
+
__global <%= c_dtype %>* C) {
|
6
|
+
|
7
|
+
// Get the index of the current element to be processed
|
8
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
9
|
+
|
10
|
+
// Compute a single element (loop over K)
|
11
|
+
<%= c_dtype %> acc = 0.0f;
|
12
|
+
<%= c_dtype %> max = <%= min_value_for(dtype) %>;
|
13
|
+
|
14
|
+
for (int k=0; k<N; k++) {
|
15
|
+
max = A[globalRow*N + k] > max ? A[globalRow*N + k] : max;
|
16
|
+
}
|
17
|
+
|
18
|
+
for (int k=0; k<N; k++) {
|
19
|
+
acc += exp(A[globalRow*N + k] - max);
|
20
|
+
}
|
21
|
+
|
22
|
+
// Store the result
|
23
|
+
for (int k=0; k < N; k++) {
|
24
|
+
C[globalRow*N + k] = (A[globalRow*N + k] - max) - log(acc);
|
25
|
+
}
|
26
|
+
}
|
@@ -0,0 +1,46 @@
|
|
1
|
+
// same dimension add floating point op
|
2
|
+
% c_dtype = dtype_to_c_type(dtype)
|
3
|
+
__kernel void max_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
4
|
+
// Get the index of the current element to be processed
|
5
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
6
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
7
|
+
|
8
|
+
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] >= B[globalRow * N + globalCol] ? A[globalRow * N + globalCol] : B[globalRow * N + globalCol];
|
9
|
+
}
|
10
|
+
|
11
|
+
// 1D + Scalar floating point add op
|
12
|
+
__kernel void max_c_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
13
|
+
// Get the index of the current element to be processed
|
14
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
15
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
16
|
+
|
17
|
+
if (switch_op == 0) {
|
18
|
+
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] >= B[0] ? A[globalRow * N + globalCol] : B[0];
|
19
|
+
} else {
|
20
|
+
C[globalRow * N + globalCol] = B[0] >= A[globalRow * N + globalCol] ? B[0] : A[globalRow * N + globalCol];
|
21
|
+
}
|
22
|
+
}
|
23
|
+
|
24
|
+
// 1D + Scalar floating point add op broadcast
|
25
|
+
__kernel void max_b_<%= dtype %>_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
26
|
+
// Get the index of the current element to be processed
|
27
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
28
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
29
|
+
|
30
|
+
int b_m_index = globalRow;
|
31
|
+
int b_n_index = globalCol;
|
32
|
+
|
33
|
+
if ( b_m_index >= M2) {
|
34
|
+
b_m_index = b_m_index % M2;
|
35
|
+
};
|
36
|
+
|
37
|
+
if (b_n_index >= N2) {
|
38
|
+
b_n_index = b_n_index % N2;
|
39
|
+
}
|
40
|
+
|
41
|
+
if (switch_op == 0) {
|
42
|
+
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] >= B[b_m_index * N2 + b_n_index] ? A[globalRow * N + globalCol] : B[b_m_index * N2 + b_n_index];
|
43
|
+
} else {
|
44
|
+
C[globalRow * N + globalCol] = B[b_m_index * N2 + b_n_index] >= A[globalRow * N + globalCol] ? B[b_m_index * N2 + b_n_index] : A[globalRow * N + globalCol];
|
45
|
+
}
|
46
|
+
}
|
@@ -0,0 +1,46 @@
|
|
1
|
+
// same dimension add floating point op
|
2
|
+
% c_dtype = dtype_to_c_type(dtype)
|
3
|
+
__kernel void min_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
4
|
+
// Get the index of the current element to be processed
|
5
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
6
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
7
|
+
|
8
|
+
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <= B[globalRow * N + globalCol] ? A[globalRow * N + globalCol] : B[globalRow * N + globalCol];
|
9
|
+
}
|
10
|
+
|
11
|
+
// 1D + Scalar floating point add op
|
12
|
+
__kernel void min_c_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
13
|
+
// Get the index of the current element to be processed
|
14
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
15
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
16
|
+
|
17
|
+
if (switch_op == 0) {
|
18
|
+
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <= B[0] ? A[globalRow * N + globalCol] : B[0];
|
19
|
+
} else {
|
20
|
+
C[globalRow * N + globalCol] = B[0] <= A[globalRow * N + globalCol] ? B[0] : A[globalRow * N + globalCol];
|
21
|
+
}
|
22
|
+
}
|
23
|
+
|
24
|
+
// 1D + Scalar floating point add op broadcast
|
25
|
+
__kernel void min_b_<%= dtype %>_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
26
|
+
// Get the index of the current element to be processed
|
27
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
28
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
29
|
+
|
30
|
+
int b_m_index = globalRow;
|
31
|
+
int b_n_index = globalCol;
|
32
|
+
|
33
|
+
if ( b_m_index >= M2) {
|
34
|
+
b_m_index = b_m_index % M2;
|
35
|
+
};
|
36
|
+
|
37
|
+
if (b_n_index >= N2) {
|
38
|
+
b_n_index = b_n_index % N2;
|
39
|
+
}
|
40
|
+
|
41
|
+
if (switch_op == 0) {
|
42
|
+
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <= B[b_m_index * N2 + b_n_index] ? A[globalRow * N + globalCol] : B[b_m_index * N2 + b_n_index];
|
43
|
+
} else {
|
44
|
+
C[globalRow * N + globalCol] = B[b_m_index * N2 + b_n_index] <= A[globalRow * N + globalCol] ? B[b_m_index * N2 + b_n_index] : A[globalRow * N + globalCol];
|
45
|
+
}
|
46
|
+
}
|
@@ -0,0 +1,8 @@
|
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
__kernel void negate_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
|
+
// Get the index of the current element to be processed
|
4
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
5
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
6
|
+
|
7
|
+
C[globalRow * N + globalCol] = -A[globalRow * N + globalCol];
|
8
|
+
}
|
@@ -0,0 +1,24 @@
|
|
1
|
+
% ctype = dtype_to_c_type(data_type)
|
2
|
+
|
3
|
+
__kernel void pack(const int N, const int index, __global const <%= ctype %> *A, __global <%= ctype %> *C) {
|
4
|
+
// Get the index of the current element to be processed
|
5
|
+
const int globalCol = get_global_id(0); // Col ID of C (0..N)
|
6
|
+
|
7
|
+
int start = index * <%= divisors[0] %>;
|
8
|
+
int ptr = start + globalCol;
|
9
|
+
int index_map[<%= divisors.size %>] = { <%= Array.new(divisors.size) { 0 }.join(', ') %> };
|
10
|
+
|
11
|
+
// compute effective coordinates
|
12
|
+
<% divisors.each_with_index do |div, index| %>
|
13
|
+
index_map[<%= index %>] = (int)floor(ptr / (float)<%= div %>);<% if index < divisors.size - 1%>ptr = ptr % <%= div %>;<% end %><% end %>
|
14
|
+
|
15
|
+
// Apply axis translation if needed
|
16
|
+
<% if axis > 0 %>
|
17
|
+
int first = index_map[0];
|
18
|
+
<% axis.times do |i| %>
|
19
|
+
index_map[<%= i %>] = index_map[<%= (i + 1) %>];<% end %>
|
20
|
+
index_map[<%= axis %>] = first;
|
21
|
+
<% end%>
|
22
|
+
|
23
|
+
C[<%= multipliers.each_with_index.map { |m, idx| "#{m}*index_map[#{idx}]" }.join(' + ') %>] = A[globalCol];
|
24
|
+
}
|
@@ -0,0 +1,46 @@
|
|
1
|
+
// same dimension add floating point op
|
2
|
+
% c_dtype = dtype_to_c_type(dtype)
|
3
|
+
__kernel void pow_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
4
|
+
// Get the index of the current element to be processed
|
5
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
6
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
7
|
+
|
8
|
+
C[globalRow * N + globalCol] = pow((float)A[globalRow * N + globalCol], (float)B[globalRow * N + globalCol]);
|
9
|
+
}
|
10
|
+
|
11
|
+
// 1D + Scalar floating point add op
|
12
|
+
__kernel void pow_c_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
13
|
+
// Get the index of the current element to be processed
|
14
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
15
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
16
|
+
|
17
|
+
if (switch_op == 0) {
|
18
|
+
C[globalRow * N + globalCol] = pow((float)A[globalRow * N + globalCol], (float)B[0]);
|
19
|
+
} else {
|
20
|
+
C[globalRow * N + globalCol] = pow((float)B[0], (float)A[globalRow * N + globalCol]);
|
21
|
+
}
|
22
|
+
}
|
23
|
+
|
24
|
+
// 1D + Scalar floating point add op broadcast
|
25
|
+
__kernel void pow_b_<%= dtype %>_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
26
|
+
// Get the index of the current element to be processed
|
27
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
28
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
29
|
+
|
30
|
+
int b_m_index = globalRow;
|
31
|
+
int b_n_index = globalCol;
|
32
|
+
|
33
|
+
if ( b_m_index >= M2) {
|
34
|
+
b_m_index = b_m_index % M2;
|
35
|
+
};
|
36
|
+
|
37
|
+
if (b_n_index >= N2) {
|
38
|
+
b_n_index = b_n_index % N2;
|
39
|
+
}
|
40
|
+
|
41
|
+
if (switch_op == 0) {
|
42
|
+
C[globalRow * N + globalCol] = pow((float)A[globalRow * N + globalCol], (float)B[b_m_index * N2 + b_n_index]);
|
43
|
+
} else {
|
44
|
+
C[globalRow * N + globalCol] = pow((float)B[b_m_index * N2 + b_n_index], (float)A[globalRow * N + globalCol]);
|
45
|
+
}
|
46
|
+
}
|