tensor_stream 0.8.1 → 0.8.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/CHANGELOG.md +8 -0
- data/README.md +12 -6
- data/lib/tensor_stream.rb +1 -0
- data/lib/tensor_stream/evaluator/base_evaluator.rb +1 -1
- data/lib/tensor_stream/evaluator/ruby/array_ops.rb +282 -0
- data/lib/tensor_stream/evaluator/ruby/images_ops.rb +61 -0
- data/lib/tensor_stream/evaluator/ruby/math_ops.rb +111 -0
- data/lib/tensor_stream/evaluator/ruby/nn_ops.rb +48 -9
- data/lib/tensor_stream/evaluator/ruby/random_ops.rb +51 -0
- data/lib/tensor_stream/evaluator/ruby_evaluator.rb +20 -433
- data/lib/tensor_stream/images.rb +16 -0
- data/lib/tensor_stream/ops.rb +5 -1
- data/lib/tensor_stream/session.rb +15 -15
- data/lib/tensor_stream/tensor.rb +1 -1
- data/lib/tensor_stream/train/adadelta_optimizer.rb +52 -0
- data/lib/tensor_stream/train/adam_optimizer.rb +17 -2
- data/lib/tensor_stream/train/gradient_descent_optimizer.rb +7 -1
- data/lib/tensor_stream/trainer.rb +1 -0
- data/lib/tensor_stream/types.rb +4 -0
- data/lib/tensor_stream/utils.rb +4 -0
- data/lib/tensor_stream/variable_scope.rb +1 -0
- data/lib/tensor_stream/version.rb +1 -1
- data/samples/linear_regression.rb +4 -1
- data/samples/mnist_data.rb +64 -0
- data/samples/nearest_neighbor.rb +1 -2
- data/samples/raw_neural_net_sample.rb +1 -1
- data/tensor_stream.gemspec +1 -0
- metadata +23 -57
- data/lib/tensor_stream/evaluator/opencl/kernels/_bool_operand.cl +0 -45
- data/lib/tensor_stream/evaluator/opencl/kernels/_operand.cl +0 -45
- data/lib/tensor_stream/evaluator/opencl/kernels/abs.cl +0 -20
- data/lib/tensor_stream/evaluator/opencl/kernels/acos.cl +0 -8
- data/lib/tensor_stream/evaluator/opencl/kernels/add.cl +0 -3
- data/lib/tensor_stream/evaluator/opencl/kernels/apply_adam.cl +0 -23
- data/lib/tensor_stream/evaluator/opencl/kernels/apply_gradient.cl +0 -9
- data/lib/tensor_stream/evaluator/opencl/kernels/apply_momentum.cl +0 -16
- data/lib/tensor_stream/evaluator/opencl/kernels/argmax.cl +0 -8
- data/lib/tensor_stream/evaluator/opencl/kernels/argmin.cl +0 -8
- data/lib/tensor_stream/evaluator/opencl/kernels/asin.cl +0 -9
- data/lib/tensor_stream/evaluator/opencl/kernels/cast.cl +0 -10
- data/lib/tensor_stream/evaluator/opencl/kernels/ceil.cl +0 -8
- data/lib/tensor_stream/evaluator/opencl/kernels/cond.cl.erb +0 -6
- data/lib/tensor_stream/evaluator/opencl/kernels/cos.cl +0 -8
- data/lib/tensor_stream/evaluator/opencl/kernels/div.cl.erb +0 -3
- data/lib/tensor_stream/evaluator/opencl/kernels/exp.cl +0 -8
- data/lib/tensor_stream/evaluator/opencl/kernels/floor.cl +0 -8
- data/lib/tensor_stream/evaluator/opencl/kernels/floor_div.cl +0 -48
- data/lib/tensor_stream/evaluator/opencl/kernels/floor_mod.cl +0 -3
- data/lib/tensor_stream/evaluator/opencl/kernels/gemm.cl +0 -32
- data/lib/tensor_stream/evaluator/opencl/kernels/log.cl +0 -8
- data/lib/tensor_stream/evaluator/opencl/kernels/log1p.cl +0 -8
- data/lib/tensor_stream/evaluator/opencl/kernels/log_softmax.cl +0 -26
- data/lib/tensor_stream/evaluator/opencl/kernels/max.cl +0 -46
- data/lib/tensor_stream/evaluator/opencl/kernels/min.cl +0 -46
- data/lib/tensor_stream/evaluator/opencl/kernels/mod.cl +0 -3
- data/lib/tensor_stream/evaluator/opencl/kernels/mul.cl +0 -3
- data/lib/tensor_stream/evaluator/opencl/kernels/negate.cl +0 -8
- data/lib/tensor_stream/evaluator/opencl/kernels/pack.cl +0 -24
- data/lib/tensor_stream/evaluator/opencl/kernels/pow.cl +0 -46
- data/lib/tensor_stream/evaluator/opencl/kernels/real_div.cl +0 -3
- data/lib/tensor_stream/evaluator/opencl/kernels/reciprocal.cl +0 -8
- data/lib/tensor_stream/evaluator/opencl/kernels/round.cl +0 -8
- data/lib/tensor_stream/evaluator/opencl/kernels/sigmoid.cl +0 -9
- data/lib/tensor_stream/evaluator/opencl/kernels/sigmoid_grad.cl +0 -55
- data/lib/tensor_stream/evaluator/opencl/kernels/sign.cl +0 -21
- data/lib/tensor_stream/evaluator/opencl/kernels/sin.cl +0 -9
- data/lib/tensor_stream/evaluator/opencl/kernels/softmax.cl +0 -26
- data/lib/tensor_stream/evaluator/opencl/kernels/softmax_cross.cl +0 -32
- data/lib/tensor_stream/evaluator/opencl/kernels/softmax_cross_grad.cl +0 -28
- data/lib/tensor_stream/evaluator/opencl/kernels/softmax_grad.cl +0 -46
- data/lib/tensor_stream/evaluator/opencl/kernels/sqrt.cl +0 -9
- data/lib/tensor_stream/evaluator/opencl/kernels/square.cl +0 -9
- data/lib/tensor_stream/evaluator/opencl/kernels/squared_difference.cl +0 -53
- data/lib/tensor_stream/evaluator/opencl/kernels/sub.cl +0 -3
- data/lib/tensor_stream/evaluator/opencl/kernels/tan.cl +0 -8
- data/lib/tensor_stream/evaluator/opencl/kernels/tanh.cl +0 -8
- data/lib/tensor_stream/evaluator/opencl/kernels/tanh_grad.cl +0 -7
- data/lib/tensor_stream/evaluator/opencl/kernels/where.cl +0 -8
- data/lib/tensor_stream/evaluator/opencl/opencl_buffer.rb +0 -35
- data/lib/tensor_stream/evaluator/opencl/opencl_device.rb +0 -5
- data/lib/tensor_stream/evaluator/opencl/opencl_evaluator.rb +0 -1230
- data/lib/tensor_stream/evaluator/opencl/opencl_template_helper.rb +0 -95
@@ -1,45 +0,0 @@
|
|
1
|
-
// same dimension add floating point op
|
2
|
-
__kernel void <%= fname%>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= a_dtype %> *A, __global <%= b_dtype %> *B, __global <%= result_t %> *C) {
|
3
|
-
// Get the index of the current element to be processed
|
4
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
5
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
6
|
-
|
7
|
-
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <%= op %> B[globalRow * N + globalCol] ? 1 : 0;
|
8
|
-
}
|
9
|
-
|
10
|
-
// 1D + Scalar floating point add op
|
11
|
-
__kernel void <%=fname%>_c_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= a_dtype %> *A, __global <%= b_dtype %> *B, __global <%= result_t %> *C) {
|
12
|
-
// Get the index of the current element to be processed
|
13
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
14
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
15
|
-
|
16
|
-
if (switch_op == 0) {
|
17
|
-
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <%= op %> B[0] ? 1 : 0;
|
18
|
-
} else {
|
19
|
-
C[globalRow * N + globalCol] = B[0] <%= op %> A[globalRow * N + globalCol] ? 1 : 0;
|
20
|
-
}
|
21
|
-
}
|
22
|
-
|
23
|
-
// 1D + Scalar floating point add op broadcast
|
24
|
-
__kernel void <%= fname%>_b_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op,__global const <%= a_dtype %> *A, __global <%= b_dtype %> *B, __global <%= result_t %> *C) {
|
25
|
-
// Get the index of the current element to be processed
|
26
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
27
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
28
|
-
|
29
|
-
int b_m_index = globalRow;
|
30
|
-
int b_n_index = globalCol;
|
31
|
-
|
32
|
-
if ( b_m_index >= M2) {
|
33
|
-
b_m_index = b_m_index % M2;
|
34
|
-
};
|
35
|
-
|
36
|
-
if (b_n_index >= N2) {
|
37
|
-
b_n_index = b_n_index % N2;
|
38
|
-
}
|
39
|
-
|
40
|
-
if (switch_op == 0) {
|
41
|
-
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <%= op %> B[b_m_index * N2 + b_n_index] ? 1 : 0;
|
42
|
-
} else {
|
43
|
-
C[globalRow * N + globalCol] = B[b_m_index * N2 + b_n_index] <%= op %> A[globalRow * N + globalCol] ? 1 : 0;
|
44
|
-
}
|
45
|
-
}
|
@@ -1,45 +0,0 @@
|
|
1
|
-
// same dimension add floating point op
|
2
|
-
__kernel void <%= fname%>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= result_t %> *C) {
|
3
|
-
// Get the index of the current element to be processed
|
4
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
5
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
6
|
-
|
7
|
-
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <%= op %> B[globalRow * N + globalCol];
|
8
|
-
}
|
9
|
-
|
10
|
-
// 1D + Scalar floating point add op
|
11
|
-
__kernel void <%=fname%>_c_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= result_t %> *C) {
|
12
|
-
// Get the index of the current element to be processed
|
13
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
14
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
15
|
-
|
16
|
-
if (switch_op == 0) {
|
17
|
-
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <%= op %> B[0];
|
18
|
-
} else {
|
19
|
-
C[globalRow * N + globalCol] = B[0] <%= op %> A[globalRow * N + globalCol];
|
20
|
-
}
|
21
|
-
}
|
22
|
-
|
23
|
-
// 1D + Scalar floating point add op broadcast
|
24
|
-
__kernel void <%= fname%>_b_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op,__global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= result_t %> *C) {
|
25
|
-
// Get the index of the current element to be processed
|
26
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
27
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
28
|
-
|
29
|
-
int b_m_index = globalRow;
|
30
|
-
int b_n_index = globalCol;
|
31
|
-
|
32
|
-
if ( b_m_index >= M2) {
|
33
|
-
b_m_index = b_m_index % M2;
|
34
|
-
};
|
35
|
-
|
36
|
-
if (b_n_index >= N2) {
|
37
|
-
b_n_index = b_n_index % N2;
|
38
|
-
}
|
39
|
-
|
40
|
-
if (switch_op == 0) {
|
41
|
-
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <%= op %> B[b_m_index * N2 + b_n_index];
|
42
|
-
} else {
|
43
|
-
C[globalRow * N + globalCol] = B[b_m_index * N2 + b_n_index] <%= op %> A[globalRow * N + globalCol];
|
44
|
-
}
|
45
|
-
}
|
@@ -1,20 +0,0 @@
|
|
1
|
-
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
% if TensorStream::Ops::FLOATING_POINT_TYPES.include?(dtype)
|
3
|
-
__kernel void abs_<%= dtype%>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
4
|
-
// Get the index of the current element to be processed
|
5
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
6
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
7
|
-
|
8
|
-
C[globalRow * N + globalCol] = fabs(A[globalRow * N + globalCol]);
|
9
|
-
}
|
10
|
-
% else
|
11
|
-
% %w[int int32].each do |dt|
|
12
|
-
__kernel void abs_<%= dt %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
13
|
-
// Get the index of the current element to be processed
|
14
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
15
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
16
|
-
|
17
|
-
C[globalRow * N + globalCol] = fabs((float)A[globalRow * N + globalCol]);
|
18
|
-
}
|
19
|
-
% end
|
20
|
-
%end
|
@@ -1,8 +0,0 @@
|
|
1
|
-
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
__kernel void acos_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
|
-
// Get the index of the current element to be processed
|
4
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
5
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
6
|
-
|
7
|
-
C[globalRow * N + globalCol] = acos(A[globalRow * N + globalCol]);
|
8
|
-
}
|
@@ -1,23 +0,0 @@
|
|
1
|
-
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
// same dimension add floating point op
|
3
|
-
__kernel void apply_adam_<%= dtype %>(const int M, const int N,
|
4
|
-
__global const <%= c_dtype %> *grad,
|
5
|
-
__global const <%= c_dtype %> *learning_rate,
|
6
|
-
__global const <%= c_dtype %> *beta1_power,
|
7
|
-
__global const <%= c_dtype %> *beta2_power,
|
8
|
-
__global const <%= c_dtype %> *beta1,
|
9
|
-
__global const <%= c_dtype %> *beta2,
|
10
|
-
__global const <%= c_dtype %> *epsilon,
|
11
|
-
__global <%= c_dtype %> *momentum,
|
12
|
-
__global <%= c_dtype %> *output, __global <%= c_dtype %> *v) {
|
13
|
-
// Get the index of the current element to be processed
|
14
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
15
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
16
|
-
const int index = globalRow * N + globalCol;
|
17
|
-
|
18
|
-
<%= c_dtype %> alpha = learning_rate[0] * sqrt(1.0 - beta2_power[0]) / (1.0 - beta1_power[0]);
|
19
|
-
|
20
|
-
momentum[index] += (grad[index] - momentum[index]) * (1.0 - beta1[0]);
|
21
|
-
v[index] += (grad[index] * grad[index] - v[index]) * (1.0 - beta2[0]);
|
22
|
-
output[index] -= (momentum[index] * alpha) / ( sqrt(v[index]) + epsilon[0] );
|
23
|
-
}
|
@@ -1,9 +0,0 @@
|
|
1
|
-
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
// same dimension add floating point op
|
3
|
-
__kernel void apply_gradient_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
4
|
-
// Get the index of the current element to be processed
|
5
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
6
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
7
|
-
|
8
|
-
C[globalRow * N + globalCol] -= A[globalRow * N + globalCol] * B[0];
|
9
|
-
}
|
@@ -1,16 +0,0 @@
|
|
1
|
-
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
// same dimension add floating point op
|
3
|
-
__kernel void apply_momentum_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *grad, __global const <%= c_dtype %> *learning_rate,
|
4
|
-
__global const <%= c_dtype %> *momentum, __global <%= c_dtype %> *output, __global <%= c_dtype %> *acc) {
|
5
|
-
// Get the index of the current element to be processed
|
6
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
7
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
8
|
-
const int index = globalRow * N + globalCol;
|
9
|
-
<%= c_dtype %> acc_m = acc[index];
|
10
|
-
acc[index] = acc_m * momentum[0] + grad[index];
|
11
|
-
<% if nesterov %>
|
12
|
-
output[index] -= grad[index] * learning_rate[0] + acc_m * momentum[0] * learning_rate[0];
|
13
|
-
<% else %>
|
14
|
-
output[index] -= acc_m * learning_rate[0];
|
15
|
-
<% end %>
|
16
|
-
}
|
@@ -1,8 +0,0 @@
|
|
1
|
-
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
__kernel void argmax_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
3
|
-
// Get the index of the current element to be processed
|
4
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
5
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
6
|
-
|
7
|
-
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] + B[globalRow * N + globalCol];
|
8
|
-
}
|
@@ -1,8 +0,0 @@
|
|
1
|
-
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
__kernel void argmin_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
3
|
-
// Get the index of the current element to be processed
|
4
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
5
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
6
|
-
|
7
|
-
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] + B[globalRow * N + globalCol];
|
8
|
-
}
|
@@ -1,9 +0,0 @@
|
|
1
|
-
|
2
|
-
% c_dtype = dtype_to_c_type(dtype)
|
3
|
-
__kernel void asin_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
4
|
-
// Get the index of the current element to be processed
|
5
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
6
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
7
|
-
|
8
|
-
C[globalRow * N + globalCol] = asin(A[globalRow * N + globalCol]);
|
9
|
-
}
|
@@ -1,10 +0,0 @@
|
|
1
|
-
% source_ctype = dtype_to_c_type(source_dt)
|
2
|
-
% target_ctype = dtype_to_c_type(target_dt)
|
3
|
-
|
4
|
-
__kernel void cast(const int M, const int N, __global const <%= source_ctype %> *A, __global <%= target_ctype %> *C) {
|
5
|
-
// Get the index of the current element to be processed
|
6
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
7
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
8
|
-
|
9
|
-
C[globalRow * N + globalCol] = A[globalRow * N + globalCol];
|
10
|
-
}
|
@@ -1,8 +0,0 @@
|
|
1
|
-
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
__kernel void ceil_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
|
-
// Get the index of the current element to be processed
|
4
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
5
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
6
|
-
|
7
|
-
C[globalRow * N + globalCol] = ceil(A[globalRow * N + globalCol]);
|
8
|
-
}
|
@@ -1,6 +0,0 @@
|
|
1
|
-
% ["#{dtype}"].product(%w[less less_equal equal not_equal greater greater_equal logical_and]).each do |dtype, fname|
|
2
|
-
% a_dtype = dtype_to_c_type(a)
|
3
|
-
% b_dtype = dtype_to_c_type(b)
|
4
|
-
% op = operator_to_c(fname)
|
5
|
-
<%= render 'bool_operand.cl', a_dtype: a_dtype, b_dtype: b_dtype, op: op, fname: fname, dtype: "#{a}_#{b}", result_t: 'short' %>
|
6
|
-
% end
|
@@ -1,8 +0,0 @@
|
|
1
|
-
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
__kernel void cos_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
|
-
// Get the index of the current element to be processed
|
4
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
5
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
6
|
-
|
7
|
-
C[globalRow * N + globalCol] = cos(A[globalRow * N + globalCol]);
|
8
|
-
}
|
@@ -1,8 +0,0 @@
|
|
1
|
-
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
__kernel void exp_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
|
-
// Get the index of the current element to be processed
|
4
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
5
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
6
|
-
|
7
|
-
C[globalRow * N + globalCol] = exp(A[globalRow * N + globalCol]);
|
8
|
-
}
|
@@ -1,8 +0,0 @@
|
|
1
|
-
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
__kernel void floor_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
|
-
// Get the index of the current element to be processed
|
4
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
5
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
6
|
-
|
7
|
-
C[globalRow * N + globalCol] = floor(A[globalRow * N + globalCol]);
|
8
|
-
}
|
@@ -1,48 +0,0 @@
|
|
1
|
-
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
% fname = 'floor_div'
|
3
|
-
% result_t = c_dtype
|
4
|
-
// same dimension add floating point op
|
5
|
-
__kernel void <%= fname%>_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= result_t %> *C) {
|
6
|
-
// Get the index of the current element to be processed
|
7
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
8
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
9
|
-
|
10
|
-
C[globalRow * N + globalCol] = (int)(A[globalRow * N + globalCol] / B[globalRow * N + globalCol]);
|
11
|
-
}
|
12
|
-
|
13
|
-
// 1D + Scalar floating point add op
|
14
|
-
__kernel void <%=fname%>_c_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= result_t %> *C) {
|
15
|
-
// Get the index of the current element to be processed
|
16
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
17
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
18
|
-
|
19
|
-
if (switch_op == 0) {
|
20
|
-
C[globalRow * N + globalCol] = (int)(A[globalRow * N + globalCol] / B[0]);
|
21
|
-
} else {
|
22
|
-
C[globalRow * N + globalCol] = (int)(B[0] / A[globalRow * N + globalCol]);
|
23
|
-
}
|
24
|
-
}
|
25
|
-
|
26
|
-
// 1D + Scalar floating point add op broadcast
|
27
|
-
__kernel void <%= fname%>_b_<%= dtype %>_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op,__global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= result_t %> *C) {
|
28
|
-
// Get the index of the current element to be processed
|
29
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
30
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
31
|
-
|
32
|
-
int b_m_index = globalRow;
|
33
|
-
int b_n_index = globalCol;
|
34
|
-
|
35
|
-
if ( b_m_index >= M2) {
|
36
|
-
b_m_index = b_m_index % M2;
|
37
|
-
};
|
38
|
-
|
39
|
-
if (b_n_index >= N2) {
|
40
|
-
b_n_index = b_n_index % N2;
|
41
|
-
}
|
42
|
-
|
43
|
-
if (switch_op == 0) {
|
44
|
-
C[globalRow * N + globalCol] = (int)(A[globalRow * N + globalCol] / B[b_m_index * N2 + b_n_index]);
|
45
|
-
} else {
|
46
|
-
C[globalRow * N + globalCol] = (int)(B[b_m_index * N2 + b_n_index] / A[globalRow * N + globalCol]);
|
47
|
-
}
|
48
|
-
}
|
@@ -1,32 +0,0 @@
|
|
1
|
-
// First naive implementation
|
2
|
-
% c_dtype = dtype_to_c_type(dtype)
|
3
|
-
__kernel void gemm_<%= dtype %>(const int M, const int N, const int K,
|
4
|
-
const int A_transpose,
|
5
|
-
const int B_transpose,
|
6
|
-
const __global <%= c_dtype %>* A,
|
7
|
-
const __global <%= c_dtype %>* B,
|
8
|
-
__global <%= c_dtype %>* C) {
|
9
|
-
|
10
|
-
// Get the index of the current element to be processed
|
11
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
12
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
13
|
-
|
14
|
-
// Compute a single element (loop over K)
|
15
|
-
<%= c_dtype %> acc = 0.0f;
|
16
|
-
for (int k=0; k<K; k++) {
|
17
|
-
int a_index = globalRow*K + k;
|
18
|
-
int b_index = k*N + globalCol;
|
19
|
-
|
20
|
-
if (A_transpose) {
|
21
|
-
a_index = M*k + globalRow;
|
22
|
-
}
|
23
|
-
|
24
|
-
if (B_transpose) {
|
25
|
-
b_index = globalCol*K + k;
|
26
|
-
}
|
27
|
-
acc += A[a_index] * B[b_index];
|
28
|
-
}
|
29
|
-
|
30
|
-
// Store the result
|
31
|
-
C[globalRow*N + globalCol] = acc;
|
32
|
-
}
|
@@ -1,8 +0,0 @@
|
|
1
|
-
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
__kernel void log_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
|
-
// Get the index of the current element to be processed
|
4
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
5
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
6
|
-
|
7
|
-
C[globalRow * N + globalCol] = log(A[globalRow * N + globalCol]);
|
8
|
-
}
|
@@ -1,8 +0,0 @@
|
|
1
|
-
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
__kernel void log1p_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
|
-
// Get the index of the current element to be processed
|
4
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
5
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
6
|
-
|
7
|
-
C[globalRow * N + globalCol] = log1p(A[globalRow * N + globalCol]);
|
8
|
-
}
|
@@ -1,26 +0,0 @@
|
|
1
|
-
// First naive implementation
|
2
|
-
% c_dtype = dtype_to_c_type(dtype)
|
3
|
-
__kernel void log_softmax_<%= dtype %>(const int N,
|
4
|
-
const __global <%= c_dtype %>* A,
|
5
|
-
__global <%= c_dtype %>* C) {
|
6
|
-
|
7
|
-
// Get the index of the current element to be processed
|
8
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
9
|
-
|
10
|
-
// Compute a single element (loop over K)
|
11
|
-
<%= c_dtype %> acc = 0.0f;
|
12
|
-
<%= c_dtype %> max = <%= min_value_for(dtype) %>;
|
13
|
-
|
14
|
-
for (int k=0; k<N; k++) {
|
15
|
-
max = A[globalRow*N + k] > max ? A[globalRow*N + k] : max;
|
16
|
-
}
|
17
|
-
|
18
|
-
for (int k=0; k<N; k++) {
|
19
|
-
acc += exp(A[globalRow*N + k] - max);
|
20
|
-
}
|
21
|
-
|
22
|
-
// Store the result
|
23
|
-
for (int k=0; k < N; k++) {
|
24
|
-
C[globalRow*N + k] = (A[globalRow*N + k] - max) - log(acc);
|
25
|
-
}
|
26
|
-
}
|
@@ -1,46 +0,0 @@
|
|
1
|
-
// same dimension add floating point op
|
2
|
-
% c_dtype = dtype_to_c_type(dtype)
|
3
|
-
__kernel void max_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
4
|
-
// Get the index of the current element to be processed
|
5
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
6
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
7
|
-
|
8
|
-
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] >= B[globalRow * N + globalCol] ? A[globalRow * N + globalCol] : B[globalRow * N + globalCol];
|
9
|
-
}
|
10
|
-
|
11
|
-
// 1D + Scalar floating point add op
|
12
|
-
__kernel void max_c_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
13
|
-
// Get the index of the current element to be processed
|
14
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
15
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
16
|
-
|
17
|
-
if (switch_op == 0) {
|
18
|
-
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] >= B[0] ? A[globalRow * N + globalCol] : B[0];
|
19
|
-
} else {
|
20
|
-
C[globalRow * N + globalCol] = B[0] >= A[globalRow * N + globalCol] ? B[0] : A[globalRow * N + globalCol];
|
21
|
-
}
|
22
|
-
}
|
23
|
-
|
24
|
-
// 1D + Scalar floating point add op broadcast
|
25
|
-
__kernel void max_b_<%= dtype %>_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
26
|
-
// Get the index of the current element to be processed
|
27
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
28
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
29
|
-
|
30
|
-
int b_m_index = globalRow;
|
31
|
-
int b_n_index = globalCol;
|
32
|
-
|
33
|
-
if ( b_m_index >= M2) {
|
34
|
-
b_m_index = b_m_index % M2;
|
35
|
-
};
|
36
|
-
|
37
|
-
if (b_n_index >= N2) {
|
38
|
-
b_n_index = b_n_index % N2;
|
39
|
-
}
|
40
|
-
|
41
|
-
if (switch_op == 0) {
|
42
|
-
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] >= B[b_m_index * N2 + b_n_index] ? A[globalRow * N + globalCol] : B[b_m_index * N2 + b_n_index];
|
43
|
-
} else {
|
44
|
-
C[globalRow * N + globalCol] = B[b_m_index * N2 + b_n_index] >= A[globalRow * N + globalCol] ? B[b_m_index * N2 + b_n_index] : A[globalRow * N + globalCol];
|
45
|
-
}
|
46
|
-
}
|