tensor_stream 0.8.1 → 0.8.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/CHANGELOG.md +8 -0
- data/README.md +12 -6
- data/lib/tensor_stream.rb +1 -0
- data/lib/tensor_stream/evaluator/base_evaluator.rb +1 -1
- data/lib/tensor_stream/evaluator/ruby/array_ops.rb +282 -0
- data/lib/tensor_stream/evaluator/ruby/images_ops.rb +61 -0
- data/lib/tensor_stream/evaluator/ruby/math_ops.rb +111 -0
- data/lib/tensor_stream/evaluator/ruby/nn_ops.rb +48 -9
- data/lib/tensor_stream/evaluator/ruby/random_ops.rb +51 -0
- data/lib/tensor_stream/evaluator/ruby_evaluator.rb +20 -433
- data/lib/tensor_stream/images.rb +16 -0
- data/lib/tensor_stream/ops.rb +5 -1
- data/lib/tensor_stream/session.rb +15 -15
- data/lib/tensor_stream/tensor.rb +1 -1
- data/lib/tensor_stream/train/adadelta_optimizer.rb +52 -0
- data/lib/tensor_stream/train/adam_optimizer.rb +17 -2
- data/lib/tensor_stream/train/gradient_descent_optimizer.rb +7 -1
- data/lib/tensor_stream/trainer.rb +1 -0
- data/lib/tensor_stream/types.rb +4 -0
- data/lib/tensor_stream/utils.rb +4 -0
- data/lib/tensor_stream/variable_scope.rb +1 -0
- data/lib/tensor_stream/version.rb +1 -1
- data/samples/linear_regression.rb +4 -1
- data/samples/mnist_data.rb +64 -0
- data/samples/nearest_neighbor.rb +1 -2
- data/samples/raw_neural_net_sample.rb +1 -1
- data/tensor_stream.gemspec +1 -0
- metadata +23 -57
- data/lib/tensor_stream/evaluator/opencl/kernels/_bool_operand.cl +0 -45
- data/lib/tensor_stream/evaluator/opencl/kernels/_operand.cl +0 -45
- data/lib/tensor_stream/evaluator/opencl/kernels/abs.cl +0 -20
- data/lib/tensor_stream/evaluator/opencl/kernels/acos.cl +0 -8
- data/lib/tensor_stream/evaluator/opencl/kernels/add.cl +0 -3
- data/lib/tensor_stream/evaluator/opencl/kernels/apply_adam.cl +0 -23
- data/lib/tensor_stream/evaluator/opencl/kernels/apply_gradient.cl +0 -9
- data/lib/tensor_stream/evaluator/opencl/kernels/apply_momentum.cl +0 -16
- data/lib/tensor_stream/evaluator/opencl/kernels/argmax.cl +0 -8
- data/lib/tensor_stream/evaluator/opencl/kernels/argmin.cl +0 -8
- data/lib/tensor_stream/evaluator/opencl/kernels/asin.cl +0 -9
- data/lib/tensor_stream/evaluator/opencl/kernels/cast.cl +0 -10
- data/lib/tensor_stream/evaluator/opencl/kernels/ceil.cl +0 -8
- data/lib/tensor_stream/evaluator/opencl/kernels/cond.cl.erb +0 -6
- data/lib/tensor_stream/evaluator/opencl/kernels/cos.cl +0 -8
- data/lib/tensor_stream/evaluator/opencl/kernels/div.cl.erb +0 -3
- data/lib/tensor_stream/evaluator/opencl/kernels/exp.cl +0 -8
- data/lib/tensor_stream/evaluator/opencl/kernels/floor.cl +0 -8
- data/lib/tensor_stream/evaluator/opencl/kernels/floor_div.cl +0 -48
- data/lib/tensor_stream/evaluator/opencl/kernels/floor_mod.cl +0 -3
- data/lib/tensor_stream/evaluator/opencl/kernels/gemm.cl +0 -32
- data/lib/tensor_stream/evaluator/opencl/kernels/log.cl +0 -8
- data/lib/tensor_stream/evaluator/opencl/kernels/log1p.cl +0 -8
- data/lib/tensor_stream/evaluator/opencl/kernels/log_softmax.cl +0 -26
- data/lib/tensor_stream/evaluator/opencl/kernels/max.cl +0 -46
- data/lib/tensor_stream/evaluator/opencl/kernels/min.cl +0 -46
- data/lib/tensor_stream/evaluator/opencl/kernels/mod.cl +0 -3
- data/lib/tensor_stream/evaluator/opencl/kernels/mul.cl +0 -3
- data/lib/tensor_stream/evaluator/opencl/kernels/negate.cl +0 -8
- data/lib/tensor_stream/evaluator/opencl/kernels/pack.cl +0 -24
- data/lib/tensor_stream/evaluator/opencl/kernels/pow.cl +0 -46
- data/lib/tensor_stream/evaluator/opencl/kernels/real_div.cl +0 -3
- data/lib/tensor_stream/evaluator/opencl/kernels/reciprocal.cl +0 -8
- data/lib/tensor_stream/evaluator/opencl/kernels/round.cl +0 -8
- data/lib/tensor_stream/evaluator/opencl/kernels/sigmoid.cl +0 -9
- data/lib/tensor_stream/evaluator/opencl/kernels/sigmoid_grad.cl +0 -55
- data/lib/tensor_stream/evaluator/opencl/kernels/sign.cl +0 -21
- data/lib/tensor_stream/evaluator/opencl/kernels/sin.cl +0 -9
- data/lib/tensor_stream/evaluator/opencl/kernels/softmax.cl +0 -26
- data/lib/tensor_stream/evaluator/opencl/kernels/softmax_cross.cl +0 -32
- data/lib/tensor_stream/evaluator/opencl/kernels/softmax_cross_grad.cl +0 -28
- data/lib/tensor_stream/evaluator/opencl/kernels/softmax_grad.cl +0 -46
- data/lib/tensor_stream/evaluator/opencl/kernels/sqrt.cl +0 -9
- data/lib/tensor_stream/evaluator/opencl/kernels/square.cl +0 -9
- data/lib/tensor_stream/evaluator/opencl/kernels/squared_difference.cl +0 -53
- data/lib/tensor_stream/evaluator/opencl/kernels/sub.cl +0 -3
- data/lib/tensor_stream/evaluator/opencl/kernels/tan.cl +0 -8
- data/lib/tensor_stream/evaluator/opencl/kernels/tanh.cl +0 -8
- data/lib/tensor_stream/evaluator/opencl/kernels/tanh_grad.cl +0 -7
- data/lib/tensor_stream/evaluator/opencl/kernels/where.cl +0 -8
- data/lib/tensor_stream/evaluator/opencl/opencl_buffer.rb +0 -35
- data/lib/tensor_stream/evaluator/opencl/opencl_device.rb +0 -5
- data/lib/tensor_stream/evaluator/opencl/opencl_evaluator.rb +0 -1230
- data/lib/tensor_stream/evaluator/opencl/opencl_template_helper.rb +0 -95
@@ -1,45 +0,0 @@
|
|
1
|
-
// same dimension add floating point op
|
2
|
-
__kernel void <%= fname%>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= a_dtype %> *A, __global <%= b_dtype %> *B, __global <%= result_t %> *C) {
|
3
|
-
// Get the index of the current element to be processed
|
4
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
5
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
6
|
-
|
7
|
-
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <%= op %> B[globalRow * N + globalCol] ? 1 : 0;
|
8
|
-
}
|
9
|
-
|
10
|
-
// 1D + Scalar floating point add op
|
11
|
-
__kernel void <%=fname%>_c_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= a_dtype %> *A, __global <%= b_dtype %> *B, __global <%= result_t %> *C) {
|
12
|
-
// Get the index of the current element to be processed
|
13
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
14
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
15
|
-
|
16
|
-
if (switch_op == 0) {
|
17
|
-
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <%= op %> B[0] ? 1 : 0;
|
18
|
-
} else {
|
19
|
-
C[globalRow * N + globalCol] = B[0] <%= op %> A[globalRow * N + globalCol] ? 1 : 0;
|
20
|
-
}
|
21
|
-
}
|
22
|
-
|
23
|
-
// 1D + Scalar floating point add op broadcast
|
24
|
-
__kernel void <%= fname%>_b_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op,__global const <%= a_dtype %> *A, __global <%= b_dtype %> *B, __global <%= result_t %> *C) {
|
25
|
-
// Get the index of the current element to be processed
|
26
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
27
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
28
|
-
|
29
|
-
int b_m_index = globalRow;
|
30
|
-
int b_n_index = globalCol;
|
31
|
-
|
32
|
-
if ( b_m_index >= M2) {
|
33
|
-
b_m_index = b_m_index % M2;
|
34
|
-
};
|
35
|
-
|
36
|
-
if (b_n_index >= N2) {
|
37
|
-
b_n_index = b_n_index % N2;
|
38
|
-
}
|
39
|
-
|
40
|
-
if (switch_op == 0) {
|
41
|
-
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <%= op %> B[b_m_index * N2 + b_n_index] ? 1 : 0;
|
42
|
-
} else {
|
43
|
-
C[globalRow * N + globalCol] = B[b_m_index * N2 + b_n_index] <%= op %> A[globalRow * N + globalCol] ? 1 : 0;
|
44
|
-
}
|
45
|
-
}
|
@@ -1,45 +0,0 @@
|
|
1
|
-
// same dimension add floating point op
|
2
|
-
__kernel void <%= fname%>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= result_t %> *C) {
|
3
|
-
// Get the index of the current element to be processed
|
4
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
5
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
6
|
-
|
7
|
-
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <%= op %> B[globalRow * N + globalCol];
|
8
|
-
}
|
9
|
-
|
10
|
-
// 1D + Scalar floating point add op
|
11
|
-
__kernel void <%=fname%>_c_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= result_t %> *C) {
|
12
|
-
// Get the index of the current element to be processed
|
13
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
14
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
15
|
-
|
16
|
-
if (switch_op == 0) {
|
17
|
-
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <%= op %> B[0];
|
18
|
-
} else {
|
19
|
-
C[globalRow * N + globalCol] = B[0] <%= op %> A[globalRow * N + globalCol];
|
20
|
-
}
|
21
|
-
}
|
22
|
-
|
23
|
-
// 1D + Scalar floating point add op broadcast
|
24
|
-
__kernel void <%= fname%>_b_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op,__global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= result_t %> *C) {
|
25
|
-
// Get the index of the current element to be processed
|
26
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
27
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
28
|
-
|
29
|
-
int b_m_index = globalRow;
|
30
|
-
int b_n_index = globalCol;
|
31
|
-
|
32
|
-
if ( b_m_index >= M2) {
|
33
|
-
b_m_index = b_m_index % M2;
|
34
|
-
};
|
35
|
-
|
36
|
-
if (b_n_index >= N2) {
|
37
|
-
b_n_index = b_n_index % N2;
|
38
|
-
}
|
39
|
-
|
40
|
-
if (switch_op == 0) {
|
41
|
-
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <%= op %> B[b_m_index * N2 + b_n_index];
|
42
|
-
} else {
|
43
|
-
C[globalRow * N + globalCol] = B[b_m_index * N2 + b_n_index] <%= op %> A[globalRow * N + globalCol];
|
44
|
-
}
|
45
|
-
}
|
@@ -1,20 +0,0 @@
|
|
1
|
-
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
% if TensorStream::Ops::FLOATING_POINT_TYPES.include?(dtype)
|
3
|
-
__kernel void abs_<%= dtype%>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
4
|
-
// Get the index of the current element to be processed
|
5
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
6
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
7
|
-
|
8
|
-
C[globalRow * N + globalCol] = fabs(A[globalRow * N + globalCol]);
|
9
|
-
}
|
10
|
-
% else
|
11
|
-
% %w[int int32].each do |dt|
|
12
|
-
__kernel void abs_<%= dt %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
13
|
-
// Get the index of the current element to be processed
|
14
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
15
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
16
|
-
|
17
|
-
C[globalRow * N + globalCol] = fabs((float)A[globalRow * N + globalCol]);
|
18
|
-
}
|
19
|
-
% end
|
20
|
-
%end
|
@@ -1,8 +0,0 @@
|
|
1
|
-
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
__kernel void acos_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
|
-
// Get the index of the current element to be processed
|
4
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
5
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
6
|
-
|
7
|
-
C[globalRow * N + globalCol] = acos(A[globalRow * N + globalCol]);
|
8
|
-
}
|
@@ -1,23 +0,0 @@
|
|
1
|
-
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
// same dimension add floating point op
|
3
|
-
__kernel void apply_adam_<%= dtype %>(const int M, const int N,
|
4
|
-
__global const <%= c_dtype %> *grad,
|
5
|
-
__global const <%= c_dtype %> *learning_rate,
|
6
|
-
__global const <%= c_dtype %> *beta1_power,
|
7
|
-
__global const <%= c_dtype %> *beta2_power,
|
8
|
-
__global const <%= c_dtype %> *beta1,
|
9
|
-
__global const <%= c_dtype %> *beta2,
|
10
|
-
__global const <%= c_dtype %> *epsilon,
|
11
|
-
__global <%= c_dtype %> *momentum,
|
12
|
-
__global <%= c_dtype %> *output, __global <%= c_dtype %> *v) {
|
13
|
-
// Get the index of the current element to be processed
|
14
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
15
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
16
|
-
const int index = globalRow * N + globalCol;
|
17
|
-
|
18
|
-
<%= c_dtype %> alpha = learning_rate[0] * sqrt(1.0 - beta2_power[0]) / (1.0 - beta1_power[0]);
|
19
|
-
|
20
|
-
momentum[index] += (grad[index] - momentum[index]) * (1.0 - beta1[0]);
|
21
|
-
v[index] += (grad[index] * grad[index] - v[index]) * (1.0 - beta2[0]);
|
22
|
-
output[index] -= (momentum[index] * alpha) / ( sqrt(v[index]) + epsilon[0] );
|
23
|
-
}
|
@@ -1,9 +0,0 @@
|
|
1
|
-
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
// same dimension add floating point op
|
3
|
-
__kernel void apply_gradient_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
4
|
-
// Get the index of the current element to be processed
|
5
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
6
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
7
|
-
|
8
|
-
C[globalRow * N + globalCol] -= A[globalRow * N + globalCol] * B[0];
|
9
|
-
}
|
@@ -1,16 +0,0 @@
|
|
1
|
-
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
// same dimension add floating point op
|
3
|
-
__kernel void apply_momentum_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *grad, __global const <%= c_dtype %> *learning_rate,
|
4
|
-
__global const <%= c_dtype %> *momentum, __global <%= c_dtype %> *output, __global <%= c_dtype %> *acc) {
|
5
|
-
// Get the index of the current element to be processed
|
6
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
7
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
8
|
-
const int index = globalRow * N + globalCol;
|
9
|
-
<%= c_dtype %> acc_m = acc[index];
|
10
|
-
acc[index] = acc_m * momentum[0] + grad[index];
|
11
|
-
<% if nesterov %>
|
12
|
-
output[index] -= grad[index] * learning_rate[0] + acc_m * momentum[0] * learning_rate[0];
|
13
|
-
<% else %>
|
14
|
-
output[index] -= acc_m * learning_rate[0];
|
15
|
-
<% end %>
|
16
|
-
}
|
@@ -1,8 +0,0 @@
|
|
1
|
-
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
__kernel void argmax_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
3
|
-
// Get the index of the current element to be processed
|
4
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
5
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
6
|
-
|
7
|
-
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] + B[globalRow * N + globalCol];
|
8
|
-
}
|
@@ -1,8 +0,0 @@
|
|
1
|
-
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
__kernel void argmin_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
3
|
-
// Get the index of the current element to be processed
|
4
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
5
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
6
|
-
|
7
|
-
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] + B[globalRow * N + globalCol];
|
8
|
-
}
|
@@ -1,9 +0,0 @@
|
|
1
|
-
|
2
|
-
% c_dtype = dtype_to_c_type(dtype)
|
3
|
-
__kernel void asin_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
4
|
-
// Get the index of the current element to be processed
|
5
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
6
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
7
|
-
|
8
|
-
C[globalRow * N + globalCol] = asin(A[globalRow * N + globalCol]);
|
9
|
-
}
|
@@ -1,10 +0,0 @@
|
|
1
|
-
% source_ctype = dtype_to_c_type(source_dt)
|
2
|
-
% target_ctype = dtype_to_c_type(target_dt)
|
3
|
-
|
4
|
-
__kernel void cast(const int M, const int N, __global const <%= source_ctype %> *A, __global <%= target_ctype %> *C) {
|
5
|
-
// Get the index of the current element to be processed
|
6
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
7
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
8
|
-
|
9
|
-
C[globalRow * N + globalCol] = A[globalRow * N + globalCol];
|
10
|
-
}
|
@@ -1,8 +0,0 @@
|
|
1
|
-
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
__kernel void ceil_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
|
-
// Get the index of the current element to be processed
|
4
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
5
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
6
|
-
|
7
|
-
C[globalRow * N + globalCol] = ceil(A[globalRow * N + globalCol]);
|
8
|
-
}
|
@@ -1,6 +0,0 @@
|
|
1
|
-
% ["#{dtype}"].product(%w[less less_equal equal not_equal greater greater_equal logical_and]).each do |dtype, fname|
|
2
|
-
% a_dtype = dtype_to_c_type(a)
|
3
|
-
% b_dtype = dtype_to_c_type(b)
|
4
|
-
% op = operator_to_c(fname)
|
5
|
-
<%= render 'bool_operand.cl', a_dtype: a_dtype, b_dtype: b_dtype, op: op, fname: fname, dtype: "#{a}_#{b}", result_t: 'short' %>
|
6
|
-
% end
|
@@ -1,8 +0,0 @@
|
|
1
|
-
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
__kernel void cos_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
|
-
// Get the index of the current element to be processed
|
4
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
5
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
6
|
-
|
7
|
-
C[globalRow * N + globalCol] = cos(A[globalRow * N + globalCol]);
|
8
|
-
}
|
@@ -1,8 +0,0 @@
|
|
1
|
-
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
__kernel void exp_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
|
-
// Get the index of the current element to be processed
|
4
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
5
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
6
|
-
|
7
|
-
C[globalRow * N + globalCol] = exp(A[globalRow * N + globalCol]);
|
8
|
-
}
|
@@ -1,8 +0,0 @@
|
|
1
|
-
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
__kernel void floor_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
|
-
// Get the index of the current element to be processed
|
4
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
5
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
6
|
-
|
7
|
-
C[globalRow * N + globalCol] = floor(A[globalRow * N + globalCol]);
|
8
|
-
}
|
@@ -1,48 +0,0 @@
|
|
1
|
-
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
% fname = 'floor_div'
|
3
|
-
% result_t = c_dtype
|
4
|
-
// same dimension add floating point op
|
5
|
-
__kernel void <%= fname%>_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= result_t %> *C) {
|
6
|
-
// Get the index of the current element to be processed
|
7
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
8
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
9
|
-
|
10
|
-
C[globalRow * N + globalCol] = (int)(A[globalRow * N + globalCol] / B[globalRow * N + globalCol]);
|
11
|
-
}
|
12
|
-
|
13
|
-
// 1D + Scalar floating point add op
|
14
|
-
__kernel void <%=fname%>_c_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= result_t %> *C) {
|
15
|
-
// Get the index of the current element to be processed
|
16
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
17
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
18
|
-
|
19
|
-
if (switch_op == 0) {
|
20
|
-
C[globalRow * N + globalCol] = (int)(A[globalRow * N + globalCol] / B[0]);
|
21
|
-
} else {
|
22
|
-
C[globalRow * N + globalCol] = (int)(B[0] / A[globalRow * N + globalCol]);
|
23
|
-
}
|
24
|
-
}
|
25
|
-
|
26
|
-
// 1D + Scalar floating point add op broadcast
|
27
|
-
__kernel void <%= fname%>_b_<%= dtype %>_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op,__global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= result_t %> *C) {
|
28
|
-
// Get the index of the current element to be processed
|
29
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
30
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
31
|
-
|
32
|
-
int b_m_index = globalRow;
|
33
|
-
int b_n_index = globalCol;
|
34
|
-
|
35
|
-
if ( b_m_index >= M2) {
|
36
|
-
b_m_index = b_m_index % M2;
|
37
|
-
};
|
38
|
-
|
39
|
-
if (b_n_index >= N2) {
|
40
|
-
b_n_index = b_n_index % N2;
|
41
|
-
}
|
42
|
-
|
43
|
-
if (switch_op == 0) {
|
44
|
-
C[globalRow * N + globalCol] = (int)(A[globalRow * N + globalCol] / B[b_m_index * N2 + b_n_index]);
|
45
|
-
} else {
|
46
|
-
C[globalRow * N + globalCol] = (int)(B[b_m_index * N2 + b_n_index] / A[globalRow * N + globalCol]);
|
47
|
-
}
|
48
|
-
}
|
@@ -1,32 +0,0 @@
|
|
1
|
-
// First naive implementation
|
2
|
-
% c_dtype = dtype_to_c_type(dtype)
|
3
|
-
__kernel void gemm_<%= dtype %>(const int M, const int N, const int K,
|
4
|
-
const int A_transpose,
|
5
|
-
const int B_transpose,
|
6
|
-
const __global <%= c_dtype %>* A,
|
7
|
-
const __global <%= c_dtype %>* B,
|
8
|
-
__global <%= c_dtype %>* C) {
|
9
|
-
|
10
|
-
// Get the index of the current element to be processed
|
11
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
12
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
13
|
-
|
14
|
-
// Compute a single element (loop over K)
|
15
|
-
<%= c_dtype %> acc = 0.0f;
|
16
|
-
for (int k=0; k<K; k++) {
|
17
|
-
int a_index = globalRow*K + k;
|
18
|
-
int b_index = k*N + globalCol;
|
19
|
-
|
20
|
-
if (A_transpose) {
|
21
|
-
a_index = M*k + globalRow;
|
22
|
-
}
|
23
|
-
|
24
|
-
if (B_transpose) {
|
25
|
-
b_index = globalCol*K + k;
|
26
|
-
}
|
27
|
-
acc += A[a_index] * B[b_index];
|
28
|
-
}
|
29
|
-
|
30
|
-
// Store the result
|
31
|
-
C[globalRow*N + globalCol] = acc;
|
32
|
-
}
|
@@ -1,8 +0,0 @@
|
|
1
|
-
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
__kernel void log_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
|
-
// Get the index of the current element to be processed
|
4
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
5
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
6
|
-
|
7
|
-
C[globalRow * N + globalCol] = log(A[globalRow * N + globalCol]);
|
8
|
-
}
|
@@ -1,8 +0,0 @@
|
|
1
|
-
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
__kernel void log1p_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
|
-
// Get the index of the current element to be processed
|
4
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
5
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
6
|
-
|
7
|
-
C[globalRow * N + globalCol] = log1p(A[globalRow * N + globalCol]);
|
8
|
-
}
|
@@ -1,26 +0,0 @@
|
|
1
|
-
// First naive implementation
|
2
|
-
% c_dtype = dtype_to_c_type(dtype)
|
3
|
-
__kernel void log_softmax_<%= dtype %>(const int N,
|
4
|
-
const __global <%= c_dtype %>* A,
|
5
|
-
__global <%= c_dtype %>* C) {
|
6
|
-
|
7
|
-
// Get the index of the current element to be processed
|
8
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
9
|
-
|
10
|
-
// Compute a single element (loop over K)
|
11
|
-
<%= c_dtype %> acc = 0.0f;
|
12
|
-
<%= c_dtype %> max = <%= min_value_for(dtype) %>;
|
13
|
-
|
14
|
-
for (int k=0; k<N; k++) {
|
15
|
-
max = A[globalRow*N + k] > max ? A[globalRow*N + k] : max;
|
16
|
-
}
|
17
|
-
|
18
|
-
for (int k=0; k<N; k++) {
|
19
|
-
acc += exp(A[globalRow*N + k] - max);
|
20
|
-
}
|
21
|
-
|
22
|
-
// Store the result
|
23
|
-
for (int k=0; k < N; k++) {
|
24
|
-
C[globalRow*N + k] = (A[globalRow*N + k] - max) - log(acc);
|
25
|
-
}
|
26
|
-
}
|
@@ -1,46 +0,0 @@
|
|
1
|
-
// same dimension add floating point op
|
2
|
-
% c_dtype = dtype_to_c_type(dtype)
|
3
|
-
__kernel void max_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
4
|
-
// Get the index of the current element to be processed
|
5
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
6
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
7
|
-
|
8
|
-
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] >= B[globalRow * N + globalCol] ? A[globalRow * N + globalCol] : B[globalRow * N + globalCol];
|
9
|
-
}
|
10
|
-
|
11
|
-
// 1D + Scalar floating point add op
|
12
|
-
__kernel void max_c_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
13
|
-
// Get the index of the current element to be processed
|
14
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
15
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
16
|
-
|
17
|
-
if (switch_op == 0) {
|
18
|
-
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] >= B[0] ? A[globalRow * N + globalCol] : B[0];
|
19
|
-
} else {
|
20
|
-
C[globalRow * N + globalCol] = B[0] >= A[globalRow * N + globalCol] ? B[0] : A[globalRow * N + globalCol];
|
21
|
-
}
|
22
|
-
}
|
23
|
-
|
24
|
-
// 1D + Scalar floating point add op broadcast
|
25
|
-
__kernel void max_b_<%= dtype %>_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
26
|
-
// Get the index of the current element to be processed
|
27
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
28
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
29
|
-
|
30
|
-
int b_m_index = globalRow;
|
31
|
-
int b_n_index = globalCol;
|
32
|
-
|
33
|
-
if ( b_m_index >= M2) {
|
34
|
-
b_m_index = b_m_index % M2;
|
35
|
-
};
|
36
|
-
|
37
|
-
if (b_n_index >= N2) {
|
38
|
-
b_n_index = b_n_index % N2;
|
39
|
-
}
|
40
|
-
|
41
|
-
if (switch_op == 0) {
|
42
|
-
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] >= B[b_m_index * N2 + b_n_index] ? A[globalRow * N + globalCol] : B[b_m_index * N2 + b_n_index];
|
43
|
-
} else {
|
44
|
-
C[globalRow * N + globalCol] = B[b_m_index * N2 + b_n_index] >= A[globalRow * N + globalCol] ? B[b_m_index * N2 + b_n_index] : A[globalRow * N + globalCol];
|
45
|
-
}
|
46
|
-
}
|