tensor_stream 0.8.1 → 0.8.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/CHANGELOG.md +8 -0
- data/README.md +12 -6
- data/lib/tensor_stream.rb +1 -0
- data/lib/tensor_stream/evaluator/base_evaluator.rb +1 -1
- data/lib/tensor_stream/evaluator/ruby/array_ops.rb +282 -0
- data/lib/tensor_stream/evaluator/ruby/images_ops.rb +61 -0
- data/lib/tensor_stream/evaluator/ruby/math_ops.rb +111 -0
- data/lib/tensor_stream/evaluator/ruby/nn_ops.rb +48 -9
- data/lib/tensor_stream/evaluator/ruby/random_ops.rb +51 -0
- data/lib/tensor_stream/evaluator/ruby_evaluator.rb +20 -433
- data/lib/tensor_stream/images.rb +16 -0
- data/lib/tensor_stream/ops.rb +5 -1
- data/lib/tensor_stream/session.rb +15 -15
- data/lib/tensor_stream/tensor.rb +1 -1
- data/lib/tensor_stream/train/adadelta_optimizer.rb +52 -0
- data/lib/tensor_stream/train/adam_optimizer.rb +17 -2
- data/lib/tensor_stream/train/gradient_descent_optimizer.rb +7 -1
- data/lib/tensor_stream/trainer.rb +1 -0
- data/lib/tensor_stream/types.rb +4 -0
- data/lib/tensor_stream/utils.rb +4 -0
- data/lib/tensor_stream/variable_scope.rb +1 -0
- data/lib/tensor_stream/version.rb +1 -1
- data/samples/linear_regression.rb +4 -1
- data/samples/mnist_data.rb +64 -0
- data/samples/nearest_neighbor.rb +1 -2
- data/samples/raw_neural_net_sample.rb +1 -1
- data/tensor_stream.gemspec +1 -0
- metadata +23 -57
- data/lib/tensor_stream/evaluator/opencl/kernels/_bool_operand.cl +0 -45
- data/lib/tensor_stream/evaluator/opencl/kernels/_operand.cl +0 -45
- data/lib/tensor_stream/evaluator/opencl/kernels/abs.cl +0 -20
- data/lib/tensor_stream/evaluator/opencl/kernels/acos.cl +0 -8
- data/lib/tensor_stream/evaluator/opencl/kernels/add.cl +0 -3
- data/lib/tensor_stream/evaluator/opencl/kernels/apply_adam.cl +0 -23
- data/lib/tensor_stream/evaluator/opencl/kernels/apply_gradient.cl +0 -9
- data/lib/tensor_stream/evaluator/opencl/kernels/apply_momentum.cl +0 -16
- data/lib/tensor_stream/evaluator/opencl/kernels/argmax.cl +0 -8
- data/lib/tensor_stream/evaluator/opencl/kernels/argmin.cl +0 -8
- data/lib/tensor_stream/evaluator/opencl/kernels/asin.cl +0 -9
- data/lib/tensor_stream/evaluator/opencl/kernels/cast.cl +0 -10
- data/lib/tensor_stream/evaluator/opencl/kernels/ceil.cl +0 -8
- data/lib/tensor_stream/evaluator/opencl/kernels/cond.cl.erb +0 -6
- data/lib/tensor_stream/evaluator/opencl/kernels/cos.cl +0 -8
- data/lib/tensor_stream/evaluator/opencl/kernels/div.cl.erb +0 -3
- data/lib/tensor_stream/evaluator/opencl/kernels/exp.cl +0 -8
- data/lib/tensor_stream/evaluator/opencl/kernels/floor.cl +0 -8
- data/lib/tensor_stream/evaluator/opencl/kernels/floor_div.cl +0 -48
- data/lib/tensor_stream/evaluator/opencl/kernels/floor_mod.cl +0 -3
- data/lib/tensor_stream/evaluator/opencl/kernels/gemm.cl +0 -32
- data/lib/tensor_stream/evaluator/opencl/kernels/log.cl +0 -8
- data/lib/tensor_stream/evaluator/opencl/kernels/log1p.cl +0 -8
- data/lib/tensor_stream/evaluator/opencl/kernels/log_softmax.cl +0 -26
- data/lib/tensor_stream/evaluator/opencl/kernels/max.cl +0 -46
- data/lib/tensor_stream/evaluator/opencl/kernels/min.cl +0 -46
- data/lib/tensor_stream/evaluator/opencl/kernels/mod.cl +0 -3
- data/lib/tensor_stream/evaluator/opencl/kernels/mul.cl +0 -3
- data/lib/tensor_stream/evaluator/opencl/kernels/negate.cl +0 -8
- data/lib/tensor_stream/evaluator/opencl/kernels/pack.cl +0 -24
- data/lib/tensor_stream/evaluator/opencl/kernels/pow.cl +0 -46
- data/lib/tensor_stream/evaluator/opencl/kernels/real_div.cl +0 -3
- data/lib/tensor_stream/evaluator/opencl/kernels/reciprocal.cl +0 -8
- data/lib/tensor_stream/evaluator/opencl/kernels/round.cl +0 -8
- data/lib/tensor_stream/evaluator/opencl/kernels/sigmoid.cl +0 -9
- data/lib/tensor_stream/evaluator/opencl/kernels/sigmoid_grad.cl +0 -55
- data/lib/tensor_stream/evaluator/opencl/kernels/sign.cl +0 -21
- data/lib/tensor_stream/evaluator/opencl/kernels/sin.cl +0 -9
- data/lib/tensor_stream/evaluator/opencl/kernels/softmax.cl +0 -26
- data/lib/tensor_stream/evaluator/opencl/kernels/softmax_cross.cl +0 -32
- data/lib/tensor_stream/evaluator/opencl/kernels/softmax_cross_grad.cl +0 -28
- data/lib/tensor_stream/evaluator/opencl/kernels/softmax_grad.cl +0 -46
- data/lib/tensor_stream/evaluator/opencl/kernels/sqrt.cl +0 -9
- data/lib/tensor_stream/evaluator/opencl/kernels/square.cl +0 -9
- data/lib/tensor_stream/evaluator/opencl/kernels/squared_difference.cl +0 -53
- data/lib/tensor_stream/evaluator/opencl/kernels/sub.cl +0 -3
- data/lib/tensor_stream/evaluator/opencl/kernels/tan.cl +0 -8
- data/lib/tensor_stream/evaluator/opencl/kernels/tanh.cl +0 -8
- data/lib/tensor_stream/evaluator/opencl/kernels/tanh_grad.cl +0 -7
- data/lib/tensor_stream/evaluator/opencl/kernels/where.cl +0 -8
- data/lib/tensor_stream/evaluator/opencl/opencl_buffer.rb +0 -35
- data/lib/tensor_stream/evaluator/opencl/opencl_device.rb +0 -5
- data/lib/tensor_stream/evaluator/opencl/opencl_evaluator.rb +0 -1230
- data/lib/tensor_stream/evaluator/opencl/opencl_template_helper.rb +0 -95
@@ -1,46 +0,0 @@
|
|
1
|
-
// same dimension add floating point op
|
2
|
-
% c_dtype = dtype_to_c_type(dtype)
|
3
|
-
__kernel void min_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
4
|
-
// Get the index of the current element to be processed
|
5
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
6
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
7
|
-
|
8
|
-
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <= B[globalRow * N + globalCol] ? A[globalRow * N + globalCol] : B[globalRow * N + globalCol];
|
9
|
-
}
|
10
|
-
|
11
|
-
// 1D + Scalar floating point add op
|
12
|
-
__kernel void min_c_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
13
|
-
// Get the index of the current element to be processed
|
14
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
15
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
16
|
-
|
17
|
-
if (switch_op == 0) {
|
18
|
-
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <= B[0] ? A[globalRow * N + globalCol] : B[0];
|
19
|
-
} else {
|
20
|
-
C[globalRow * N + globalCol] = B[0] <= A[globalRow * N + globalCol] ? B[0] : A[globalRow * N + globalCol];
|
21
|
-
}
|
22
|
-
}
|
23
|
-
|
24
|
-
// 1D + Scalar floating point add op broadcast
|
25
|
-
__kernel void min_b_<%= dtype %>_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
26
|
-
// Get the index of the current element to be processed
|
27
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
28
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
29
|
-
|
30
|
-
int b_m_index = globalRow;
|
31
|
-
int b_n_index = globalCol;
|
32
|
-
|
33
|
-
if ( b_m_index >= M2) {
|
34
|
-
b_m_index = b_m_index % M2;
|
35
|
-
};
|
36
|
-
|
37
|
-
if (b_n_index >= N2) {
|
38
|
-
b_n_index = b_n_index % N2;
|
39
|
-
}
|
40
|
-
|
41
|
-
if (switch_op == 0) {
|
42
|
-
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <= B[b_m_index * N2 + b_n_index] ? A[globalRow * N + globalCol] : B[b_m_index * N2 + b_n_index];
|
43
|
-
} else {
|
44
|
-
C[globalRow * N + globalCol] = B[b_m_index * N2 + b_n_index] <= A[globalRow * N + globalCol] ? B[b_m_index * N2 + b_n_index] : A[globalRow * N + globalCol];
|
45
|
-
}
|
46
|
-
}
|
@@ -1,8 +0,0 @@
|
|
1
|
-
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
__kernel void negate_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
|
-
// Get the index of the current element to be processed
|
4
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
5
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
6
|
-
|
7
|
-
C[globalRow * N + globalCol] = -A[globalRow * N + globalCol];
|
8
|
-
}
|
@@ -1,24 +0,0 @@
|
|
1
|
-
% ctype = dtype_to_c_type(data_type)
|
2
|
-
|
3
|
-
__kernel void pack(const int N, const int index, __global const <%= ctype %> *A, __global <%= ctype %> *C) {
|
4
|
-
// Get the index of the current element to be processed
|
5
|
-
const int globalCol = get_global_id(0); // Col ID of C (0..N)
|
6
|
-
|
7
|
-
int start = index * <%= divisors[0] %>;
|
8
|
-
int ptr = start + globalCol;
|
9
|
-
int index_map[<%= divisors.size %>] = { <%= Array.new(divisors.size) { 0 }.join(', ') %> };
|
10
|
-
|
11
|
-
// compute effective coordinates
|
12
|
-
<% divisors.each_with_index do |div, index| %>
|
13
|
-
index_map[<%= index %>] = (int)floor(ptr / (float)<%= div %>);<% if index < divisors.size - 1%>ptr = ptr % <%= div %>;<% end %><% end %>
|
14
|
-
|
15
|
-
// Apply axis translation if needed
|
16
|
-
<% if axis > 0 %>
|
17
|
-
int first = index_map[0];
|
18
|
-
<% axis.times do |i| %>
|
19
|
-
index_map[<%= i %>] = index_map[<%= (i + 1) %>];<% end %>
|
20
|
-
index_map[<%= axis %>] = first;
|
21
|
-
<% end%>
|
22
|
-
|
23
|
-
C[<%= multipliers.each_with_index.map { |m, idx| "#{m}*index_map[#{idx}]" }.join(' + ') %>] = A[globalCol];
|
24
|
-
}
|
@@ -1,46 +0,0 @@
|
|
1
|
-
// same dimension add floating point op
|
2
|
-
% c_dtype = dtype_to_c_type(dtype)
|
3
|
-
__kernel void pow_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
4
|
-
// Get the index of the current element to be processed
|
5
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
6
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
7
|
-
|
8
|
-
C[globalRow * N + globalCol] = pow((float)A[globalRow * N + globalCol], (float)B[globalRow * N + globalCol]);
|
9
|
-
}
|
10
|
-
|
11
|
-
// 1D + Scalar floating point add op
|
12
|
-
__kernel void pow_c_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
13
|
-
// Get the index of the current element to be processed
|
14
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
15
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
16
|
-
|
17
|
-
if (switch_op == 0) {
|
18
|
-
C[globalRow * N + globalCol] = pow((float)A[globalRow * N + globalCol], (float)B[0]);
|
19
|
-
} else {
|
20
|
-
C[globalRow * N + globalCol] = pow((float)B[0], (float)A[globalRow * N + globalCol]);
|
21
|
-
}
|
22
|
-
}
|
23
|
-
|
24
|
-
// 1D + Scalar floating point add op broadcast
|
25
|
-
__kernel void pow_b_<%= dtype %>_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
26
|
-
// Get the index of the current element to be processed
|
27
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
28
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
29
|
-
|
30
|
-
int b_m_index = globalRow;
|
31
|
-
int b_n_index = globalCol;
|
32
|
-
|
33
|
-
if ( b_m_index >= M2) {
|
34
|
-
b_m_index = b_m_index % M2;
|
35
|
-
};
|
36
|
-
|
37
|
-
if (b_n_index >= N2) {
|
38
|
-
b_n_index = b_n_index % N2;
|
39
|
-
}
|
40
|
-
|
41
|
-
if (switch_op == 0) {
|
42
|
-
C[globalRow * N + globalCol] = pow((float)A[globalRow * N + globalCol], (float)B[b_m_index * N2 + b_n_index]);
|
43
|
-
} else {
|
44
|
-
C[globalRow * N + globalCol] = pow((float)B[b_m_index * N2 + b_n_index], (float)A[globalRow * N + globalCol]);
|
45
|
-
}
|
46
|
-
}
|
@@ -1,8 +0,0 @@
|
|
1
|
-
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
__kernel void reciprocal_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
|
-
// Get the index of the current element to be processed
|
4
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
5
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
6
|
-
|
7
|
-
C[globalRow * N + globalCol] = 1 / A[globalRow * N + globalCol];
|
8
|
-
}
|
@@ -1,8 +0,0 @@
|
|
1
|
-
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
__kernel void round_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
|
-
// Get the index of the current element to be processed
|
4
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
5
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
6
|
-
|
7
|
-
C[globalRow * N + globalCol] = round(A[globalRow * N + globalCol]);
|
8
|
-
}
|
@@ -1,9 +0,0 @@
|
|
1
|
-
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
|
3
|
-
__kernel void sigmoid_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
4
|
-
// Get the index of the current element to be processed
|
5
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
6
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
7
|
-
|
8
|
-
C[globalRow * N + globalCol] = 1.0f/(1.0f + exp(-A[globalRow * N + globalCol]));
|
9
|
-
}
|
@@ -1,55 +0,0 @@
|
|
1
|
-
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
|
3
|
-
float sigmoid(<%= c_dtype %> x) {
|
4
|
-
return 1.0f/(1.0f + exp(-x));
|
5
|
-
}
|
6
|
-
|
7
|
-
float sigmoid_grad(<%= c_dtype %> x, <%= c_dtype %> g) {
|
8
|
-
return g * sigmoid(x) * ( 1.0f - sigmoid(x));
|
9
|
-
}
|
10
|
-
|
11
|
-
// same dimension add floating point op
|
12
|
-
__kernel void sigmoid_grad_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
13
|
-
// Get the index of the current element to be processed
|
14
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
15
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
16
|
-
|
17
|
-
C[globalRow * N + globalCol] = sigmoid_grad(A[globalRow * N + globalCol], B[globalRow * N + globalCol]);
|
18
|
-
}
|
19
|
-
|
20
|
-
// 1D + Scalar floating point add op
|
21
|
-
__kernel void sigmoid_grad_c_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
22
|
-
// Get the index of the current element to be processed
|
23
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
24
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
25
|
-
|
26
|
-
if (switch_op == 0) {
|
27
|
-
C[globalRow * N + globalCol] = sigmoid_grad(A[globalRow * N + globalCol], B[0]);
|
28
|
-
} else {
|
29
|
-
C[globalRow * N + globalCol] = sigmoid_grad(B[0], A[globalRow * N + globalCol]);
|
30
|
-
}
|
31
|
-
}
|
32
|
-
|
33
|
-
// 1D + Scalar floating point add op broadcast
|
34
|
-
__kernel void sigmoid_grad_b_<%= dtype %>_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
35
|
-
// Get the index of the current element to be processed
|
36
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
37
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
38
|
-
|
39
|
-
int b_m_index = globalRow;
|
40
|
-
int b_n_index = globalCol;
|
41
|
-
|
42
|
-
if ( b_m_index >= M2) {
|
43
|
-
b_m_index = b_m_index % M2;
|
44
|
-
};
|
45
|
-
|
46
|
-
if (b_n_index >= N2) {
|
47
|
-
b_n_index = b_n_index % N2;
|
48
|
-
}
|
49
|
-
|
50
|
-
if (switch_op == 0) {
|
51
|
-
C[globalRow * N + globalCol] = sigmoid_grad(A[globalRow * N + globalCol], B[b_m_index * N2 + b_n_index]);
|
52
|
-
} else {
|
53
|
-
C[globalRow * N + globalCol] = sigmoid_grad(B[b_m_index * N2 + b_n_index], A[globalRow * N + globalCol]);
|
54
|
-
}
|
55
|
-
}
|
@@ -1,21 +0,0 @@
|
|
1
|
-
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
|
3
|
-
__kernel void sign_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
4
|
-
// Get the index of the current element to be processed
|
5
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
6
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
7
|
-
<%= c_dtype %> value = A[globalRow * N + globalCol];
|
8
|
-
% if floating_point?(dtype)
|
9
|
-
if (isnan(value) || value == 0.0f) {
|
10
|
-
C[globalRow * N + globalCol] = 0.0;
|
11
|
-
} else {
|
12
|
-
C[globalRow * N + globalCol] = value < 0 ? -1.0 : 1.0;
|
13
|
-
}
|
14
|
-
% else
|
15
|
-
if (value == 0) {
|
16
|
-
C[globalRow * N + globalCol] = 0;
|
17
|
-
} else {
|
18
|
-
C[globalRow * N + globalCol] = value < 0 ? -1 : 1;
|
19
|
-
}
|
20
|
-
% end
|
21
|
-
}
|
@@ -1,9 +0,0 @@
|
|
1
|
-
|
2
|
-
% c_dtype = dtype_to_c_type(dtype)
|
3
|
-
__kernel void sin_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
4
|
-
// Get the index of the current element to be processed
|
5
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
6
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
7
|
-
|
8
|
-
C[globalRow * N + globalCol] = sin(A[globalRow * N + globalCol]);
|
9
|
-
}
|
@@ -1,26 +0,0 @@
|
|
1
|
-
// First naive implementation
|
2
|
-
% c_dtype = dtype_to_c_type(dtype)
|
3
|
-
__kernel void softmax_<%= dtype %>(const int N,
|
4
|
-
const __global <%= c_dtype %>* A,
|
5
|
-
__global <%= c_dtype %>* C) {
|
6
|
-
|
7
|
-
// Get the index of the current element to be processed
|
8
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
9
|
-
|
10
|
-
// Compute a single element (loop over K)
|
11
|
-
<%= c_dtype %> acc = 0.0f;
|
12
|
-
<%= c_dtype %> max = <%= min_value_for(dtype) %>;
|
13
|
-
|
14
|
-
for (int k=0; k<N; k++) {
|
15
|
-
max = A[globalRow*N + k] > max ? A[globalRow*N + k] : max;
|
16
|
-
}
|
17
|
-
|
18
|
-
for (int k=0; k<N; k++) {
|
19
|
-
acc += exp(A[globalRow*N + k] - max);
|
20
|
-
}
|
21
|
-
|
22
|
-
// Store the result
|
23
|
-
for (int k=0; k < N; k++) {
|
24
|
-
C[globalRow*N + k] = exp(A[globalRow*N + k] - max) / acc;
|
25
|
-
}
|
26
|
-
}
|
@@ -1,32 +0,0 @@
|
|
1
|
-
|
2
|
-
// First naive implementation
|
3
|
-
% c_dtype = dtype_to_c_type(dtype)
|
4
|
-
__kernel void softmax_cross_<%= dtype %>(const int N,
|
5
|
-
const __global <%= c_dtype %>* A,
|
6
|
-
const __global <%= c_dtype %>* L,
|
7
|
-
__global <%= c_dtype %>* C, __global <%= c_dtype %>* P) {
|
8
|
-
|
9
|
-
// Get the index of the current element to be processed
|
10
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
11
|
-
|
12
|
-
// Compute a single element (loop over K)
|
13
|
-
<%= c_dtype %> acc = 0.0f;
|
14
|
-
<%= c_dtype %> max = <%= min_value_for(dtype) %>;
|
15
|
-
|
16
|
-
for (int k=0; k<N; k++) {
|
17
|
-
max = A[globalRow*N + k] > max ? A[globalRow*N + k] : max;
|
18
|
-
}
|
19
|
-
|
20
|
-
for (int k=0; k<N; k++) {
|
21
|
-
acc += exp(A[globalRow*N + k] - max);
|
22
|
-
}
|
23
|
-
|
24
|
-
// Store the result
|
25
|
-
for (int k=0; k < N; k++) {
|
26
|
-
C[globalRow*N + k] = (log(acc) - (A[globalRow*N + k] - max)) * L[globalRow*N + k];
|
27
|
-
}
|
28
|
-
|
29
|
-
for (int k=0; k < N; k++) {
|
30
|
-
P[globalRow*N + k] = (exp(A[globalRow*N + k] - max) / acc) - L[globalRow*N + k];
|
31
|
-
}
|
32
|
-
}
|
@@ -1,28 +0,0 @@
|
|
1
|
-
// First naive implementation
|
2
|
-
% c_dtype = dtype_to_c_type(dtype)
|
3
|
-
__kernel void softmax_cross_grad_<%= dtype %>(const int N,
|
4
|
-
const __global <%= c_dtype %>* A,
|
5
|
-
const __global <%= c_dtype %>* L,
|
6
|
-
const __global <%= c_dtype %>* G,
|
7
|
-
__global <%= c_dtype %>* C) {
|
8
|
-
|
9
|
-
// Get the index of the current element to be processed
|
10
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
11
|
-
|
12
|
-
// Compute a single element (loop over K)
|
13
|
-
<%= c_dtype %> acc = 0.0f;
|
14
|
-
<%= c_dtype %> max = <%= min_value_for(dtype) %>;
|
15
|
-
|
16
|
-
for (int k=0; k<N; k++) {
|
17
|
-
max = A[globalRow*N + k] > max ? A[globalRow*N + k] : max;
|
18
|
-
}
|
19
|
-
|
20
|
-
for (int k=0; k<N; k++) {
|
21
|
-
acc += exp(A[globalRow*N + k] - max);
|
22
|
-
}
|
23
|
-
|
24
|
-
// Store the result
|
25
|
-
for (int k=0; k < N; k++) {
|
26
|
-
C[globalRow*N + k] = ((exp(A[globalRow*N + k] - max)/acc) * G[globalRow*N + k] - L[globalRow*N + k]);
|
27
|
-
}
|
28
|
-
}
|
@@ -1,46 +0,0 @@
|
|
1
|
-
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
__kernel void softmax_grad_<%= dtype %>(const int N,
|
3
|
-
const __global <%= c_dtype %>* A,
|
4
|
-
const __global <%= c_dtype %>* G,
|
5
|
-
__global <%= c_dtype %>* C) {
|
6
|
-
|
7
|
-
// Get the index of the current element to be processed
|
8
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
9
|
-
|
10
|
-
// Compute a single element (loop over K)
|
11
|
-
float acc = 0.0f;
|
12
|
-
float max = FLT_MIN;
|
13
|
-
float row[<%= size %>];
|
14
|
-
float grads[<%= size %>][<%= size %>];
|
15
|
-
|
16
|
-
for (int k=0; k<N; k++) {
|
17
|
-
max = A[globalRow*N + k] > max ? A[globalRow*N + k] : max;
|
18
|
-
}
|
19
|
-
|
20
|
-
for (int k=0; k<N; k++) {
|
21
|
-
acc += exp(A[globalRow*N + k] - max);
|
22
|
-
}
|
23
|
-
|
24
|
-
// Store the result
|
25
|
-
for (int k=0; k < N; k++) {
|
26
|
-
row[k] = exp(A[globalRow*N + k] - max) / acc;
|
27
|
-
}
|
28
|
-
|
29
|
-
for (int a=0; a < N; a++) {
|
30
|
-
for(int b=0; b < N; b++) {
|
31
|
-
if (a != b) {
|
32
|
-
grads[a][b] = -row[a] * row[b];
|
33
|
-
} else {
|
34
|
-
grads[a][b] = row[a] * (1.0f - row[a]);
|
35
|
-
}
|
36
|
-
}
|
37
|
-
}
|
38
|
-
|
39
|
-
for (int k=0; k < N; k++) {
|
40
|
-
float total_grad = 0.0f;
|
41
|
-
for (int a = 0; a < N; a++) {
|
42
|
-
total_grad += grads[a][k] * G[globalRow*N + a];
|
43
|
-
}
|
44
|
-
C[globalRow*N + k] = total_grad;
|
45
|
-
}
|
46
|
-
}
|
@@ -1,9 +0,0 @@
|
|
1
|
-
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
|
3
|
-
__kernel void sqrt_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
4
|
-
// Get the index of the current element to be processed
|
5
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
6
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
7
|
-
|
8
|
-
C[globalRow * N + globalCol] = sqrt(A[globalRow * N + globalCol]);
|
9
|
-
}
|
@@ -1,9 +0,0 @@
|
|
1
|
-
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
|
3
|
-
__kernel void square_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
4
|
-
// Get the index of the current element to be processed
|
5
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
6
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
7
|
-
|
8
|
-
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] * A[globalRow * N + globalCol];
|
9
|
-
}
|
@@ -1,53 +0,0 @@
|
|
1
|
-
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
// same dimension add floating point op
|
3
|
-
__kernel void squared_difference_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
4
|
-
// Get the index of the current element to be processed
|
5
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
6
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
7
|
-
<%= c_dtype %> x = A[globalRow * N + globalCol];
|
8
|
-
<%= c_dtype %> y = B[globalRow * N + globalCol];
|
9
|
-
C[globalRow * N + globalCol] = (x - y) * (x - y);
|
10
|
-
}
|
11
|
-
|
12
|
-
// 1D + Scalar floating point add op
|
13
|
-
__kernel void squared_difference_c_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
14
|
-
// Get the index of the current element to be processed
|
15
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
16
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
17
|
-
|
18
|
-
<%= c_dtype %> x = A[globalRow * N + globalCol];
|
19
|
-
<%= c_dtype %> y = B[0];
|
20
|
-
|
21
|
-
if (switch_op == 0) {
|
22
|
-
C[globalRow * N + globalCol] = (x - y) * (x - y);
|
23
|
-
} else {
|
24
|
-
C[globalRow * N + globalCol] = (y - x) * (y - x);
|
25
|
-
}
|
26
|
-
}
|
27
|
-
|
28
|
-
// 1D + Scalar floating point add op broadcast
|
29
|
-
__kernel void squared_difference_b_<%= dtype %>_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op,__global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
30
|
-
// Get the index of the current element to be processed
|
31
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
32
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
33
|
-
|
34
|
-
int b_m_index = globalRow;
|
35
|
-
int b_n_index = globalCol;
|
36
|
-
|
37
|
-
if ( b_m_index >= M2) {
|
38
|
-
b_m_index = b_m_index % M2;
|
39
|
-
};
|
40
|
-
|
41
|
-
if (b_n_index >= N2) {
|
42
|
-
b_n_index = b_n_index % N2;
|
43
|
-
}
|
44
|
-
|
45
|
-
<%= c_dtype %> x = A[globalRow * N + globalCol];
|
46
|
-
<%= c_dtype %> y = B[b_m_index * N2 + b_n_index];
|
47
|
-
|
48
|
-
if (switch_op == 0) {
|
49
|
-
C[globalRow * N + globalCol] = (x - y) * (x - y);
|
50
|
-
} else {
|
51
|
-
C[globalRow * N + globalCol] = (y - x) * (y - x);
|
52
|
-
}
|
53
|
-
}
|