tensor_stream 0.8.1 → 0.8.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (84) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/CHANGELOG.md +8 -0
  4. data/README.md +12 -6
  5. data/lib/tensor_stream.rb +1 -0
  6. data/lib/tensor_stream/evaluator/base_evaluator.rb +1 -1
  7. data/lib/tensor_stream/evaluator/ruby/array_ops.rb +282 -0
  8. data/lib/tensor_stream/evaluator/ruby/images_ops.rb +61 -0
  9. data/lib/tensor_stream/evaluator/ruby/math_ops.rb +111 -0
  10. data/lib/tensor_stream/evaluator/ruby/nn_ops.rb +48 -9
  11. data/lib/tensor_stream/evaluator/ruby/random_ops.rb +51 -0
  12. data/lib/tensor_stream/evaluator/ruby_evaluator.rb +20 -433
  13. data/lib/tensor_stream/images.rb +16 -0
  14. data/lib/tensor_stream/ops.rb +5 -1
  15. data/lib/tensor_stream/session.rb +15 -15
  16. data/lib/tensor_stream/tensor.rb +1 -1
  17. data/lib/tensor_stream/train/adadelta_optimizer.rb +52 -0
  18. data/lib/tensor_stream/train/adam_optimizer.rb +17 -2
  19. data/lib/tensor_stream/train/gradient_descent_optimizer.rb +7 -1
  20. data/lib/tensor_stream/trainer.rb +1 -0
  21. data/lib/tensor_stream/types.rb +4 -0
  22. data/lib/tensor_stream/utils.rb +4 -0
  23. data/lib/tensor_stream/variable_scope.rb +1 -0
  24. data/lib/tensor_stream/version.rb +1 -1
  25. data/samples/linear_regression.rb +4 -1
  26. data/samples/mnist_data.rb +64 -0
  27. data/samples/nearest_neighbor.rb +1 -2
  28. data/samples/raw_neural_net_sample.rb +1 -1
  29. data/tensor_stream.gemspec +1 -0
  30. metadata +23 -57
  31. data/lib/tensor_stream/evaluator/opencl/kernels/_bool_operand.cl +0 -45
  32. data/lib/tensor_stream/evaluator/opencl/kernels/_operand.cl +0 -45
  33. data/lib/tensor_stream/evaluator/opencl/kernels/abs.cl +0 -20
  34. data/lib/tensor_stream/evaluator/opencl/kernels/acos.cl +0 -8
  35. data/lib/tensor_stream/evaluator/opencl/kernels/add.cl +0 -3
  36. data/lib/tensor_stream/evaluator/opencl/kernels/apply_adam.cl +0 -23
  37. data/lib/tensor_stream/evaluator/opencl/kernels/apply_gradient.cl +0 -9
  38. data/lib/tensor_stream/evaluator/opencl/kernels/apply_momentum.cl +0 -16
  39. data/lib/tensor_stream/evaluator/opencl/kernels/argmax.cl +0 -8
  40. data/lib/tensor_stream/evaluator/opencl/kernels/argmin.cl +0 -8
  41. data/lib/tensor_stream/evaluator/opencl/kernels/asin.cl +0 -9
  42. data/lib/tensor_stream/evaluator/opencl/kernels/cast.cl +0 -10
  43. data/lib/tensor_stream/evaluator/opencl/kernels/ceil.cl +0 -8
  44. data/lib/tensor_stream/evaluator/opencl/kernels/cond.cl.erb +0 -6
  45. data/lib/tensor_stream/evaluator/opencl/kernels/cos.cl +0 -8
  46. data/lib/tensor_stream/evaluator/opencl/kernels/div.cl.erb +0 -3
  47. data/lib/tensor_stream/evaluator/opencl/kernels/exp.cl +0 -8
  48. data/lib/tensor_stream/evaluator/opencl/kernels/floor.cl +0 -8
  49. data/lib/tensor_stream/evaluator/opencl/kernels/floor_div.cl +0 -48
  50. data/lib/tensor_stream/evaluator/opencl/kernels/floor_mod.cl +0 -3
  51. data/lib/tensor_stream/evaluator/opencl/kernels/gemm.cl +0 -32
  52. data/lib/tensor_stream/evaluator/opencl/kernels/log.cl +0 -8
  53. data/lib/tensor_stream/evaluator/opencl/kernels/log1p.cl +0 -8
  54. data/lib/tensor_stream/evaluator/opencl/kernels/log_softmax.cl +0 -26
  55. data/lib/tensor_stream/evaluator/opencl/kernels/max.cl +0 -46
  56. data/lib/tensor_stream/evaluator/opencl/kernels/min.cl +0 -46
  57. data/lib/tensor_stream/evaluator/opencl/kernels/mod.cl +0 -3
  58. data/lib/tensor_stream/evaluator/opencl/kernels/mul.cl +0 -3
  59. data/lib/tensor_stream/evaluator/opencl/kernels/negate.cl +0 -8
  60. data/lib/tensor_stream/evaluator/opencl/kernels/pack.cl +0 -24
  61. data/lib/tensor_stream/evaluator/opencl/kernels/pow.cl +0 -46
  62. data/lib/tensor_stream/evaluator/opencl/kernels/real_div.cl +0 -3
  63. data/lib/tensor_stream/evaluator/opencl/kernels/reciprocal.cl +0 -8
  64. data/lib/tensor_stream/evaluator/opencl/kernels/round.cl +0 -8
  65. data/lib/tensor_stream/evaluator/opencl/kernels/sigmoid.cl +0 -9
  66. data/lib/tensor_stream/evaluator/opencl/kernels/sigmoid_grad.cl +0 -55
  67. data/lib/tensor_stream/evaluator/opencl/kernels/sign.cl +0 -21
  68. data/lib/tensor_stream/evaluator/opencl/kernels/sin.cl +0 -9
  69. data/lib/tensor_stream/evaluator/opencl/kernels/softmax.cl +0 -26
  70. data/lib/tensor_stream/evaluator/opencl/kernels/softmax_cross.cl +0 -32
  71. data/lib/tensor_stream/evaluator/opencl/kernels/softmax_cross_grad.cl +0 -28
  72. data/lib/tensor_stream/evaluator/opencl/kernels/softmax_grad.cl +0 -46
  73. data/lib/tensor_stream/evaluator/opencl/kernels/sqrt.cl +0 -9
  74. data/lib/tensor_stream/evaluator/opencl/kernels/square.cl +0 -9
  75. data/lib/tensor_stream/evaluator/opencl/kernels/squared_difference.cl +0 -53
  76. data/lib/tensor_stream/evaluator/opencl/kernels/sub.cl +0 -3
  77. data/lib/tensor_stream/evaluator/opencl/kernels/tan.cl +0 -8
  78. data/lib/tensor_stream/evaluator/opencl/kernels/tanh.cl +0 -8
  79. data/lib/tensor_stream/evaluator/opencl/kernels/tanh_grad.cl +0 -7
  80. data/lib/tensor_stream/evaluator/opencl/kernels/where.cl +0 -8
  81. data/lib/tensor_stream/evaluator/opencl/opencl_buffer.rb +0 -35
  82. data/lib/tensor_stream/evaluator/opencl/opencl_device.rb +0 -5
  83. data/lib/tensor_stream/evaluator/opencl/opencl_evaluator.rb +0 -1230
  84. data/lib/tensor_stream/evaluator/opencl/opencl_template_helper.rb +0 -95
@@ -1,46 +0,0 @@
1
- // same dimension add floating point op
2
- % c_dtype = dtype_to_c_type(dtype)
3
- __kernel void min_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
4
- // Get the index of the current element to be processed
5
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
7
-
8
- C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <= B[globalRow * N + globalCol] ? A[globalRow * N + globalCol] : B[globalRow * N + globalCol];
9
- }
10
-
11
- // 1D + Scalar floating point add op
12
- __kernel void min_c_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
13
- // Get the index of the current element to be processed
14
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
15
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
16
-
17
- if (switch_op == 0) {
18
- C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <= B[0] ? A[globalRow * N + globalCol] : B[0];
19
- } else {
20
- C[globalRow * N + globalCol] = B[0] <= A[globalRow * N + globalCol] ? B[0] : A[globalRow * N + globalCol];
21
- }
22
- }
23
-
24
- // 1D + Scalar floating point add op broadcast
25
- __kernel void min_b_<%= dtype %>_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
26
- // Get the index of the current element to be processed
27
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
28
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
29
-
30
- int b_m_index = globalRow;
31
- int b_n_index = globalCol;
32
-
33
- if ( b_m_index >= M2) {
34
- b_m_index = b_m_index % M2;
35
- };
36
-
37
- if (b_n_index >= N2) {
38
- b_n_index = b_n_index % N2;
39
- }
40
-
41
- if (switch_op == 0) {
42
- C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <= B[b_m_index * N2 + b_n_index] ? A[globalRow * N + globalCol] : B[b_m_index * N2 + b_n_index];
43
- } else {
44
- C[globalRow * N + globalCol] = B[b_m_index * N2 + b_n_index] <= A[globalRow * N + globalCol] ? B[b_m_index * N2 + b_n_index] : A[globalRow * N + globalCol];
45
- }
46
- }
@@ -1,3 +0,0 @@
1
- % c_dtype = dtype_to_c_type(dtype)
2
- % op = operator_to_c('mod')
3
- <%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'mod', dtype: "#{a}_#{b}", result_t: c_dtype %>
@@ -1,3 +0,0 @@
1
- % c_dtype = dtype_to_c_type(dtype)
2
- % op = operator_to_c('mul')
3
- <%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'mul', dtype: "#{a}_#{b}", result_t: c_dtype %>
@@ -1,8 +0,0 @@
1
- % c_dtype = dtype_to_c_type(dtype)
2
- __kernel void negate_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
- // Get the index of the current element to be processed
4
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
-
7
- C[globalRow * N + globalCol] = -A[globalRow * N + globalCol];
8
- }
@@ -1,24 +0,0 @@
1
- % ctype = dtype_to_c_type(data_type)
2
-
3
- __kernel void pack(const int N, const int index, __global const <%= ctype %> *A, __global <%= ctype %> *C) {
4
- // Get the index of the current element to be processed
5
- const int globalCol = get_global_id(0); // Col ID of C (0..N)
6
-
7
- int start = index * <%= divisors[0] %>;
8
- int ptr = start + globalCol;
9
- int index_map[<%= divisors.size %>] = { <%= Array.new(divisors.size) { 0 }.join(', ') %> };
10
-
11
- // compute effective coordinates
12
- <% divisors.each_with_index do |div, index| %>
13
- index_map[<%= index %>] = (int)floor(ptr / (float)<%= div %>);<% if index < divisors.size - 1%>ptr = ptr % <%= div %>;<% end %><% end %>
14
-
15
- // Apply axis translation if needed
16
- <% if axis > 0 %>
17
- int first = index_map[0];
18
- <% axis.times do |i| %>
19
- index_map[<%= i %>] = index_map[<%= (i + 1) %>];<% end %>
20
- index_map[<%= axis %>] = first;
21
- <% end%>
22
-
23
- C[<%= multipliers.each_with_index.map { |m, idx| "#{m}*index_map[#{idx}]" }.join(' + ') %>] = A[globalCol];
24
- }
@@ -1,46 +0,0 @@
1
- // same dimension add floating point op
2
- % c_dtype = dtype_to_c_type(dtype)
3
- __kernel void pow_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
4
- // Get the index of the current element to be processed
5
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
7
-
8
- C[globalRow * N + globalCol] = pow((float)A[globalRow * N + globalCol], (float)B[globalRow * N + globalCol]);
9
- }
10
-
11
- // 1D + Scalar floating point add op
12
- __kernel void pow_c_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
13
- // Get the index of the current element to be processed
14
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
15
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
16
-
17
- if (switch_op == 0) {
18
- C[globalRow * N + globalCol] = pow((float)A[globalRow * N + globalCol], (float)B[0]);
19
- } else {
20
- C[globalRow * N + globalCol] = pow((float)B[0], (float)A[globalRow * N + globalCol]);
21
- }
22
- }
23
-
24
- // 1D + Scalar floating point add op broadcast
25
- __kernel void pow_b_<%= dtype %>_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
26
- // Get the index of the current element to be processed
27
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
28
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
29
-
30
- int b_m_index = globalRow;
31
- int b_n_index = globalCol;
32
-
33
- if ( b_m_index >= M2) {
34
- b_m_index = b_m_index % M2;
35
- };
36
-
37
- if (b_n_index >= N2) {
38
- b_n_index = b_n_index % N2;
39
- }
40
-
41
- if (switch_op == 0) {
42
- C[globalRow * N + globalCol] = pow((float)A[globalRow * N + globalCol], (float)B[b_m_index * N2 + b_n_index]);
43
- } else {
44
- C[globalRow * N + globalCol] = pow((float)B[b_m_index * N2 + b_n_index], (float)A[globalRow * N + globalCol]);
45
- }
46
- }
@@ -1,3 +0,0 @@
1
- % c_dtype = dtype_to_c_type(dtype)
2
- % op = operator_to_c('div')
3
- <%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'real_div', dtype: "#{a}_#{b}", result_t: c_dtype %>
@@ -1,8 +0,0 @@
1
- % c_dtype = dtype_to_c_type(dtype)
2
- __kernel void reciprocal_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
- // Get the index of the current element to be processed
4
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
-
7
- C[globalRow * N + globalCol] = 1 / A[globalRow * N + globalCol];
8
- }
@@ -1,8 +0,0 @@
1
- % c_dtype = dtype_to_c_type(dtype)
2
- __kernel void round_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
- // Get the index of the current element to be processed
4
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
-
7
- C[globalRow * N + globalCol] = round(A[globalRow * N + globalCol]);
8
- }
@@ -1,9 +0,0 @@
1
- % c_dtype = dtype_to_c_type(dtype)
2
-
3
- __kernel void sigmoid_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
4
- // Get the index of the current element to be processed
5
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
7
-
8
- C[globalRow * N + globalCol] = 1.0f/(1.0f + exp(-A[globalRow * N + globalCol]));
9
- }
@@ -1,55 +0,0 @@
1
- % c_dtype = dtype_to_c_type(dtype)
2
-
3
- float sigmoid(<%= c_dtype %> x) {
4
- return 1.0f/(1.0f + exp(-x));
5
- }
6
-
7
- float sigmoid_grad(<%= c_dtype %> x, <%= c_dtype %> g) {
8
- return g * sigmoid(x) * ( 1.0f - sigmoid(x));
9
- }
10
-
11
- // same dimension add floating point op
12
- __kernel void sigmoid_grad_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
13
- // Get the index of the current element to be processed
14
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
15
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
16
-
17
- C[globalRow * N + globalCol] = sigmoid_grad(A[globalRow * N + globalCol], B[globalRow * N + globalCol]);
18
- }
19
-
20
- // 1D + Scalar floating point add op
21
- __kernel void sigmoid_grad_c_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
22
- // Get the index of the current element to be processed
23
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
24
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
25
-
26
- if (switch_op == 0) {
27
- C[globalRow * N + globalCol] = sigmoid_grad(A[globalRow * N + globalCol], B[0]);
28
- } else {
29
- C[globalRow * N + globalCol] = sigmoid_grad(B[0], A[globalRow * N + globalCol]);
30
- }
31
- }
32
-
33
- // 1D + Scalar floating point add op broadcast
34
- __kernel void sigmoid_grad_b_<%= dtype %>_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
35
- // Get the index of the current element to be processed
36
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
37
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
38
-
39
- int b_m_index = globalRow;
40
- int b_n_index = globalCol;
41
-
42
- if ( b_m_index >= M2) {
43
- b_m_index = b_m_index % M2;
44
- };
45
-
46
- if (b_n_index >= N2) {
47
- b_n_index = b_n_index % N2;
48
- }
49
-
50
- if (switch_op == 0) {
51
- C[globalRow * N + globalCol] = sigmoid_grad(A[globalRow * N + globalCol], B[b_m_index * N2 + b_n_index]);
52
- } else {
53
- C[globalRow * N + globalCol] = sigmoid_grad(B[b_m_index * N2 + b_n_index], A[globalRow * N + globalCol]);
54
- }
55
- }
@@ -1,21 +0,0 @@
1
- % c_dtype = dtype_to_c_type(dtype)
2
-
3
- __kernel void sign_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
4
- // Get the index of the current element to be processed
5
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
7
- <%= c_dtype %> value = A[globalRow * N + globalCol];
8
- % if floating_point?(dtype)
9
- if (isnan(value) || value == 0.0f) {
10
- C[globalRow * N + globalCol] = 0.0;
11
- } else {
12
- C[globalRow * N + globalCol] = value < 0 ? -1.0 : 1.0;
13
- }
14
- % else
15
- if (value == 0) {
16
- C[globalRow * N + globalCol] = 0;
17
- } else {
18
- C[globalRow * N + globalCol] = value < 0 ? -1 : 1;
19
- }
20
- % end
21
- }
@@ -1,9 +0,0 @@
1
-
2
- % c_dtype = dtype_to_c_type(dtype)
3
- __kernel void sin_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
4
- // Get the index of the current element to be processed
5
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
7
-
8
- C[globalRow * N + globalCol] = sin(A[globalRow * N + globalCol]);
9
- }
@@ -1,26 +0,0 @@
1
- // First naive implementation
2
- % c_dtype = dtype_to_c_type(dtype)
3
- __kernel void softmax_<%= dtype %>(const int N,
4
- const __global <%= c_dtype %>* A,
5
- __global <%= c_dtype %>* C) {
6
-
7
- // Get the index of the current element to be processed
8
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
9
-
10
- // Compute a single element (loop over K)
11
- <%= c_dtype %> acc = 0.0f;
12
- <%= c_dtype %> max = <%= min_value_for(dtype) %>;
13
-
14
- for (int k=0; k<N; k++) {
15
- max = A[globalRow*N + k] > max ? A[globalRow*N + k] : max;
16
- }
17
-
18
- for (int k=0; k<N; k++) {
19
- acc += exp(A[globalRow*N + k] - max);
20
- }
21
-
22
- // Store the result
23
- for (int k=0; k < N; k++) {
24
- C[globalRow*N + k] = exp(A[globalRow*N + k] - max) / acc;
25
- }
26
- }
@@ -1,32 +0,0 @@
1
-
2
- // First naive implementation
3
- % c_dtype = dtype_to_c_type(dtype)
4
- __kernel void softmax_cross_<%= dtype %>(const int N,
5
- const __global <%= c_dtype %>* A,
6
- const __global <%= c_dtype %>* L,
7
- __global <%= c_dtype %>* C, __global <%= c_dtype %>* P) {
8
-
9
- // Get the index of the current element to be processed
10
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
11
-
12
- // Compute a single element (loop over K)
13
- <%= c_dtype %> acc = 0.0f;
14
- <%= c_dtype %> max = <%= min_value_for(dtype) %>;
15
-
16
- for (int k=0; k<N; k++) {
17
- max = A[globalRow*N + k] > max ? A[globalRow*N + k] : max;
18
- }
19
-
20
- for (int k=0; k<N; k++) {
21
- acc += exp(A[globalRow*N + k] - max);
22
- }
23
-
24
- // Store the result
25
- for (int k=0; k < N; k++) {
26
- C[globalRow*N + k] = (log(acc) - (A[globalRow*N + k] - max)) * L[globalRow*N + k];
27
- }
28
-
29
- for (int k=0; k < N; k++) {
30
- P[globalRow*N + k] = (exp(A[globalRow*N + k] - max) / acc) - L[globalRow*N + k];
31
- }
32
- }
@@ -1,28 +0,0 @@
1
- // First naive implementation
2
- % c_dtype = dtype_to_c_type(dtype)
3
- __kernel void softmax_cross_grad_<%= dtype %>(const int N,
4
- const __global <%= c_dtype %>* A,
5
- const __global <%= c_dtype %>* L,
6
- const __global <%= c_dtype %>* G,
7
- __global <%= c_dtype %>* C) {
8
-
9
- // Get the index of the current element to be processed
10
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
11
-
12
- // Compute a single element (loop over K)
13
- <%= c_dtype %> acc = 0.0f;
14
- <%= c_dtype %> max = <%= min_value_for(dtype) %>;
15
-
16
- for (int k=0; k<N; k++) {
17
- max = A[globalRow*N + k] > max ? A[globalRow*N + k] : max;
18
- }
19
-
20
- for (int k=0; k<N; k++) {
21
- acc += exp(A[globalRow*N + k] - max);
22
- }
23
-
24
- // Store the result
25
- for (int k=0; k < N; k++) {
26
- C[globalRow*N + k] = ((exp(A[globalRow*N + k] - max)/acc) * G[globalRow*N + k] - L[globalRow*N + k]);
27
- }
28
- }
@@ -1,46 +0,0 @@
1
- % c_dtype = dtype_to_c_type(dtype)
2
- __kernel void softmax_grad_<%= dtype %>(const int N,
3
- const __global <%= c_dtype %>* A,
4
- const __global <%= c_dtype %>* G,
5
- __global <%= c_dtype %>* C) {
6
-
7
- // Get the index of the current element to be processed
8
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
9
-
10
- // Compute a single element (loop over K)
11
- float acc = 0.0f;
12
- float max = FLT_MIN;
13
- float row[<%= size %>];
14
- float grads[<%= size %>][<%= size %>];
15
-
16
- for (int k=0; k<N; k++) {
17
- max = A[globalRow*N + k] > max ? A[globalRow*N + k] : max;
18
- }
19
-
20
- for (int k=0; k<N; k++) {
21
- acc += exp(A[globalRow*N + k] - max);
22
- }
23
-
24
- // Store the result
25
- for (int k=0; k < N; k++) {
26
- row[k] = exp(A[globalRow*N + k] - max) / acc;
27
- }
28
-
29
- for (int a=0; a < N; a++) {
30
- for(int b=0; b < N; b++) {
31
- if (a != b) {
32
- grads[a][b] = -row[a] * row[b];
33
- } else {
34
- grads[a][b] = row[a] * (1.0f - row[a]);
35
- }
36
- }
37
- }
38
-
39
- for (int k=0; k < N; k++) {
40
- float total_grad = 0.0f;
41
- for (int a = 0; a < N; a++) {
42
- total_grad += grads[a][k] * G[globalRow*N + a];
43
- }
44
- C[globalRow*N + k] = total_grad;
45
- }
46
- }
@@ -1,9 +0,0 @@
1
- % c_dtype = dtype_to_c_type(dtype)
2
-
3
- __kernel void sqrt_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
4
- // Get the index of the current element to be processed
5
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
7
-
8
- C[globalRow * N + globalCol] = sqrt(A[globalRow * N + globalCol]);
9
- }
@@ -1,9 +0,0 @@
1
- % c_dtype = dtype_to_c_type(dtype)
2
-
3
- __kernel void square_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
4
- // Get the index of the current element to be processed
5
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
7
-
8
- C[globalRow * N + globalCol] = A[globalRow * N + globalCol] * A[globalRow * N + globalCol];
9
- }
@@ -1,53 +0,0 @@
1
- % c_dtype = dtype_to_c_type(dtype)
2
- // same dimension add floating point op
3
- __kernel void squared_difference_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
4
- // Get the index of the current element to be processed
5
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
7
- <%= c_dtype %> x = A[globalRow * N + globalCol];
8
- <%= c_dtype %> y = B[globalRow * N + globalCol];
9
- C[globalRow * N + globalCol] = (x - y) * (x - y);
10
- }
11
-
12
- // 1D + Scalar floating point add op
13
- __kernel void squared_difference_c_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
14
- // Get the index of the current element to be processed
15
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
16
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
17
-
18
- <%= c_dtype %> x = A[globalRow * N + globalCol];
19
- <%= c_dtype %> y = B[0];
20
-
21
- if (switch_op == 0) {
22
- C[globalRow * N + globalCol] = (x - y) * (x - y);
23
- } else {
24
- C[globalRow * N + globalCol] = (y - x) * (y - x);
25
- }
26
- }
27
-
28
- // 1D + Scalar floating point add op broadcast
29
- __kernel void squared_difference_b_<%= dtype %>_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op,__global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
30
- // Get the index of the current element to be processed
31
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
32
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
33
-
34
- int b_m_index = globalRow;
35
- int b_n_index = globalCol;
36
-
37
- if ( b_m_index >= M2) {
38
- b_m_index = b_m_index % M2;
39
- };
40
-
41
- if (b_n_index >= N2) {
42
- b_n_index = b_n_index % N2;
43
- }
44
-
45
- <%= c_dtype %> x = A[globalRow * N + globalCol];
46
- <%= c_dtype %> y = B[b_m_index * N2 + b_n_index];
47
-
48
- if (switch_op == 0) {
49
- C[globalRow * N + globalCol] = (x - y) * (x - y);
50
- } else {
51
- C[globalRow * N + globalCol] = (y - x) * (y - x);
52
- }
53
- }