tensor_stream 0.8.1 → 0.8.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/CHANGELOG.md +8 -0
  4. data/README.md +12 -6
  5. data/lib/tensor_stream.rb +1 -0
  6. data/lib/tensor_stream/evaluator/base_evaluator.rb +1 -1
  7. data/lib/tensor_stream/evaluator/ruby/array_ops.rb +282 -0
  8. data/lib/tensor_stream/evaluator/ruby/images_ops.rb +61 -0
  9. data/lib/tensor_stream/evaluator/ruby/math_ops.rb +111 -0
  10. data/lib/tensor_stream/evaluator/ruby/nn_ops.rb +48 -9
  11. data/lib/tensor_stream/evaluator/ruby/random_ops.rb +51 -0
  12. data/lib/tensor_stream/evaluator/ruby_evaluator.rb +20 -433
  13. data/lib/tensor_stream/images.rb +16 -0
  14. data/lib/tensor_stream/ops.rb +5 -1
  15. data/lib/tensor_stream/session.rb +15 -15
  16. data/lib/tensor_stream/tensor.rb +1 -1
  17. data/lib/tensor_stream/train/adadelta_optimizer.rb +52 -0
  18. data/lib/tensor_stream/train/adam_optimizer.rb +17 -2
  19. data/lib/tensor_stream/train/gradient_descent_optimizer.rb +7 -1
  20. data/lib/tensor_stream/trainer.rb +1 -0
  21. data/lib/tensor_stream/types.rb +4 -0
  22. data/lib/tensor_stream/utils.rb +4 -0
  23. data/lib/tensor_stream/variable_scope.rb +1 -0
  24. data/lib/tensor_stream/version.rb +1 -1
  25. data/samples/linear_regression.rb +4 -1
  26. data/samples/mnist_data.rb +64 -0
  27. data/samples/nearest_neighbor.rb +1 -2
  28. data/samples/raw_neural_net_sample.rb +1 -1
  29. data/tensor_stream.gemspec +1 -0
  30. metadata +23 -57
  31. data/lib/tensor_stream/evaluator/opencl/kernels/_bool_operand.cl +0 -45
  32. data/lib/tensor_stream/evaluator/opencl/kernels/_operand.cl +0 -45
  33. data/lib/tensor_stream/evaluator/opencl/kernels/abs.cl +0 -20
  34. data/lib/tensor_stream/evaluator/opencl/kernels/acos.cl +0 -8
  35. data/lib/tensor_stream/evaluator/opencl/kernels/add.cl +0 -3
  36. data/lib/tensor_stream/evaluator/opencl/kernels/apply_adam.cl +0 -23
  37. data/lib/tensor_stream/evaluator/opencl/kernels/apply_gradient.cl +0 -9
  38. data/lib/tensor_stream/evaluator/opencl/kernels/apply_momentum.cl +0 -16
  39. data/lib/tensor_stream/evaluator/opencl/kernels/argmax.cl +0 -8
  40. data/lib/tensor_stream/evaluator/opencl/kernels/argmin.cl +0 -8
  41. data/lib/tensor_stream/evaluator/opencl/kernels/asin.cl +0 -9
  42. data/lib/tensor_stream/evaluator/opencl/kernels/cast.cl +0 -10
  43. data/lib/tensor_stream/evaluator/opencl/kernels/ceil.cl +0 -8
  44. data/lib/tensor_stream/evaluator/opencl/kernels/cond.cl.erb +0 -6
  45. data/lib/tensor_stream/evaluator/opencl/kernels/cos.cl +0 -8
  46. data/lib/tensor_stream/evaluator/opencl/kernels/div.cl.erb +0 -3
  47. data/lib/tensor_stream/evaluator/opencl/kernels/exp.cl +0 -8
  48. data/lib/tensor_stream/evaluator/opencl/kernels/floor.cl +0 -8
  49. data/lib/tensor_stream/evaluator/opencl/kernels/floor_div.cl +0 -48
  50. data/lib/tensor_stream/evaluator/opencl/kernels/floor_mod.cl +0 -3
  51. data/lib/tensor_stream/evaluator/opencl/kernels/gemm.cl +0 -32
  52. data/lib/tensor_stream/evaluator/opencl/kernels/log.cl +0 -8
  53. data/lib/tensor_stream/evaluator/opencl/kernels/log1p.cl +0 -8
  54. data/lib/tensor_stream/evaluator/opencl/kernels/log_softmax.cl +0 -26
  55. data/lib/tensor_stream/evaluator/opencl/kernels/max.cl +0 -46
  56. data/lib/tensor_stream/evaluator/opencl/kernels/min.cl +0 -46
  57. data/lib/tensor_stream/evaluator/opencl/kernels/mod.cl +0 -3
  58. data/lib/tensor_stream/evaluator/opencl/kernels/mul.cl +0 -3
  59. data/lib/tensor_stream/evaluator/opencl/kernels/negate.cl +0 -8
  60. data/lib/tensor_stream/evaluator/opencl/kernels/pack.cl +0 -24
  61. data/lib/tensor_stream/evaluator/opencl/kernels/pow.cl +0 -46
  62. data/lib/tensor_stream/evaluator/opencl/kernels/real_div.cl +0 -3
  63. data/lib/tensor_stream/evaluator/opencl/kernels/reciprocal.cl +0 -8
  64. data/lib/tensor_stream/evaluator/opencl/kernels/round.cl +0 -8
  65. data/lib/tensor_stream/evaluator/opencl/kernels/sigmoid.cl +0 -9
  66. data/lib/tensor_stream/evaluator/opencl/kernels/sigmoid_grad.cl +0 -55
  67. data/lib/tensor_stream/evaluator/opencl/kernels/sign.cl +0 -21
  68. data/lib/tensor_stream/evaluator/opencl/kernels/sin.cl +0 -9
  69. data/lib/tensor_stream/evaluator/opencl/kernels/softmax.cl +0 -26
  70. data/lib/tensor_stream/evaluator/opencl/kernels/softmax_cross.cl +0 -32
  71. data/lib/tensor_stream/evaluator/opencl/kernels/softmax_cross_grad.cl +0 -28
  72. data/lib/tensor_stream/evaluator/opencl/kernels/softmax_grad.cl +0 -46
  73. data/lib/tensor_stream/evaluator/opencl/kernels/sqrt.cl +0 -9
  74. data/lib/tensor_stream/evaluator/opencl/kernels/square.cl +0 -9
  75. data/lib/tensor_stream/evaluator/opencl/kernels/squared_difference.cl +0 -53
  76. data/lib/tensor_stream/evaluator/opencl/kernels/sub.cl +0 -3
  77. data/lib/tensor_stream/evaluator/opencl/kernels/tan.cl +0 -8
  78. data/lib/tensor_stream/evaluator/opencl/kernels/tanh.cl +0 -8
  79. data/lib/tensor_stream/evaluator/opencl/kernels/tanh_grad.cl +0 -7
  80. data/lib/tensor_stream/evaluator/opencl/kernels/where.cl +0 -8
  81. data/lib/tensor_stream/evaluator/opencl/opencl_buffer.rb +0 -35
  82. data/lib/tensor_stream/evaluator/opencl/opencl_device.rb +0 -5
  83. data/lib/tensor_stream/evaluator/opencl/opencl_evaluator.rb +0 -1230
  84. data/lib/tensor_stream/evaluator/opencl/opencl_template_helper.rb +0 -95
@@ -1,46 +0,0 @@
1
- // same dimension add floating point op
2
- % c_dtype = dtype_to_c_type(dtype)
3
- __kernel void min_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
4
- // Get the index of the current element to be processed
5
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
7
-
8
- C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <= B[globalRow * N + globalCol] ? A[globalRow * N + globalCol] : B[globalRow * N + globalCol];
9
- }
10
-
11
- // 1D + Scalar floating point add op
12
- __kernel void min_c_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
13
- // Get the index of the current element to be processed
14
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
15
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
16
-
17
- if (switch_op == 0) {
18
- C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <= B[0] ? A[globalRow * N + globalCol] : B[0];
19
- } else {
20
- C[globalRow * N + globalCol] = B[0] <= A[globalRow * N + globalCol] ? B[0] : A[globalRow * N + globalCol];
21
- }
22
- }
23
-
24
- // 1D + Scalar floating point add op broadcast
25
- __kernel void min_b_<%= dtype %>_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
26
- // Get the index of the current element to be processed
27
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
28
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
29
-
30
- int b_m_index = globalRow;
31
- int b_n_index = globalCol;
32
-
33
- if ( b_m_index >= M2) {
34
- b_m_index = b_m_index % M2;
35
- };
36
-
37
- if (b_n_index >= N2) {
38
- b_n_index = b_n_index % N2;
39
- }
40
-
41
- if (switch_op == 0) {
42
- C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <= B[b_m_index * N2 + b_n_index] ? A[globalRow * N + globalCol] : B[b_m_index * N2 + b_n_index];
43
- } else {
44
- C[globalRow * N + globalCol] = B[b_m_index * N2 + b_n_index] <= A[globalRow * N + globalCol] ? B[b_m_index * N2 + b_n_index] : A[globalRow * N + globalCol];
45
- }
46
- }
@@ -1,3 +0,0 @@
1
- % c_dtype = dtype_to_c_type(dtype)
2
- % op = operator_to_c('mod')
3
- <%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'mod', dtype: "#{a}_#{b}", result_t: c_dtype %>
@@ -1,3 +0,0 @@
1
- % c_dtype = dtype_to_c_type(dtype)
2
- % op = operator_to_c('mul')
3
- <%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'mul', dtype: "#{a}_#{b}", result_t: c_dtype %>
@@ -1,8 +0,0 @@
1
- % c_dtype = dtype_to_c_type(dtype)
2
- __kernel void negate_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
- // Get the index of the current element to be processed
4
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
-
7
- C[globalRow * N + globalCol] = -A[globalRow * N + globalCol];
8
- }
@@ -1,24 +0,0 @@
1
- % ctype = dtype_to_c_type(data_type)
2
-
3
- __kernel void pack(const int N, const int index, __global const <%= ctype %> *A, __global <%= ctype %> *C) {
4
- // Get the index of the current element to be processed
5
- const int globalCol = get_global_id(0); // Col ID of C (0..N)
6
-
7
- int start = index * <%= divisors[0] %>;
8
- int ptr = start + globalCol;
9
- int index_map[<%= divisors.size %>] = { <%= Array.new(divisors.size) { 0 }.join(', ') %> };
10
-
11
- // compute effective coordinates
12
- <% divisors.each_with_index do |div, index| %>
13
- index_map[<%= index %>] = (int)floor(ptr / (float)<%= div %>);<% if index < divisors.size - 1%>ptr = ptr % <%= div %>;<% end %><% end %>
14
-
15
- // Apply axis translation if needed
16
- <% if axis > 0 %>
17
- int first = index_map[0];
18
- <% axis.times do |i| %>
19
- index_map[<%= i %>] = index_map[<%= (i + 1) %>];<% end %>
20
- index_map[<%= axis %>] = first;
21
- <% end%>
22
-
23
- C[<%= multipliers.each_with_index.map { |m, idx| "#{m}*index_map[#{idx}]" }.join(' + ') %>] = A[globalCol];
24
- }
@@ -1,46 +0,0 @@
1
- // same dimension add floating point op
2
- % c_dtype = dtype_to_c_type(dtype)
3
- __kernel void pow_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
4
- // Get the index of the current element to be processed
5
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
7
-
8
- C[globalRow * N + globalCol] = pow((float)A[globalRow * N + globalCol], (float)B[globalRow * N + globalCol]);
9
- }
10
-
11
- // 1D + Scalar floating point add op
12
- __kernel void pow_c_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
13
- // Get the index of the current element to be processed
14
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
15
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
16
-
17
- if (switch_op == 0) {
18
- C[globalRow * N + globalCol] = pow((float)A[globalRow * N + globalCol], (float)B[0]);
19
- } else {
20
- C[globalRow * N + globalCol] = pow((float)B[0], (float)A[globalRow * N + globalCol]);
21
- }
22
- }
23
-
24
- // 1D + Scalar floating point add op broadcast
25
- __kernel void pow_b_<%= dtype %>_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
26
- // Get the index of the current element to be processed
27
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
28
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
29
-
30
- int b_m_index = globalRow;
31
- int b_n_index = globalCol;
32
-
33
- if ( b_m_index >= M2) {
34
- b_m_index = b_m_index % M2;
35
- };
36
-
37
- if (b_n_index >= N2) {
38
- b_n_index = b_n_index % N2;
39
- }
40
-
41
- if (switch_op == 0) {
42
- C[globalRow * N + globalCol] = pow((float)A[globalRow * N + globalCol], (float)B[b_m_index * N2 + b_n_index]);
43
- } else {
44
- C[globalRow * N + globalCol] = pow((float)B[b_m_index * N2 + b_n_index], (float)A[globalRow * N + globalCol]);
45
- }
46
- }
@@ -1,3 +0,0 @@
1
- % c_dtype = dtype_to_c_type(dtype)
2
- % op = operator_to_c('div')
3
- <%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'real_div', dtype: "#{a}_#{b}", result_t: c_dtype %>
@@ -1,8 +0,0 @@
1
- % c_dtype = dtype_to_c_type(dtype)
2
- __kernel void reciprocal_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
- // Get the index of the current element to be processed
4
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
-
7
- C[globalRow * N + globalCol] = 1 / A[globalRow * N + globalCol];
8
- }
@@ -1,8 +0,0 @@
1
- % c_dtype = dtype_to_c_type(dtype)
2
- __kernel void round_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
- // Get the index of the current element to be processed
4
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
-
7
- C[globalRow * N + globalCol] = round(A[globalRow * N + globalCol]);
8
- }
@@ -1,9 +0,0 @@
1
- % c_dtype = dtype_to_c_type(dtype)
2
-
3
- __kernel void sigmoid_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
4
- // Get the index of the current element to be processed
5
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
7
-
8
- C[globalRow * N + globalCol] = 1.0f/(1.0f + exp(-A[globalRow * N + globalCol]));
9
- }
@@ -1,55 +0,0 @@
1
- % c_dtype = dtype_to_c_type(dtype)
2
-
3
- float sigmoid(<%= c_dtype %> x) {
4
- return 1.0f/(1.0f + exp(-x));
5
- }
6
-
7
- float sigmoid_grad(<%= c_dtype %> x, <%= c_dtype %> g) {
8
- return g * sigmoid(x) * ( 1.0f - sigmoid(x));
9
- }
10
-
11
- // same dimension add floating point op
12
- __kernel void sigmoid_grad_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
13
- // Get the index of the current element to be processed
14
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
15
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
16
-
17
- C[globalRow * N + globalCol] = sigmoid_grad(A[globalRow * N + globalCol], B[globalRow * N + globalCol]);
18
- }
19
-
20
- // 1D + Scalar floating point add op
21
- __kernel void sigmoid_grad_c_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
22
- // Get the index of the current element to be processed
23
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
24
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
25
-
26
- if (switch_op == 0) {
27
- C[globalRow * N + globalCol] = sigmoid_grad(A[globalRow * N + globalCol], B[0]);
28
- } else {
29
- C[globalRow * N + globalCol] = sigmoid_grad(B[0], A[globalRow * N + globalCol]);
30
- }
31
- }
32
-
33
- // 1D + Scalar floating point add op broadcast
34
- __kernel void sigmoid_grad_b_<%= dtype %>_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
35
- // Get the index of the current element to be processed
36
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
37
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
38
-
39
- int b_m_index = globalRow;
40
- int b_n_index = globalCol;
41
-
42
- if ( b_m_index >= M2) {
43
- b_m_index = b_m_index % M2;
44
- };
45
-
46
- if (b_n_index >= N2) {
47
- b_n_index = b_n_index % N2;
48
- }
49
-
50
- if (switch_op == 0) {
51
- C[globalRow * N + globalCol] = sigmoid_grad(A[globalRow * N + globalCol], B[b_m_index * N2 + b_n_index]);
52
- } else {
53
- C[globalRow * N + globalCol] = sigmoid_grad(B[b_m_index * N2 + b_n_index], A[globalRow * N + globalCol]);
54
- }
55
- }
@@ -1,21 +0,0 @@
1
- % c_dtype = dtype_to_c_type(dtype)
2
-
3
- __kernel void sign_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
4
- // Get the index of the current element to be processed
5
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
7
- <%= c_dtype %> value = A[globalRow * N + globalCol];
8
- % if floating_point?(dtype)
9
- if (isnan(value) || value == 0.0f) {
10
- C[globalRow * N + globalCol] = 0.0;
11
- } else {
12
- C[globalRow * N + globalCol] = value < 0 ? -1.0 : 1.0;
13
- }
14
- % else
15
- if (value == 0) {
16
- C[globalRow * N + globalCol] = 0;
17
- } else {
18
- C[globalRow * N + globalCol] = value < 0 ? -1 : 1;
19
- }
20
- % end
21
- }
@@ -1,9 +0,0 @@
1
-
2
- % c_dtype = dtype_to_c_type(dtype)
3
- __kernel void sin_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
4
- // Get the index of the current element to be processed
5
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
7
-
8
- C[globalRow * N + globalCol] = sin(A[globalRow * N + globalCol]);
9
- }
@@ -1,26 +0,0 @@
1
- // First naive implementation
2
- % c_dtype = dtype_to_c_type(dtype)
3
- __kernel void softmax_<%= dtype %>(const int N,
4
- const __global <%= c_dtype %>* A,
5
- __global <%= c_dtype %>* C) {
6
-
7
- // Get the index of the current element to be processed
8
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
9
-
10
- // Compute a single element (loop over K)
11
- <%= c_dtype %> acc = 0.0f;
12
- <%= c_dtype %> max = <%= min_value_for(dtype) %>;
13
-
14
- for (int k=0; k<N; k++) {
15
- max = A[globalRow*N + k] > max ? A[globalRow*N + k] : max;
16
- }
17
-
18
- for (int k=0; k<N; k++) {
19
- acc += exp(A[globalRow*N + k] - max);
20
- }
21
-
22
- // Store the result
23
- for (int k=0; k < N; k++) {
24
- C[globalRow*N + k] = exp(A[globalRow*N + k] - max) / acc;
25
- }
26
- }
@@ -1,32 +0,0 @@
1
-
2
- // First naive implementation
3
- % c_dtype = dtype_to_c_type(dtype)
4
- __kernel void softmax_cross_<%= dtype %>(const int N,
5
- const __global <%= c_dtype %>* A,
6
- const __global <%= c_dtype %>* L,
7
- __global <%= c_dtype %>* C, __global <%= c_dtype %>* P) {
8
-
9
- // Get the index of the current element to be processed
10
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
11
-
12
- // Compute a single element (loop over K)
13
- <%= c_dtype %> acc = 0.0f;
14
- <%= c_dtype %> max = <%= min_value_for(dtype) %>;
15
-
16
- for (int k=0; k<N; k++) {
17
- max = A[globalRow*N + k] > max ? A[globalRow*N + k] : max;
18
- }
19
-
20
- for (int k=0; k<N; k++) {
21
- acc += exp(A[globalRow*N + k] - max);
22
- }
23
-
24
- // Store the result
25
- for (int k=0; k < N; k++) {
26
- C[globalRow*N + k] = (log(acc) - (A[globalRow*N + k] - max)) * L[globalRow*N + k];
27
- }
28
-
29
- for (int k=0; k < N; k++) {
30
- P[globalRow*N + k] = (exp(A[globalRow*N + k] - max) / acc) - L[globalRow*N + k];
31
- }
32
- }
@@ -1,28 +0,0 @@
1
- // First naive implementation
2
- % c_dtype = dtype_to_c_type(dtype)
3
- __kernel void softmax_cross_grad_<%= dtype %>(const int N,
4
- const __global <%= c_dtype %>* A,
5
- const __global <%= c_dtype %>* L,
6
- const __global <%= c_dtype %>* G,
7
- __global <%= c_dtype %>* C) {
8
-
9
- // Get the index of the current element to be processed
10
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
11
-
12
- // Compute a single element (loop over K)
13
- <%= c_dtype %> acc = 0.0f;
14
- <%= c_dtype %> max = <%= min_value_for(dtype) %>;
15
-
16
- for (int k=0; k<N; k++) {
17
- max = A[globalRow*N + k] > max ? A[globalRow*N + k] : max;
18
- }
19
-
20
- for (int k=0; k<N; k++) {
21
- acc += exp(A[globalRow*N + k] - max);
22
- }
23
-
24
- // Store the result
25
- for (int k=0; k < N; k++) {
26
- C[globalRow*N + k] = ((exp(A[globalRow*N + k] - max)/acc) * G[globalRow*N + k] - L[globalRow*N + k]);
27
- }
28
- }
@@ -1,46 +0,0 @@
1
- % c_dtype = dtype_to_c_type(dtype)
2
- __kernel void softmax_grad_<%= dtype %>(const int N,
3
- const __global <%= c_dtype %>* A,
4
- const __global <%= c_dtype %>* G,
5
- __global <%= c_dtype %>* C) {
6
-
7
- // Get the index of the current element to be processed
8
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
9
-
10
- // Compute a single element (loop over K)
11
- float acc = 0.0f;
12
- float max = FLT_MIN;
13
- float row[<%= size %>];
14
- float grads[<%= size %>][<%= size %>];
15
-
16
- for (int k=0; k<N; k++) {
17
- max = A[globalRow*N + k] > max ? A[globalRow*N + k] : max;
18
- }
19
-
20
- for (int k=0; k<N; k++) {
21
- acc += exp(A[globalRow*N + k] - max);
22
- }
23
-
24
- // Store the result
25
- for (int k=0; k < N; k++) {
26
- row[k] = exp(A[globalRow*N + k] - max) / acc;
27
- }
28
-
29
- for (int a=0; a < N; a++) {
30
- for(int b=0; b < N; b++) {
31
- if (a != b) {
32
- grads[a][b] = -row[a] * row[b];
33
- } else {
34
- grads[a][b] = row[a] * (1.0f - row[a]);
35
- }
36
- }
37
- }
38
-
39
- for (int k=0; k < N; k++) {
40
- float total_grad = 0.0f;
41
- for (int a = 0; a < N; a++) {
42
- total_grad += grads[a][k] * G[globalRow*N + a];
43
- }
44
- C[globalRow*N + k] = total_grad;
45
- }
46
- }
@@ -1,9 +0,0 @@
1
- % c_dtype = dtype_to_c_type(dtype)
2
-
3
- __kernel void sqrt_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
4
- // Get the index of the current element to be processed
5
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
7
-
8
- C[globalRow * N + globalCol] = sqrt(A[globalRow * N + globalCol]);
9
- }
@@ -1,9 +0,0 @@
1
- % c_dtype = dtype_to_c_type(dtype)
2
-
3
- __kernel void square_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
4
- // Get the index of the current element to be processed
5
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
7
-
8
- C[globalRow * N + globalCol] = A[globalRow * N + globalCol] * A[globalRow * N + globalCol];
9
- }
@@ -1,53 +0,0 @@
1
- % c_dtype = dtype_to_c_type(dtype)
2
- // same dimension add floating point op
3
- __kernel void squared_difference_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
4
- // Get the index of the current element to be processed
5
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
7
- <%= c_dtype %> x = A[globalRow * N + globalCol];
8
- <%= c_dtype %> y = B[globalRow * N + globalCol];
9
- C[globalRow * N + globalCol] = (x - y) * (x - y);
10
- }
11
-
12
- // 1D + Scalar floating point add op
13
- __kernel void squared_difference_c_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
14
- // Get the index of the current element to be processed
15
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
16
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
17
-
18
- <%= c_dtype %> x = A[globalRow * N + globalCol];
19
- <%= c_dtype %> y = B[0];
20
-
21
- if (switch_op == 0) {
22
- C[globalRow * N + globalCol] = (x - y) * (x - y);
23
- } else {
24
- C[globalRow * N + globalCol] = (y - x) * (y - x);
25
- }
26
- }
27
-
28
- // 1D + Scalar floating point add op broadcast
29
- __kernel void squared_difference_b_<%= dtype %>_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op,__global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
30
- // Get the index of the current element to be processed
31
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
32
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
33
-
34
- int b_m_index = globalRow;
35
- int b_n_index = globalCol;
36
-
37
- if ( b_m_index >= M2) {
38
- b_m_index = b_m_index % M2;
39
- };
40
-
41
- if (b_n_index >= N2) {
42
- b_n_index = b_n_index % N2;
43
- }
44
-
45
- <%= c_dtype %> x = A[globalRow * N + globalCol];
46
- <%= c_dtype %> y = B[b_m_index * N2 + b_n_index];
47
-
48
- if (switch_op == 0) {
49
- C[globalRow * N + globalCol] = (x - y) * (x - y);
50
- } else {
51
- C[globalRow * N + globalCol] = (y - x) * (y - x);
52
- }
53
- }