tensor_stream 0.8.1 → 0.8.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (84) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/CHANGELOG.md +8 -0
  4. data/README.md +12 -6
  5. data/lib/tensor_stream.rb +1 -0
  6. data/lib/tensor_stream/evaluator/base_evaluator.rb +1 -1
  7. data/lib/tensor_stream/evaluator/ruby/array_ops.rb +282 -0
  8. data/lib/tensor_stream/evaluator/ruby/images_ops.rb +61 -0
  9. data/lib/tensor_stream/evaluator/ruby/math_ops.rb +111 -0
  10. data/lib/tensor_stream/evaluator/ruby/nn_ops.rb +48 -9
  11. data/lib/tensor_stream/evaluator/ruby/random_ops.rb +51 -0
  12. data/lib/tensor_stream/evaluator/ruby_evaluator.rb +20 -433
  13. data/lib/tensor_stream/images.rb +16 -0
  14. data/lib/tensor_stream/ops.rb +5 -1
  15. data/lib/tensor_stream/session.rb +15 -15
  16. data/lib/tensor_stream/tensor.rb +1 -1
  17. data/lib/tensor_stream/train/adadelta_optimizer.rb +52 -0
  18. data/lib/tensor_stream/train/adam_optimizer.rb +17 -2
  19. data/lib/tensor_stream/train/gradient_descent_optimizer.rb +7 -1
  20. data/lib/tensor_stream/trainer.rb +1 -0
  21. data/lib/tensor_stream/types.rb +4 -0
  22. data/lib/tensor_stream/utils.rb +4 -0
  23. data/lib/tensor_stream/variable_scope.rb +1 -0
  24. data/lib/tensor_stream/version.rb +1 -1
  25. data/samples/linear_regression.rb +4 -1
  26. data/samples/mnist_data.rb +64 -0
  27. data/samples/nearest_neighbor.rb +1 -2
  28. data/samples/raw_neural_net_sample.rb +1 -1
  29. data/tensor_stream.gemspec +1 -0
  30. metadata +23 -57
  31. data/lib/tensor_stream/evaluator/opencl/kernels/_bool_operand.cl +0 -45
  32. data/lib/tensor_stream/evaluator/opencl/kernels/_operand.cl +0 -45
  33. data/lib/tensor_stream/evaluator/opencl/kernels/abs.cl +0 -20
  34. data/lib/tensor_stream/evaluator/opencl/kernels/acos.cl +0 -8
  35. data/lib/tensor_stream/evaluator/opencl/kernels/add.cl +0 -3
  36. data/lib/tensor_stream/evaluator/opencl/kernels/apply_adam.cl +0 -23
  37. data/lib/tensor_stream/evaluator/opencl/kernels/apply_gradient.cl +0 -9
  38. data/lib/tensor_stream/evaluator/opencl/kernels/apply_momentum.cl +0 -16
  39. data/lib/tensor_stream/evaluator/opencl/kernels/argmax.cl +0 -8
  40. data/lib/tensor_stream/evaluator/opencl/kernels/argmin.cl +0 -8
  41. data/lib/tensor_stream/evaluator/opencl/kernels/asin.cl +0 -9
  42. data/lib/tensor_stream/evaluator/opencl/kernels/cast.cl +0 -10
  43. data/lib/tensor_stream/evaluator/opencl/kernels/ceil.cl +0 -8
  44. data/lib/tensor_stream/evaluator/opencl/kernels/cond.cl.erb +0 -6
  45. data/lib/tensor_stream/evaluator/opencl/kernels/cos.cl +0 -8
  46. data/lib/tensor_stream/evaluator/opencl/kernels/div.cl.erb +0 -3
  47. data/lib/tensor_stream/evaluator/opencl/kernels/exp.cl +0 -8
  48. data/lib/tensor_stream/evaluator/opencl/kernels/floor.cl +0 -8
  49. data/lib/tensor_stream/evaluator/opencl/kernels/floor_div.cl +0 -48
  50. data/lib/tensor_stream/evaluator/opencl/kernels/floor_mod.cl +0 -3
  51. data/lib/tensor_stream/evaluator/opencl/kernels/gemm.cl +0 -32
  52. data/lib/tensor_stream/evaluator/opencl/kernels/log.cl +0 -8
  53. data/lib/tensor_stream/evaluator/opencl/kernels/log1p.cl +0 -8
  54. data/lib/tensor_stream/evaluator/opencl/kernels/log_softmax.cl +0 -26
  55. data/lib/tensor_stream/evaluator/opencl/kernels/max.cl +0 -46
  56. data/lib/tensor_stream/evaluator/opencl/kernels/min.cl +0 -46
  57. data/lib/tensor_stream/evaluator/opencl/kernels/mod.cl +0 -3
  58. data/lib/tensor_stream/evaluator/opencl/kernels/mul.cl +0 -3
  59. data/lib/tensor_stream/evaluator/opencl/kernels/negate.cl +0 -8
  60. data/lib/tensor_stream/evaluator/opencl/kernels/pack.cl +0 -24
  61. data/lib/tensor_stream/evaluator/opencl/kernels/pow.cl +0 -46
  62. data/lib/tensor_stream/evaluator/opencl/kernels/real_div.cl +0 -3
  63. data/lib/tensor_stream/evaluator/opencl/kernels/reciprocal.cl +0 -8
  64. data/lib/tensor_stream/evaluator/opencl/kernels/round.cl +0 -8
  65. data/lib/tensor_stream/evaluator/opencl/kernels/sigmoid.cl +0 -9
  66. data/lib/tensor_stream/evaluator/opencl/kernels/sigmoid_grad.cl +0 -55
  67. data/lib/tensor_stream/evaluator/opencl/kernels/sign.cl +0 -21
  68. data/lib/tensor_stream/evaluator/opencl/kernels/sin.cl +0 -9
  69. data/lib/tensor_stream/evaluator/opencl/kernels/softmax.cl +0 -26
  70. data/lib/tensor_stream/evaluator/opencl/kernels/softmax_cross.cl +0 -32
  71. data/lib/tensor_stream/evaluator/opencl/kernels/softmax_cross_grad.cl +0 -28
  72. data/lib/tensor_stream/evaluator/opencl/kernels/softmax_grad.cl +0 -46
  73. data/lib/tensor_stream/evaluator/opencl/kernels/sqrt.cl +0 -9
  74. data/lib/tensor_stream/evaluator/opencl/kernels/square.cl +0 -9
  75. data/lib/tensor_stream/evaluator/opencl/kernels/squared_difference.cl +0 -53
  76. data/lib/tensor_stream/evaluator/opencl/kernels/sub.cl +0 -3
  77. data/lib/tensor_stream/evaluator/opencl/kernels/tan.cl +0 -8
  78. data/lib/tensor_stream/evaluator/opencl/kernels/tanh.cl +0 -8
  79. data/lib/tensor_stream/evaluator/opencl/kernels/tanh_grad.cl +0 -7
  80. data/lib/tensor_stream/evaluator/opencl/kernels/where.cl +0 -8
  81. data/lib/tensor_stream/evaluator/opencl/opencl_buffer.rb +0 -35
  82. data/lib/tensor_stream/evaluator/opencl/opencl_device.rb +0 -5
  83. data/lib/tensor_stream/evaluator/opencl/opencl_evaluator.rb +0 -1230
  84. data/lib/tensor_stream/evaluator/opencl/opencl_template_helper.rb +0 -95
@@ -1,45 +0,0 @@
1
- // same dimension add floating point op
2
- __kernel void <%= fname%>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= a_dtype %> *A, __global <%= b_dtype %> *B, __global <%= result_t %> *C) {
3
- // Get the index of the current element to be processed
4
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
-
7
- C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <%= op %> B[globalRow * N + globalCol] ? 1 : 0;
8
- }
9
-
10
- // 1D + Scalar floating point add op
11
- __kernel void <%=fname%>_c_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= a_dtype %> *A, __global <%= b_dtype %> *B, __global <%= result_t %> *C) {
12
- // Get the index of the current element to be processed
13
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
14
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
15
-
16
- if (switch_op == 0) {
17
- C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <%= op %> B[0] ? 1 : 0;
18
- } else {
19
- C[globalRow * N + globalCol] = B[0] <%= op %> A[globalRow * N + globalCol] ? 1 : 0;
20
- }
21
- }
22
-
23
- // 1D + Scalar floating point add op broadcast
24
- __kernel void <%= fname%>_b_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op,__global const <%= a_dtype %> *A, __global <%= b_dtype %> *B, __global <%= result_t %> *C) {
25
- // Get the index of the current element to be processed
26
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
27
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
28
-
29
- int b_m_index = globalRow;
30
- int b_n_index = globalCol;
31
-
32
- if ( b_m_index >= M2) {
33
- b_m_index = b_m_index % M2;
34
- };
35
-
36
- if (b_n_index >= N2) {
37
- b_n_index = b_n_index % N2;
38
- }
39
-
40
- if (switch_op == 0) {
41
- C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <%= op %> B[b_m_index * N2 + b_n_index] ? 1 : 0;
42
- } else {
43
- C[globalRow * N + globalCol] = B[b_m_index * N2 + b_n_index] <%= op %> A[globalRow * N + globalCol] ? 1 : 0;
44
- }
45
- }
@@ -1,45 +0,0 @@
1
- // same dimension add floating point op
2
- __kernel void <%= fname%>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= result_t %> *C) {
3
- // Get the index of the current element to be processed
4
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
-
7
- C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <%= op %> B[globalRow * N + globalCol];
8
- }
9
-
10
- // 1D + Scalar floating point add op
11
- __kernel void <%=fname%>_c_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= result_t %> *C) {
12
- // Get the index of the current element to be processed
13
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
14
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
15
-
16
- if (switch_op == 0) {
17
- C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <%= op %> B[0];
18
- } else {
19
- C[globalRow * N + globalCol] = B[0] <%= op %> A[globalRow * N + globalCol];
20
- }
21
- }
22
-
23
- // 1D + Scalar floating point add op broadcast
24
- __kernel void <%= fname%>_b_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op,__global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= result_t %> *C) {
25
- // Get the index of the current element to be processed
26
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
27
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
28
-
29
- int b_m_index = globalRow;
30
- int b_n_index = globalCol;
31
-
32
- if ( b_m_index >= M2) {
33
- b_m_index = b_m_index % M2;
34
- };
35
-
36
- if (b_n_index >= N2) {
37
- b_n_index = b_n_index % N2;
38
- }
39
-
40
- if (switch_op == 0) {
41
- C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <%= op %> B[b_m_index * N2 + b_n_index];
42
- } else {
43
- C[globalRow * N + globalCol] = B[b_m_index * N2 + b_n_index] <%= op %> A[globalRow * N + globalCol];
44
- }
45
- }
@@ -1,20 +0,0 @@
1
- % c_dtype = dtype_to_c_type(dtype)
2
- % if TensorStream::Ops::FLOATING_POINT_TYPES.include?(dtype)
3
- __kernel void abs_<%= dtype%>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
4
- // Get the index of the current element to be processed
5
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
7
-
8
- C[globalRow * N + globalCol] = fabs(A[globalRow * N + globalCol]);
9
- }
10
- % else
11
- % %w[int int32].each do |dt|
12
- __kernel void abs_<%= dt %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
13
- // Get the index of the current element to be processed
14
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
15
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
16
-
17
- C[globalRow * N + globalCol] = fabs((float)A[globalRow * N + globalCol]);
18
- }
19
- % end
20
- %end
@@ -1,8 +0,0 @@
1
- % c_dtype = dtype_to_c_type(dtype)
2
- __kernel void acos_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
- // Get the index of the current element to be processed
4
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
-
7
- C[globalRow * N + globalCol] = acos(A[globalRow * N + globalCol]);
8
- }
@@ -1,3 +0,0 @@
1
- % c_dtype = dtype_to_c_type(dtype)
2
- % op = operator_to_c('add')
3
- <%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'add', dtype: "#{a}_#{b}", result_t: c_dtype %>
@@ -1,23 +0,0 @@
1
- % c_dtype = dtype_to_c_type(dtype)
2
- // same dimension add floating point op
3
- __kernel void apply_adam_<%= dtype %>(const int M, const int N,
4
- __global const <%= c_dtype %> *grad,
5
- __global const <%= c_dtype %> *learning_rate,
6
- __global const <%= c_dtype %> *beta1_power,
7
- __global const <%= c_dtype %> *beta2_power,
8
- __global const <%= c_dtype %> *beta1,
9
- __global const <%= c_dtype %> *beta2,
10
- __global const <%= c_dtype %> *epsilon,
11
- __global <%= c_dtype %> *momentum,
12
- __global <%= c_dtype %> *output, __global <%= c_dtype %> *v) {
13
- // Get the index of the current element to be processed
14
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
15
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
16
- const int index = globalRow * N + globalCol;
17
-
18
- <%= c_dtype %> alpha = learning_rate[0] * sqrt(1.0 - beta2_power[0]) / (1.0 - beta1_power[0]);
19
-
20
- momentum[index] += (grad[index] - momentum[index]) * (1.0 - beta1[0]);
21
- v[index] += (grad[index] * grad[index] - v[index]) * (1.0 - beta2[0]);
22
- output[index] -= (momentum[index] * alpha) / ( sqrt(v[index]) + epsilon[0] );
23
- }
@@ -1,9 +0,0 @@
1
- % c_dtype = dtype_to_c_type(dtype)
2
- // same dimension add floating point op
3
- __kernel void apply_gradient_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
4
- // Get the index of the current element to be processed
5
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
7
-
8
- C[globalRow * N + globalCol] -= A[globalRow * N + globalCol] * B[0];
9
- }
@@ -1,16 +0,0 @@
1
- % c_dtype = dtype_to_c_type(dtype)
2
- // same dimension add floating point op
3
- __kernel void apply_momentum_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *grad, __global const <%= c_dtype %> *learning_rate,
4
- __global const <%= c_dtype %> *momentum, __global <%= c_dtype %> *output, __global <%= c_dtype %> *acc) {
5
- // Get the index of the current element to be processed
6
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
7
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
8
- const int index = globalRow * N + globalCol;
9
- <%= c_dtype %> acc_m = acc[index];
10
- acc[index] = acc_m * momentum[0] + grad[index];
11
- <% if nesterov %>
12
- output[index] -= grad[index] * learning_rate[0] + acc_m * momentum[0] * learning_rate[0];
13
- <% else %>
14
- output[index] -= acc_m * learning_rate[0];
15
- <% end %>
16
- }
@@ -1,8 +0,0 @@
1
- % c_dtype = dtype_to_c_type(dtype)
2
- __kernel void argmax_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
3
- // Get the index of the current element to be processed
4
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
-
7
- C[globalRow * N + globalCol] = A[globalRow * N + globalCol] + B[globalRow * N + globalCol];
8
- }
@@ -1,8 +0,0 @@
1
- % c_dtype = dtype_to_c_type(dtype)
2
- __kernel void argmin_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
3
- // Get the index of the current element to be processed
4
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
-
7
- C[globalRow * N + globalCol] = A[globalRow * N + globalCol] + B[globalRow * N + globalCol];
8
- }
@@ -1,9 +0,0 @@
1
-
2
- % c_dtype = dtype_to_c_type(dtype)
3
- __kernel void asin_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
4
- // Get the index of the current element to be processed
5
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
7
-
8
- C[globalRow * N + globalCol] = asin(A[globalRow * N + globalCol]);
9
- }
@@ -1,10 +0,0 @@
1
- % source_ctype = dtype_to_c_type(source_dt)
2
- % target_ctype = dtype_to_c_type(target_dt)
3
-
4
- __kernel void cast(const int M, const int N, __global const <%= source_ctype %> *A, __global <%= target_ctype %> *C) {
5
- // Get the index of the current element to be processed
6
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
7
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
8
-
9
- C[globalRow * N + globalCol] = A[globalRow * N + globalCol];
10
- }
@@ -1,8 +0,0 @@
1
- % c_dtype = dtype_to_c_type(dtype)
2
- __kernel void ceil_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
- // Get the index of the current element to be processed
4
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
-
7
- C[globalRow * N + globalCol] = ceil(A[globalRow * N + globalCol]);
8
- }
@@ -1,6 +0,0 @@
1
- % ["#{dtype}"].product(%w[less less_equal equal not_equal greater greater_equal logical_and]).each do |dtype, fname|
2
- % a_dtype = dtype_to_c_type(a)
3
- % b_dtype = dtype_to_c_type(b)
4
- % op = operator_to_c(fname)
5
- <%= render 'bool_operand.cl', a_dtype: a_dtype, b_dtype: b_dtype, op: op, fname: fname, dtype: "#{a}_#{b}", result_t: 'short' %>
6
- % end
@@ -1,8 +0,0 @@
1
- % c_dtype = dtype_to_c_type(dtype)
2
- __kernel void cos_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
- // Get the index of the current element to be processed
4
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
-
7
- C[globalRow * N + globalCol] = cos(A[globalRow * N + globalCol]);
8
- }
@@ -1,3 +0,0 @@
1
- % c_dtype = dtype_to_c_type(dtype)
2
- % op = operator_to_c('div')
3
- <%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'div', dtype: "#{a}_#{b}", result_t: c_dtype %>
@@ -1,8 +0,0 @@
1
- % c_dtype = dtype_to_c_type(dtype)
2
- __kernel void exp_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
- // Get the index of the current element to be processed
4
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
-
7
- C[globalRow * N + globalCol] = exp(A[globalRow * N + globalCol]);
8
- }
@@ -1,8 +0,0 @@
1
- % c_dtype = dtype_to_c_type(dtype)
2
- __kernel void floor_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
- // Get the index of the current element to be processed
4
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
-
7
- C[globalRow * N + globalCol] = floor(A[globalRow * N + globalCol]);
8
- }
@@ -1,48 +0,0 @@
1
- % c_dtype = dtype_to_c_type(dtype)
2
- % fname = 'floor_div'
3
- % result_t = c_dtype
4
- // same dimension add floating point op
5
- __kernel void <%= fname%>_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= result_t %> *C) {
6
- // Get the index of the current element to be processed
7
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
8
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
9
-
10
- C[globalRow * N + globalCol] = (int)(A[globalRow * N + globalCol] / B[globalRow * N + globalCol]);
11
- }
12
-
13
- // 1D + Scalar floating point add op
14
- __kernel void <%=fname%>_c_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= result_t %> *C) {
15
- // Get the index of the current element to be processed
16
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
17
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
18
-
19
- if (switch_op == 0) {
20
- C[globalRow * N + globalCol] = (int)(A[globalRow * N + globalCol] / B[0]);
21
- } else {
22
- C[globalRow * N + globalCol] = (int)(B[0] / A[globalRow * N + globalCol]);
23
- }
24
- }
25
-
26
- // 1D + Scalar floating point add op broadcast
27
- __kernel void <%= fname%>_b_<%= dtype %>_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op,__global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= result_t %> *C) {
28
- // Get the index of the current element to be processed
29
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
30
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
31
-
32
- int b_m_index = globalRow;
33
- int b_n_index = globalCol;
34
-
35
- if ( b_m_index >= M2) {
36
- b_m_index = b_m_index % M2;
37
- };
38
-
39
- if (b_n_index >= N2) {
40
- b_n_index = b_n_index % N2;
41
- }
42
-
43
- if (switch_op == 0) {
44
- C[globalRow * N + globalCol] = (int)(A[globalRow * N + globalCol] / B[b_m_index * N2 + b_n_index]);
45
- } else {
46
- C[globalRow * N + globalCol] = (int)(B[b_m_index * N2 + b_n_index] / A[globalRow * N + globalCol]);
47
- }
48
- }
@@ -1,3 +0,0 @@
1
- % c_dtype = dtype_to_c_type(dtype)
2
- % op = operator_to_c('mod')
3
- <%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'floor_mod', dtype: "#{a}_#{b}", result_t: c_dtype %>
@@ -1,32 +0,0 @@
1
- // First naive implementation
2
- % c_dtype = dtype_to_c_type(dtype)
3
- __kernel void gemm_<%= dtype %>(const int M, const int N, const int K,
4
- const int A_transpose,
5
- const int B_transpose,
6
- const __global <%= c_dtype %>* A,
7
- const __global <%= c_dtype %>* B,
8
- __global <%= c_dtype %>* C) {
9
-
10
- // Get the index of the current element to be processed
11
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
12
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
13
-
14
- // Compute a single element (loop over K)
15
- <%= c_dtype %> acc = 0.0f;
16
- for (int k=0; k<K; k++) {
17
- int a_index = globalRow*K + k;
18
- int b_index = k*N + globalCol;
19
-
20
- if (A_transpose) {
21
- a_index = M*k + globalRow;
22
- }
23
-
24
- if (B_transpose) {
25
- b_index = globalCol*K + k;
26
- }
27
- acc += A[a_index] * B[b_index];
28
- }
29
-
30
- // Store the result
31
- C[globalRow*N + globalCol] = acc;
32
- }
@@ -1,8 +0,0 @@
1
- % c_dtype = dtype_to_c_type(dtype)
2
- __kernel void log_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
- // Get the index of the current element to be processed
4
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
-
7
- C[globalRow * N + globalCol] = log(A[globalRow * N + globalCol]);
8
- }
@@ -1,8 +0,0 @@
1
- % c_dtype = dtype_to_c_type(dtype)
2
- __kernel void log1p_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
- // Get the index of the current element to be processed
4
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
-
7
- C[globalRow * N + globalCol] = log1p(A[globalRow * N + globalCol]);
8
- }
@@ -1,26 +0,0 @@
1
- // First naive implementation
2
- % c_dtype = dtype_to_c_type(dtype)
3
- __kernel void log_softmax_<%= dtype %>(const int N,
4
- const __global <%= c_dtype %>* A,
5
- __global <%= c_dtype %>* C) {
6
-
7
- // Get the index of the current element to be processed
8
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
9
-
10
- // Compute a single element (loop over K)
11
- <%= c_dtype %> acc = 0.0f;
12
- <%= c_dtype %> max = <%= min_value_for(dtype) %>;
13
-
14
- for (int k=0; k<N; k++) {
15
- max = A[globalRow*N + k] > max ? A[globalRow*N + k] : max;
16
- }
17
-
18
- for (int k=0; k<N; k++) {
19
- acc += exp(A[globalRow*N + k] - max);
20
- }
21
-
22
- // Store the result
23
- for (int k=0; k < N; k++) {
24
- C[globalRow*N + k] = (A[globalRow*N + k] - max) - log(acc);
25
- }
26
- }
@@ -1,46 +0,0 @@
1
- // same dimension add floating point op
2
- % c_dtype = dtype_to_c_type(dtype)
3
- __kernel void max_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
4
- // Get the index of the current element to be processed
5
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
7
-
8
- C[globalRow * N + globalCol] = A[globalRow * N + globalCol] >= B[globalRow * N + globalCol] ? A[globalRow * N + globalCol] : B[globalRow * N + globalCol];
9
- }
10
-
11
- // 1D + Scalar floating point add op
12
- __kernel void max_c_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
13
- // Get the index of the current element to be processed
14
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
15
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
16
-
17
- if (switch_op == 0) {
18
- C[globalRow * N + globalCol] = A[globalRow * N + globalCol] >= B[0] ? A[globalRow * N + globalCol] : B[0];
19
- } else {
20
- C[globalRow * N + globalCol] = B[0] >= A[globalRow * N + globalCol] ? B[0] : A[globalRow * N + globalCol];
21
- }
22
- }
23
-
24
- // 1D + Scalar floating point add op broadcast
25
- __kernel void max_b_<%= dtype %>_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
26
- // Get the index of the current element to be processed
27
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
28
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
29
-
30
- int b_m_index = globalRow;
31
- int b_n_index = globalCol;
32
-
33
- if ( b_m_index >= M2) {
34
- b_m_index = b_m_index % M2;
35
- };
36
-
37
- if (b_n_index >= N2) {
38
- b_n_index = b_n_index % N2;
39
- }
40
-
41
- if (switch_op == 0) {
42
- C[globalRow * N + globalCol] = A[globalRow * N + globalCol] >= B[b_m_index * N2 + b_n_index] ? A[globalRow * N + globalCol] : B[b_m_index * N2 + b_n_index];
43
- } else {
44
- C[globalRow * N + globalCol] = B[b_m_index * N2 + b_n_index] >= A[globalRow * N + globalCol] ? B[b_m_index * N2 + b_n_index] : A[globalRow * N + globalCol];
45
- }
46
- }