RubyGems - tensor_stream-opencl - Versions diffs - 0.1.0 - Mend

tensor_stream-opencl 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (72) hide show

checksums.yaml +7 -0
data/.gitignore +11 -0
data/.rspec +3 -0
data/.travis.yml +5 -0
data/CODE_OF_CONDUCT.md +74 -0
data/Gemfile +6 -0
data/Gemfile.lock +51 -0
data/LICENSE.txt +21 -0
data/README.md +58 -0
data/Rakefile +6 -0
data/bin/console +14 -0
data/bin/setup +8 -0
data/lib/tensor_stream/opencl.rb +7 -0
data/lib/tensor_stream/opencl/kernels/_bool_operand.cl +45 -0
data/lib/tensor_stream/opencl/kernels/_operand.cl +45 -0
data/lib/tensor_stream/opencl/kernels/abs.cl +20 -0
data/lib/tensor_stream/opencl/kernels/acos.cl +8 -0
data/lib/tensor_stream/opencl/kernels/add.cl +3 -0
data/lib/tensor_stream/opencl/kernels/apply_adam.cl +23 -0
data/lib/tensor_stream/opencl/kernels/apply_gradient.cl +9 -0
data/lib/tensor_stream/opencl/kernels/apply_momentum.cl +16 -0
data/lib/tensor_stream/opencl/kernels/argmax.cl +8 -0
data/lib/tensor_stream/opencl/kernels/argmin.cl +8 -0
data/lib/tensor_stream/opencl/kernels/asin.cl +9 -0
data/lib/tensor_stream/opencl/kernels/cast.cl +10 -0
data/lib/tensor_stream/opencl/kernels/ceil.cl +8 -0
data/lib/tensor_stream/opencl/kernels/cond.cl.erb +6 -0
data/lib/tensor_stream/opencl/kernels/cos.cl +8 -0
data/lib/tensor_stream/opencl/kernels/div.cl.erb +3 -0
data/lib/tensor_stream/opencl/kernels/exp.cl +8 -0
data/lib/tensor_stream/opencl/kernels/floor.cl +8 -0
data/lib/tensor_stream/opencl/kernels/floor_div.cl +48 -0
data/lib/tensor_stream/opencl/kernels/floor_mod.cl +3 -0
data/lib/tensor_stream/opencl/kernels/gemm.cl +32 -0
data/lib/tensor_stream/opencl/kernels/log.cl +8 -0
data/lib/tensor_stream/opencl/kernels/log1p.cl +8 -0
data/lib/tensor_stream/opencl/kernels/log_softmax.cl +26 -0
data/lib/tensor_stream/opencl/kernels/max.cl +46 -0
data/lib/tensor_stream/opencl/kernels/min.cl +46 -0
data/lib/tensor_stream/opencl/kernels/mod.cl +3 -0
data/lib/tensor_stream/opencl/kernels/mul.cl +3 -0
data/lib/tensor_stream/opencl/kernels/negate.cl +8 -0
data/lib/tensor_stream/opencl/kernels/pack.cl +24 -0
data/lib/tensor_stream/opencl/kernels/pow.cl +46 -0
data/lib/tensor_stream/opencl/kernels/real_div.cl +3 -0
data/lib/tensor_stream/opencl/kernels/reciprocal.cl +8 -0
data/lib/tensor_stream/opencl/kernels/round.cl +8 -0
data/lib/tensor_stream/opencl/kernels/sigmoid.cl +9 -0
data/lib/tensor_stream/opencl/kernels/sigmoid_grad.cl +55 -0
data/lib/tensor_stream/opencl/kernels/sign.cl +21 -0
data/lib/tensor_stream/opencl/kernels/sin.cl +9 -0
data/lib/tensor_stream/opencl/kernels/softmax.cl +26 -0
data/lib/tensor_stream/opencl/kernels/softmax_cross.cl +32 -0
data/lib/tensor_stream/opencl/kernels/softmax_cross_grad.cl +28 -0
data/lib/tensor_stream/opencl/kernels/softmax_grad.cl +46 -0
data/lib/tensor_stream/opencl/kernels/sqrt.cl +9 -0
data/lib/tensor_stream/opencl/kernels/square.cl +9 -0
data/lib/tensor_stream/opencl/kernels/squared_difference.cl +53 -0
data/lib/tensor_stream/opencl/kernels/sub.cl +3 -0
data/lib/tensor_stream/opencl/kernels/tan.cl +8 -0
data/lib/tensor_stream/opencl/kernels/tanh.cl +8 -0
data/lib/tensor_stream/opencl/kernels/tanh_grad.cl +7 -0
data/lib/tensor_stream/opencl/kernels/where.cl +8 -0
data/lib/tensor_stream/opencl/math_ops.rb +133 -0
data/lib/tensor_stream/opencl/nn_ops.rb +191 -0
data/lib/tensor_stream/opencl/opencl_buffer.rb +35 -0
data/lib/tensor_stream/opencl/opencl_device.rb +5 -0
data/lib/tensor_stream/opencl/opencl_evaluator.rb +933 -0
data/lib/tensor_stream/opencl/opencl_template_helper.rb +99 -0
data/lib/tensor_stream/opencl/version.rb +5 -0
data/tensor_stream-opencl.gemspec +40 -0
metadata +185 -0

data/lib/tensor_stream/opencl/kernels/real_div.cl ADDED Viewed

@@ -0,0 +1,3 @@
+% c_dtype = dtype_to_c_type(dtype)
+% op = operator_to_c('div')
+<%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'real_div', dtype: "#{a}_#{b}", result_t: c_dtype %>

data/lib/tensor_stream/opencl/kernels/reciprocal.cl ADDED Viewed

@@ -0,0 +1,8 @@
+% c_dtype = dtype_to_c_type(dtype)
+__kernel void reciprocal_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
+    // Get the index of the current element to be processed
+    const int globalRow = get_global_id(0); // Row ID of C (0..M)
+    const int globalCol = get_global_id(1); // Col ID of C (0..N)
+    C[globalRow * N + globalCol] = 1 / A[globalRow * N + globalCol];
+}

data/lib/tensor_stream/opencl/kernels/round.cl ADDED Viewed

@@ -0,0 +1,8 @@
+% c_dtype = dtype_to_c_type(dtype)
+__kernel void round_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
+    // Get the index of the current element to be processed
+    const int globalRow = get_global_id(0); // Row ID of C (0..M)
+    const int globalCol = get_global_id(1); // Col ID of C (0..N)
+    C[globalRow * N + globalCol] = round(A[globalRow * N + globalCol]);
+}

data/lib/tensor_stream/opencl/kernels/sigmoid.cl ADDED Viewed

@@ -0,0 +1,9 @@
+% c_dtype = dtype_to_c_type(dtype)
+__kernel void sigmoid_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
+    // Get the index of the current element to be processed
+    const int globalRow = get_global_id(0); // Row ID of C (0..M)
+    const int globalCol = get_global_id(1); // Col ID of C (0..N)
+    C[globalRow * N + globalCol] = 1.0f/(1.0f + exp(-A[globalRow * N + globalCol]));
+}

data/lib/tensor_stream/opencl/kernels/sigmoid_grad.cl ADDED Viewed

@@ -0,0 +1,55 @@
+% c_dtype = dtype_to_c_type(dtype)
+float sigmoid(<%= c_dtype %> x) {
+  return 1.0f/(1.0f + exp(-x));
+}
+float sigmoid_grad(<%= c_dtype %> x, <%= c_dtype %> g) {
+  return g * sigmoid(x) * ( 1.0f - sigmoid(x));
+}
+ // same dimension add floating point op
+ __kernel void sigmoid_grad_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
+    // Get the index of the current element to be processed
+    const int globalRow = get_global_id(0); // Row ID of C (0..M)
+    const int globalCol = get_global_id(1); // Col ID of C (0..N)
+    C[globalRow * N + globalCol] = sigmoid_grad(A[globalRow * N + globalCol], B[globalRow * N + globalCol]);
+}
+ // 1D + Scalar floating point add op
+ __kernel void sigmoid_grad_c_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
+    // Get the index of the current element to be processed
+    const int globalRow = get_global_id(0); // Row ID of C (0..M)
+    const int globalCol = get_global_id(1); // Col ID of C (0..N)
+    if (switch_op == 0) {
+      C[globalRow * N + globalCol] = sigmoid_grad(A[globalRow * N + globalCol], B[0]);
+    } else {
+      C[globalRow * N + globalCol] = sigmoid_grad(B[0], A[globalRow * N + globalCol]);
+    }
+}
+ // 1D + Scalar floating point add op broadcast
+ __kernel void sigmoid_grad_b_<%= dtype %>_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
+    // Get the index of the current element to be processed
+    const int globalRow = get_global_id(0); // Row ID of C (0..M)
+    const int globalCol = get_global_id(1); // Col ID of C (0..N)
+    int b_m_index = globalRow;
+    int b_n_index = globalCol;
+    if ( b_m_index >= M2) {
+      b_m_index = b_m_index % M2;
+    };
+    if (b_n_index >= N2) {
+      b_n_index = b_n_index % N2;
+    }
+    if (switch_op == 0) {
+      C[globalRow * N + globalCol] = sigmoid_grad(A[globalRow * N + globalCol], B[b_m_index * N2 + b_n_index]);
+    } else {
+      C[globalRow * N + globalCol] = sigmoid_grad(B[b_m_index * N2 + b_n_index], A[globalRow * N + globalCol]);
+    }
+}

data/lib/tensor_stream/opencl/kernels/sign.cl ADDED Viewed

@@ -0,0 +1,21 @@
+% c_dtype = dtype_to_c_type(dtype)
+__kernel void sign_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
+    // Get the index of the current element to be processed
+    const int globalRow = get_global_id(0); // Row ID of C (0..M)
+    const int globalCol = get_global_id(1); // Col ID of C (0..N)
+    <%= c_dtype %> value = A[globalRow * N + globalCol];
+% if floating_point?(dtype)
+    if (isnan(value) || value == 0.0f) {
+      C[globalRow * N + globalCol] = 0.0;
+    } else {
+      C[globalRow * N + globalCol] = value < 0 ? -1.0 : 1.0;
+    }
+% else
+  if (value == 0) {
+    C[globalRow * N + globalCol] = 0;
+  } else {
+    C[globalRow * N + globalCol] = value < 0 ? -1 : 1;
+  }
+% end
+}

data/lib/tensor_stream/opencl/kernels/sin.cl ADDED Viewed

@@ -0,0 +1,9 @@
+% c_dtype = dtype_to_c_type(dtype)
+__kernel void sin_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
+    // Get the index of the current element to be processed
+    const int globalRow = get_global_id(0); // Row ID of C (0..M)
+    const int globalCol = get_global_id(1); // Col ID of C (0..N)
+    C[globalRow * N + globalCol] = sin(A[globalRow * N + globalCol]);
+}

data/lib/tensor_stream/opencl/kernels/softmax.cl ADDED Viewed

@@ -0,0 +1,26 @@
+// First naive implementation
+% c_dtype = dtype_to_c_type(dtype)
+__kernel void softmax_<%= dtype %>(const int N,
+                      const __global <%= c_dtype %>* A,
+                      __global <%= c_dtype %>* C) {
+    // Get the index of the current element to be processed
+    const int globalRow = get_global_id(0); // Row ID of C (0..M)
+    // Compute a single element (loop over K)
+    <%= c_dtype %> acc = 0.0f;
+    <%= c_dtype %> max = <%= min_value_for(dtype) %>;
+    for (int k=0; k<N; k++) {
+      max = A[globalRow*N + k] > max ? A[globalRow*N + k] : max;
+    }
+    for (int k=0; k<N; k++) {
+      acc += exp(A[globalRow*N + k] - max);
+    }
+    // Store the result
+    for (int k=0; k < N; k++) {
+      C[globalRow*N + k] = exp(A[globalRow*N + k] - max) / acc;
+    }
+}

data/lib/tensor_stream/opencl/kernels/softmax_cross.cl ADDED Viewed

@@ -0,0 +1,32 @@
+// First naive implementation
+% c_dtype = dtype_to_c_type(dtype)
+__kernel void softmax_cross_<%= dtype %>(const int N,
+                      const __global <%= c_dtype %>* A,
+                      const __global <%= c_dtype %>* L,
+                      __global <%= c_dtype %>* C, __global <%= c_dtype %>* P) {
+    // Get the index of the current element to be processed
+    const int globalRow = get_global_id(0); // Row ID of C (0..M)
+    // Compute a single element (loop over K)
+    <%= c_dtype %> acc = 0.0f;
+    <%= c_dtype %> max = <%= min_value_for(dtype) %>;
+    for (int k=0; k<N; k++) {
+      max = A[globalRow*N + k] > max ? A[globalRow*N + k] : max;
+    }
+    for (int k=0; k<N; k++) {
+      acc += exp(A[globalRow*N + k] - max);
+    }
+    // Store the result
+    for (int k=0; k < N; k++) {
+      C[globalRow*N + k] = (log(acc) - (A[globalRow*N + k] - max)) * L[globalRow*N + k];
+    }
+    for (int k=0; k < N; k++) {
+      P[globalRow*N + k] = (exp(A[globalRow*N + k] - max) / acc) - L[globalRow*N + k];
+    }
+}

data/lib/tensor_stream/opencl/kernels/softmax_cross_grad.cl ADDED Viewed

@@ -0,0 +1,28 @@
+// First naive implementation
+% c_dtype = dtype_to_c_type(dtype)
+__kernel void softmax_cross_grad_<%= dtype %>(const int N,
+                      const __global <%= c_dtype %>* A,
+                      const __global <%= c_dtype %>* L,
+                      const __global <%= c_dtype %>* G,
+                      __global <%= c_dtype %>* C) {
+    // Get the index of the current element to be processed
+    const int globalRow = get_global_id(0); // Row ID of C (0..M)
+    // Compute a single element (loop over K)
+    <%= c_dtype %> acc = 0.0f;
+    <%= c_dtype %> max = <%= min_value_for(dtype) %>;
+    for (int k=0; k<N; k++) {
+      max = A[globalRow*N + k] > max ? A[globalRow*N + k] : max;
+    }
+    for (int k=0; k<N; k++) {
+      acc += exp(A[globalRow*N + k] - max);
+    }
+    // Store the result
+    for (int k=0; k < N; k++) {
+      C[globalRow*N + k] = ((exp(A[globalRow*N + k] - max)/acc)  *  G[globalRow*N + k] - L[globalRow*N + k]);
+    }
+}

data/lib/tensor_stream/opencl/kernels/softmax_grad.cl ADDED Viewed

@@ -0,0 +1,46 @@
+% c_dtype = dtype_to_c_type(dtype)
+__kernel void softmax_grad_<%= dtype %>(const int N,
+                      const __global <%= c_dtype %>* A,
+                      const __global <%= c_dtype %>* G,
+                      __global <%= c_dtype %>* C) {
+    // Get the index of the current element to be processed
+    const int globalRow = get_global_id(0); // Row ID of C (0..M)
+    // Compute a single element (loop over K)
+    float acc = 0.0f;
+    float max = FLT_MIN;
+    float row[<%= size %>];
+    float grads[<%= size %>][<%= size %>];
+    for (int k=0; k<N; k++) {
+      max = A[globalRow*N + k] > max ? A[globalRow*N + k] : max;
+    }
+    for (int k=0; k<N; k++) {
+      acc += exp(A[globalRow*N + k] - max);
+    }
+    // Store the result
+    for (int k=0; k < N; k++) {
+      row[k] = exp(A[globalRow*N + k] - max) / acc;
+    }
+    for (int a=0; a < N; a++) {
+      for(int b=0; b < N; b++) {
+        if (a != b) {
+          grads[a][b] = -row[a] * row[b];
+        } else {
+          grads[a][b] = row[a] * (1.0f - row[a]);
+        }
+      }
+    }
+    for (int k=0; k < N; k++) {
+      float total_grad = 0.0f;
+      for (int a = 0; a < N; a++) {
+        total_grad += grads[a][k] * G[globalRow*N + a];
+      }
+      C[globalRow*N + k] = total_grad;
+    }
+}

data/lib/tensor_stream/opencl/kernels/sqrt.cl ADDED Viewed

@@ -0,0 +1,9 @@
+% c_dtype = dtype_to_c_type(dtype)
+__kernel void sqrt_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
+    // Get the index of the current element to be processed
+    const int globalRow = get_global_id(0); // Row ID of C (0..M)
+    const int globalCol = get_global_id(1); // Col ID of C (0..N)
+    C[globalRow * N + globalCol] = sqrt(A[globalRow * N + globalCol]);
+}

data/lib/tensor_stream/opencl/kernels/square.cl ADDED Viewed

@@ -0,0 +1,9 @@
+% c_dtype = dtype_to_c_type(dtype)
+__kernel void square_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
+    // Get the index of the current element to be processed
+    const int globalRow = get_global_id(0); // Row ID of C (0..M)
+    const int globalCol = get_global_id(1); // Col ID of C (0..N)
+    C[globalRow * N + globalCol] = A[globalRow * N + globalCol] * A[globalRow * N + globalCol];
+}

data/lib/tensor_stream/opencl/kernels/squared_difference.cl ADDED Viewed

@@ -0,0 +1,53 @@
+% c_dtype = dtype_to_c_type(dtype)
+ // same dimension add floating point op
+ __kernel void squared_difference_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
+    // Get the index of the current element to be processed
+    const int globalRow = get_global_id(0); // Row ID of C (0..M)
+    const int globalCol = get_global_id(1); // Col ID of C (0..N)
+    <%= c_dtype %> x = A[globalRow * N + globalCol];
+    <%= c_dtype %> y = B[globalRow * N + globalCol];
+    C[globalRow * N + globalCol] = (x - y) * (x - y);
+}
+ // 1D + Scalar floating point add op
+ __kernel void squared_difference_c_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
+    // Get the index of the current element to be processed
+    const int globalRow = get_global_id(0); // Row ID of C (0..M)
+    const int globalCol = get_global_id(1); // Col ID of C (0..N)
+    <%= c_dtype %> x = A[globalRow * N + globalCol];
+    <%= c_dtype %> y = B[0];
+    if (switch_op == 0) {
+      C[globalRow * N + globalCol] = (x - y) * (x - y);
+    } else {
+      C[globalRow * N + globalCol] = (y - x) * (y - x);
+    }
+}
+ // 1D + Scalar floating point add op broadcast
+ __kernel void squared_difference_b_<%= dtype %>_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op,__global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
+    // Get the index of the current element to be processed
+    const int globalRow = get_global_id(0); // Row ID of C (0..M)
+    const int globalCol = get_global_id(1); // Col ID of C (0..N)
+    int b_m_index = globalRow;
+    int b_n_index = globalCol;
+    if ( b_m_index >= M2) {
+      b_m_index = b_m_index % M2;
+    };
+    if (b_n_index >= N2) {
+      b_n_index = b_n_index % N2;
+    }
+    <%= c_dtype %> x = A[globalRow * N + globalCol];
+    <%= c_dtype %> y = B[b_m_index * N2 + b_n_index];
+    if (switch_op == 0) {
+      C[globalRow * N + globalCol] = (x - y) * (x - y);
+    } else {
+      C[globalRow * N + globalCol] = (y - x) * (y - x);
+    }
+}

data/lib/tensor_stream/opencl/kernels/sub.cl ADDED Viewed

@@ -0,0 +1,3 @@
+% c_dtype = dtype_to_c_type(dtype)
+% op = operator_to_c('sub')
+<%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'sub', dtype: "#{a}_#{b}", result_t: c_dtype %>

data/lib/tensor_stream/opencl/kernels/tan.cl ADDED Viewed

@@ -0,0 +1,8 @@
+% c_dtype = dtype_to_c_type(dtype)
+__kernel void tan_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
+    // Get the index of the current element to be processed
+    const int globalRow = get_global_id(0); // Row ID of C (0..M)
+    const int globalCol = get_global_id(1); // Col ID of C (0..N)
+    C[globalRow * N + globalCol] = tan(A[globalRow * N + globalCol]);
+}

data/lib/tensor_stream/opencl/kernels/tanh.cl ADDED Viewed

@@ -0,0 +1,8 @@
+% c_dtype = dtype_to_c_type(dtype)
+__kernel void tanh_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
+    // Get the index of the current element to be processed
+    const int globalRow = get_global_id(0); // Row ID of C (0..M)
+    const int globalCol = get_global_id(1); // Col ID of C (0..N)
+    C[globalRow * N + globalCol] = tanh(A[globalRow * N + globalCol]);
+}

data/lib/tensor_stream/opencl/kernels/tanh_grad.cl ADDED Viewed

@@ -0,0 +1,7 @@
+% c_dtype = dtype_to_c_type(dtype)
+__kernel void tanh_grad_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
+    // Get the index of the current element to be processed
+    const int globalRow = get_global_id(0); // Row ID of C (0..M)
+    const int globalCol = get_global_id(1); // Col ID of C (0..N)
+    C[globalRow * N + globalCol] = 1 - tanh(A[globalRow * N + globalCol]) * tanh(A[globalRow * N + globalCol]);
+}

data/lib/tensor_stream/opencl/kernels/where.cl ADDED Viewed

@@ -0,0 +1,8 @@
+% c_dtype = dtype_to_c_type(dtype)
+ __kernel void where_<%= dtype %>(const int M, const int N, __global const short *PRED, __global const <%= c_dtype %> *A, __global const  <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
+    // Get the index of the current element to be processed
+    const int globalRow = get_global_id(0); // Row ID of C (0..M)
+    const int globalCol = get_global_id(1); // Col ID of C (0..N)
+    C[globalRow * N + globalCol] = PRED[globalRow * N + globalCol]  ? A[globalRow * N + globalCol] : B[globalRow * N + globalCol];
+}

data/lib/tensor_stream/opencl/math_ops.rb ADDED Viewed

@@ -0,0 +1,133 @@
+module TensorStream
+  module OpenCLHelpers
+    # Collection of math functions for interfacing with OpenCL kernels
+    module MathOps
+      def MathOps.included(klass)
+        klass.class_eval do
+          %i[max min add real_div div sub floor_mod mod mul pow sigmoid_grad squared_difference].each do |op|
+            register_op op, noop: true do |context, tensor, inputs|
+              execute_2_operand_func(op.to_s, tensor, inputs[0], inputs[1], context)
+            end
+          end
+          register_op :add_n do |_context, tensor, inputs|
+            if inputs.size == 1
+              inputs[0]
+            else
+              m, n = inputs[0].shape
+              work_group = [m || 1, n || 1]
+              cl_m = OpenCL::Int1.new(m || 1)
+              cl_n = OpenCL::Int1.new(n || 1)
+              cl_switch = OpenCL::Int1.new(0)
+              dtype = tensor.data_type
+              output_buffer = _create_result_buffer(tensor.data_type, inputs[0].shape, "out_#{tensor.name}")
+              inputs_queue = inputs.dup
+              a = inputs_queue.pop
+              until inputs_queue.empty?
+                b = inputs_queue.pop
+                event_wait_list = build_event_wait_list([a, b])
+                method_call = :"add_#{a.data_type}_#{b.data_type}"
+                event = _cl_program('add', a: a.data_type, b: b.data_type, dtype: dtype).send(method_call, _opencl_queue, work_group, cl_m, cl_n, cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
+                a = output_buffer
+                a.op = event
+              end
+              output_buffer.op = a.op
+              output_buffer
+            end
+          end
+          register_op :floor_div, noop: true do |context, tensor, inputs|
+            if fp_type?(tensor.data_type)
+              execute_2_operand_func('floor_div', tensor, inputs[0], inputs[1], context)
+            else
+              execute_2_operand_func('div', tensor, inputs[0], inputs[1], context)
+            end
+          end
+          register_op :mat_mul do |_context, tensor, inputs|
+            a, b = inputs
+            m = a.shape[0]
+            n = b.shape[1]
+            v = b.shape[0]
+            k = a.shape[1]
+            if tensor.options[:transpose_a]
+              m = a.shape[1]
+              k = a.shape[0]
+            end
+            if tensor.options[:transpose_b]
+              n = b.shape[0]
+              v = b.shape[1]
+            end
+            result_shape = [m, n]
+            raise "#{tensor.inputs[0].name} rank must be greater than 1" if a.shape.size < 2
+            raise "#{tensor.inputs[1].name} rank must be greater than 1" if b.shape.size < 2
+            raise "incompatible shape sizes for matrix multiplication (#{a.shape[1]} != #{b.shape[0]}) #{a.shape} vs #{b.shape}" if k != v
+            dtype = tensor.data_type
+            a, b = auto_type_cast(a, b, name: "#{tensor.name}/cast_#{a.name}_#{b.data_type}")
+            output_buffer = _create_result_buffer(a.data_type, result_shape, tensor.name)
+            cl_m = OpenCL::Int1.new(m)
+            cl_n = OpenCL::Int1.new(n)
+            cl_k = OpenCL::Int1.new(k)
+            transpose_a = OpenCL::Int1.new(tensor.options[:transpose_a] ? 1 : 0)
+            transpose_b = OpenCL::Int1.new(tensor.options[:transpose_b] ? 1 : 0)
+            event_wait_list = build_event_wait_list(inputs)
+            output_buffer.op = _cl_program('gemm', dtype: dtype).send(:"gemm_#{dtype}", _opencl_queue, result_shape, cl_m, cl_n, cl_k, transpose_a, transpose_b, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
+            output_buffer
+          end
+          %i[sign exp tan acos asin sin cos abs sqrt negate square reciprocal tanh tanh_grad sigmoid log1p round floor ceil log].each do |op|
+            register_op op, noop: true do |context, tensor, inputs|
+              execute_func(op.to_s, tensor, inputs[0], context)
+            end
+          end
+          %i[sum mean].each do |op|
+            register_op op, noop: true do |context, tensor, inputs|
+              reduction(context, tensor, inputs[0], inputs[1], op.to_sym)
+            end
+          end
+          register_op :prod, noop: true do |context, tensor, inputs|
+            input_a = complete_eval(inputs[0], context)
+            if input_a.buffer.empty?
+              convert_to_opencl([1.0], [], data_type: inputs[0].data_type, name: tensor.name)
+            else
+              reduction(context, tensor, inputs[0], inputs[1], :prod)
+            end
+          end
+          register_op :argmin, buffer: true do |_context, tensor, inputs|
+            axis = tensor.options[:axis] || 0
+            rank = inputs[0].shape.size
+            raise TensorStream::InvalidArgumentError, "Expected dimension in the range [#{-rank},#{rank}) but got #{axis}" if axis < -rank || axis >= rank
+            arr = inputs[0].buffer.reshape(*inputs[0].shape.reverse).to_a
+            op = get_op_with_axis(arr, axis, 0, inputs[0].data_type, ->(a, b) { a < b })
+            convert_to_opencl(op, shape_eval(op), data_type: tensor.data_type, name: tensor.name)
+          end
+          register_op :argmax, buffer: true do |_context, tensor, inputs|
+            axis = tensor.options[:axis] || 0
+            rank = inputs[0].shape.size
+            raise TensorStream::InvalidArgumentError, "Expected dimension in the range [#{-rank},#{rank}) but got #{axis}" if axis < -rank || axis >= rank
+            arr = inputs[0].buffer.reshape(*inputs[0].shape.reverse).to_a
+            op = get_op_with_axis(arr, axis, 0, inputs[0].data_type, ->(a, b) { a > b })
+            convert_to_opencl(op, shape_eval(op), data_type: tensor.data_type, name: tensor.name)
+          end
+        end
+      end
+    end
+  end
+end