RubyGems - tensor_stream - Versions diffs - 0.3.0 → 0.4.0 - Mend

tensor_stream 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

checksums.yaml +4 -4
data/.circleci/config.yml +7 -7
data/CHANGELOG.md +13 -0
data/Dockerfile +25 -0
data/Rakefile +6 -0
data/benchmark/benchmark.rb +16 -57
data/benchmark_intel.txt +21 -0
data/benchmark_nvidia.txt +33 -0
data/lib/tensor_stream.rb +4 -173
data/lib/tensor_stream/debugging/debugging.rb +20 -0
data/lib/tensor_stream/evaluator/kernels/abs.cl +9 -5
data/lib/tensor_stream/evaluator/kernels/add.cl +2 -4
data/lib/tensor_stream/evaluator/kernels/argmax.cl +2 -9
data/lib/tensor_stream/evaluator/kernels/argmin.cl +2 -9
data/lib/tensor_stream/evaluator/kernels/cast.cl +3 -8
data/lib/tensor_stream/evaluator/kernels/cond.cl.erb +1 -1
data/lib/tensor_stream/evaluator/kernels/cos.cl +2 -1
data/lib/tensor_stream/evaluator/kernels/div.cl.erb +2 -4
data/lib/tensor_stream/evaluator/kernels/exp.cl +2 -1
data/lib/tensor_stream/evaluator/kernels/gemm.cl +8 -39
data/lib/tensor_stream/evaluator/kernels/log.cl +2 -1
data/lib/tensor_stream/evaluator/kernels/log1p.cl +2 -1
data/lib/tensor_stream/evaluator/kernels/max.cl +4 -49
data/lib/tensor_stream/evaluator/kernels/mul.cl +2 -4
data/lib/tensor_stream/evaluator/kernels/negate.cl +2 -9
data/lib/tensor_stream/evaluator/kernels/pow.cl +4 -88
data/lib/tensor_stream/evaluator/kernels/reciprocal.cl +2 -9
data/lib/tensor_stream/evaluator/kernels/round.cl +2 -1
data/lib/tensor_stream/evaluator/kernels/sigmoid.cl +2 -1
data/lib/tensor_stream/evaluator/kernels/sigmoid_grad.cl +6 -5
data/lib/tensor_stream/evaluator/kernels/sign.cl +12 -14
data/lib/tensor_stream/evaluator/kernels/sin.cl +2 -1
data/lib/tensor_stream/evaluator/kernels/softmax.cl +26 -0
data/lib/tensor_stream/evaluator/kernels/softmax_grad.cl +46 -0
data/lib/tensor_stream/evaluator/kernels/sqrt.cl +2 -1
data/lib/tensor_stream/evaluator/kernels/square.cl +2 -8
data/lib/tensor_stream/evaluator/kernels/sub.cl +2 -4
data/lib/tensor_stream/evaluator/kernels/tan.cl +2 -1
data/lib/tensor_stream/evaluator/kernels/tanh.cl +2 -1
data/lib/tensor_stream/evaluator/kernels/tanh_grad.cl +2 -1
data/lib/tensor_stream/evaluator/kernels/where.cl +2 -9
data/lib/tensor_stream/evaluator/opencl_evaluator.rb +108 -58
data/lib/tensor_stream/evaluator/opencl_template_helper.rb +40 -5
data/lib/tensor_stream/evaluator/operation_helpers/array_ops_helper.rb +35 -0
data/lib/tensor_stream/evaluator/ruby_evaluator.rb +30 -9
data/lib/tensor_stream/graph_serializers/graphml.rb +1 -1
data/lib/tensor_stream/graph_serializers/pbtext.rb +4 -0
data/lib/tensor_stream/math_gradients.rb +6 -5
data/lib/tensor_stream/nn/nn_ops.rb +18 -2
data/lib/tensor_stream/ops.rb +237 -44
data/lib/tensor_stream/tensor.rb +16 -2
data/lib/tensor_stream/utils.rb +205 -0
data/lib/tensor_stream/variable.rb +2 -1
data/lib/tensor_stream/version.rb +1 -1
data/samples/error.graphml +2755 -0
data/{test_samples → samples}/iris.rb +18 -24
data/samples/logistic_regression.rb +0 -1
data/test_samples/raw_neural_net_sample.rb +80 -23
metadata +11 -3

data/lib/tensor_stream/debugging/debugging.rb ADDED Viewed

@@ -0,0 +1,20 @@
+module TensorStream
+  module Debugging
+    extend TensorStream::OpHelper
+    def add_check_numerics_ops
+      graph = TensorStream.get_default_graph
+      nodes_to_process  = graph.nodes.values.select { |node| node.is_a?(Operation) }
+      nodes_to_process.each do |node|
+        node.items = node.items.compact.collect do |item|
+          if TensorStream::Ops::FLOATING_POINT_TYPES.include?(item.data_type)
+            TensorStream.check_numerics(item, "#{node.name}/#{item.name}", name: "check/#{node.name}/#{item.name}" )
+          else
+            item
+          end
+        end
+      end
+    end
+  end
+end

data/lib/tensor_stream/evaluator/kernels/abs.cl CHANGED Viewed

@@ -1,16 +1,20 @@
-__kernel void abs_fp(const int M, const int N, __global const float *A, __global float *C) {
+% c_dtype = dtype_to_c_type(dtype)
+% if TensorStream::Ops::FLOATING_POINT_TYPES.include?(dtype)
+__kernel void abs_<%= dtype%>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
     // Get the index of the current element to be processed
     const int globalRow = get_global_id(0); // Row ID of C (0..M)
     const int globalCol = get_global_id(1); // Col ID of C (0..N)
     C[globalRow * N + globalCol] = fabs(A[globalRow * N + globalCol]);
 }
-__kernel void abs_int(const int M, const int N, __global const int *A, __global int *C) {
+% else
+% %w[int int32].each do |dt|
+__kernel void abs_<%= dt %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
     // Get the index of the current element to be processed
     const int globalRow = get_global_id(0); // Row ID of C (0..M)
     const int globalCol = get_global_id(1); // Col ID of C (0..N)
     C[globalRow * N + globalCol] = fabs((float)A[globalRow * N + globalCol]);
-}
+}
+% end
+%end

data/lib/tensor_stream/evaluator/kernels/add.cl CHANGED Viewed

@@ -1,5 +1,3 @@
-% %w[fp int].product(%w[add]).each do |dtype, fname|
 % c_dtype = dtype_to_c_type(dtype)
-% op = operator_to_c(fname)
-<%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: fname, dtype: dtype, result_t: c_dtype %>
-% end
+% op = operator_to_c('add')
+<%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'add', dtype: dtype, result_t: c_dtype %>

data/lib/tensor_stream/evaluator/kernels/argmax.cl CHANGED Viewed

@@ -1,12 +1,5 @@
- __kernel void argmax_fp(const int M, const int N, const int switch_op, __global const float *A, __global const float *B, __global float *C) {
-    // Get the index of the current element to be processed
-    const int globalRow = get_global_id(0); // Row ID of C (0..M)
-    const int globalCol = get_global_id(1); // Col ID of C (0..N)
-    C[globalRow * N + globalCol] = A[globalRow * N + globalCol] + B[globalRow * N + globalCol];
-}
- __kernel void argmax_int(const int M, const int N, const int switch_op, __global const int *A, __global const int *B, __global int *C) {
+% c_dtype = dtype_to_c_type(dtype)
+ __kernel void argmax_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
     // Get the index of the current element to be processed
     const int globalRow = get_global_id(0); // Row ID of C (0..M)
     const int globalCol = get_global_id(1); // Col ID of C (0..N)

data/lib/tensor_stream/evaluator/kernels/argmin.cl CHANGED Viewed

@@ -1,12 +1,5 @@
- __kernel void argmin_fp(const int M, const int N, const int switch_op, __global const float *A, __global const float *B, __global float *C) {
-    // Get the index of the current element to be processed
-    const int globalRow = get_global_id(0); // Row ID of C (0..M)
-    const int globalCol = get_global_id(1); // Col ID of C (0..N)
-    C[globalRow * N + globalCol] = A[globalRow * N + globalCol] + B[globalRow * N + globalCol];
-}
- __kernel void argmin_int(const int M, const int N, const int switch_op, __global const int *A, __global const int *B, __global int *C) {
+% c_dtype = dtype_to_c_type(dtype)
+ __kernel void argmin_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
     // Get the index of the current element to be processed
     const int globalRow = get_global_id(0); // Row ID of C (0..M)
     const int globalCol = get_global_id(1); // Col ID of C (0..N)

data/lib/tensor_stream/evaluator/kernels/cast.cl CHANGED Viewed

@@ -1,12 +1,7 @@
- __kernel void cast_int_fp(const int M, const int N, __global const int *A, __global float *C) {
-    // Get the index of the current element to be processed
-    const int globalRow = get_global_id(0); // Row ID of C (0..M)
-    const int globalCol = get_global_id(1); // Col ID of C (0..N)
-    C[globalRow * N + globalCol] = A[globalRow * N + globalCol];
-}
+% source_ctype = dtype_to_c_type(source_dt)
+% target_ctype = dtype_to_c_type(target_dt)
- __kernel void cast_fp_int(const int M, const int N,__global const float *A, __global int *C) {
+__kernel void cast(const int M, const int N, __global const <%= source_ctype %> *A, __global <%= target_ctype %> *C) {
     // Get the index of the current element to be processed
     const int globalRow = get_global_id(0); // Row ID of C (0..M)
     const int globalCol = get_global_id(1); // Col ID of C (0..N)

data/lib/tensor_stream/evaluator/kernels/cond.cl.erb CHANGED Viewed

@@ -1,4 +1,4 @@
-% %w[fp int].product(%w[less less_equal equal not_equal greater greater_equal logical_and]).each do |dtype, fname|
+% ["#{dtype}"].product(%w[less less_equal equal not_equal greater greater_equal logical_and]).each do |dtype, fname|
 % c_dtype = dtype_to_c_type(dtype)
 % op = operator_to_c(fname)
 <%= render 'bool_operand.cl', c_dtype: c_dtype, op: op, fname: fname, dtype: dtype, result_t: 'int' %>

data/lib/tensor_stream/evaluator/kernels/cos.cl CHANGED Viewed

@@ -1,4 +1,5 @@
-__kernel void cos_fp(const int M, const int N, __global const float *A, __global float *C) {
+% c_dtype = dtype_to_c_type(dtype)
+__kernel void cos_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
     // Get the index of the current element to be processed
     const int globalRow = get_global_id(0); // Row ID of C (0..M)
     const int globalCol = get_global_id(1); // Col ID of C (0..N)

data/lib/tensor_stream/evaluator/kernels/div.cl.erb CHANGED Viewed

@@ -1,5 +1,3 @@
-% %w[fp int].product(%w[div]).each do |dtype, fname|
 % c_dtype = dtype_to_c_type(dtype)
-% op = operator_to_c(fname)
-<%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: fname, dtype: dtype, result_t: c_dtype %>
-% end
+% op = operator_to_c('div')
+<%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'div', dtype: dtype, result_t: c_dtype %>

data/lib/tensor_stream/evaluator/kernels/exp.cl CHANGED Viewed

@@ -1,4 +1,5 @@
-__kernel void exp_fp(const int M, const int N, __global const float *A, __global float *C) {
+% c_dtype = dtype_to_c_type(dtype)
+__kernel void exp_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
     // Get the index of the current element to be processed
     const int globalRow = get_global_id(0); // Row ID of C (0..M)
     const int globalCol = get_global_id(1); // Col ID of C (0..N)

data/lib/tensor_stream/evaluator/kernels/gemm.cl CHANGED Viewed

@@ -1,49 +1,18 @@
 // First naive implementation
-__kernel void gemm_fp(const int M, const int N, const int K,
+% c_dtype = dtype_to_c_type(dtype)
+__kernel void gemm_<%= dtype %>(const int M, const int N, const int K,
                       const int A_transpose,
                       const int B_transpose,
-                      const __global float* A,
-                      const __global float* B,
-                      __global float* C) {
-    // Get the index of the current element to be processed
-    const int globalRow = get_global_id(0); // Row ID of C (0..M)
-    const int globalCol = get_global_id(1); // Col ID of C (0..N)
-    // Compute a single element (loop over K)
-    float acc = 0.0f;
-    for (int k=0; k<K; k++) {
-        int a_index = globalRow*K + k;
-        int b_index = k*N + globalCol;
-        if (A_transpose) {
-            a_index = M*k + globalRow;
-        }
-        if (B_transpose) {
-            b_index = globalCol*K + k;
-        }
-        acc += A[a_index] * B[b_index];
-    }
-    // Store the result
-    C[globalRow*N + globalCol] = acc;
-}
+                      const __global <%= c_dtype %>* A,
+                      const __global <%= c_dtype %>* B,
+                      __global <%= c_dtype %>* C) {
-// First naive implementation
-__kernel void gemm_int(const int M, const int N, const int K,
-                      const int A_transpose,
-                      const int B_transpose,
-                      const __global int* A,
-                      const __global int* B,
-                      __global int* C) {
     // Get the index of the current element to be processed
     const int globalRow = get_global_id(0); // Row ID of C (0..M)
     const int globalCol = get_global_id(1); // Col ID of C (0..N)
     // Compute a single element (loop over K)
-    int acc = 0;
+    <%= c_dtype %> acc = 0.0f;
     for (int k=0; k<K; k++) {
         int a_index = globalRow*K + k;
         int b_index = k*N + globalCol;
@@ -57,7 +26,7 @@ __kernel void gemm_int(const int M, const int N, const int K,
         }
         acc += A[a_index] * B[b_index];
     }
     // Store the result
     C[globalRow*N + globalCol] = acc;
 }

data/lib/tensor_stream/evaluator/kernels/log.cl CHANGED Viewed

@@ -1,4 +1,5 @@
-__kernel void log_fp(const int M, const int N, __global const float *A, __global float *C) {
+% c_dtype = dtype_to_c_type(dtype)
+__kernel void log_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
     // Get the index of the current element to be processed
     const int globalRow = get_global_id(0); // Row ID of C (0..M)
     const int globalCol = get_global_id(1); // Col ID of C (0..N)

data/lib/tensor_stream/evaluator/kernels/log1p.cl CHANGED Viewed

@@ -1,4 +1,5 @@
-__kernel void log1p_fp(const int M, const int N, __global const float *A, __global float *C) {
+% c_dtype = dtype_to_c_type(dtype)
+__kernel void log1p_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
     // Get the index of the current element to be processed
     const int globalRow = get_global_id(0); // Row ID of C (0..M)
     const int globalCol = get_global_id(1); // Col ID of C (0..N)

data/lib/tensor_stream/evaluator/kernels/max.cl CHANGED Viewed

@@ -1,5 +1,6 @@
  // same dimension add floating point op
- __kernel void max_fp(const int M, const int N, const int switch_op, __global const float *A, __global const float *B, __global float *C) {
+% c_dtype = dtype_to_c_type(dtype)
+ __kernel void max_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
     // Get the index of the current element to be processed
     const int globalRow = get_global_id(0); // Row ID of C (0..M)
     const int globalCol = get_global_id(1); // Col ID of C (0..N)
@@ -8,7 +9,7 @@
 }
  // 1D + Scalar floating point add op
- __kernel void max_c_fp(const int M, const int N, const int switch_op, __global const float *A, __global const float *B, __global float *C) {
+ __kernel void max_c_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
     // Get the index of the current element to be processed
     const int globalRow = get_global_id(0); // Row ID of C (0..M)
     const int globalCol = get_global_id(1); // Col ID of C (0..N)
@@ -21,7 +22,7 @@
 }
  // 1D + Scalar floating point add op broadcast
- __kernel void max_b_fp(const int M, const int N, const int M2, const int N2, const int switch_op, __global const float *A, __global const float *B, __global float *C) {
+ __kernel void max_b_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
     // Get the index of the current element to be processed
     const int globalRow = get_global_id(0); // Row ID of C (0..M)
     const int globalCol = get_global_id(1); // Col ID of C (0..N)
@@ -42,50 +43,4 @@
     } else {
       C[globalRow * N + globalCol] = B[b_m_index * N2 + b_n_index] > A[globalRow * N + globalCol] ? B[b_m_index * N2 + b_n_index] :  A[globalRow * N + globalCol];
     }
-}
- // same dimension add floating point op
- __kernel void max_int(const int M, const int N, const int switch_op, __global const int *A, __global const int *B, __global int *C) {
-    // Get the index of the current element to be processed
-    const int globalRow = get_global_id(0); // Row ID of C (0..M)
-    const int globalCol = get_global_id(1); // Col ID of C (0..N)
-    C[globalRow * N + globalCol] = A[globalRow * N + globalCol] > B[globalRow * N + globalCol] ? A[globalRow * N + globalCol] : B[globalRow * N + globalCol];
-}
- // 1D + Scalar floating point add op
- __kernel void max_c_int(const int M, const int N, const int switch_op, __global const int *A, __global const int *B, __global int *C) {
-    // Get the index of the current element to be processed
-    const int globalRow = get_global_id(0); // Row ID of C (0..M)
-    const int globalCol = get_global_id(1); // Col ID of C (0..N)
-    if (switch_op == 0) {
-      C[globalRow * N + globalCol] = A[globalRow * N + globalCol] > B[0] ?  A[globalRow * N + globalCol] : B[0];
-    } else {
-      C[globalRow * N + globalCol] = B[0] > A[globalRow * N + globalCol] ? B[0] : A[globalRow * N + globalCol];
-    }
-}
- // 1D + Scalar floating point add op broadcast
- __kernel void max_b_int(const int M, const int N, const int M2, const int N2, const int switch_op, __global const int *A, __global const int *B, __global int *C) {
-    // Get the index of the current element to be processed
-    const int globalRow = get_global_id(0); // Row ID of C (0..M)
-    const int globalCol = get_global_id(1); // Col ID of C (0..N)
-    int b_m_index = globalRow;
-    int b_n_index = globalCol;
-    if ( b_m_index >= M2) {
-      b_m_index = b_m_index % M2;
-    };
-    if (b_n_index >= N2) {
-      b_n_index = b_n_index % N2;
-    }
-    if (switch_op == 0) {
-      C[globalRow * N + globalCol] = A[globalRow * N + globalCol] > B[b_m_index * N2 + b_n_index] ? A[globalRow * N + globalCol] : B[b_m_index * N2 + b_n_index];
-    } else {
-      C[globalRow * N + globalCol] = B[b_m_index * N2 + b_n_index] > A[globalRow * N + globalCol] ? B[b_m_index * N2 + b_n_index] : A[globalRow * N + globalCol];
-    }
 }

data/lib/tensor_stream/evaluator/kernels/mul.cl CHANGED Viewed

@@ -1,5 +1,3 @@
-% %w[fp int].product(%w[mul]).each do |dtype, fname|
 % c_dtype = dtype_to_c_type(dtype)
-% op = operator_to_c(fname)
-<%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: fname, dtype: dtype, result_t: c_dtype %>
-% end
+% op = operator_to_c('mul')
+<%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'mul', dtype: dtype, result_t: c_dtype %>

data/lib/tensor_stream/evaluator/kernels/negate.cl CHANGED Viewed

@@ -1,12 +1,5 @@
-__kernel void negate_fp(const int M, const int N, __global const float *A, __global float *C) {
-    // Get the index of the current element to be processed
-    const int globalRow = get_global_id(0); // Row ID of C (0..M)
-    const int globalCol = get_global_id(1); // Col ID of C (0..N)
-    C[globalRow * N + globalCol] = -A[globalRow * N + globalCol];
-}
-__kernel void negate_int(const int M, const int N, __global const int *A, __global int *C) {
+% c_dtype = dtype_to_c_type(dtype)
+__kernel void negate_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
     // Get the index of the current element to be processed
     const int globalRow = get_global_id(0); // Row ID of C (0..M)
     const int globalCol = get_global_id(1); // Col ID of C (0..N)

data/lib/tensor_stream/evaluator/kernels/pow.cl CHANGED Viewed

@@ -1,5 +1,6 @@
  // same dimension add floating point op
- __kernel void pow_fp(const int M, const int N, const int switch_op, __global const float *A, __global const float *B, __global float *C) {
+% c_dtype = dtype_to_c_type(dtype)
+ __kernel void pow_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
     // Get the index of the current element to be processed
     const int globalRow = get_global_id(0); // Row ID of C (0..M)
     const int globalCol = get_global_id(1); // Col ID of C (0..N)
@@ -8,7 +9,7 @@
 }
  // 1D + Scalar floating point add op
- __kernel void pow_c_fp(const int M, const int N, const int switch_op, __global const float *A, __global const float *B, __global float *C) {
+ __kernel void pow_c_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
     // Get the index of the current element to be processed
     const int globalRow = get_global_id(0); // Row ID of C (0..M)
     const int globalCol = get_global_id(1); // Col ID of C (0..N)
@@ -21,7 +22,7 @@
 }
  // 1D + Scalar floating point add op broadcast
- __kernel void pow_b_fp(const int M, const int N, const int M2, const int N2, const int switch_op, __global const float *A, __global const float *B, __global float *C) {
+ __kernel void pow_b_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
     // Get the index of the current element to be processed
     const int globalRow = get_global_id(0); // Row ID of C (0..M)
     const int globalCol = get_global_id(1); // Col ID of C (0..N)
@@ -42,89 +43,4 @@
     } else {
       C[globalRow * N + globalCol] = pow((float)B[b_m_index * N2 + b_n_index], (float)A[globalRow * N + globalCol]);
     }
-}
- // same dimension add floating point op
- __kernel void pow_int(const int M, const int N, const int switch_op, __global const int *A, __global const int *B, __global int *C) {
-    // Get the index of the current element to be processed
-    const int globalRow = get_global_id(0); // Row ID of C (0..M)
-    const int globalCol = get_global_id(1); // Col ID of C (0..N)
-    int acc = A[globalRow * N + globalCol];
-    const int count = B[globalRow * N + globalCol];
-    const int c = A[globalRow * N + globalCol];
-    if (count < 4) {
-      for(int i = 0; i < count - 1; i++) {
-        acc *= c;
-      }
-      C[globalRow * N + globalCol] = acc;
-    } else {
-      C[globalRow * N + globalCol] = pow((float)c, (float)count);
-    }
-}
- // 1D + Scalar floating point add op
- __kernel void pow_c_int(const int M, const int N, const int switch_op, __global const int *A, __global const int *B, __global int *C) {
-    // Get the index of the current element to be processed
-    const int globalRow = get_global_id(0); // Row ID of C (0..M)
-    const int globalCol = get_global_id(1); // Col ID of C (0..N)
-    int acc, count, c;
-    if (switch_op == 0) {
-      acc = A[globalRow * N + globalCol];
-      count = B[0];
-      c = A[globalRow * N + globalCol];
-    } else {
-      acc = B[0];
-      count = A[globalRow * N + globalCol];
-      c = B[0];
-    }
-    if (count < 4) {
-      for(int i =0; i < count - 1; i++) {
-        acc *= c;
-      }
-      C[globalRow * N + globalCol] = acc;
-    } else {
-      C[globalRow * N + globalCol] = pow((float)c, (float)count);
-    }
-}
- // 1D + Scalar floating point add op broadcast
- __kernel void pow_b_int(const int M, const int N, const int M2, const int N2, const int switch_op, __global const int *A, __global const int *B, __global int *C) {
-    // Get the index of the current element to be processed
-    const int globalRow = get_global_id(0); // Row ID of C (0..M)
-    const int globalCol = get_global_id(1); // Col ID of C (0..N)
-    int b_m_index = globalRow;
-    int b_n_index = globalCol;
-    if ( b_m_index >= M2) {
-      b_m_index = b_m_index % M2;
-    };
-    if (b_n_index >= N2) {
-      b_n_index = b_n_index % N2;
-    }
-    int acc, count, c;
-    if (switch_op == 0) {
-      acc = A[globalRow * N + globalCol];
-      count = B[b_m_index * N2 + b_n_index];
-      c = A[globalRow * N + globalCol];
-    } else {
-      acc = B[b_m_index * N2 + b_n_index];
-      count = A[globalRow * N + globalCol];
-      c = B[b_m_index * N2 + b_n_index];
-    }
-    if (count < 4) {
-      for (int i = 0; i < count - 1; i++) {
-        acc *= c;
-      }
-      C[globalRow * N + globalCol] = acc;
-    } else {
-      C[globalRow * N + globalCol] = pow((float)c, (float)count);
-    }
 }

data/lib/tensor_stream/evaluator/kernels/reciprocal.cl CHANGED Viewed

@@ -1,12 +1,5 @@
-__kernel void reciprocal_fp(const int M, const int N, __global const float *A, __global float *C) {
-    // Get the index of the current element to be processed
-    const int globalRow = get_global_id(0); // Row ID of C (0..M)
-    const int globalCol = get_global_id(1); // Col ID of C (0..N)
-    C[globalRow * N + globalCol] = 1.0f / A[globalRow * N + globalCol];
-}
-__kernel void reciprocal_int(const int M, const int N, __global const int *A, __global int *C) {
+% c_dtype = dtype_to_c_type(dtype)
+__kernel void reciprocal_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
     // Get the index of the current element to be processed
     const int globalRow = get_global_id(0); // Row ID of C (0..M)
     const int globalCol = get_global_id(1); // Col ID of C (0..N)