RubyGems - tensor_stream - Versions diffs - 0.2.0 → 0.3.0 - Mend

tensor_stream 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (58) hide show

checksums.yaml +5 -5
data/.circleci/config.yml +2 -1
data/CHANGELOG.md +5 -0
data/README.md +28 -1
data/benchmark/benchmark.rb +129 -0
data/lib/tensor_stream.rb +7 -4
data/lib/tensor_stream/evaluator/buffer.rb +10 -0
data/lib/tensor_stream/evaluator/evaluator.rb +1 -0
data/lib/tensor_stream/evaluator/kernels/_bool_operand.cl +45 -0
data/lib/tensor_stream/evaluator/kernels/_operand.cl +45 -0
data/lib/tensor_stream/evaluator/kernels/abs.cl +16 -0
data/lib/tensor_stream/evaluator/kernels/add.cl +5 -0
data/lib/tensor_stream/evaluator/kernels/argmax.cl +15 -0
data/lib/tensor_stream/evaluator/kernels/argmin.cl +15 -0
data/lib/tensor_stream/evaluator/kernels/cast.cl +15 -0
data/lib/tensor_stream/evaluator/kernels/cond.cl.erb +5 -0
data/lib/tensor_stream/evaluator/kernels/cos.cl +7 -0
data/lib/tensor_stream/evaluator/kernels/div.cl.erb +5 -0
data/lib/tensor_stream/evaluator/kernels/exp.cl +7 -0
data/lib/tensor_stream/evaluator/kernels/gemm.cl +63 -0
data/lib/tensor_stream/evaluator/kernels/log.cl +7 -0
data/lib/tensor_stream/evaluator/kernels/log1p.cl +7 -0
data/lib/tensor_stream/evaluator/kernels/max.cl +91 -0
data/lib/tensor_stream/evaluator/kernels/mul.cl +5 -0
data/lib/tensor_stream/evaluator/kernels/negate.cl +15 -0
data/lib/tensor_stream/evaluator/kernels/pow.cl +130 -0
data/lib/tensor_stream/evaluator/kernels/reciprocal.cl +15 -0
data/lib/tensor_stream/evaluator/kernels/round.cl +7 -0
data/lib/tensor_stream/evaluator/kernels/sigmoid.cl +8 -0
data/lib/tensor_stream/evaluator/kernels/sigmoid_grad.cl +54 -0
data/lib/tensor_stream/evaluator/kernels/sign.cl +23 -0
data/lib/tensor_stream/evaluator/kernels/sin.cl +8 -0
data/lib/tensor_stream/evaluator/kernels/sqrt.cl +8 -0
data/lib/tensor_stream/evaluator/kernels/square.cl +15 -0
data/lib/tensor_stream/evaluator/kernels/sub.cl +5 -0
data/lib/tensor_stream/evaluator/kernels/tan.cl +7 -0
data/lib/tensor_stream/evaluator/kernels/tanh.cl +7 -0
data/lib/tensor_stream/evaluator/kernels/tanh_grad.cl +6 -0
data/lib/tensor_stream/evaluator/kernels/where.cl +15 -0
data/lib/tensor_stream/evaluator/opencl_buffer.rb +30 -0
data/lib/tensor_stream/evaluator/opencl_evaluator.rb +1095 -0
data/lib/tensor_stream/evaluator/opencl_template_helper.rb +58 -0
data/lib/tensor_stream/evaluator/operation_helpers/array_ops_helper.rb +27 -0
data/lib/tensor_stream/evaluator/ruby_evaluator.rb +20 -31
data/lib/tensor_stream/graph.rb +4 -2
data/lib/tensor_stream/math_gradients.rb +3 -0
data/lib/tensor_stream/operation.rb +29 -2
data/lib/tensor_stream/ops.rb +14 -2
data/lib/tensor_stream/placeholder.rb +1 -1
data/lib/tensor_stream/session.rb +10 -3
data/lib/tensor_stream/tensor_shape.rb +1 -1
data/lib/tensor_stream/train/saver.rb +1 -1
data/lib/tensor_stream/variable.rb +7 -1
data/lib/tensor_stream/version.rb +1 -1
data/samples/logistic_regression.rb +2 -1
data/samples/nearest_neighbor.rb +54 -0
data/tensor_stream.gemspec +3 -1
metadata +107 -28

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
-SHA256:
-  metadata.gz: a5a6dce7a4317dee2e9fba536056d0d346ac9fd4a2763396a5735204f77b90c6
-  data.tar.gz: 4a6d2973badfa0f2ac20850f6885181c4f19b6fe00e416e9f8394f89ac267f77
+SHA1:
+  metadata.gz: 7f758a576604fad842e40ff2c64f5d7deef05e5c
+  data.tar.gz: f94636819dc9fab55bf53bc198a8ad4854728249
 SHA512:
-  metadata.gz: 285c9deb129680a9050a2afa9811de1d3cbcf2250cb78b4b6eab06f225bf8a5532d301a4cad7c0e9d39b7b16bcffb0527c7b6cadaad8b840a78cadc8fcf92c80
-  data.tar.gz: 2021fc72c95a8aad4e8b7f8a5b25fa7ce64e715ae08be9d3c325f058155f1a2ea9d58be38aae8f4849dabe067c75e140f174efe498503a86c7a73820dc367e5d
+  metadata.gz: b167b5160330e4421ebbbce9c7e2235db0d145f1f99b37731b63a9e32182dbdc38d473c86318be8023d86bb4ae0aae4104f66856911404dcdd478aaed8df9f43
+  data.tar.gz: aa1264ef33f8e7550dc917cb29d6c183e932b9637e8262e6c87eca4cea337d640d5576d9cf58b2bcf9714304fe8fa16fc95a3be6e0f879ab3678f0bccb6233ae

data/.circleci/config.yml CHANGED Viewed

@@ -42,8 +42,9 @@ jobs:
           command: |
             mkdir /tmp/test-results
             TEST_FILES="$(circleci tests glob "spec/**/*_spec.rb" | circleci tests split --split-by=timings)"
             bundle exec rspec -r rspec_junit_formatter --format progress \
+                            --exclude-pattern "spec/tensor_stream/evaluators/opencl_*.rb" \
                             --format RspecJunitFormatter \
                             --out /tmp/test-results/rspec.xml \
                             --format progress \

data/CHANGELOG.md CHANGED Viewed

@@ -4,6 +4,11 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+## [0.3.0] - 2018-06-05
+### Added
+- hardware acceleration using OpenCL
+- working nearest neighbor sample (use opencl evaluator for best performance)
 ## [0.2.0] - 2018-05-27
 ### Added
 - working logistic regression sample

data/README.md CHANGED Viewed

@@ -4,7 +4,7 @@
 # TensorStream
-A reimplementation of TensorFlow for ruby. This is a ground up implementation with no dependency on TensorFlow. Effort has been made to make the programming style as near to TensorFlow as possible, comes with a pure ruby evaluator by default as well with support for an opencl evaluator.
+A reimplementation of TensorFlow for ruby. This is a ground up implementation with no dependency on TensorFlow. Effort has been made to make the programming style as near to TensorFlow as possible, comes with a pure ruby evaluator by default with support for an opencl evaluator for large models and datasets.
 The goal of this gem is to have a high performance machine learning and compute solution for ruby with support for a wide range of hardware and software configuration.
@@ -176,6 +176,33 @@ f = tf.matmul(a, b).breakpoint! { |tensor, a, b, result_value| binding.pry }
 tf.session.run(f)
 ```
+### OpenCL
+For OpenCL support, make sure that the required OpenCL drivers for your hardware are correctly installed on your system.
+Also OpenCL only supports ruby-mri at the moment.
+Also include the following gem in your project:
+```
+gem 'opencl_ruby_ffi'
+```
+To use the opencl evaluator instead of the ruby evaluator:
+```ruby
+require 'tensor_stream/evaluator/opencl_evaluator'
+# set session to use the opencl evaluator
+sess = tf.session(:opencl_evaluator)
+sess.run(....) # do stuff
+```
+Note that the OpenCL evaluator provides speedup if you are using large tensors, tensors that are only using scalars like the linear regression sample will actually be slower.
+samples/nearest_neighbor.rb contains a sample that uses opencl.
 # Visualization
 tensorstream does not support tensorboard yet, but a graphml generator is included:

data/benchmark/benchmark.rb ADDED Viewed

@@ -0,0 +1,129 @@
+require "bundler/setup"
+require 'tensor_stream'
+require 'benchmark'
+require 'pry-byebug'
+require 'awesome_print'
+require 'tensor_stream/evaluator/opencl_evaluator'
+def tr(t, places = 1)
+  if t.is_a?(Array)
+    return t.collect do |v|
+      tr(v)
+    end
+  end
+  return t unless t.kind_of?(Float)
+  t.round(places)
+end
+tf = TensorStream
+srand(5)
+seed = 5
+tf.set_random_seed(seed)
+a = tf.constant([
+  [1.0, 2.0, 3.0, 4.0, 4.1, 1.1, 4.1, 8.1, 3.2, 4.3, 1.1, 1.1],
+  [2.0, 2.1, 3.0, 4.0, 4.2, 1.1, 1.1, 1.1, 1.2, 4.4, 1.0, 1.1],
+  [3.0, 2.2, 3.0, 4.0, 0.1, 1.1, 1.2, 2.1, 1.3, 1.0, 2.0, 1.3],
+  [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.3, 0.5, 1.5, 1.2, 3.0, 1.1],
+  [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.4, 0.1, 1.6, 1.3, 4.0, 1.2],
+  [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.4, 0.11, 1.4, 5.1, 1.4],
+  [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.22, 1.1, 6.1, 1.5],
+  [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 0.8, 0.25, 1.6],
+  [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 2.0, 6.5, 1.7],
+  [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 2.0, 1.1, 1.8],
+  [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 2.0, 1.1, 1.9],
+  [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 2.0, 1.1, 1.9],
+])
+a_int = tf.constant([
+  [1, 2, 3, 4, 4, 1, 4, 8, 3, 4, 1, 1],
+  [2, 2, 3, 4, 4, 1, 1, 1, 1, 4, 1, 1],
+  [3, 2, 3, 4, 0, 1, 1, 2, 1, 1, 2, 1],
+  [4, 2, 3, 4, 0, 1, 1, 0, 1, 1, 3, 1],
+  [4, 2, 3, 4, 0, 1, 1, 0, 1, 1, 4, 1],
+  [4, 2, 3, 4, 0, 1, 1, 0, 0, 1, 5, 1],
+  [4, 2, 3, 4, 0, 1, 1, 0, 0, 1, 6, 1],
+  [4, 2, 3, 4, 0, 1, 1, 0, 0, 0, 0, 1],
+  [4, 2, 3, 4, 0, 1, 1, 0, 0, 2, 6, 1],
+  [4, 2, 3, 4, 0, 1, 1, 0, 0, 2, 1, 1],
+  [4, 2, 3, 4, 0, 1, 1, 0, 0, 2, 1, 2],
+  [4, 2, 3, 4, 0, 1, 1, 0, 0, 2, 1, 2],
+])
+b = tf.constant([
+  [1.1, 2.0, 3.0, 4.0, 4.1, 1.1, 4.1, 8.1, 3.2, 4.3, 1.1, 1.1],
+  [2.1, 2.1, 3.0, 4.0, 4.2, 1.1, 1.1, 1.1, 1.2, 4.4, 1.0, 1.1],
+  [3.1, 2.2, 3.0, 4.0, 0.1, 1.1, 1.2, 2.1, 0.3, 1.0, 2.0, 1.3],
+  [4.1, 2.3, 3.0, 4.0, 0.3, 1.1, 1.3, 0.5, 1.5, 1.2, 3.0, 1.1],
+  [4.1, 2.3, 3.0, 4.0, 0.3, 1.1, 1.4, 0.1, 1.6, 1.3, 4.0, 1.2],
+  [4.1, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.4, 1.11, 1.4, 5.1, 1.4],
+  [4.1, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 1.22, 1.1, 6.1, 1.5],
+  [4.1, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 1.3, 0.8, 0.25, 1.6],
+  [4.1, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 1.3, 2.0, 6.5, 1.7],
+  [4.1, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 1.3, 2.0, 1.1, 1.8],
+  [4.1, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 1.3, 2.0, 1.1, 1.9],
+  [4.1, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 1.3, 2.0, 1.1, 1.9],
+])
+c = tf.constant([
+  [1.0, 2.0, 3.0, 4.0, 4.1, 1.1, 4.1, 8.1, 3.2, 4.3, 1.1, 1.1],
+  [2.0, 2.1, 3.0, 4.0, 4.2, 1.1, 1.1, 1.1, 1.2, 4.4, 1.0, 1.1],
+  [3.0, 2.2, 3.0, 4.0, 0.1, 1.1, 1.2, 2.1, 1.3, 1.0, 2.0, 1.3],
+  [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.3, 0.5, 1.5, 1.2, 3.0, 1.1],
+  [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.4, 0.1, 1.6, 1.3, 4.0, 1.2],
+  [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.4, 0.11, 1.4, 5.1, 1.4],
+  [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.22, 1.1, 6.1, 1.5],
+  [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 0.8, 0.25, 1.6],
+  [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 2.0, 6.5, 1.7],
+  [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 2.0, 1.1, 1.8],
+  [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 2.0, 1.1, 1.9],
+  [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 2.0, 1.1, 1.9],
+])
+d = tf.constant([
+  [1.0, 2.0, 3.0, 4.0, 4.1, 1.1, 4.1, 8.1, 3.2, 4.3, 1.1, 1.1],
+  [2.0, 2.1, 3.0, 4.0, 4.2, 1.1, 1.1, 1.1, 1.2, 4.4, 1.0, 1.1],
+  [3.0, 2.2, 3.0, 4.0, 0.1, 1.1, 1.2, 2.1, 1.3, 1.0, 2.0, 1.3],
+  [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.3, 0.5, 1.5, 1.2, 3.0, 1.1],
+  [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.4, 0.1, 1.6, 1.3, 4.0, 1.2],
+  [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.4, 0.11, 1.4, 5.1, 1.4],
+  [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.22, 1.1, 6.1, 1.5],
+  [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 0.8, 0.25, 1.6],
+  [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 2.0, 6.5, 1.7],
+  [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 2.0, 1.1, 1.8],
+  [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 2.0, 1.1, 1.9],
+  [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 2.0, 1.1, 1.9],
+])
+p = tf.placeholder("float")
+q = tf.placeholder("float")
+model = -tf.sin(a.dot(b + p) + c).dot(a) + tf.cos(a.dot(d + q))
+single_function_test = (tf.sigmoid(a * p) * tf.sigmoid(b * q)) + c
+pow_f = tf.pow(a, 3)
+pow_i = tf.pow(a_int, 3)
+sess = tf.session
+sess2 = tf.session(:opencl_evaluator)
+# verify correctness
+10.times do
+  feed = { p => rand, q => rand }
+  x = sess.run(model, feed_dict: feed )
+  y = sess2.run(model, feed_dict: feed )
+  fail "not equal #{tr(x.first)} != #{tr(y.first)}" if tr(x) != tr(y)
+end
+Benchmark.bmbm do |x|
+  x.report("pure ruby                :") { 100.times do sess.run(model, feed_dict: { p => rand, q => rand }) end }
+  x.report("opencl                   :") { 100.times do sess2.run(model, feed_dict: { p => rand, q => rand }) end }
+  x.report("pure ruby single function:") { 100.times do sess.run(single_function_test, feed_dict: { p => rand, q => rand }) end }
+  x.report("opencl     singlefunction:") { 100.times do sess2.run(single_function_test, feed_dict: { p => rand, q => rand }) end }
+  x.report("pure ruby pow float:") { 100.times do sess.run(pow_f, feed_dict: { p => rand, q => rand }) end }
+  x.report("opencl pow float:") { 100.times do sess2.run(pow_f, feed_dict: { p => rand, q => rand }) end }
+  x.report("pure ruby pow int:") { 100.times do sess.run(pow_i, feed_dict: { p => rand, q => rand }) end }
+  x.report("opencl pow int:") { 100.times do sess2.run(pow_i, feed_dict: { p => rand, q => rand }) end }
+end

data/lib/tensor_stream.rb CHANGED Viewed

@@ -62,14 +62,15 @@ module TensorStream
   end
   def self.variable(value, name: nil, initializer: nil, graph: nil, dtype: nil, trainable: true)
+    op = Operation.new(:assign, nil, value)
     common_options = {
-      initializer: initializer || Operation.new(:assign, nil, value),
+      initializer: initializer || op,
       name: name,
       graph: graph,
       dtype: dtype,
       trainable: trainable
     }
-    if value.is_a?(String)
+    tensor = if value.is_a?(String)
       TensorStream::Variable.new(dtype || :string, 0, [], common_options)
     elsif value.is_a?(Integer)
       TensorStream::Variable.new(dtype || :int32, 0, [], common_options)
@@ -78,6 +79,8 @@ module TensorStream
     else
       TensorStream::Variable.new(dtype || :float32, 0, nil, common_options)
     end
+    op.items[0] = tensor
+    tensor
   end
   def self.variable_scope(scope = nil, reuse: nil, initializer: nil)
@@ -166,8 +169,8 @@ module TensorStream
     Graph.get_default_graph.get_collection(name, options)
   end
-  def self.placeholder(dtype, options = {})
-    TensorStream::Placeholder.new(dtype, nil, options[:shape])
+  def self.placeholder(dtype, shape: nil)
+    TensorStream::Placeholder.new(dtype, nil, shape)
   end
   def self.global_variables_initializer

data/lib/tensor_stream/evaluator/buffer.rb ADDED Viewed

@@ -0,0 +1,10 @@
+module TensorStream
+  # this class represents an evaluator specific native buffer
+  class Buffer
+    attr_accessor :dirty, :name
+    def to_ruby
+      raise "not implemented"
+    end
+  end
+end

data/lib/tensor_stream/evaluator/evaluator.rb CHANGED Viewed

@@ -1,5 +1,6 @@
 require 'tensor_stream/evaluator/ruby_evaluator'
+require 'tensor_stream/evaluator/buffer'
 module TensorStream
   module Evaluator

data/lib/tensor_stream/evaluator/kernels/_bool_operand.cl ADDED Viewed

@@ -0,0 +1,45 @@
+ // same dimension add floating point op
+ __kernel void <%= fname%>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= result_t %> *C) {
+    // Get the index of the current element to be processed
+    const int globalRow = get_global_id(0); // Row ID of C (0..M)
+    const int globalCol = get_global_id(1); // Col ID of C (0..N)
+    C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <%= op %> B[globalRow * N + globalCol] ? 1 : 0;
+}
+ // 1D + Scalar floating point add op
+ __kernel void <%=fname%>_c_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= result_t %> *C) {
+    // Get the index of the current element to be processed
+    const int globalRow = get_global_id(0); // Row ID of C (0..M)
+    const int globalCol = get_global_id(1); // Col ID of C (0..N)
+    if (switch_op == 0) {
+      C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <%= op %> B[0] ? 1 : 0;
+    } else {
+      C[globalRow * N + globalCol] = B[0] <%= op %> A[globalRow * N + globalCol] ? 1 : 0;
+    }
+}
+ // 1D + Scalar floating point add op broadcast
+ __kernel void <%= fname%>_b_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op,__global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= result_t %> *C) {
+    // Get the index of the current element to be processed
+    const int globalRow = get_global_id(0); // Row ID of C (0..M)
+    const int globalCol = get_global_id(1); // Col ID of C (0..N)
+    int b_m_index = globalRow;
+    int b_n_index = globalCol;
+    if ( b_m_index >= M2) {
+      b_m_index = b_m_index % M2;
+    };
+    if (b_n_index >= N2) {
+      b_n_index = b_n_index % N2;
+    }
+    if (switch_op == 0) {
+      C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <%= op %> B[b_m_index * N2 + b_n_index] ? 1 : 0;
+    } else {
+      C[globalRow * N + globalCol] = B[b_m_index * N2 + b_n_index] <%= op %> A[globalRow * N + globalCol] ? 1 : 0;
+    }
+}

data/lib/tensor_stream/evaluator/kernels/_operand.cl ADDED Viewed

@@ -0,0 +1,45 @@
+ // same dimension add floating point op
+ __kernel void <%= fname%>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= result_t %> *C) {
+    // Get the index of the current element to be processed
+    const int globalRow = get_global_id(0); // Row ID of C (0..M)
+    const int globalCol = get_global_id(1); // Col ID of C (0..N)
+    C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <%= op %> B[globalRow * N + globalCol];
+}
+ // 1D + Scalar floating point add op
+ __kernel void <%=fname%>_c_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= result_t %> *C) {
+    // Get the index of the current element to be processed
+    const int globalRow = get_global_id(0); // Row ID of C (0..M)
+    const int globalCol = get_global_id(1); // Col ID of C (0..N)
+    if (switch_op == 0) {
+      C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <%= op %> B[0];
+    } else {
+      C[globalRow * N + globalCol] = B[0] <%= op %> A[globalRow * N + globalCol];
+    }
+}
+ // 1D + Scalar floating point add op broadcast
+ __kernel void <%= fname%>_b_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op,__global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= result_t %> *C) {
+    // Get the index of the current element to be processed
+    const int globalRow = get_global_id(0); // Row ID of C (0..M)
+    const int globalCol = get_global_id(1); // Col ID of C (0..N)
+    int b_m_index = globalRow;
+    int b_n_index = globalCol;
+    if ( b_m_index >= M2) {
+      b_m_index = b_m_index % M2;
+    };
+    if (b_n_index >= N2) {
+      b_n_index = b_n_index % N2;
+    }
+    if (switch_op == 0) {
+      C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <%= op %> B[b_m_index * N2 + b_n_index];
+    } else {
+      C[globalRow * N + globalCol] = B[b_m_index * N2 + b_n_index] <%= op %> A[globalRow * N + globalCol];
+    }
+}

data/lib/tensor_stream/evaluator/kernels/abs.cl ADDED Viewed

@@ -0,0 +1,16 @@
+__kernel void abs_fp(const int M, const int N, __global const float *A, __global float *C) {
+    // Get the index of the current element to be processed
+    const int globalRow = get_global_id(0); // Row ID of C (0..M)
+    const int globalCol = get_global_id(1); // Col ID of C (0..N)
+    C[globalRow * N + globalCol] = fabs(A[globalRow * N + globalCol]);
+}
+__kernel void abs_int(const int M, const int N, __global const int *A, __global int *C) {
+    // Get the index of the current element to be processed
+    const int globalRow = get_global_id(0); // Row ID of C (0..M)
+    const int globalCol = get_global_id(1); // Col ID of C (0..N)
+    C[globalRow * N + globalCol] = fabs((float)A[globalRow * N + globalCol]);
+}

data/lib/tensor_stream/evaluator/kernels/add.cl ADDED Viewed

@@ -0,0 +1,5 @@
+% %w[fp int].product(%w[add]).each do |dtype, fname|
+% c_dtype = dtype_to_c_type(dtype)
+% op = operator_to_c(fname)
+<%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: fname, dtype: dtype, result_t: c_dtype %>
+% end

data/lib/tensor_stream/evaluator/kernels/argmax.cl ADDED Viewed

@@ -0,0 +1,15 @@
+ __kernel void argmax_fp(const int M, const int N, const int switch_op, __global const float *A, __global const float *B, __global float *C) {
+    // Get the index of the current element to be processed
+    const int globalRow = get_global_id(0); // Row ID of C (0..M)
+    const int globalCol = get_global_id(1); // Col ID of C (0..N)
+    C[globalRow * N + globalCol] = A[globalRow * N + globalCol] + B[globalRow * N + globalCol];
+}
+ __kernel void argmax_int(const int M, const int N, const int switch_op, __global const int *A, __global const int *B, __global int *C) {
+    // Get the index of the current element to be processed
+    const int globalRow = get_global_id(0); // Row ID of C (0..M)
+    const int globalCol = get_global_id(1); // Col ID of C (0..N)
+    C[globalRow * N + globalCol] = A[globalRow * N + globalCol] + B[globalRow * N + globalCol];
+}

data/lib/tensor_stream/evaluator/kernels/argmin.cl ADDED Viewed

@@ -0,0 +1,15 @@
+ __kernel void argmin_fp(const int M, const int N, const int switch_op, __global const float *A, __global const float *B, __global float *C) {
+    // Get the index of the current element to be processed
+    const int globalRow = get_global_id(0); // Row ID of C (0..M)
+    const int globalCol = get_global_id(1); // Col ID of C (0..N)
+    C[globalRow * N + globalCol] = A[globalRow * N + globalCol] + B[globalRow * N + globalCol];
+}
+ __kernel void argmin_int(const int M, const int N, const int switch_op, __global const int *A, __global const int *B, __global int *C) {
+    // Get the index of the current element to be processed
+    const int globalRow = get_global_id(0); // Row ID of C (0..M)
+    const int globalCol = get_global_id(1); // Col ID of C (0..N)
+    C[globalRow * N + globalCol] = A[globalRow * N + globalCol] + B[globalRow * N + globalCol];
+}

data/lib/tensor_stream/evaluator/kernels/cast.cl ADDED Viewed

@@ -0,0 +1,15 @@
+ __kernel void cast_int_fp(const int M, const int N, __global const int *A, __global float *C) {
+    // Get the index of the current element to be processed
+    const int globalRow = get_global_id(0); // Row ID of C (0..M)
+    const int globalCol = get_global_id(1); // Col ID of C (0..N)
+    C[globalRow * N + globalCol] = A[globalRow * N + globalCol];
+}
+ __kernel void cast_fp_int(const int M, const int N,__global const float *A, __global int *C) {
+    // Get the index of the current element to be processed
+    const int globalRow = get_global_id(0); // Row ID of C (0..M)
+    const int globalCol = get_global_id(1); // Col ID of C (0..N)
+    C[globalRow * N + globalCol] = A[globalRow * N + globalCol];
+}

data/lib/tensor_stream/evaluator/kernels/cond.cl.erb ADDED Viewed

@@ -0,0 +1,5 @@
+% %w[fp int].product(%w[less less_equal equal not_equal greater greater_equal logical_and]).each do |dtype, fname|
+% c_dtype = dtype_to_c_type(dtype)
+% op = operator_to_c(fname)
+<%= render 'bool_operand.cl', c_dtype: c_dtype, op: op, fname: fname, dtype: dtype, result_t: 'int' %>
+% end