tensor_stream 0.2.0 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (58) hide show
  1. checksums.yaml +5 -5
  2. data/.circleci/config.yml +2 -1
  3. data/CHANGELOG.md +5 -0
  4. data/README.md +28 -1
  5. data/benchmark/benchmark.rb +129 -0
  6. data/lib/tensor_stream.rb +7 -4
  7. data/lib/tensor_stream/evaluator/buffer.rb +10 -0
  8. data/lib/tensor_stream/evaluator/evaluator.rb +1 -0
  9. data/lib/tensor_stream/evaluator/kernels/_bool_operand.cl +45 -0
  10. data/lib/tensor_stream/evaluator/kernels/_operand.cl +45 -0
  11. data/lib/tensor_stream/evaluator/kernels/abs.cl +16 -0
  12. data/lib/tensor_stream/evaluator/kernels/add.cl +5 -0
  13. data/lib/tensor_stream/evaluator/kernels/argmax.cl +15 -0
  14. data/lib/tensor_stream/evaluator/kernels/argmin.cl +15 -0
  15. data/lib/tensor_stream/evaluator/kernels/cast.cl +15 -0
  16. data/lib/tensor_stream/evaluator/kernels/cond.cl.erb +5 -0
  17. data/lib/tensor_stream/evaluator/kernels/cos.cl +7 -0
  18. data/lib/tensor_stream/evaluator/kernels/div.cl.erb +5 -0
  19. data/lib/tensor_stream/evaluator/kernels/exp.cl +7 -0
  20. data/lib/tensor_stream/evaluator/kernels/gemm.cl +63 -0
  21. data/lib/tensor_stream/evaluator/kernels/log.cl +7 -0
  22. data/lib/tensor_stream/evaluator/kernels/log1p.cl +7 -0
  23. data/lib/tensor_stream/evaluator/kernels/max.cl +91 -0
  24. data/lib/tensor_stream/evaluator/kernels/mul.cl +5 -0
  25. data/lib/tensor_stream/evaluator/kernels/negate.cl +15 -0
  26. data/lib/tensor_stream/evaluator/kernels/pow.cl +130 -0
  27. data/lib/tensor_stream/evaluator/kernels/reciprocal.cl +15 -0
  28. data/lib/tensor_stream/evaluator/kernels/round.cl +7 -0
  29. data/lib/tensor_stream/evaluator/kernels/sigmoid.cl +8 -0
  30. data/lib/tensor_stream/evaluator/kernels/sigmoid_grad.cl +54 -0
  31. data/lib/tensor_stream/evaluator/kernels/sign.cl +23 -0
  32. data/lib/tensor_stream/evaluator/kernels/sin.cl +8 -0
  33. data/lib/tensor_stream/evaluator/kernels/sqrt.cl +8 -0
  34. data/lib/tensor_stream/evaluator/kernels/square.cl +15 -0
  35. data/lib/tensor_stream/evaluator/kernels/sub.cl +5 -0
  36. data/lib/tensor_stream/evaluator/kernels/tan.cl +7 -0
  37. data/lib/tensor_stream/evaluator/kernels/tanh.cl +7 -0
  38. data/lib/tensor_stream/evaluator/kernels/tanh_grad.cl +6 -0
  39. data/lib/tensor_stream/evaluator/kernels/where.cl +15 -0
  40. data/lib/tensor_stream/evaluator/opencl_buffer.rb +30 -0
  41. data/lib/tensor_stream/evaluator/opencl_evaluator.rb +1095 -0
  42. data/lib/tensor_stream/evaluator/opencl_template_helper.rb +58 -0
  43. data/lib/tensor_stream/evaluator/operation_helpers/array_ops_helper.rb +27 -0
  44. data/lib/tensor_stream/evaluator/ruby_evaluator.rb +20 -31
  45. data/lib/tensor_stream/graph.rb +4 -2
  46. data/lib/tensor_stream/math_gradients.rb +3 -0
  47. data/lib/tensor_stream/operation.rb +29 -2
  48. data/lib/tensor_stream/ops.rb +14 -2
  49. data/lib/tensor_stream/placeholder.rb +1 -1
  50. data/lib/tensor_stream/session.rb +10 -3
  51. data/lib/tensor_stream/tensor_shape.rb +1 -1
  52. data/lib/tensor_stream/train/saver.rb +1 -1
  53. data/lib/tensor_stream/variable.rb +7 -1
  54. data/lib/tensor_stream/version.rb +1 -1
  55. data/samples/logistic_regression.rb +2 -1
  56. data/samples/nearest_neighbor.rb +54 -0
  57. data/tensor_stream.gemspec +3 -1
  58. metadata +107 -28
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA256:
3
- metadata.gz: a5a6dce7a4317dee2e9fba536056d0d346ac9fd4a2763396a5735204f77b90c6
4
- data.tar.gz: 4a6d2973badfa0f2ac20850f6885181c4f19b6fe00e416e9f8394f89ac267f77
2
+ SHA1:
3
+ metadata.gz: 7f758a576604fad842e40ff2c64f5d7deef05e5c
4
+ data.tar.gz: f94636819dc9fab55bf53bc198a8ad4854728249
5
5
  SHA512:
6
- metadata.gz: 285c9deb129680a9050a2afa9811de1d3cbcf2250cb78b4b6eab06f225bf8a5532d301a4cad7c0e9d39b7b16bcffb0527c7b6cadaad8b840a78cadc8fcf92c80
7
- data.tar.gz: 2021fc72c95a8aad4e8b7f8a5b25fa7ce64e715ae08be9d3c325f058155f1a2ea9d58be38aae8f4849dabe067c75e140f174efe498503a86c7a73820dc367e5d
6
+ metadata.gz: b167b5160330e4421ebbbce9c7e2235db0d145f1f99b37731b63a9e32182dbdc38d473c86318be8023d86bb4ae0aae4104f66856911404dcdd478aaed8df9f43
7
+ data.tar.gz: aa1264ef33f8e7550dc917cb29d6c183e932b9637e8262e6c87eca4cea337d640d5576d9cf58b2bcf9714304fe8fa16fc95a3be6e0f879ab3678f0bccb6233ae
data/.circleci/config.yml CHANGED
@@ -42,8 +42,9 @@ jobs:
42
42
  command: |
43
43
  mkdir /tmp/test-results
44
44
  TEST_FILES="$(circleci tests glob "spec/**/*_spec.rb" | circleci tests split --split-by=timings)"
45
-
45
+
46
46
  bundle exec rspec -r rspec_junit_formatter --format progress \
47
+ --exclude-pattern "spec/tensor_stream/evaluators/opencl_*.rb" \
47
48
  --format RspecJunitFormatter \
48
49
  --out /tmp/test-results/rspec.xml \
49
50
  --format progress \
data/CHANGELOG.md CHANGED
@@ -4,6 +4,11 @@ All notable changes to this project will be documented in this file.
4
4
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
5
5
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
6
6
 
7
+ ## [0.3.0] - 2018-06-05
8
+ ### Added
9
+ - hardware acceleration using OpenCL
10
+ - working nearest neighbor sample (use opencl evaluator for best performance)
11
+
7
12
  ## [0.2.0] - 2018-05-27
8
13
  ### Added
9
14
  - working logistic regression sample
data/README.md CHANGED
@@ -4,7 +4,7 @@
4
4
 
5
5
  # TensorStream
6
6
 
7
- A reimplementation of TensorFlow for ruby. This is a ground up implementation with no dependency on TensorFlow. Effort has been made to make the programming style as near to TensorFlow as possible, comes with a pure ruby evaluator by default as well with support for an opencl evaluator.
7
+ A reimplementation of TensorFlow for ruby. This is a ground up implementation with no dependency on TensorFlow. Effort has been made to make the programming style as near to TensorFlow as possible, comes with a pure ruby evaluator by default with support for an opencl evaluator for large models and datasets.
8
8
 
9
9
  The goal of this gem is to have a high performance machine learning and compute solution for ruby with support for a wide range of hardware and software configuration.
10
10
 
@@ -176,6 +176,33 @@ f = tf.matmul(a, b).breakpoint! { |tensor, a, b, result_value| binding.pry }
176
176
  tf.session.run(f)
177
177
  ```
178
178
 
179
+ ### OpenCL
180
+
181
+ For OpenCL support, make sure that the required OpenCL drivers for your hardware are correctly installed on your system.
182
+ Also OpenCL only supports ruby-mri at the moment.
183
+
184
+ Also include the following gem in your project:
185
+
186
+ ```
187
+ gem 'opencl_ruby_ffi'
188
+ ```
189
+
190
+ To use the opencl evaluator instead of the ruby evaluator:
191
+
192
+ ```ruby
193
+ require 'tensor_stream/evaluator/opencl_evaluator'
194
+
195
+ # set session to use the opencl evaluator
196
+ sess = tf.session(:opencl_evaluator)
197
+
198
+ sess.run(....) # do stuff
199
+
200
+ ```
201
+
202
+ Note that the OpenCL evaluator provides speedup if you are using large tensors, tensors that are only using scalars like the linear regression sample will actually be slower.
203
+
204
+ samples/nearest_neighbor.rb contains a sample that uses opencl.
205
+
179
206
  # Visualization
180
207
 
181
208
  tensorstream does not support tensorboard yet, but a graphml generator is included:
@@ -0,0 +1,129 @@
1
+ require "bundler/setup"
2
+ require 'tensor_stream'
3
+ require 'benchmark'
4
+ require 'pry-byebug'
5
+ require 'awesome_print'
6
+ require 'tensor_stream/evaluator/opencl_evaluator'
7
+
8
+ def tr(t, places = 1)
9
+ if t.is_a?(Array)
10
+ return t.collect do |v|
11
+ tr(v)
12
+ end
13
+ end
14
+
15
+ return t unless t.kind_of?(Float)
16
+
17
+ t.round(places)
18
+ end
19
+
20
+ tf = TensorStream
21
+
22
+ srand(5)
23
+ seed = 5
24
+ tf.set_random_seed(seed)
25
+
26
+ a = tf.constant([
27
+ [1.0, 2.0, 3.0, 4.0, 4.1, 1.1, 4.1, 8.1, 3.2, 4.3, 1.1, 1.1],
28
+ [2.0, 2.1, 3.0, 4.0, 4.2, 1.1, 1.1, 1.1, 1.2, 4.4, 1.0, 1.1],
29
+ [3.0, 2.2, 3.0, 4.0, 0.1, 1.1, 1.2, 2.1, 1.3, 1.0, 2.0, 1.3],
30
+ [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.3, 0.5, 1.5, 1.2, 3.0, 1.1],
31
+ [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.4, 0.1, 1.6, 1.3, 4.0, 1.2],
32
+ [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.4, 0.11, 1.4, 5.1, 1.4],
33
+ [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.22, 1.1, 6.1, 1.5],
34
+ [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 0.8, 0.25, 1.6],
35
+ [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 2.0, 6.5, 1.7],
36
+ [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 2.0, 1.1, 1.8],
37
+ [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 2.0, 1.1, 1.9],
38
+ [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 2.0, 1.1, 1.9],
39
+ ])
40
+
41
+ a_int = tf.constant([
42
+ [1, 2, 3, 4, 4, 1, 4, 8, 3, 4, 1, 1],
43
+ [2, 2, 3, 4, 4, 1, 1, 1, 1, 4, 1, 1],
44
+ [3, 2, 3, 4, 0, 1, 1, 2, 1, 1, 2, 1],
45
+ [4, 2, 3, 4, 0, 1, 1, 0, 1, 1, 3, 1],
46
+ [4, 2, 3, 4, 0, 1, 1, 0, 1, 1, 4, 1],
47
+ [4, 2, 3, 4, 0, 1, 1, 0, 0, 1, 5, 1],
48
+ [4, 2, 3, 4, 0, 1, 1, 0, 0, 1, 6, 1],
49
+ [4, 2, 3, 4, 0, 1, 1, 0, 0, 0, 0, 1],
50
+ [4, 2, 3, 4, 0, 1, 1, 0, 0, 2, 6, 1],
51
+ [4, 2, 3, 4, 0, 1, 1, 0, 0, 2, 1, 1],
52
+ [4, 2, 3, 4, 0, 1, 1, 0, 0, 2, 1, 2],
53
+ [4, 2, 3, 4, 0, 1, 1, 0, 0, 2, 1, 2],
54
+ ])
55
+
56
+ b = tf.constant([
57
+ [1.1, 2.0, 3.0, 4.0, 4.1, 1.1, 4.1, 8.1, 3.2, 4.3, 1.1, 1.1],
58
+ [2.1, 2.1, 3.0, 4.0, 4.2, 1.1, 1.1, 1.1, 1.2, 4.4, 1.0, 1.1],
59
+ [3.1, 2.2, 3.0, 4.0, 0.1, 1.1, 1.2, 2.1, 0.3, 1.0, 2.0, 1.3],
60
+ [4.1, 2.3, 3.0, 4.0, 0.3, 1.1, 1.3, 0.5, 1.5, 1.2, 3.0, 1.1],
61
+ [4.1, 2.3, 3.0, 4.0, 0.3, 1.1, 1.4, 0.1, 1.6, 1.3, 4.0, 1.2],
62
+ [4.1, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.4, 1.11, 1.4, 5.1, 1.4],
63
+ [4.1, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 1.22, 1.1, 6.1, 1.5],
64
+ [4.1, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 1.3, 0.8, 0.25, 1.6],
65
+ [4.1, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 1.3, 2.0, 6.5, 1.7],
66
+ [4.1, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 1.3, 2.0, 1.1, 1.8],
67
+ [4.1, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 1.3, 2.0, 1.1, 1.9],
68
+ [4.1, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 1.3, 2.0, 1.1, 1.9],
69
+ ])
70
+
71
+ c = tf.constant([
72
+ [1.0, 2.0, 3.0, 4.0, 4.1, 1.1, 4.1, 8.1, 3.2, 4.3, 1.1, 1.1],
73
+ [2.0, 2.1, 3.0, 4.0, 4.2, 1.1, 1.1, 1.1, 1.2, 4.4, 1.0, 1.1],
74
+ [3.0, 2.2, 3.0, 4.0, 0.1, 1.1, 1.2, 2.1, 1.3, 1.0, 2.0, 1.3],
75
+ [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.3, 0.5, 1.5, 1.2, 3.0, 1.1],
76
+ [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.4, 0.1, 1.6, 1.3, 4.0, 1.2],
77
+ [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.4, 0.11, 1.4, 5.1, 1.4],
78
+ [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.22, 1.1, 6.1, 1.5],
79
+ [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 0.8, 0.25, 1.6],
80
+ [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 2.0, 6.5, 1.7],
81
+ [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 2.0, 1.1, 1.8],
82
+ [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 2.0, 1.1, 1.9],
83
+ [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 2.0, 1.1, 1.9],
84
+ ])
85
+
86
+ d = tf.constant([
87
+ [1.0, 2.0, 3.0, 4.0, 4.1, 1.1, 4.1, 8.1, 3.2, 4.3, 1.1, 1.1],
88
+ [2.0, 2.1, 3.0, 4.0, 4.2, 1.1, 1.1, 1.1, 1.2, 4.4, 1.0, 1.1],
89
+ [3.0, 2.2, 3.0, 4.0, 0.1, 1.1, 1.2, 2.1, 1.3, 1.0, 2.0, 1.3],
90
+ [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.3, 0.5, 1.5, 1.2, 3.0, 1.1],
91
+ [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.4, 0.1, 1.6, 1.3, 4.0, 1.2],
92
+ [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.4, 0.11, 1.4, 5.1, 1.4],
93
+ [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.22, 1.1, 6.1, 1.5],
94
+ [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 0.8, 0.25, 1.6],
95
+ [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 2.0, 6.5, 1.7],
96
+ [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 2.0, 1.1, 1.8],
97
+ [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 2.0, 1.1, 1.9],
98
+ [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 2.0, 1.1, 1.9],
99
+ ])
100
+
101
+ p = tf.placeholder("float")
102
+ q = tf.placeholder("float")
103
+
104
+ model = -tf.sin(a.dot(b + p) + c).dot(a) + tf.cos(a.dot(d + q))
105
+ single_function_test = (tf.sigmoid(a * p) * tf.sigmoid(b * q)) + c
106
+ pow_f = tf.pow(a, 3)
107
+ pow_i = tf.pow(a_int, 3)
108
+
109
+ sess = tf.session
110
+ sess2 = tf.session(:opencl_evaluator)
111
+
112
+ # verify correctness
113
+ 10.times do
114
+ feed = { p => rand, q => rand }
115
+ x = sess.run(model, feed_dict: feed )
116
+ y = sess2.run(model, feed_dict: feed )
117
+ fail "not equal #{tr(x.first)} != #{tr(y.first)}" if tr(x) != tr(y)
118
+ end
119
+
120
+ Benchmark.bmbm do |x|
121
+ x.report("pure ruby :") { 100.times do sess.run(model, feed_dict: { p => rand, q => rand }) end }
122
+ x.report("opencl :") { 100.times do sess2.run(model, feed_dict: { p => rand, q => rand }) end }
123
+ x.report("pure ruby single function:") { 100.times do sess.run(single_function_test, feed_dict: { p => rand, q => rand }) end }
124
+ x.report("opencl singlefunction:") { 100.times do sess2.run(single_function_test, feed_dict: { p => rand, q => rand }) end }
125
+ x.report("pure ruby pow float:") { 100.times do sess.run(pow_f, feed_dict: { p => rand, q => rand }) end }
126
+ x.report("opencl pow float:") { 100.times do sess2.run(pow_f, feed_dict: { p => rand, q => rand }) end }
127
+ x.report("pure ruby pow int:") { 100.times do sess.run(pow_i, feed_dict: { p => rand, q => rand }) end }
128
+ x.report("opencl pow int:") { 100.times do sess2.run(pow_i, feed_dict: { p => rand, q => rand }) end }
129
+ end
data/lib/tensor_stream.rb CHANGED
@@ -62,14 +62,15 @@ module TensorStream
62
62
  end
63
63
 
64
64
  def self.variable(value, name: nil, initializer: nil, graph: nil, dtype: nil, trainable: true)
65
+ op = Operation.new(:assign, nil, value)
65
66
  common_options = {
66
- initializer: initializer || Operation.new(:assign, nil, value),
67
+ initializer: initializer || op,
67
68
  name: name,
68
69
  graph: graph,
69
70
  dtype: dtype,
70
71
  trainable: trainable
71
72
  }
72
- if value.is_a?(String)
73
+ tensor = if value.is_a?(String)
73
74
  TensorStream::Variable.new(dtype || :string, 0, [], common_options)
74
75
  elsif value.is_a?(Integer)
75
76
  TensorStream::Variable.new(dtype || :int32, 0, [], common_options)
@@ -78,6 +79,8 @@ module TensorStream
78
79
  else
79
80
  TensorStream::Variable.new(dtype || :float32, 0, nil, common_options)
80
81
  end
82
+ op.items[0] = tensor
83
+ tensor
81
84
  end
82
85
 
83
86
  def self.variable_scope(scope = nil, reuse: nil, initializer: nil)
@@ -166,8 +169,8 @@ module TensorStream
166
169
  Graph.get_default_graph.get_collection(name, options)
167
170
  end
168
171
 
169
- def self.placeholder(dtype, options = {})
170
- TensorStream::Placeholder.new(dtype, nil, options[:shape])
172
+ def self.placeholder(dtype, shape: nil)
173
+ TensorStream::Placeholder.new(dtype, nil, shape)
171
174
  end
172
175
 
173
176
  def self.global_variables_initializer
@@ -0,0 +1,10 @@
1
+ module TensorStream
2
+ # this class represents an evaluator specific native buffer
3
+ class Buffer
4
+ attr_accessor :dirty, :name
5
+
6
+ def to_ruby
7
+ raise "not implemented"
8
+ end
9
+ end
10
+ end
@@ -1,5 +1,6 @@
1
1
 
2
2
  require 'tensor_stream/evaluator/ruby_evaluator'
3
+ require 'tensor_stream/evaluator/buffer'
3
4
 
4
5
  module TensorStream
5
6
  module Evaluator
@@ -0,0 +1,45 @@
1
+ // same dimension add floating point op
2
+ __kernel void <%= fname%>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= result_t %> *C) {
3
+ // Get the index of the current element to be processed
4
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
+
7
+ C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <%= op %> B[globalRow * N + globalCol] ? 1 : 0;
8
+ }
9
+
10
+ // 1D + Scalar floating point add op
11
+ __kernel void <%=fname%>_c_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= result_t %> *C) {
12
+ // Get the index of the current element to be processed
13
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
14
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
15
+
16
+ if (switch_op == 0) {
17
+ C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <%= op %> B[0] ? 1 : 0;
18
+ } else {
19
+ C[globalRow * N + globalCol] = B[0] <%= op %> A[globalRow * N + globalCol] ? 1 : 0;
20
+ }
21
+ }
22
+
23
+ // 1D + Scalar floating point add op broadcast
24
+ __kernel void <%= fname%>_b_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op,__global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= result_t %> *C) {
25
+ // Get the index of the current element to be processed
26
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
27
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
28
+
29
+ int b_m_index = globalRow;
30
+ int b_n_index = globalCol;
31
+
32
+ if ( b_m_index >= M2) {
33
+ b_m_index = b_m_index % M2;
34
+ };
35
+
36
+ if (b_n_index >= N2) {
37
+ b_n_index = b_n_index % N2;
38
+ }
39
+
40
+ if (switch_op == 0) {
41
+ C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <%= op %> B[b_m_index * N2 + b_n_index] ? 1 : 0;
42
+ } else {
43
+ C[globalRow * N + globalCol] = B[b_m_index * N2 + b_n_index] <%= op %> A[globalRow * N + globalCol] ? 1 : 0;
44
+ }
45
+ }
@@ -0,0 +1,45 @@
1
+ // same dimension add floating point op
2
+ __kernel void <%= fname%>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= result_t %> *C) {
3
+ // Get the index of the current element to be processed
4
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
+
7
+ C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <%= op %> B[globalRow * N + globalCol];
8
+ }
9
+
10
+ // 1D + Scalar floating point add op
11
+ __kernel void <%=fname%>_c_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= result_t %> *C) {
12
+ // Get the index of the current element to be processed
13
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
14
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
15
+
16
+ if (switch_op == 0) {
17
+ C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <%= op %> B[0];
18
+ } else {
19
+ C[globalRow * N + globalCol] = B[0] <%= op %> A[globalRow * N + globalCol];
20
+ }
21
+ }
22
+
23
+ // 1D + Scalar floating point add op broadcast
24
+ __kernel void <%= fname%>_b_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op,__global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= result_t %> *C) {
25
+ // Get the index of the current element to be processed
26
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
27
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
28
+
29
+ int b_m_index = globalRow;
30
+ int b_n_index = globalCol;
31
+
32
+ if ( b_m_index >= M2) {
33
+ b_m_index = b_m_index % M2;
34
+ };
35
+
36
+ if (b_n_index >= N2) {
37
+ b_n_index = b_n_index % N2;
38
+ }
39
+
40
+ if (switch_op == 0) {
41
+ C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <%= op %> B[b_m_index * N2 + b_n_index];
42
+ } else {
43
+ C[globalRow * N + globalCol] = B[b_m_index * N2 + b_n_index] <%= op %> A[globalRow * N + globalCol];
44
+ }
45
+ }
@@ -0,0 +1,16 @@
1
+
2
+ __kernel void abs_fp(const int M, const int N, __global const float *A, __global float *C) {
3
+ // Get the index of the current element to be processed
4
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
+
7
+ C[globalRow * N + globalCol] = fabs(A[globalRow * N + globalCol]);
8
+ }
9
+
10
+ __kernel void abs_int(const int M, const int N, __global const int *A, __global int *C) {
11
+ // Get the index of the current element to be processed
12
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
13
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
14
+
15
+ C[globalRow * N + globalCol] = fabs((float)A[globalRow * N + globalCol]);
16
+ }
@@ -0,0 +1,5 @@
1
+ % %w[fp int].product(%w[add]).each do |dtype, fname|
2
+ % c_dtype = dtype_to_c_type(dtype)
3
+ % op = operator_to_c(fname)
4
+ <%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: fname, dtype: dtype, result_t: c_dtype %>
5
+ % end
@@ -0,0 +1,15 @@
1
+ __kernel void argmax_fp(const int M, const int N, const int switch_op, __global const float *A, __global const float *B, __global float *C) {
2
+ // Get the index of the current element to be processed
3
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
4
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
5
+
6
+ C[globalRow * N + globalCol] = A[globalRow * N + globalCol] + B[globalRow * N + globalCol];
7
+ }
8
+
9
+ __kernel void argmax_int(const int M, const int N, const int switch_op, __global const int *A, __global const int *B, __global int *C) {
10
+ // Get the index of the current element to be processed
11
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
12
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
13
+
14
+ C[globalRow * N + globalCol] = A[globalRow * N + globalCol] + B[globalRow * N + globalCol];
15
+ }
@@ -0,0 +1,15 @@
1
+ __kernel void argmin_fp(const int M, const int N, const int switch_op, __global const float *A, __global const float *B, __global float *C) {
2
+ // Get the index of the current element to be processed
3
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
4
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
5
+
6
+ C[globalRow * N + globalCol] = A[globalRow * N + globalCol] + B[globalRow * N + globalCol];
7
+ }
8
+
9
+ __kernel void argmin_int(const int M, const int N, const int switch_op, __global const int *A, __global const int *B, __global int *C) {
10
+ // Get the index of the current element to be processed
11
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
12
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
13
+
14
+ C[globalRow * N + globalCol] = A[globalRow * N + globalCol] + B[globalRow * N + globalCol];
15
+ }
@@ -0,0 +1,15 @@
1
+ __kernel void cast_int_fp(const int M, const int N, __global const int *A, __global float *C) {
2
+ // Get the index of the current element to be processed
3
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
4
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
5
+
6
+ C[globalRow * N + globalCol] = A[globalRow * N + globalCol];
7
+ }
8
+
9
+ __kernel void cast_fp_int(const int M, const int N,__global const float *A, __global int *C) {
10
+ // Get the index of the current element to be processed
11
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
12
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
13
+
14
+ C[globalRow * N + globalCol] = A[globalRow * N + globalCol];
15
+ }
@@ -0,0 +1,5 @@
1
+ % %w[fp int].product(%w[less less_equal equal not_equal greater greater_equal logical_and]).each do |dtype, fname|
2
+ % c_dtype = dtype_to_c_type(dtype)
3
+ % op = operator_to_c(fname)
4
+ <%= render 'bool_operand.cl', c_dtype: c_dtype, op: op, fname: fname, dtype: dtype, result_t: 'int' %>
5
+ % end