tensor_stream 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. checksums.yaml +5 -5
  2. data/.circleci/config.yml +2 -1
  3. data/CHANGELOG.md +5 -0
  4. data/README.md +28 -1
  5. data/benchmark/benchmark.rb +129 -0
  6. data/lib/tensor_stream.rb +7 -4
  7. data/lib/tensor_stream/evaluator/buffer.rb +10 -0
  8. data/lib/tensor_stream/evaluator/evaluator.rb +1 -0
  9. data/lib/tensor_stream/evaluator/kernels/_bool_operand.cl +45 -0
  10. data/lib/tensor_stream/evaluator/kernels/_operand.cl +45 -0
  11. data/lib/tensor_stream/evaluator/kernels/abs.cl +16 -0
  12. data/lib/tensor_stream/evaluator/kernels/add.cl +5 -0
  13. data/lib/tensor_stream/evaluator/kernels/argmax.cl +15 -0
  14. data/lib/tensor_stream/evaluator/kernels/argmin.cl +15 -0
  15. data/lib/tensor_stream/evaluator/kernels/cast.cl +15 -0
  16. data/lib/tensor_stream/evaluator/kernels/cond.cl.erb +5 -0
  17. data/lib/tensor_stream/evaluator/kernels/cos.cl +7 -0
  18. data/lib/tensor_stream/evaluator/kernels/div.cl.erb +5 -0
  19. data/lib/tensor_stream/evaluator/kernels/exp.cl +7 -0
  20. data/lib/tensor_stream/evaluator/kernels/gemm.cl +63 -0
  21. data/lib/tensor_stream/evaluator/kernels/log.cl +7 -0
  22. data/lib/tensor_stream/evaluator/kernels/log1p.cl +7 -0
  23. data/lib/tensor_stream/evaluator/kernels/max.cl +91 -0
  24. data/lib/tensor_stream/evaluator/kernels/mul.cl +5 -0
  25. data/lib/tensor_stream/evaluator/kernels/negate.cl +15 -0
  26. data/lib/tensor_stream/evaluator/kernels/pow.cl +130 -0
  27. data/lib/tensor_stream/evaluator/kernels/reciprocal.cl +15 -0
  28. data/lib/tensor_stream/evaluator/kernels/round.cl +7 -0
  29. data/lib/tensor_stream/evaluator/kernels/sigmoid.cl +8 -0
  30. data/lib/tensor_stream/evaluator/kernels/sigmoid_grad.cl +54 -0
  31. data/lib/tensor_stream/evaluator/kernels/sign.cl +23 -0
  32. data/lib/tensor_stream/evaluator/kernels/sin.cl +8 -0
  33. data/lib/tensor_stream/evaluator/kernels/sqrt.cl +8 -0
  34. data/lib/tensor_stream/evaluator/kernels/square.cl +15 -0
  35. data/lib/tensor_stream/evaluator/kernels/sub.cl +5 -0
  36. data/lib/tensor_stream/evaluator/kernels/tan.cl +7 -0
  37. data/lib/tensor_stream/evaluator/kernels/tanh.cl +7 -0
  38. data/lib/tensor_stream/evaluator/kernels/tanh_grad.cl +6 -0
  39. data/lib/tensor_stream/evaluator/kernels/where.cl +15 -0
  40. data/lib/tensor_stream/evaluator/opencl_buffer.rb +30 -0
  41. data/lib/tensor_stream/evaluator/opencl_evaluator.rb +1095 -0
  42. data/lib/tensor_stream/evaluator/opencl_template_helper.rb +58 -0
  43. data/lib/tensor_stream/evaluator/operation_helpers/array_ops_helper.rb +27 -0
  44. data/lib/tensor_stream/evaluator/ruby_evaluator.rb +20 -31
  45. data/lib/tensor_stream/graph.rb +4 -2
  46. data/lib/tensor_stream/math_gradients.rb +3 -0
  47. data/lib/tensor_stream/operation.rb +29 -2
  48. data/lib/tensor_stream/ops.rb +14 -2
  49. data/lib/tensor_stream/placeholder.rb +1 -1
  50. data/lib/tensor_stream/session.rb +10 -3
  51. data/lib/tensor_stream/tensor_shape.rb +1 -1
  52. data/lib/tensor_stream/train/saver.rb +1 -1
  53. data/lib/tensor_stream/variable.rb +7 -1
  54. data/lib/tensor_stream/version.rb +1 -1
  55. data/samples/logistic_regression.rb +2 -1
  56. data/samples/nearest_neighbor.rb +54 -0
  57. data/tensor_stream.gemspec +3 -1
  58. metadata +107 -28
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA256:
3
- metadata.gz: a5a6dce7a4317dee2e9fba536056d0d346ac9fd4a2763396a5735204f77b90c6
4
- data.tar.gz: 4a6d2973badfa0f2ac20850f6885181c4f19b6fe00e416e9f8394f89ac267f77
2
+ SHA1:
3
+ metadata.gz: 7f758a576604fad842e40ff2c64f5d7deef05e5c
4
+ data.tar.gz: f94636819dc9fab55bf53bc198a8ad4854728249
5
5
  SHA512:
6
- metadata.gz: 285c9deb129680a9050a2afa9811de1d3cbcf2250cb78b4b6eab06f225bf8a5532d301a4cad7c0e9d39b7b16bcffb0527c7b6cadaad8b840a78cadc8fcf92c80
7
- data.tar.gz: 2021fc72c95a8aad4e8b7f8a5b25fa7ce64e715ae08be9d3c325f058155f1a2ea9d58be38aae8f4849dabe067c75e140f174efe498503a86c7a73820dc367e5d
6
+ metadata.gz: b167b5160330e4421ebbbce9c7e2235db0d145f1f99b37731b63a9e32182dbdc38d473c86318be8023d86bb4ae0aae4104f66856911404dcdd478aaed8df9f43
7
+ data.tar.gz: aa1264ef33f8e7550dc917cb29d6c183e932b9637e8262e6c87eca4cea337d640d5576d9cf58b2bcf9714304fe8fa16fc95a3be6e0f879ab3678f0bccb6233ae
data/.circleci/config.yml CHANGED
@@ -42,8 +42,9 @@ jobs:
42
42
  command: |
43
43
  mkdir /tmp/test-results
44
44
  TEST_FILES="$(circleci tests glob "spec/**/*_spec.rb" | circleci tests split --split-by=timings)"
45
-
45
+
46
46
  bundle exec rspec -r rspec_junit_formatter --format progress \
47
+ --exclude-pattern "spec/tensor_stream/evaluators/opencl_*.rb" \
47
48
  --format RspecJunitFormatter \
48
49
  --out /tmp/test-results/rspec.xml \
49
50
  --format progress \
data/CHANGELOG.md CHANGED
@@ -4,6 +4,11 @@ All notable changes to this project will be documented in this file.
4
4
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
5
5
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
6
6
 
7
+ ## [0.3.0] - 2018-06-05
8
+ ### Added
9
+ - hardware acceleration using OpenCL
10
+ - working nearest neighbor sample (use opencl evaluator for best performance)
11
+
7
12
  ## [0.2.0] - 2018-05-27
8
13
  ### Added
9
14
  - working logistic regression sample
data/README.md CHANGED
@@ -4,7 +4,7 @@
4
4
 
5
5
  # TensorStream
6
6
 
7
- A reimplementation of TensorFlow for ruby. This is a ground up implementation with no dependency on TensorFlow. Effort has been made to make the programming style as near to TensorFlow as possible, comes with a pure ruby evaluator by default as well with support for an opencl evaluator.
7
+ A reimplementation of TensorFlow for ruby. This is a ground up implementation with no dependency on TensorFlow. Effort has been made to make the programming style as near to TensorFlow as possible, comes with a pure ruby evaluator by default with support for an opencl evaluator for large models and datasets.
8
8
 
9
9
  The goal of this gem is to have a high performance machine learning and compute solution for ruby with support for a wide range of hardware and software configuration.
10
10
 
@@ -176,6 +176,33 @@ f = tf.matmul(a, b).breakpoint! { |tensor, a, b, result_value| binding.pry }
176
176
  tf.session.run(f)
177
177
  ```
178
178
 
179
+ ### OpenCL
180
+
181
+ For OpenCL support, make sure that the required OpenCL drivers for your hardware are correctly installed on your system.
182
+ Also OpenCL only supports ruby-mri at the moment.
183
+
184
+ Also include the following gem in your project:
185
+
186
+ ```
187
+ gem 'opencl_ruby_ffi'
188
+ ```
189
+
190
+ To use the opencl evaluator instead of the ruby evaluator:
191
+
192
+ ```ruby
193
+ require 'tensor_stream/evaluator/opencl_evaluator'
194
+
195
+ # set session to use the opencl evaluator
196
+ sess = tf.session(:opencl_evaluator)
197
+
198
+ sess.run(....) # do stuff
199
+
200
+ ```
201
+
202
+ Note that the OpenCL evaluator provides speedup if you are using large tensors, tensors that are only using scalars like the linear regression sample will actually be slower.
203
+
204
+ samples/nearest_neighbor.rb contains a sample that uses opencl.
205
+
179
206
  # Visualization
180
207
 
181
208
  tensorstream does not support tensorboard yet, but a graphml generator is included:
@@ -0,0 +1,129 @@
1
+ require "bundler/setup"
2
+ require 'tensor_stream'
3
+ require 'benchmark'
4
+ require 'pry-byebug'
5
+ require 'awesome_print'
6
+ require 'tensor_stream/evaluator/opencl_evaluator'
7
+
8
+ def tr(t, places = 1)
9
+ if t.is_a?(Array)
10
+ return t.collect do |v|
11
+ tr(v)
12
+ end
13
+ end
14
+
15
+ return t unless t.kind_of?(Float)
16
+
17
+ t.round(places)
18
+ end
19
+
20
+ tf = TensorStream
21
+
22
+ srand(5)
23
+ seed = 5
24
+ tf.set_random_seed(seed)
25
+
26
+ a = tf.constant([
27
+ [1.0, 2.0, 3.0, 4.0, 4.1, 1.1, 4.1, 8.1, 3.2, 4.3, 1.1, 1.1],
28
+ [2.0, 2.1, 3.0, 4.0, 4.2, 1.1, 1.1, 1.1, 1.2, 4.4, 1.0, 1.1],
29
+ [3.0, 2.2, 3.0, 4.0, 0.1, 1.1, 1.2, 2.1, 1.3, 1.0, 2.0, 1.3],
30
+ [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.3, 0.5, 1.5, 1.2, 3.0, 1.1],
31
+ [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.4, 0.1, 1.6, 1.3, 4.0, 1.2],
32
+ [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.4, 0.11, 1.4, 5.1, 1.4],
33
+ [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.22, 1.1, 6.1, 1.5],
34
+ [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 0.8, 0.25, 1.6],
35
+ [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 2.0, 6.5, 1.7],
36
+ [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 2.0, 1.1, 1.8],
37
+ [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 2.0, 1.1, 1.9],
38
+ [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 2.0, 1.1, 1.9],
39
+ ])
40
+
41
+ a_int = tf.constant([
42
+ [1, 2, 3, 4, 4, 1, 4, 8, 3, 4, 1, 1],
43
+ [2, 2, 3, 4, 4, 1, 1, 1, 1, 4, 1, 1],
44
+ [3, 2, 3, 4, 0, 1, 1, 2, 1, 1, 2, 1],
45
+ [4, 2, 3, 4, 0, 1, 1, 0, 1, 1, 3, 1],
46
+ [4, 2, 3, 4, 0, 1, 1, 0, 1, 1, 4, 1],
47
+ [4, 2, 3, 4, 0, 1, 1, 0, 0, 1, 5, 1],
48
+ [4, 2, 3, 4, 0, 1, 1, 0, 0, 1, 6, 1],
49
+ [4, 2, 3, 4, 0, 1, 1, 0, 0, 0, 0, 1],
50
+ [4, 2, 3, 4, 0, 1, 1, 0, 0, 2, 6, 1],
51
+ [4, 2, 3, 4, 0, 1, 1, 0, 0, 2, 1, 1],
52
+ [4, 2, 3, 4, 0, 1, 1, 0, 0, 2, 1, 2],
53
+ [4, 2, 3, 4, 0, 1, 1, 0, 0, 2, 1, 2],
54
+ ])
55
+
56
+ b = tf.constant([
57
+ [1.1, 2.0, 3.0, 4.0, 4.1, 1.1, 4.1, 8.1, 3.2, 4.3, 1.1, 1.1],
58
+ [2.1, 2.1, 3.0, 4.0, 4.2, 1.1, 1.1, 1.1, 1.2, 4.4, 1.0, 1.1],
59
+ [3.1, 2.2, 3.0, 4.0, 0.1, 1.1, 1.2, 2.1, 0.3, 1.0, 2.0, 1.3],
60
+ [4.1, 2.3, 3.0, 4.0, 0.3, 1.1, 1.3, 0.5, 1.5, 1.2, 3.0, 1.1],
61
+ [4.1, 2.3, 3.0, 4.0, 0.3, 1.1, 1.4, 0.1, 1.6, 1.3, 4.0, 1.2],
62
+ [4.1, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.4, 1.11, 1.4, 5.1, 1.4],
63
+ [4.1, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 1.22, 1.1, 6.1, 1.5],
64
+ [4.1, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 1.3, 0.8, 0.25, 1.6],
65
+ [4.1, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 1.3, 2.0, 6.5, 1.7],
66
+ [4.1, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 1.3, 2.0, 1.1, 1.8],
67
+ [4.1, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 1.3, 2.0, 1.1, 1.9],
68
+ [4.1, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 1.3, 2.0, 1.1, 1.9],
69
+ ])
70
+
71
+ c = tf.constant([
72
+ [1.0, 2.0, 3.0, 4.0, 4.1, 1.1, 4.1, 8.1, 3.2, 4.3, 1.1, 1.1],
73
+ [2.0, 2.1, 3.0, 4.0, 4.2, 1.1, 1.1, 1.1, 1.2, 4.4, 1.0, 1.1],
74
+ [3.0, 2.2, 3.0, 4.0, 0.1, 1.1, 1.2, 2.1, 1.3, 1.0, 2.0, 1.3],
75
+ [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.3, 0.5, 1.5, 1.2, 3.0, 1.1],
76
+ [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.4, 0.1, 1.6, 1.3, 4.0, 1.2],
77
+ [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.4, 0.11, 1.4, 5.1, 1.4],
78
+ [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.22, 1.1, 6.1, 1.5],
79
+ [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 0.8, 0.25, 1.6],
80
+ [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 2.0, 6.5, 1.7],
81
+ [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 2.0, 1.1, 1.8],
82
+ [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 2.0, 1.1, 1.9],
83
+ [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 2.0, 1.1, 1.9],
84
+ ])
85
+
86
+ d = tf.constant([
87
+ [1.0, 2.0, 3.0, 4.0, 4.1, 1.1, 4.1, 8.1, 3.2, 4.3, 1.1, 1.1],
88
+ [2.0, 2.1, 3.0, 4.0, 4.2, 1.1, 1.1, 1.1, 1.2, 4.4, 1.0, 1.1],
89
+ [3.0, 2.2, 3.0, 4.0, 0.1, 1.1, 1.2, 2.1, 1.3, 1.0, 2.0, 1.3],
90
+ [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.3, 0.5, 1.5, 1.2, 3.0, 1.1],
91
+ [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.4, 0.1, 1.6, 1.3, 4.0, 1.2],
92
+ [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.4, 0.11, 1.4, 5.1, 1.4],
93
+ [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.22, 1.1, 6.1, 1.5],
94
+ [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 0.8, 0.25, 1.6],
95
+ [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 2.0, 6.5, 1.7],
96
+ [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 2.0, 1.1, 1.8],
97
+ [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 2.0, 1.1, 1.9],
98
+ [4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 2.0, 1.1, 1.9],
99
+ ])
100
+
101
+ p = tf.placeholder("float")
102
+ q = tf.placeholder("float")
103
+
104
+ model = -tf.sin(a.dot(b + p) + c).dot(a) + tf.cos(a.dot(d + q))
105
+ single_function_test = (tf.sigmoid(a * p) * tf.sigmoid(b * q)) + c
106
+ pow_f = tf.pow(a, 3)
107
+ pow_i = tf.pow(a_int, 3)
108
+
109
+ sess = tf.session
110
+ sess2 = tf.session(:opencl_evaluator)
111
+
112
+ # verify correctness
113
+ 10.times do
114
+ feed = { p => rand, q => rand }
115
+ x = sess.run(model, feed_dict: feed )
116
+ y = sess2.run(model, feed_dict: feed )
117
+ fail "not equal #{tr(x.first)} != #{tr(y.first)}" if tr(x) != tr(y)
118
+ end
119
+
120
+ Benchmark.bmbm do |x|
121
+ x.report("pure ruby :") { 100.times do sess.run(model, feed_dict: { p => rand, q => rand }) end }
122
+ x.report("opencl :") { 100.times do sess2.run(model, feed_dict: { p => rand, q => rand }) end }
123
+ x.report("pure ruby single function:") { 100.times do sess.run(single_function_test, feed_dict: { p => rand, q => rand }) end }
124
+ x.report("opencl singlefunction:") { 100.times do sess2.run(single_function_test, feed_dict: { p => rand, q => rand }) end }
125
+ x.report("pure ruby pow float:") { 100.times do sess.run(pow_f, feed_dict: { p => rand, q => rand }) end }
126
+ x.report("opencl pow float:") { 100.times do sess2.run(pow_f, feed_dict: { p => rand, q => rand }) end }
127
+ x.report("pure ruby pow int:") { 100.times do sess.run(pow_i, feed_dict: { p => rand, q => rand }) end }
128
+ x.report("opencl pow int:") { 100.times do sess2.run(pow_i, feed_dict: { p => rand, q => rand }) end }
129
+ end
data/lib/tensor_stream.rb CHANGED
@@ -62,14 +62,15 @@ module TensorStream
62
62
  end
63
63
 
64
64
  def self.variable(value, name: nil, initializer: nil, graph: nil, dtype: nil, trainable: true)
65
+ op = Operation.new(:assign, nil, value)
65
66
  common_options = {
66
- initializer: initializer || Operation.new(:assign, nil, value),
67
+ initializer: initializer || op,
67
68
  name: name,
68
69
  graph: graph,
69
70
  dtype: dtype,
70
71
  trainable: trainable
71
72
  }
72
- if value.is_a?(String)
73
+ tensor = if value.is_a?(String)
73
74
  TensorStream::Variable.new(dtype || :string, 0, [], common_options)
74
75
  elsif value.is_a?(Integer)
75
76
  TensorStream::Variable.new(dtype || :int32, 0, [], common_options)
@@ -78,6 +79,8 @@ module TensorStream
78
79
  else
79
80
  TensorStream::Variable.new(dtype || :float32, 0, nil, common_options)
80
81
  end
82
+ op.items[0] = tensor
83
+ tensor
81
84
  end
82
85
 
83
86
  def self.variable_scope(scope = nil, reuse: nil, initializer: nil)
@@ -166,8 +169,8 @@ module TensorStream
166
169
  Graph.get_default_graph.get_collection(name, options)
167
170
  end
168
171
 
169
- def self.placeholder(dtype, options = {})
170
- TensorStream::Placeholder.new(dtype, nil, options[:shape])
172
+ def self.placeholder(dtype, shape: nil)
173
+ TensorStream::Placeholder.new(dtype, nil, shape)
171
174
  end
172
175
 
173
176
  def self.global_variables_initializer
@@ -0,0 +1,10 @@
1
+ module TensorStream
2
+ # this class represents an evaluator specific native buffer
3
+ class Buffer
4
+ attr_accessor :dirty, :name
5
+
6
+ def to_ruby
7
+ raise "not implemented"
8
+ end
9
+ end
10
+ end
@@ -1,5 +1,6 @@
1
1
 
2
2
  require 'tensor_stream/evaluator/ruby_evaluator'
3
+ require 'tensor_stream/evaluator/buffer'
3
4
 
4
5
  module TensorStream
5
6
  module Evaluator
@@ -0,0 +1,45 @@
1
+ // same dimension add floating point op
2
+ __kernel void <%= fname%>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= result_t %> *C) {
3
+ // Get the index of the current element to be processed
4
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
+
7
+ C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <%= op %> B[globalRow * N + globalCol] ? 1 : 0;
8
+ }
9
+
10
+ // 1D + Scalar floating point add op
11
+ __kernel void <%=fname%>_c_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= result_t %> *C) {
12
+ // Get the index of the current element to be processed
13
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
14
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
15
+
16
+ if (switch_op == 0) {
17
+ C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <%= op %> B[0] ? 1 : 0;
18
+ } else {
19
+ C[globalRow * N + globalCol] = B[0] <%= op %> A[globalRow * N + globalCol] ? 1 : 0;
20
+ }
21
+ }
22
+
23
+ // 1D + Scalar floating point add op broadcast
24
+ __kernel void <%= fname%>_b_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op,__global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= result_t %> *C) {
25
+ // Get the index of the current element to be processed
26
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
27
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
28
+
29
+ int b_m_index = globalRow;
30
+ int b_n_index = globalCol;
31
+
32
+ if ( b_m_index >= M2) {
33
+ b_m_index = b_m_index % M2;
34
+ };
35
+
36
+ if (b_n_index >= N2) {
37
+ b_n_index = b_n_index % N2;
38
+ }
39
+
40
+ if (switch_op == 0) {
41
+ C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <%= op %> B[b_m_index * N2 + b_n_index] ? 1 : 0;
42
+ } else {
43
+ C[globalRow * N + globalCol] = B[b_m_index * N2 + b_n_index] <%= op %> A[globalRow * N + globalCol] ? 1 : 0;
44
+ }
45
+ }
@@ -0,0 +1,45 @@
1
+ // same dimension add floating point op
2
+ __kernel void <%= fname%>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= result_t %> *C) {
3
+ // Get the index of the current element to be processed
4
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
+
7
+ C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <%= op %> B[globalRow * N + globalCol];
8
+ }
9
+
10
+ // 1D + Scalar floating point add op
11
+ __kernel void <%=fname%>_c_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= result_t %> *C) {
12
+ // Get the index of the current element to be processed
13
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
14
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
15
+
16
+ if (switch_op == 0) {
17
+ C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <%= op %> B[0];
18
+ } else {
19
+ C[globalRow * N + globalCol] = B[0] <%= op %> A[globalRow * N + globalCol];
20
+ }
21
+ }
22
+
23
+ // 1D + Scalar floating point add op broadcast
24
+ __kernel void <%= fname%>_b_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op,__global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= result_t %> *C) {
25
+ // Get the index of the current element to be processed
26
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
27
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
28
+
29
+ int b_m_index = globalRow;
30
+ int b_n_index = globalCol;
31
+
32
+ if ( b_m_index >= M2) {
33
+ b_m_index = b_m_index % M2;
34
+ };
35
+
36
+ if (b_n_index >= N2) {
37
+ b_n_index = b_n_index % N2;
38
+ }
39
+
40
+ if (switch_op == 0) {
41
+ C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <%= op %> B[b_m_index * N2 + b_n_index];
42
+ } else {
43
+ C[globalRow * N + globalCol] = B[b_m_index * N2 + b_n_index] <%= op %> A[globalRow * N + globalCol];
44
+ }
45
+ }
@@ -0,0 +1,16 @@
1
+
2
+ __kernel void abs_fp(const int M, const int N, __global const float *A, __global float *C) {
3
+ // Get the index of the current element to be processed
4
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
+
7
+ C[globalRow * N + globalCol] = fabs(A[globalRow * N + globalCol]);
8
+ }
9
+
10
+ __kernel void abs_int(const int M, const int N, __global const int *A, __global int *C) {
11
+ // Get the index of the current element to be processed
12
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
13
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
14
+
15
+ C[globalRow * N + globalCol] = fabs((float)A[globalRow * N + globalCol]);
16
+ }
@@ -0,0 +1,5 @@
1
+ % %w[fp int].product(%w[add]).each do |dtype, fname|
2
+ % c_dtype = dtype_to_c_type(dtype)
3
+ % op = operator_to_c(fname)
4
+ <%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: fname, dtype: dtype, result_t: c_dtype %>
5
+ % end
@@ -0,0 +1,15 @@
1
+ __kernel void argmax_fp(const int M, const int N, const int switch_op, __global const float *A, __global const float *B, __global float *C) {
2
+ // Get the index of the current element to be processed
3
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
4
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
5
+
6
+ C[globalRow * N + globalCol] = A[globalRow * N + globalCol] + B[globalRow * N + globalCol];
7
+ }
8
+
9
+ __kernel void argmax_int(const int M, const int N, const int switch_op, __global const int *A, __global const int *B, __global int *C) {
10
+ // Get the index of the current element to be processed
11
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
12
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
13
+
14
+ C[globalRow * N + globalCol] = A[globalRow * N + globalCol] + B[globalRow * N + globalCol];
15
+ }
@@ -0,0 +1,15 @@
1
+ __kernel void argmin_fp(const int M, const int N, const int switch_op, __global const float *A, __global const float *B, __global float *C) {
2
+ // Get the index of the current element to be processed
3
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
4
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
5
+
6
+ C[globalRow * N + globalCol] = A[globalRow * N + globalCol] + B[globalRow * N + globalCol];
7
+ }
8
+
9
+ __kernel void argmin_int(const int M, const int N, const int switch_op, __global const int *A, __global const int *B, __global int *C) {
10
+ // Get the index of the current element to be processed
11
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
12
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
13
+
14
+ C[globalRow * N + globalCol] = A[globalRow * N + globalCol] + B[globalRow * N + globalCol];
15
+ }
@@ -0,0 +1,15 @@
1
+ __kernel void cast_int_fp(const int M, const int N, __global const int *A, __global float *C) {
2
+ // Get the index of the current element to be processed
3
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
4
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
5
+
6
+ C[globalRow * N + globalCol] = A[globalRow * N + globalCol];
7
+ }
8
+
9
+ __kernel void cast_fp_int(const int M, const int N,__global const float *A, __global int *C) {
10
+ // Get the index of the current element to be processed
11
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
12
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
13
+
14
+ C[globalRow * N + globalCol] = A[globalRow * N + globalCol];
15
+ }
@@ -0,0 +1,5 @@
1
+ % %w[fp int].product(%w[less less_equal equal not_equal greater greater_equal logical_and]).each do |dtype, fname|
2
+ % c_dtype = dtype_to_c_type(dtype)
3
+ % op = operator_to_c(fname)
4
+ <%= render 'bool_operand.cl', c_dtype: c_dtype, op: op, fname: fname, dtype: dtype, result_t: 'int' %>
5
+ % end