tensor_stream 0.2.0 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.circleci/config.yml +2 -1
- data/CHANGELOG.md +5 -0
- data/README.md +28 -1
- data/benchmark/benchmark.rb +129 -0
- data/lib/tensor_stream.rb +7 -4
- data/lib/tensor_stream/evaluator/buffer.rb +10 -0
- data/lib/tensor_stream/evaluator/evaluator.rb +1 -0
- data/lib/tensor_stream/evaluator/kernels/_bool_operand.cl +45 -0
- data/lib/tensor_stream/evaluator/kernels/_operand.cl +45 -0
- data/lib/tensor_stream/evaluator/kernels/abs.cl +16 -0
- data/lib/tensor_stream/evaluator/kernels/add.cl +5 -0
- data/lib/tensor_stream/evaluator/kernels/argmax.cl +15 -0
- data/lib/tensor_stream/evaluator/kernels/argmin.cl +15 -0
- data/lib/tensor_stream/evaluator/kernels/cast.cl +15 -0
- data/lib/tensor_stream/evaluator/kernels/cond.cl.erb +5 -0
- data/lib/tensor_stream/evaluator/kernels/cos.cl +7 -0
- data/lib/tensor_stream/evaluator/kernels/div.cl.erb +5 -0
- data/lib/tensor_stream/evaluator/kernels/exp.cl +7 -0
- data/lib/tensor_stream/evaluator/kernels/gemm.cl +63 -0
- data/lib/tensor_stream/evaluator/kernels/log.cl +7 -0
- data/lib/tensor_stream/evaluator/kernels/log1p.cl +7 -0
- data/lib/tensor_stream/evaluator/kernels/max.cl +91 -0
- data/lib/tensor_stream/evaluator/kernels/mul.cl +5 -0
- data/lib/tensor_stream/evaluator/kernels/negate.cl +15 -0
- data/lib/tensor_stream/evaluator/kernels/pow.cl +130 -0
- data/lib/tensor_stream/evaluator/kernels/reciprocal.cl +15 -0
- data/lib/tensor_stream/evaluator/kernels/round.cl +7 -0
- data/lib/tensor_stream/evaluator/kernels/sigmoid.cl +8 -0
- data/lib/tensor_stream/evaluator/kernels/sigmoid_grad.cl +54 -0
- data/lib/tensor_stream/evaluator/kernels/sign.cl +23 -0
- data/lib/tensor_stream/evaluator/kernels/sin.cl +8 -0
- data/lib/tensor_stream/evaluator/kernels/sqrt.cl +8 -0
- data/lib/tensor_stream/evaluator/kernels/square.cl +15 -0
- data/lib/tensor_stream/evaluator/kernels/sub.cl +5 -0
- data/lib/tensor_stream/evaluator/kernels/tan.cl +7 -0
- data/lib/tensor_stream/evaluator/kernels/tanh.cl +7 -0
- data/lib/tensor_stream/evaluator/kernels/tanh_grad.cl +6 -0
- data/lib/tensor_stream/evaluator/kernels/where.cl +15 -0
- data/lib/tensor_stream/evaluator/opencl_buffer.rb +30 -0
- data/lib/tensor_stream/evaluator/opencl_evaluator.rb +1095 -0
- data/lib/tensor_stream/evaluator/opencl_template_helper.rb +58 -0
- data/lib/tensor_stream/evaluator/operation_helpers/array_ops_helper.rb +27 -0
- data/lib/tensor_stream/evaluator/ruby_evaluator.rb +20 -31
- data/lib/tensor_stream/graph.rb +4 -2
- data/lib/tensor_stream/math_gradients.rb +3 -0
- data/lib/tensor_stream/operation.rb +29 -2
- data/lib/tensor_stream/ops.rb +14 -2
- data/lib/tensor_stream/placeholder.rb +1 -1
- data/lib/tensor_stream/session.rb +10 -3
- data/lib/tensor_stream/tensor_shape.rb +1 -1
- data/lib/tensor_stream/train/saver.rb +1 -1
- data/lib/tensor_stream/variable.rb +7 -1
- data/lib/tensor_stream/version.rb +1 -1
- data/samples/logistic_regression.rb +2 -1
- data/samples/nearest_neighbor.rb +54 -0
- data/tensor_stream.gemspec +3 -1
- metadata +107 -28
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 7f758a576604fad842e40ff2c64f5d7deef05e5c
|
4
|
+
data.tar.gz: f94636819dc9fab55bf53bc198a8ad4854728249
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b167b5160330e4421ebbbce9c7e2235db0d145f1f99b37731b63a9e32182dbdc38d473c86318be8023d86bb4ae0aae4104f66856911404dcdd478aaed8df9f43
|
7
|
+
data.tar.gz: aa1264ef33f8e7550dc917cb29d6c183e932b9637e8262e6c87eca4cea337d640d5576d9cf58b2bcf9714304fe8fa16fc95a3be6e0f879ab3678f0bccb6233ae
|
data/.circleci/config.yml
CHANGED
@@ -42,8 +42,9 @@ jobs:
|
|
42
42
|
command: |
|
43
43
|
mkdir /tmp/test-results
|
44
44
|
TEST_FILES="$(circleci tests glob "spec/**/*_spec.rb" | circleci tests split --split-by=timings)"
|
45
|
-
|
45
|
+
|
46
46
|
bundle exec rspec -r rspec_junit_formatter --format progress \
|
47
|
+
--exclude-pattern "spec/tensor_stream/evaluators/opencl_*.rb" \
|
47
48
|
--format RspecJunitFormatter \
|
48
49
|
--out /tmp/test-results/rspec.xml \
|
49
50
|
--format progress \
|
data/CHANGELOG.md
CHANGED
@@ -4,6 +4,11 @@ All notable changes to this project will be documented in this file.
|
|
4
4
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
|
5
5
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
6
6
|
|
7
|
+
## [0.3.0] - 2018-06-05
|
8
|
+
### Added
|
9
|
+
- hardware acceleration using OpenCL
|
10
|
+
- working nearest neighbor sample (use opencl evaluator for best performance)
|
11
|
+
|
7
12
|
## [0.2.0] - 2018-05-27
|
8
13
|
### Added
|
9
14
|
- working logistic regression sample
|
data/README.md
CHANGED
@@ -4,7 +4,7 @@
|
|
4
4
|
|
5
5
|
# TensorStream
|
6
6
|
|
7
|
-
A reimplementation of TensorFlow for ruby. This is a ground up implementation with no dependency on TensorFlow. Effort has been made to make the programming style as near to TensorFlow as possible, comes with a pure ruby evaluator by default
|
7
|
+
A reimplementation of TensorFlow for ruby. This is a ground up implementation with no dependency on TensorFlow. Effort has been made to make the programming style as near to TensorFlow as possible, comes with a pure ruby evaluator by default with support for an opencl evaluator for large models and datasets.
|
8
8
|
|
9
9
|
The goal of this gem is to have a high performance machine learning and compute solution for ruby with support for a wide range of hardware and software configuration.
|
10
10
|
|
@@ -176,6 +176,33 @@ f = tf.matmul(a, b).breakpoint! { |tensor, a, b, result_value| binding.pry }
|
|
176
176
|
tf.session.run(f)
|
177
177
|
```
|
178
178
|
|
179
|
+
### OpenCL
|
180
|
+
|
181
|
+
For OpenCL support, make sure that the required OpenCL drivers for your hardware are correctly installed on your system.
|
182
|
+
Also OpenCL only supports ruby-mri at the moment.
|
183
|
+
|
184
|
+
Also include the following gem in your project:
|
185
|
+
|
186
|
+
```
|
187
|
+
gem 'opencl_ruby_ffi'
|
188
|
+
```
|
189
|
+
|
190
|
+
To use the opencl evaluator instead of the ruby evaluator:
|
191
|
+
|
192
|
+
```ruby
|
193
|
+
require 'tensor_stream/evaluator/opencl_evaluator'
|
194
|
+
|
195
|
+
# set session to use the opencl evaluator
|
196
|
+
sess = tf.session(:opencl_evaluator)
|
197
|
+
|
198
|
+
sess.run(....) # do stuff
|
199
|
+
|
200
|
+
```
|
201
|
+
|
202
|
+
Note that the OpenCL evaluator provides speedup if you are using large tensors, tensors that are only using scalars like the linear regression sample will actually be slower.
|
203
|
+
|
204
|
+
samples/nearest_neighbor.rb contains a sample that uses opencl.
|
205
|
+
|
179
206
|
# Visualization
|
180
207
|
|
181
208
|
tensorstream does not support tensorboard yet, but a graphml generator is included:
|
@@ -0,0 +1,129 @@
|
|
1
|
+
require "bundler/setup"
|
2
|
+
require 'tensor_stream'
|
3
|
+
require 'benchmark'
|
4
|
+
require 'pry-byebug'
|
5
|
+
require 'awesome_print'
|
6
|
+
require 'tensor_stream/evaluator/opencl_evaluator'
|
7
|
+
|
8
|
+
def tr(t, places = 1)
|
9
|
+
if t.is_a?(Array)
|
10
|
+
return t.collect do |v|
|
11
|
+
tr(v)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
return t unless t.kind_of?(Float)
|
16
|
+
|
17
|
+
t.round(places)
|
18
|
+
end
|
19
|
+
|
20
|
+
tf = TensorStream
|
21
|
+
|
22
|
+
srand(5)
|
23
|
+
seed = 5
|
24
|
+
tf.set_random_seed(seed)
|
25
|
+
|
26
|
+
a = tf.constant([
|
27
|
+
[1.0, 2.0, 3.0, 4.0, 4.1, 1.1, 4.1, 8.1, 3.2, 4.3, 1.1, 1.1],
|
28
|
+
[2.0, 2.1, 3.0, 4.0, 4.2, 1.1, 1.1, 1.1, 1.2, 4.4, 1.0, 1.1],
|
29
|
+
[3.0, 2.2, 3.0, 4.0, 0.1, 1.1, 1.2, 2.1, 1.3, 1.0, 2.0, 1.3],
|
30
|
+
[4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.3, 0.5, 1.5, 1.2, 3.0, 1.1],
|
31
|
+
[4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.4, 0.1, 1.6, 1.3, 4.0, 1.2],
|
32
|
+
[4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.4, 0.11, 1.4, 5.1, 1.4],
|
33
|
+
[4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.22, 1.1, 6.1, 1.5],
|
34
|
+
[4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 0.8, 0.25, 1.6],
|
35
|
+
[4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 2.0, 6.5, 1.7],
|
36
|
+
[4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 2.0, 1.1, 1.8],
|
37
|
+
[4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 2.0, 1.1, 1.9],
|
38
|
+
[4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 2.0, 1.1, 1.9],
|
39
|
+
])
|
40
|
+
|
41
|
+
a_int = tf.constant([
|
42
|
+
[1, 2, 3, 4, 4, 1, 4, 8, 3, 4, 1, 1],
|
43
|
+
[2, 2, 3, 4, 4, 1, 1, 1, 1, 4, 1, 1],
|
44
|
+
[3, 2, 3, 4, 0, 1, 1, 2, 1, 1, 2, 1],
|
45
|
+
[4, 2, 3, 4, 0, 1, 1, 0, 1, 1, 3, 1],
|
46
|
+
[4, 2, 3, 4, 0, 1, 1, 0, 1, 1, 4, 1],
|
47
|
+
[4, 2, 3, 4, 0, 1, 1, 0, 0, 1, 5, 1],
|
48
|
+
[4, 2, 3, 4, 0, 1, 1, 0, 0, 1, 6, 1],
|
49
|
+
[4, 2, 3, 4, 0, 1, 1, 0, 0, 0, 0, 1],
|
50
|
+
[4, 2, 3, 4, 0, 1, 1, 0, 0, 2, 6, 1],
|
51
|
+
[4, 2, 3, 4, 0, 1, 1, 0, 0, 2, 1, 1],
|
52
|
+
[4, 2, 3, 4, 0, 1, 1, 0, 0, 2, 1, 2],
|
53
|
+
[4, 2, 3, 4, 0, 1, 1, 0, 0, 2, 1, 2],
|
54
|
+
])
|
55
|
+
|
56
|
+
b = tf.constant([
|
57
|
+
[1.1, 2.0, 3.0, 4.0, 4.1, 1.1, 4.1, 8.1, 3.2, 4.3, 1.1, 1.1],
|
58
|
+
[2.1, 2.1, 3.0, 4.0, 4.2, 1.1, 1.1, 1.1, 1.2, 4.4, 1.0, 1.1],
|
59
|
+
[3.1, 2.2, 3.0, 4.0, 0.1, 1.1, 1.2, 2.1, 0.3, 1.0, 2.0, 1.3],
|
60
|
+
[4.1, 2.3, 3.0, 4.0, 0.3, 1.1, 1.3, 0.5, 1.5, 1.2, 3.0, 1.1],
|
61
|
+
[4.1, 2.3, 3.0, 4.0, 0.3, 1.1, 1.4, 0.1, 1.6, 1.3, 4.0, 1.2],
|
62
|
+
[4.1, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.4, 1.11, 1.4, 5.1, 1.4],
|
63
|
+
[4.1, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 1.22, 1.1, 6.1, 1.5],
|
64
|
+
[4.1, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 1.3, 0.8, 0.25, 1.6],
|
65
|
+
[4.1, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 1.3, 2.0, 6.5, 1.7],
|
66
|
+
[4.1, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 1.3, 2.0, 1.1, 1.8],
|
67
|
+
[4.1, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 1.3, 2.0, 1.1, 1.9],
|
68
|
+
[4.1, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 1.3, 2.0, 1.1, 1.9],
|
69
|
+
])
|
70
|
+
|
71
|
+
c = tf.constant([
|
72
|
+
[1.0, 2.0, 3.0, 4.0, 4.1, 1.1, 4.1, 8.1, 3.2, 4.3, 1.1, 1.1],
|
73
|
+
[2.0, 2.1, 3.0, 4.0, 4.2, 1.1, 1.1, 1.1, 1.2, 4.4, 1.0, 1.1],
|
74
|
+
[3.0, 2.2, 3.0, 4.0, 0.1, 1.1, 1.2, 2.1, 1.3, 1.0, 2.0, 1.3],
|
75
|
+
[4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.3, 0.5, 1.5, 1.2, 3.0, 1.1],
|
76
|
+
[4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.4, 0.1, 1.6, 1.3, 4.0, 1.2],
|
77
|
+
[4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.4, 0.11, 1.4, 5.1, 1.4],
|
78
|
+
[4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.22, 1.1, 6.1, 1.5],
|
79
|
+
[4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 0.8, 0.25, 1.6],
|
80
|
+
[4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 2.0, 6.5, 1.7],
|
81
|
+
[4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 2.0, 1.1, 1.8],
|
82
|
+
[4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 2.0, 1.1, 1.9],
|
83
|
+
[4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 2.0, 1.1, 1.9],
|
84
|
+
])
|
85
|
+
|
86
|
+
d = tf.constant([
|
87
|
+
[1.0, 2.0, 3.0, 4.0, 4.1, 1.1, 4.1, 8.1, 3.2, 4.3, 1.1, 1.1],
|
88
|
+
[2.0, 2.1, 3.0, 4.0, 4.2, 1.1, 1.1, 1.1, 1.2, 4.4, 1.0, 1.1],
|
89
|
+
[3.0, 2.2, 3.0, 4.0, 0.1, 1.1, 1.2, 2.1, 1.3, 1.0, 2.0, 1.3],
|
90
|
+
[4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.3, 0.5, 1.5, 1.2, 3.0, 1.1],
|
91
|
+
[4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.4, 0.1, 1.6, 1.3, 4.0, 1.2],
|
92
|
+
[4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.4, 0.11, 1.4, 5.1, 1.4],
|
93
|
+
[4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.22, 1.1, 6.1, 1.5],
|
94
|
+
[4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 0.8, 0.25, 1.6],
|
95
|
+
[4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 2.0, 6.5, 1.7],
|
96
|
+
[4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 2.0, 1.1, 1.8],
|
97
|
+
[4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 2.0, 1.1, 1.9],
|
98
|
+
[4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 2.0, 1.1, 1.9],
|
99
|
+
])
|
100
|
+
|
101
|
+
p = tf.placeholder("float")
|
102
|
+
q = tf.placeholder("float")
|
103
|
+
|
104
|
+
model = -tf.sin(a.dot(b + p) + c).dot(a) + tf.cos(a.dot(d + q))
|
105
|
+
single_function_test = (tf.sigmoid(a * p) * tf.sigmoid(b * q)) + c
|
106
|
+
pow_f = tf.pow(a, 3)
|
107
|
+
pow_i = tf.pow(a_int, 3)
|
108
|
+
|
109
|
+
sess = tf.session
|
110
|
+
sess2 = tf.session(:opencl_evaluator)
|
111
|
+
|
112
|
+
# verify correctness
|
113
|
+
10.times do
|
114
|
+
feed = { p => rand, q => rand }
|
115
|
+
x = sess.run(model, feed_dict: feed )
|
116
|
+
y = sess2.run(model, feed_dict: feed )
|
117
|
+
fail "not equal #{tr(x.first)} != #{tr(y.first)}" if tr(x) != tr(y)
|
118
|
+
end
|
119
|
+
|
120
|
+
Benchmark.bmbm do |x|
|
121
|
+
x.report("pure ruby :") { 100.times do sess.run(model, feed_dict: { p => rand, q => rand }) end }
|
122
|
+
x.report("opencl :") { 100.times do sess2.run(model, feed_dict: { p => rand, q => rand }) end }
|
123
|
+
x.report("pure ruby single function:") { 100.times do sess.run(single_function_test, feed_dict: { p => rand, q => rand }) end }
|
124
|
+
x.report("opencl singlefunction:") { 100.times do sess2.run(single_function_test, feed_dict: { p => rand, q => rand }) end }
|
125
|
+
x.report("pure ruby pow float:") { 100.times do sess.run(pow_f, feed_dict: { p => rand, q => rand }) end }
|
126
|
+
x.report("opencl pow float:") { 100.times do sess2.run(pow_f, feed_dict: { p => rand, q => rand }) end }
|
127
|
+
x.report("pure ruby pow int:") { 100.times do sess.run(pow_i, feed_dict: { p => rand, q => rand }) end }
|
128
|
+
x.report("opencl pow int:") { 100.times do sess2.run(pow_i, feed_dict: { p => rand, q => rand }) end }
|
129
|
+
end
|
data/lib/tensor_stream.rb
CHANGED
@@ -62,14 +62,15 @@ module TensorStream
|
|
62
62
|
end
|
63
63
|
|
64
64
|
def self.variable(value, name: nil, initializer: nil, graph: nil, dtype: nil, trainable: true)
|
65
|
+
op = Operation.new(:assign, nil, value)
|
65
66
|
common_options = {
|
66
|
-
initializer: initializer ||
|
67
|
+
initializer: initializer || op,
|
67
68
|
name: name,
|
68
69
|
graph: graph,
|
69
70
|
dtype: dtype,
|
70
71
|
trainable: trainable
|
71
72
|
}
|
72
|
-
if value.is_a?(String)
|
73
|
+
tensor = if value.is_a?(String)
|
73
74
|
TensorStream::Variable.new(dtype || :string, 0, [], common_options)
|
74
75
|
elsif value.is_a?(Integer)
|
75
76
|
TensorStream::Variable.new(dtype || :int32, 0, [], common_options)
|
@@ -78,6 +79,8 @@ module TensorStream
|
|
78
79
|
else
|
79
80
|
TensorStream::Variable.new(dtype || :float32, 0, nil, common_options)
|
80
81
|
end
|
82
|
+
op.items[0] = tensor
|
83
|
+
tensor
|
81
84
|
end
|
82
85
|
|
83
86
|
def self.variable_scope(scope = nil, reuse: nil, initializer: nil)
|
@@ -166,8 +169,8 @@ module TensorStream
|
|
166
169
|
Graph.get_default_graph.get_collection(name, options)
|
167
170
|
end
|
168
171
|
|
169
|
-
def self.placeholder(dtype,
|
170
|
-
TensorStream::Placeholder.new(dtype, nil,
|
172
|
+
def self.placeholder(dtype, shape: nil)
|
173
|
+
TensorStream::Placeholder.new(dtype, nil, shape)
|
171
174
|
end
|
172
175
|
|
173
176
|
def self.global_variables_initializer
|
@@ -0,0 +1,45 @@
|
|
1
|
+
// same dimension add floating point op
|
2
|
+
__kernel void <%= fname%>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= result_t %> *C) {
|
3
|
+
// Get the index of the current element to be processed
|
4
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
5
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
6
|
+
|
7
|
+
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <%= op %> B[globalRow * N + globalCol] ? 1 : 0;
|
8
|
+
}
|
9
|
+
|
10
|
+
// 1D + Scalar floating point add op
|
11
|
+
__kernel void <%=fname%>_c_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= result_t %> *C) {
|
12
|
+
// Get the index of the current element to be processed
|
13
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
14
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
15
|
+
|
16
|
+
if (switch_op == 0) {
|
17
|
+
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <%= op %> B[0] ? 1 : 0;
|
18
|
+
} else {
|
19
|
+
C[globalRow * N + globalCol] = B[0] <%= op %> A[globalRow * N + globalCol] ? 1 : 0;
|
20
|
+
}
|
21
|
+
}
|
22
|
+
|
23
|
+
// 1D + Scalar floating point add op broadcast
|
24
|
+
__kernel void <%= fname%>_b_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op,__global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= result_t %> *C) {
|
25
|
+
// Get the index of the current element to be processed
|
26
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
27
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
28
|
+
|
29
|
+
int b_m_index = globalRow;
|
30
|
+
int b_n_index = globalCol;
|
31
|
+
|
32
|
+
if ( b_m_index >= M2) {
|
33
|
+
b_m_index = b_m_index % M2;
|
34
|
+
};
|
35
|
+
|
36
|
+
if (b_n_index >= N2) {
|
37
|
+
b_n_index = b_n_index % N2;
|
38
|
+
}
|
39
|
+
|
40
|
+
if (switch_op == 0) {
|
41
|
+
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <%= op %> B[b_m_index * N2 + b_n_index] ? 1 : 0;
|
42
|
+
} else {
|
43
|
+
C[globalRow * N + globalCol] = B[b_m_index * N2 + b_n_index] <%= op %> A[globalRow * N + globalCol] ? 1 : 0;
|
44
|
+
}
|
45
|
+
}
|
@@ -0,0 +1,45 @@
|
|
1
|
+
// same dimension add floating point op
|
2
|
+
__kernel void <%= fname%>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= result_t %> *C) {
|
3
|
+
// Get the index of the current element to be processed
|
4
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
5
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
6
|
+
|
7
|
+
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <%= op %> B[globalRow * N + globalCol];
|
8
|
+
}
|
9
|
+
|
10
|
+
// 1D + Scalar floating point add op
|
11
|
+
__kernel void <%=fname%>_c_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= result_t %> *C) {
|
12
|
+
// Get the index of the current element to be processed
|
13
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
14
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
15
|
+
|
16
|
+
if (switch_op == 0) {
|
17
|
+
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <%= op %> B[0];
|
18
|
+
} else {
|
19
|
+
C[globalRow * N + globalCol] = B[0] <%= op %> A[globalRow * N + globalCol];
|
20
|
+
}
|
21
|
+
}
|
22
|
+
|
23
|
+
// 1D + Scalar floating point add op broadcast
|
24
|
+
__kernel void <%= fname%>_b_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op,__global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= result_t %> *C) {
|
25
|
+
// Get the index of the current element to be processed
|
26
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
27
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
28
|
+
|
29
|
+
int b_m_index = globalRow;
|
30
|
+
int b_n_index = globalCol;
|
31
|
+
|
32
|
+
if ( b_m_index >= M2) {
|
33
|
+
b_m_index = b_m_index % M2;
|
34
|
+
};
|
35
|
+
|
36
|
+
if (b_n_index >= N2) {
|
37
|
+
b_n_index = b_n_index % N2;
|
38
|
+
}
|
39
|
+
|
40
|
+
if (switch_op == 0) {
|
41
|
+
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <%= op %> B[b_m_index * N2 + b_n_index];
|
42
|
+
} else {
|
43
|
+
C[globalRow * N + globalCol] = B[b_m_index * N2 + b_n_index] <%= op %> A[globalRow * N + globalCol];
|
44
|
+
}
|
45
|
+
}
|
@@ -0,0 +1,16 @@
|
|
1
|
+
|
2
|
+
__kernel void abs_fp(const int M, const int N, __global const float *A, __global float *C) {
|
3
|
+
// Get the index of the current element to be processed
|
4
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
5
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
6
|
+
|
7
|
+
C[globalRow * N + globalCol] = fabs(A[globalRow * N + globalCol]);
|
8
|
+
}
|
9
|
+
|
10
|
+
__kernel void abs_int(const int M, const int N, __global const int *A, __global int *C) {
|
11
|
+
// Get the index of the current element to be processed
|
12
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
13
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
14
|
+
|
15
|
+
C[globalRow * N + globalCol] = fabs((float)A[globalRow * N + globalCol]);
|
16
|
+
}
|
@@ -0,0 +1,15 @@
|
|
1
|
+
__kernel void argmax_fp(const int M, const int N, const int switch_op, __global const float *A, __global const float *B, __global float *C) {
|
2
|
+
// Get the index of the current element to be processed
|
3
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
4
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
5
|
+
|
6
|
+
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] + B[globalRow * N + globalCol];
|
7
|
+
}
|
8
|
+
|
9
|
+
__kernel void argmax_int(const int M, const int N, const int switch_op, __global const int *A, __global const int *B, __global int *C) {
|
10
|
+
// Get the index of the current element to be processed
|
11
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
12
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
13
|
+
|
14
|
+
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] + B[globalRow * N + globalCol];
|
15
|
+
}
|
@@ -0,0 +1,15 @@
|
|
1
|
+
__kernel void argmin_fp(const int M, const int N, const int switch_op, __global const float *A, __global const float *B, __global float *C) {
|
2
|
+
// Get the index of the current element to be processed
|
3
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
4
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
5
|
+
|
6
|
+
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] + B[globalRow * N + globalCol];
|
7
|
+
}
|
8
|
+
|
9
|
+
__kernel void argmin_int(const int M, const int N, const int switch_op, __global const int *A, __global const int *B, __global int *C) {
|
10
|
+
// Get the index of the current element to be processed
|
11
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
12
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
13
|
+
|
14
|
+
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] + B[globalRow * N + globalCol];
|
15
|
+
}
|
@@ -0,0 +1,15 @@
|
|
1
|
+
__kernel void cast_int_fp(const int M, const int N, __global const int *A, __global float *C) {
|
2
|
+
// Get the index of the current element to be processed
|
3
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
4
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
5
|
+
|
6
|
+
C[globalRow * N + globalCol] = A[globalRow * N + globalCol];
|
7
|
+
}
|
8
|
+
|
9
|
+
__kernel void cast_fp_int(const int M, const int N,__global const float *A, __global int *C) {
|
10
|
+
// Get the index of the current element to be processed
|
11
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
12
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
13
|
+
|
14
|
+
C[globalRow * N + globalCol] = A[globalRow * N + globalCol];
|
15
|
+
}
|
@@ -0,0 +1,5 @@
|
|
1
|
+
% %w[fp int].product(%w[less less_equal equal not_equal greater greater_equal logical_and]).each do |dtype, fname|
|
2
|
+
% c_dtype = dtype_to_c_type(dtype)
|
3
|
+
% op = operator_to_c(fname)
|
4
|
+
<%= render 'bool_operand.cl', c_dtype: c_dtype, op: op, fname: fname, dtype: dtype, result_t: 'int' %>
|
5
|
+
% end
|