tensor_stream 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.circleci/config.yml +2 -1
- data/CHANGELOG.md +5 -0
- data/README.md +28 -1
- data/benchmark/benchmark.rb +129 -0
- data/lib/tensor_stream.rb +7 -4
- data/lib/tensor_stream/evaluator/buffer.rb +10 -0
- data/lib/tensor_stream/evaluator/evaluator.rb +1 -0
- data/lib/tensor_stream/evaluator/kernels/_bool_operand.cl +45 -0
- data/lib/tensor_stream/evaluator/kernels/_operand.cl +45 -0
- data/lib/tensor_stream/evaluator/kernels/abs.cl +16 -0
- data/lib/tensor_stream/evaluator/kernels/add.cl +5 -0
- data/lib/tensor_stream/evaluator/kernels/argmax.cl +15 -0
- data/lib/tensor_stream/evaluator/kernels/argmin.cl +15 -0
- data/lib/tensor_stream/evaluator/kernels/cast.cl +15 -0
- data/lib/tensor_stream/evaluator/kernels/cond.cl.erb +5 -0
- data/lib/tensor_stream/evaluator/kernels/cos.cl +7 -0
- data/lib/tensor_stream/evaluator/kernels/div.cl.erb +5 -0
- data/lib/tensor_stream/evaluator/kernels/exp.cl +7 -0
- data/lib/tensor_stream/evaluator/kernels/gemm.cl +63 -0
- data/lib/tensor_stream/evaluator/kernels/log.cl +7 -0
- data/lib/tensor_stream/evaluator/kernels/log1p.cl +7 -0
- data/lib/tensor_stream/evaluator/kernels/max.cl +91 -0
- data/lib/tensor_stream/evaluator/kernels/mul.cl +5 -0
- data/lib/tensor_stream/evaluator/kernels/negate.cl +15 -0
- data/lib/tensor_stream/evaluator/kernels/pow.cl +130 -0
- data/lib/tensor_stream/evaluator/kernels/reciprocal.cl +15 -0
- data/lib/tensor_stream/evaluator/kernels/round.cl +7 -0
- data/lib/tensor_stream/evaluator/kernels/sigmoid.cl +8 -0
- data/lib/tensor_stream/evaluator/kernels/sigmoid_grad.cl +54 -0
- data/lib/tensor_stream/evaluator/kernels/sign.cl +23 -0
- data/lib/tensor_stream/evaluator/kernels/sin.cl +8 -0
- data/lib/tensor_stream/evaluator/kernels/sqrt.cl +8 -0
- data/lib/tensor_stream/evaluator/kernels/square.cl +15 -0
- data/lib/tensor_stream/evaluator/kernels/sub.cl +5 -0
- data/lib/tensor_stream/evaluator/kernels/tan.cl +7 -0
- data/lib/tensor_stream/evaluator/kernels/tanh.cl +7 -0
- data/lib/tensor_stream/evaluator/kernels/tanh_grad.cl +6 -0
- data/lib/tensor_stream/evaluator/kernels/where.cl +15 -0
- data/lib/tensor_stream/evaluator/opencl_buffer.rb +30 -0
- data/lib/tensor_stream/evaluator/opencl_evaluator.rb +1095 -0
- data/lib/tensor_stream/evaluator/opencl_template_helper.rb +58 -0
- data/lib/tensor_stream/evaluator/operation_helpers/array_ops_helper.rb +27 -0
- data/lib/tensor_stream/evaluator/ruby_evaluator.rb +20 -31
- data/lib/tensor_stream/graph.rb +4 -2
- data/lib/tensor_stream/math_gradients.rb +3 -0
- data/lib/tensor_stream/operation.rb +29 -2
- data/lib/tensor_stream/ops.rb +14 -2
- data/lib/tensor_stream/placeholder.rb +1 -1
- data/lib/tensor_stream/session.rb +10 -3
- data/lib/tensor_stream/tensor_shape.rb +1 -1
- data/lib/tensor_stream/train/saver.rb +1 -1
- data/lib/tensor_stream/variable.rb +7 -1
- data/lib/tensor_stream/version.rb +1 -1
- data/samples/logistic_regression.rb +2 -1
- data/samples/nearest_neighbor.rb +54 -0
- data/tensor_stream.gemspec +3 -1
- metadata +107 -28
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 7f758a576604fad842e40ff2c64f5d7deef05e5c
|
4
|
+
data.tar.gz: f94636819dc9fab55bf53bc198a8ad4854728249
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b167b5160330e4421ebbbce9c7e2235db0d145f1f99b37731b63a9e32182dbdc38d473c86318be8023d86bb4ae0aae4104f66856911404dcdd478aaed8df9f43
|
7
|
+
data.tar.gz: aa1264ef33f8e7550dc917cb29d6c183e932b9637e8262e6c87eca4cea337d640d5576d9cf58b2bcf9714304fe8fa16fc95a3be6e0f879ab3678f0bccb6233ae
|
data/.circleci/config.yml
CHANGED
@@ -42,8 +42,9 @@ jobs:
|
|
42
42
|
command: |
|
43
43
|
mkdir /tmp/test-results
|
44
44
|
TEST_FILES="$(circleci tests glob "spec/**/*_spec.rb" | circleci tests split --split-by=timings)"
|
45
|
-
|
45
|
+
|
46
46
|
bundle exec rspec -r rspec_junit_formatter --format progress \
|
47
|
+
--exclude-pattern "spec/tensor_stream/evaluators/opencl_*.rb" \
|
47
48
|
--format RspecJunitFormatter \
|
48
49
|
--out /tmp/test-results/rspec.xml \
|
49
50
|
--format progress \
|
data/CHANGELOG.md
CHANGED
@@ -4,6 +4,11 @@ All notable changes to this project will be documented in this file.
|
|
4
4
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
|
5
5
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
6
6
|
|
7
|
+
## [0.3.0] - 2018-06-05
|
8
|
+
### Added
|
9
|
+
- hardware acceleration using OpenCL
|
10
|
+
- working nearest neighbor sample (use opencl evaluator for best performance)
|
11
|
+
|
7
12
|
## [0.2.0] - 2018-05-27
|
8
13
|
### Added
|
9
14
|
- working logistic regression sample
|
data/README.md
CHANGED
@@ -4,7 +4,7 @@
|
|
4
4
|
|
5
5
|
# TensorStream
|
6
6
|
|
7
|
-
A reimplementation of TensorFlow for ruby. This is a ground up implementation with no dependency on TensorFlow. Effort has been made to make the programming style as near to TensorFlow as possible, comes with a pure ruby evaluator by default
|
7
|
+
A reimplementation of TensorFlow for ruby. This is a ground up implementation with no dependency on TensorFlow. Effort has been made to make the programming style as near to TensorFlow as possible, comes with a pure ruby evaluator by default with support for an opencl evaluator for large models and datasets.
|
8
8
|
|
9
9
|
The goal of this gem is to have a high performance machine learning and compute solution for ruby with support for a wide range of hardware and software configuration.
|
10
10
|
|
@@ -176,6 +176,33 @@ f = tf.matmul(a, b).breakpoint! { |tensor, a, b, result_value| binding.pry }
|
|
176
176
|
tf.session.run(f)
|
177
177
|
```
|
178
178
|
|
179
|
+
### OpenCL
|
180
|
+
|
181
|
+
For OpenCL support, make sure that the required OpenCL drivers for your hardware are correctly installed on your system.
|
182
|
+
Also OpenCL only supports ruby-mri at the moment.
|
183
|
+
|
184
|
+
Also include the following gem in your project:
|
185
|
+
|
186
|
+
```
|
187
|
+
gem 'opencl_ruby_ffi'
|
188
|
+
```
|
189
|
+
|
190
|
+
To use the opencl evaluator instead of the ruby evaluator:
|
191
|
+
|
192
|
+
```ruby
|
193
|
+
require 'tensor_stream/evaluator/opencl_evaluator'
|
194
|
+
|
195
|
+
# set session to use the opencl evaluator
|
196
|
+
sess = tf.session(:opencl_evaluator)
|
197
|
+
|
198
|
+
sess.run(....) # do stuff
|
199
|
+
|
200
|
+
```
|
201
|
+
|
202
|
+
Note that the OpenCL evaluator provides speedup if you are using large tensors, tensors that are only using scalars like the linear regression sample will actually be slower.
|
203
|
+
|
204
|
+
samples/nearest_neighbor.rb contains a sample that uses opencl.
|
205
|
+
|
179
206
|
# Visualization
|
180
207
|
|
181
208
|
tensorstream does not support tensorboard yet, but a graphml generator is included:
|
@@ -0,0 +1,129 @@
|
|
1
|
+
require "bundler/setup"
|
2
|
+
require 'tensor_stream'
|
3
|
+
require 'benchmark'
|
4
|
+
require 'pry-byebug'
|
5
|
+
require 'awesome_print'
|
6
|
+
require 'tensor_stream/evaluator/opencl_evaluator'
|
7
|
+
|
8
|
+
def tr(t, places = 1)
|
9
|
+
if t.is_a?(Array)
|
10
|
+
return t.collect do |v|
|
11
|
+
tr(v)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
return t unless t.kind_of?(Float)
|
16
|
+
|
17
|
+
t.round(places)
|
18
|
+
end
|
19
|
+
|
20
|
+
tf = TensorStream
|
21
|
+
|
22
|
+
srand(5)
|
23
|
+
seed = 5
|
24
|
+
tf.set_random_seed(seed)
|
25
|
+
|
26
|
+
a = tf.constant([
|
27
|
+
[1.0, 2.0, 3.0, 4.0, 4.1, 1.1, 4.1, 8.1, 3.2, 4.3, 1.1, 1.1],
|
28
|
+
[2.0, 2.1, 3.0, 4.0, 4.2, 1.1, 1.1, 1.1, 1.2, 4.4, 1.0, 1.1],
|
29
|
+
[3.0, 2.2, 3.0, 4.0, 0.1, 1.1, 1.2, 2.1, 1.3, 1.0, 2.0, 1.3],
|
30
|
+
[4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.3, 0.5, 1.5, 1.2, 3.0, 1.1],
|
31
|
+
[4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.4, 0.1, 1.6, 1.3, 4.0, 1.2],
|
32
|
+
[4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.4, 0.11, 1.4, 5.1, 1.4],
|
33
|
+
[4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.22, 1.1, 6.1, 1.5],
|
34
|
+
[4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 0.8, 0.25, 1.6],
|
35
|
+
[4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 2.0, 6.5, 1.7],
|
36
|
+
[4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 2.0, 1.1, 1.8],
|
37
|
+
[4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 2.0, 1.1, 1.9],
|
38
|
+
[4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 2.0, 1.1, 1.9],
|
39
|
+
])
|
40
|
+
|
41
|
+
a_int = tf.constant([
|
42
|
+
[1, 2, 3, 4, 4, 1, 4, 8, 3, 4, 1, 1],
|
43
|
+
[2, 2, 3, 4, 4, 1, 1, 1, 1, 4, 1, 1],
|
44
|
+
[3, 2, 3, 4, 0, 1, 1, 2, 1, 1, 2, 1],
|
45
|
+
[4, 2, 3, 4, 0, 1, 1, 0, 1, 1, 3, 1],
|
46
|
+
[4, 2, 3, 4, 0, 1, 1, 0, 1, 1, 4, 1],
|
47
|
+
[4, 2, 3, 4, 0, 1, 1, 0, 0, 1, 5, 1],
|
48
|
+
[4, 2, 3, 4, 0, 1, 1, 0, 0, 1, 6, 1],
|
49
|
+
[4, 2, 3, 4, 0, 1, 1, 0, 0, 0, 0, 1],
|
50
|
+
[4, 2, 3, 4, 0, 1, 1, 0, 0, 2, 6, 1],
|
51
|
+
[4, 2, 3, 4, 0, 1, 1, 0, 0, 2, 1, 1],
|
52
|
+
[4, 2, 3, 4, 0, 1, 1, 0, 0, 2, 1, 2],
|
53
|
+
[4, 2, 3, 4, 0, 1, 1, 0, 0, 2, 1, 2],
|
54
|
+
])
|
55
|
+
|
56
|
+
b = tf.constant([
|
57
|
+
[1.1, 2.0, 3.0, 4.0, 4.1, 1.1, 4.1, 8.1, 3.2, 4.3, 1.1, 1.1],
|
58
|
+
[2.1, 2.1, 3.0, 4.0, 4.2, 1.1, 1.1, 1.1, 1.2, 4.4, 1.0, 1.1],
|
59
|
+
[3.1, 2.2, 3.0, 4.0, 0.1, 1.1, 1.2, 2.1, 0.3, 1.0, 2.0, 1.3],
|
60
|
+
[4.1, 2.3, 3.0, 4.0, 0.3, 1.1, 1.3, 0.5, 1.5, 1.2, 3.0, 1.1],
|
61
|
+
[4.1, 2.3, 3.0, 4.0, 0.3, 1.1, 1.4, 0.1, 1.6, 1.3, 4.0, 1.2],
|
62
|
+
[4.1, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.4, 1.11, 1.4, 5.1, 1.4],
|
63
|
+
[4.1, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 1.22, 1.1, 6.1, 1.5],
|
64
|
+
[4.1, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 1.3, 0.8, 0.25, 1.6],
|
65
|
+
[4.1, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 1.3, 2.0, 6.5, 1.7],
|
66
|
+
[4.1, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 1.3, 2.0, 1.1, 1.8],
|
67
|
+
[4.1, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 1.3, 2.0, 1.1, 1.9],
|
68
|
+
[4.1, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 1.3, 2.0, 1.1, 1.9],
|
69
|
+
])
|
70
|
+
|
71
|
+
c = tf.constant([
|
72
|
+
[1.0, 2.0, 3.0, 4.0, 4.1, 1.1, 4.1, 8.1, 3.2, 4.3, 1.1, 1.1],
|
73
|
+
[2.0, 2.1, 3.0, 4.0, 4.2, 1.1, 1.1, 1.1, 1.2, 4.4, 1.0, 1.1],
|
74
|
+
[3.0, 2.2, 3.0, 4.0, 0.1, 1.1, 1.2, 2.1, 1.3, 1.0, 2.0, 1.3],
|
75
|
+
[4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.3, 0.5, 1.5, 1.2, 3.0, 1.1],
|
76
|
+
[4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.4, 0.1, 1.6, 1.3, 4.0, 1.2],
|
77
|
+
[4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.4, 0.11, 1.4, 5.1, 1.4],
|
78
|
+
[4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.22, 1.1, 6.1, 1.5],
|
79
|
+
[4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 0.8, 0.25, 1.6],
|
80
|
+
[4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 2.0, 6.5, 1.7],
|
81
|
+
[4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 2.0, 1.1, 1.8],
|
82
|
+
[4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 2.0, 1.1, 1.9],
|
83
|
+
[4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 2.0, 1.1, 1.9],
|
84
|
+
])
|
85
|
+
|
86
|
+
d = tf.constant([
|
87
|
+
[1.0, 2.0, 3.0, 4.0, 4.1, 1.1, 4.1, 8.1, 3.2, 4.3, 1.1, 1.1],
|
88
|
+
[2.0, 2.1, 3.0, 4.0, 4.2, 1.1, 1.1, 1.1, 1.2, 4.4, 1.0, 1.1],
|
89
|
+
[3.0, 2.2, 3.0, 4.0, 0.1, 1.1, 1.2, 2.1, 1.3, 1.0, 2.0, 1.3],
|
90
|
+
[4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.3, 0.5, 1.5, 1.2, 3.0, 1.1],
|
91
|
+
[4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.4, 0.1, 1.6, 1.3, 4.0, 1.2],
|
92
|
+
[4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.4, 0.11, 1.4, 5.1, 1.4],
|
93
|
+
[4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.22, 1.1, 6.1, 1.5],
|
94
|
+
[4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 0.8, 0.25, 1.6],
|
95
|
+
[4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 2.0, 6.5, 1.7],
|
96
|
+
[4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 2.0, 1.1, 1.8],
|
97
|
+
[4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 2.0, 1.1, 1.9],
|
98
|
+
[4.0, 2.3, 3.0, 4.0, 0.3, 1.1, 1.5, 0.6, 0.3, 2.0, 1.1, 1.9],
|
99
|
+
])
|
100
|
+
|
101
|
+
p = tf.placeholder("float")
|
102
|
+
q = tf.placeholder("float")
|
103
|
+
|
104
|
+
model = -tf.sin(a.dot(b + p) + c).dot(a) + tf.cos(a.dot(d + q))
|
105
|
+
single_function_test = (tf.sigmoid(a * p) * tf.sigmoid(b * q)) + c
|
106
|
+
pow_f = tf.pow(a, 3)
|
107
|
+
pow_i = tf.pow(a_int, 3)
|
108
|
+
|
109
|
+
sess = tf.session
|
110
|
+
sess2 = tf.session(:opencl_evaluator)
|
111
|
+
|
112
|
+
# verify correctness
|
113
|
+
10.times do
|
114
|
+
feed = { p => rand, q => rand }
|
115
|
+
x = sess.run(model, feed_dict: feed )
|
116
|
+
y = sess2.run(model, feed_dict: feed )
|
117
|
+
fail "not equal #{tr(x.first)} != #{tr(y.first)}" if tr(x) != tr(y)
|
118
|
+
end
|
119
|
+
|
120
|
+
Benchmark.bmbm do |x|
|
121
|
+
x.report("pure ruby :") { 100.times do sess.run(model, feed_dict: { p => rand, q => rand }) end }
|
122
|
+
x.report("opencl :") { 100.times do sess2.run(model, feed_dict: { p => rand, q => rand }) end }
|
123
|
+
x.report("pure ruby single function:") { 100.times do sess.run(single_function_test, feed_dict: { p => rand, q => rand }) end }
|
124
|
+
x.report("opencl singlefunction:") { 100.times do sess2.run(single_function_test, feed_dict: { p => rand, q => rand }) end }
|
125
|
+
x.report("pure ruby pow float:") { 100.times do sess.run(pow_f, feed_dict: { p => rand, q => rand }) end }
|
126
|
+
x.report("opencl pow float:") { 100.times do sess2.run(pow_f, feed_dict: { p => rand, q => rand }) end }
|
127
|
+
x.report("pure ruby pow int:") { 100.times do sess.run(pow_i, feed_dict: { p => rand, q => rand }) end }
|
128
|
+
x.report("opencl pow int:") { 100.times do sess2.run(pow_i, feed_dict: { p => rand, q => rand }) end }
|
129
|
+
end
|
data/lib/tensor_stream.rb
CHANGED
@@ -62,14 +62,15 @@ module TensorStream
|
|
62
62
|
end
|
63
63
|
|
64
64
|
def self.variable(value, name: nil, initializer: nil, graph: nil, dtype: nil, trainable: true)
|
65
|
+
op = Operation.new(:assign, nil, value)
|
65
66
|
common_options = {
|
66
|
-
initializer: initializer ||
|
67
|
+
initializer: initializer || op,
|
67
68
|
name: name,
|
68
69
|
graph: graph,
|
69
70
|
dtype: dtype,
|
70
71
|
trainable: trainable
|
71
72
|
}
|
72
|
-
if value.is_a?(String)
|
73
|
+
tensor = if value.is_a?(String)
|
73
74
|
TensorStream::Variable.new(dtype || :string, 0, [], common_options)
|
74
75
|
elsif value.is_a?(Integer)
|
75
76
|
TensorStream::Variable.new(dtype || :int32, 0, [], common_options)
|
@@ -78,6 +79,8 @@ module TensorStream
|
|
78
79
|
else
|
79
80
|
TensorStream::Variable.new(dtype || :float32, 0, nil, common_options)
|
80
81
|
end
|
82
|
+
op.items[0] = tensor
|
83
|
+
tensor
|
81
84
|
end
|
82
85
|
|
83
86
|
def self.variable_scope(scope = nil, reuse: nil, initializer: nil)
|
@@ -166,8 +169,8 @@ module TensorStream
|
|
166
169
|
Graph.get_default_graph.get_collection(name, options)
|
167
170
|
end
|
168
171
|
|
169
|
-
def self.placeholder(dtype,
|
170
|
-
TensorStream::Placeholder.new(dtype, nil,
|
172
|
+
def self.placeholder(dtype, shape: nil)
|
173
|
+
TensorStream::Placeholder.new(dtype, nil, shape)
|
171
174
|
end
|
172
175
|
|
173
176
|
def self.global_variables_initializer
|
@@ -0,0 +1,45 @@
|
|
1
|
+
// same dimension add floating point op
|
2
|
+
__kernel void <%= fname%>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= result_t %> *C) {
|
3
|
+
// Get the index of the current element to be processed
|
4
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
5
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
6
|
+
|
7
|
+
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <%= op %> B[globalRow * N + globalCol] ? 1 : 0;
|
8
|
+
}
|
9
|
+
|
10
|
+
// 1D + Scalar floating point add op
|
11
|
+
__kernel void <%=fname%>_c_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= result_t %> *C) {
|
12
|
+
// Get the index of the current element to be processed
|
13
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
14
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
15
|
+
|
16
|
+
if (switch_op == 0) {
|
17
|
+
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <%= op %> B[0] ? 1 : 0;
|
18
|
+
} else {
|
19
|
+
C[globalRow * N + globalCol] = B[0] <%= op %> A[globalRow * N + globalCol] ? 1 : 0;
|
20
|
+
}
|
21
|
+
}
|
22
|
+
|
23
|
+
// 1D + Scalar floating point add op broadcast
|
24
|
+
__kernel void <%= fname%>_b_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op,__global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= result_t %> *C) {
|
25
|
+
// Get the index of the current element to be processed
|
26
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
27
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
28
|
+
|
29
|
+
int b_m_index = globalRow;
|
30
|
+
int b_n_index = globalCol;
|
31
|
+
|
32
|
+
if ( b_m_index >= M2) {
|
33
|
+
b_m_index = b_m_index % M2;
|
34
|
+
};
|
35
|
+
|
36
|
+
if (b_n_index >= N2) {
|
37
|
+
b_n_index = b_n_index % N2;
|
38
|
+
}
|
39
|
+
|
40
|
+
if (switch_op == 0) {
|
41
|
+
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <%= op %> B[b_m_index * N2 + b_n_index] ? 1 : 0;
|
42
|
+
} else {
|
43
|
+
C[globalRow * N + globalCol] = B[b_m_index * N2 + b_n_index] <%= op %> A[globalRow * N + globalCol] ? 1 : 0;
|
44
|
+
}
|
45
|
+
}
|
@@ -0,0 +1,45 @@
|
|
1
|
+
// same dimension add floating point op
|
2
|
+
__kernel void <%= fname%>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= result_t %> *C) {
|
3
|
+
// Get the index of the current element to be processed
|
4
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
5
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
6
|
+
|
7
|
+
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <%= op %> B[globalRow * N + globalCol];
|
8
|
+
}
|
9
|
+
|
10
|
+
// 1D + Scalar floating point add op
|
11
|
+
__kernel void <%=fname%>_c_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= result_t %> *C) {
|
12
|
+
// Get the index of the current element to be processed
|
13
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
14
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
15
|
+
|
16
|
+
if (switch_op == 0) {
|
17
|
+
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <%= op %> B[0];
|
18
|
+
} else {
|
19
|
+
C[globalRow * N + globalCol] = B[0] <%= op %> A[globalRow * N + globalCol];
|
20
|
+
}
|
21
|
+
}
|
22
|
+
|
23
|
+
// 1D + Scalar floating point add op broadcast
|
24
|
+
__kernel void <%= fname%>_b_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op,__global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= result_t %> *C) {
|
25
|
+
// Get the index of the current element to be processed
|
26
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
27
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
28
|
+
|
29
|
+
int b_m_index = globalRow;
|
30
|
+
int b_n_index = globalCol;
|
31
|
+
|
32
|
+
if ( b_m_index >= M2) {
|
33
|
+
b_m_index = b_m_index % M2;
|
34
|
+
};
|
35
|
+
|
36
|
+
if (b_n_index >= N2) {
|
37
|
+
b_n_index = b_n_index % N2;
|
38
|
+
}
|
39
|
+
|
40
|
+
if (switch_op == 0) {
|
41
|
+
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <%= op %> B[b_m_index * N2 + b_n_index];
|
42
|
+
} else {
|
43
|
+
C[globalRow * N + globalCol] = B[b_m_index * N2 + b_n_index] <%= op %> A[globalRow * N + globalCol];
|
44
|
+
}
|
45
|
+
}
|
@@ -0,0 +1,16 @@
|
|
1
|
+
|
2
|
+
__kernel void abs_fp(const int M, const int N, __global const float *A, __global float *C) {
|
3
|
+
// Get the index of the current element to be processed
|
4
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
5
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
6
|
+
|
7
|
+
C[globalRow * N + globalCol] = fabs(A[globalRow * N + globalCol]);
|
8
|
+
}
|
9
|
+
|
10
|
+
__kernel void abs_int(const int M, const int N, __global const int *A, __global int *C) {
|
11
|
+
// Get the index of the current element to be processed
|
12
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
13
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
14
|
+
|
15
|
+
C[globalRow * N + globalCol] = fabs((float)A[globalRow * N + globalCol]);
|
16
|
+
}
|
@@ -0,0 +1,15 @@
|
|
1
|
+
__kernel void argmax_fp(const int M, const int N, const int switch_op, __global const float *A, __global const float *B, __global float *C) {
|
2
|
+
// Get the index of the current element to be processed
|
3
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
4
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
5
|
+
|
6
|
+
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] + B[globalRow * N + globalCol];
|
7
|
+
}
|
8
|
+
|
9
|
+
__kernel void argmax_int(const int M, const int N, const int switch_op, __global const int *A, __global const int *B, __global int *C) {
|
10
|
+
// Get the index of the current element to be processed
|
11
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
12
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
13
|
+
|
14
|
+
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] + B[globalRow * N + globalCol];
|
15
|
+
}
|
@@ -0,0 +1,15 @@
|
|
1
|
+
__kernel void argmin_fp(const int M, const int N, const int switch_op, __global const float *A, __global const float *B, __global float *C) {
|
2
|
+
// Get the index of the current element to be processed
|
3
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
4
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
5
|
+
|
6
|
+
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] + B[globalRow * N + globalCol];
|
7
|
+
}
|
8
|
+
|
9
|
+
__kernel void argmin_int(const int M, const int N, const int switch_op, __global const int *A, __global const int *B, __global int *C) {
|
10
|
+
// Get the index of the current element to be processed
|
11
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
12
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
13
|
+
|
14
|
+
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] + B[globalRow * N + globalCol];
|
15
|
+
}
|
@@ -0,0 +1,15 @@
|
|
1
|
+
__kernel void cast_int_fp(const int M, const int N, __global const int *A, __global float *C) {
|
2
|
+
// Get the index of the current element to be processed
|
3
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
4
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
5
|
+
|
6
|
+
C[globalRow * N + globalCol] = A[globalRow * N + globalCol];
|
7
|
+
}
|
8
|
+
|
9
|
+
__kernel void cast_fp_int(const int M, const int N,__global const float *A, __global int *C) {
|
10
|
+
// Get the index of the current element to be processed
|
11
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
12
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
13
|
+
|
14
|
+
C[globalRow * N + globalCol] = A[globalRow * N + globalCol];
|
15
|
+
}
|
@@ -0,0 +1,5 @@
|
|
1
|
+
% %w[fp int].product(%w[less less_equal equal not_equal greater greater_equal logical_and]).each do |dtype, fname|
|
2
|
+
% c_dtype = dtype_to_c_type(dtype)
|
3
|
+
% op = operator_to_c(fname)
|
4
|
+
<%= render 'bool_operand.cl', c_dtype: c_dtype, op: op, fname: fname, dtype: dtype, result_t: 'int' %>
|
5
|
+
% end
|