tensor_stream-opencl 0.2.2 → 0.2.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/benchmark/benchmark.rb +23 -1
- data/benchmark_ryzen.txt +56 -0
- data/lib/tensor_stream/opencl/array_ops.rb +3 -3
- data/lib/tensor_stream/opencl/images_ops.rb +30 -0
- data/lib/tensor_stream/opencl/kernels/conv2d.cl +27 -0
- data/lib/tensor_stream/opencl/kernels/conv2d_backprop_filter.cl +26 -0
- data/lib/tensor_stream/opencl/kernels/conv2d_backprop_input.cl +32 -0
- data/lib/tensor_stream/opencl/kernels/gemm.cl +2 -10
- data/lib/tensor_stream/opencl/kernels/max.cl +5 -13
- data/lib/tensor_stream/opencl/kernels/mean.cl +26 -0
- data/lib/tensor_stream/opencl/kernels/min.cl +3 -11
- data/lib/tensor_stream/opencl/kernels/prod.cl +26 -0
- data/lib/tensor_stream/opencl/kernels/relu6.cl +7 -0
- data/lib/tensor_stream/opencl/kernels/round.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/sum.cl +26 -0
- data/lib/tensor_stream/opencl/math_ops.rb +86 -29
- data/lib/tensor_stream/opencl/nn_ops.rb +89 -5
- data/lib/tensor_stream/opencl/opencl_buffer.rb +6 -2
- data/lib/tensor_stream/opencl/opencl_evaluator.rb +97 -92
- data/lib/tensor_stream/opencl/version.rb +1 -1
- data/samples/iris.rb +2 -2
- data/samples/logistic_regression.rb +84 -0
- data/samples/mnist_data_2.1.rb +9 -4
- data/samples/mnist_data_2.2.rb +12 -7
- data/samples/mnist_data_2.3.rb +111 -0
- data/samples/rnn.rb +1 -1
- data/tensor_stream-opencl.gemspec +2 -1
- metadata +28 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7888f5385dcec4b9a747128d1b25d0cb6d7fb01ab12f1c1b70706d0df5770903
|
4
|
+
data.tar.gz: 1129196d93a7b194d9de888fe49b43b0839c00589fb0987c0841ad1ac1693a4c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 727d97b9aa1402ed9681eb71fe0c0cdcc966e8a9e2f08b7480fdefa503509e26872b187a0b184d31a5b0cee86dc96e1d97f4fce14b378fa4a7c8ce4679b06421
|
7
|
+
data.tar.gz: 0e2f1601543d582042d0911222c1209f2df2c08139b7acc8ee347b5d7e236cd451a1b00032fb75d6f532105fb8f4ee49e53614071d111e0fabc8ed44010c4301
|
data/.gitignore
CHANGED
data/benchmark/benchmark.rb
CHANGED
@@ -26,7 +26,7 @@ tf.set_random_seed(seed)
|
|
26
26
|
SHAPES = [32, 32]
|
27
27
|
|
28
28
|
sess = tf.session(:ruby_evaluator)
|
29
|
-
|
29
|
+
large_tensor = tf.constant(sess.run(tf.random_uniform([256, 256])))
|
30
30
|
a = tf.constant(sess.run(tf.random_uniform(SHAPES)))
|
31
31
|
a_int = tf.constant([
|
32
32
|
[1, 2, 3, 4, 4, 1, 4, 8, 3, 4, 1, 1],
|
@@ -49,6 +49,9 @@ c = tf.constant(sess.run(tf.random_uniform(SHAPES)))
|
|
49
49
|
|
50
50
|
d = tf.constant(sess.run(tf.random_uniform(SHAPES)))
|
51
51
|
|
52
|
+
sample_image = tf.constant(sess.run(tf.random_uniform([10, 8, 8, 3])))
|
53
|
+
sample_filter = tf.constant(sess.run(tf.random_uniform([2, 2, 3, 3])))
|
54
|
+
|
52
55
|
p = tf.placeholder('float')
|
53
56
|
q = tf.placeholder('float')
|
54
57
|
|
@@ -61,6 +64,13 @@ out_of_order = tf.matmul(a, b) + tf.matmul(c, d)
|
|
61
64
|
softmax = tf.nn.softmax(a)
|
62
65
|
add_n = tf.add_n([a,b,c,d])
|
63
66
|
split = tf.split(a, 4)
|
67
|
+
sum = tf.reduce_sum(large_tensor)
|
68
|
+
sum_axis_1 = tf.reduce_sum(large_tensor, 1)
|
69
|
+
min = tf.min(large_tensor, 1)
|
70
|
+
index = large_tensor[0]
|
71
|
+
|
72
|
+
conv2d = tf.nn.conv2d(sample_image, sample_filter, [1, 1, 1, 1], 'SAME')
|
73
|
+
conv2d_grad = tf.gradients(conv2d, [sample_image, sample_filter])
|
64
74
|
|
65
75
|
puts TensorStream::Evaluator.default_evaluators
|
66
76
|
|
@@ -70,6 +80,18 @@ puts `cat /proc/cpuinfo | grep "model name" | head -1`
|
|
70
80
|
device = TensorStream::Evaluator::OpenclEvaluator.default_device.native_device
|
71
81
|
puts "OpenCL device #{device.platform.to_s} #{device.name}"
|
72
82
|
Benchmark.bmbm do |x|
|
83
|
+
x.report("pure ruby conv2d_backprop :") { 100.times do sess.run(conv2d_grad) end }
|
84
|
+
x.report("opencl conv2d_backprop :") { 100.times do sess2.run(conv2d_grad) end }
|
85
|
+
x.report("pure ruby conv2d :") { 100.times do sess.run(conv2d) end }
|
86
|
+
x.report("opencl conv2d :") { 100.times do sess2.run(conv2d) end }
|
87
|
+
x.report("pure ruby arr index :") { 100.times do sess.run(index) end }
|
88
|
+
x.report("opencl arr index :") { 100.times do sess2.run(index) end }
|
89
|
+
x.report("pure ruby min :") { 100.times do sess.run(min) end }
|
90
|
+
x.report("opencl min :") { 100.times do sess2.run(min) end }
|
91
|
+
x.report("pure ruby sum :") { 100.times do sess.run(sum) end }
|
92
|
+
x.report("opencl sum :") { 100.times do sess2.run(sum) end }
|
93
|
+
x.report("pure ruby sum axis 1 :") { 100.times do sess.run(sum_axis_1) end }
|
94
|
+
x.report("opencl sum axis 1 :") { 100.times do sess2.run(sum_axis_1) end }
|
73
95
|
x.report("pure ruby split :") { 100.times do sess.run(split) end }
|
74
96
|
x.report("opencl split :") { 100.times do sess2.run(split) end }
|
75
97
|
x.report("pure ruby add_n :") { 100.times do sess.run(add_n) end }
|
data/benchmark_ryzen.txt
ADDED
@@ -0,0 +1,56 @@
|
|
1
|
+
TensorStream::Evaluator::OpenclEvaluator
|
2
|
+
TensorStream::Evaluator::RubyEvaluator
|
3
|
+
model name : AMD Ryzen 3 1300X Quad-Core Processor
|
4
|
+
OpenCL device NVIDIA CUDA GeForce GTX 1060 6GB
|
5
|
+
Rehearsal --------------------------------------------------------------
|
6
|
+
pure ruby arr index : 0.005448 0.003557 0.009005 ( 0.008999)
|
7
|
+
opencl arr index : 0.074642 0.190132 0.264774 ( 0.275557)
|
8
|
+
pure ruby min : 0.256004 0.000777 0.256781 ( 0.256682)
|
9
|
+
opencl min : 0.017543 0.004523 0.022066 ( 0.018797)
|
10
|
+
pure ruby sum : 0.313039 0.000565 0.313604 ( 0.313535)
|
11
|
+
opencl sum : 0.009037 0.004249 0.013286 ( 0.011073)
|
12
|
+
pure ruby split : 0.017223 0.000300 0.017523 ( 0.017542)
|
13
|
+
opencl split : 0.033489 0.014394 0.047883 ( 0.038798)
|
14
|
+
pure ruby add_n : 0.159864 0.000153 0.160017 ( 0.159992)
|
15
|
+
opencl add_n : 0.018535 0.000563 0.019098 ( 0.016168)
|
16
|
+
pure ruby ooo matmul : 1.390970 0.000304 1.391274 ( 1.390790)
|
17
|
+
opencl ooo matmul : 0.014119 0.000229 0.014348 ( 0.011738)
|
18
|
+
pure ruby softmax : 0.024103 0.000014 0.024117 ( 0.024135)
|
19
|
+
opencl softmax : 0.010602 0.004277 0.014879 ( 0.011941)
|
20
|
+
pure ruby matmul : 0.668126 0.000006 0.668132 ( 0.667778)
|
21
|
+
opencl matmul : 0.006672 0.007527 0.014199 ( 0.011594)
|
22
|
+
pure ruby : 2.388817 0.000005 2.388822 ( 2.387870)
|
23
|
+
opencl : 0.152289 0.007804 0.160093 ( 0.156279)
|
24
|
+
pure ruby single function: 0.356575 0.000062 0.356637 ( 0.356488)
|
25
|
+
opencl singlefunction: 0.120073 0.000210 0.120283 ( 0.116378)
|
26
|
+
pure ruby pow float: 0.088966 0.000051 0.089017 ( 0.088996)
|
27
|
+
opencl pow float: 0.018054 0.000100 0.018154 ( 0.015429)
|
28
|
+
pure ruby pow int: 0.025430 0.000070 0.025500 ( 0.025524)
|
29
|
+
opencl pow int: 0.015652 0.003880 0.019532 ( 0.017386)
|
30
|
+
----------------------------------------------------- total: 6.429024sec
|
31
|
+
|
32
|
+
user system total real
|
33
|
+
pure ruby arr index : 0.003564 0.000110 0.003674 ( 0.003636)
|
34
|
+
opencl arr index : 0.007966 0.003974 0.011940 ( 0.009775)
|
35
|
+
pure ruby min : 0.246153 0.000102 0.246255 ( 0.246172)
|
36
|
+
opencl min : 0.011787 0.007785 0.019572 ( 0.016169)
|
37
|
+
pure ruby sum : 0.294371 0.000000 0.294371 ( 0.294335)
|
38
|
+
opencl sum : 0.008266 0.003879 0.012145 ( 0.009315)
|
39
|
+
pure ruby split : 0.014552 0.000000 0.014552 ( 0.014539)
|
40
|
+
opencl split : 0.037984 0.004037 0.042021 ( 0.033276)
|
41
|
+
pure ruby add_n : 0.146300 0.000053 0.146353 ( 0.146319)
|
42
|
+
opencl add_n : 0.006426 0.007827 0.014253 ( 0.011461)
|
43
|
+
pure ruby ooo matmul : 1.373232 0.000096 1.373328 ( 1.372788)
|
44
|
+
opencl ooo matmul : 0.013838 0.000000 0.013838 ( 0.011088)
|
45
|
+
pure ruby softmax : 0.024478 0.000000 0.024478 ( 0.024493)
|
46
|
+
opencl softmax : 0.014117 0.000022 0.014139 ( 0.011246)
|
47
|
+
pure ruby matmul : 0.653146 0.000054 0.653200 ( 0.652889)
|
48
|
+
opencl matmul : 0.002750 0.011934 0.014684 ( 0.011729)
|
49
|
+
pure ruby : 2.392733 0.000058 2.392791 ( 2.391726)
|
50
|
+
opencl : 0.140118 0.016001 0.156119 ( 0.151788)
|
51
|
+
pure ruby single function: 0.352515 0.000000 0.352515 ( 0.352443)
|
52
|
+
opencl singlefunction: 0.093955 0.011813 0.105768 ( 0.102301)
|
53
|
+
pure ruby pow float: 0.083659 0.000000 0.083659 ( 0.083623)
|
54
|
+
opencl pow float: 0.017433 0.000125 0.017558 ( 0.014508)
|
55
|
+
pure ruby pow int: 0.018381 0.000000 0.018381 ( 0.018391)
|
56
|
+
opencl pow int: 0.008186 0.003755 0.011941 ( 0.009828)
|
@@ -197,6 +197,7 @@ module TensorStream
|
|
197
197
|
ops = if axis.zero? # fast path
|
198
198
|
inputs.each_with_index.map do |input, index|
|
199
199
|
next if input.empty_value?
|
200
|
+
|
200
201
|
start = index * input.buffer.size * input.buffer.element_size
|
201
202
|
region = [input.buffer.size * input.buffer.element_size, 1, 1]
|
202
203
|
event_wait_list = build_event_wait_list(input)
|
@@ -339,7 +340,7 @@ module TensorStream
|
|
339
340
|
|
340
341
|
register_op :index, noop: true do |context, tensor, inputs|
|
341
342
|
a = _run(inputs[0], context)
|
342
|
-
index = read_final_result(_run(inputs[1], context))
|
343
|
+
index = inputs[1].value || read_final_result(_run(inputs[1], context))
|
343
344
|
|
344
345
|
if a.is_a?(TensorStream::Evaluator::OutputGroup)
|
345
346
|
a.outputs[index]
|
@@ -348,8 +349,7 @@ module TensorStream
|
|
348
349
|
else
|
349
350
|
new_shape = a.shape.dup
|
350
351
|
new_shape.shift
|
351
|
-
|
352
|
-
convert_to_opencl(input_a[index], new_shape, data_type: a.data_type, name: tensor.name)
|
352
|
+
_create_result_sub_buffer(a, index, tensor.data_type, new_shape, "#{tensor.name}/out_#{index}")
|
353
353
|
end
|
354
354
|
end
|
355
355
|
|
@@ -7,9 +7,23 @@ module TensorStream
|
|
7
7
|
register_op :decode_png do |context, tensor, inputs|
|
8
8
|
content = _run(inputs[0], context)
|
9
9
|
channels = tensor.options[:channels]
|
10
|
+
resample_new_shape = tensor.options[:new_shape]
|
11
|
+
resample_method = tensor.options[:resample_method] || :bilinear
|
10
12
|
channels = 4 if channels.zero?
|
11
13
|
|
12
14
|
image = ChunkyPNG::Image.from_blob(content.buffer.to_a.pack('C*'))
|
15
|
+
|
16
|
+
if resample_new_shape
|
17
|
+
case resample_method
|
18
|
+
when :bilinear
|
19
|
+
image.resample_bilinear!(resample_new_shape[1], resample_new_shape[0]) # width, # height
|
20
|
+
when :nearest_neighbor
|
21
|
+
image.resample_nearest_neighbor!(resample_new_shape[1], resample_new_shape[0])
|
22
|
+
else
|
23
|
+
raise TensorStream::ValueError, "invalid resample method provided #{resample_method}. Available (:bilinear, :nearest_neighbor)"
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
13
27
|
output_buffer = _create_result_buffer(tensor.data_type, [image.height, image.width, channels], "out_#{tensor.name}")
|
14
28
|
|
15
29
|
image.grayscale! if channels == 1
|
@@ -38,6 +52,10 @@ module TensorStream
|
|
38
52
|
|
39
53
|
register_op :encode_png do |_context, tensor, inputs|
|
40
54
|
image_data = inputs[0]
|
55
|
+
|
56
|
+
resample_new_shape = tensor.options[:new_shape]
|
57
|
+
resample_method = tensor.options[:resample_method] || :bilinear
|
58
|
+
|
41
59
|
height, width, channels = image_data.shape
|
42
60
|
image_buffer = image_data.buffer.reshape(*image_data.shape.reverse).to_a
|
43
61
|
\
|
@@ -53,6 +71,18 @@ module TensorStream
|
|
53
71
|
end
|
54
72
|
end
|
55
73
|
end
|
74
|
+
|
75
|
+
if resample_new_shape
|
76
|
+
case resample_method
|
77
|
+
when :bilinear
|
78
|
+
png.resample_bilinear!(resample_new_shape[1], resample_new_shape[0]) # width, # height
|
79
|
+
when :nearest_neighbor
|
80
|
+
png.resample_nearest_neighbor!(resample_new_shape[1], resample_new_shape[0])
|
81
|
+
else
|
82
|
+
raise TensorStream::ValueError, "invalid resample method provided #{resample_method}. Available (:bilinear, :nearest_neighbor)"
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
56
86
|
convert_to_opencl(png.to_s, [], data_type: :string, name: tensor.name)
|
57
87
|
end
|
58
88
|
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
% ctype = dtype_to_c_type(dtype)
|
2
|
+
|
3
|
+
__kernel void conv2d(const int height, const int width, __global const <%= ctype %> *images, __global const <%= ctype %> *filter, __global <%= ctype %> *output) {
|
4
|
+
// Get the index of the current element to be processed
|
5
|
+
const int batch_index = get_global_id(0);
|
6
|
+
const int h_index = get_global_id(1);
|
7
|
+
const int w_index = get_global_id(2);
|
8
|
+
const int h_index_with_stride = h_index * <%= stride[0] %>;
|
9
|
+
const int w_index_with_stride = w_index * <%= stride[1] %>;
|
10
|
+
|
11
|
+
const int image_index = batch_index * height * width * <%= ch %>;
|
12
|
+
const int image_row_width = width * <%= ch %>;
|
13
|
+
|
14
|
+
for (int out_channel_index = 0; out_channel_index < <%= out_ch %>; out_channel_index++) {
|
15
|
+
<%= ctype %> sum = 0;
|
16
|
+
for (int channel_index = 0; channel_index < <%= ch %>; channel_index++) {
|
17
|
+
for(int y = 0; y < <%= fh %>; y++) {
|
18
|
+
for (int x = 0; x < <%= fw %>; x++) {
|
19
|
+
if ( (h_index_with_stride + y) < height && (w_index_with_stride + x) < width) {
|
20
|
+
sum += images[image_index + (h_index_with_stride + y)*image_row_width + (w_index_with_stride + x)*<%= ch %> + channel_index] * filter[y*<%= fw * ch * out_ch %> + x*<%= ch * out_ch %> + (channel_index*<%= out_ch %>) + out_channel_index];
|
21
|
+
}
|
22
|
+
}
|
23
|
+
}
|
24
|
+
}
|
25
|
+
output[batch_index * (height/<%= stride[0] %>) * (width/<%= stride[1] %>) * <%= out_ch %> + h_index * (width/<%= stride[1] %>) * <%= out_ch %> + w_index * <%= out_ch %> + out_channel_index ] = sum;
|
26
|
+
}
|
27
|
+
}
|
@@ -0,0 +1,26 @@
|
|
1
|
+
% ctype = dtype_to_c_type(dtype)
|
2
|
+
|
3
|
+
__kernel void conv2d_backprop_filter(const int batch_size, const int height, const int width, __global const <%= ctype %> *images, __global const <%= ctype %> *grad, __global <%= ctype %> *output) {
|
4
|
+
// Get the index of the current element to be processed
|
5
|
+
const int fh_index = get_global_id(0);
|
6
|
+
const int fw_index = get_global_id(1);
|
7
|
+
const int f_out_channel = get_global_id(2);
|
8
|
+
const int image_size = height * width * <%= ch %>;
|
9
|
+
const int grad_image_row_width = width * <%= out_ch %>;
|
10
|
+
|
11
|
+
for(int channel = 0; channel < <%= ch %>; channel++) {
|
12
|
+
<%= ctype %> grad_sum = 0.0;
|
13
|
+
for(int batch = 0; batch < batch_size; batch++) {
|
14
|
+
const int image_index = batch * height * width * <%= out_ch %>;
|
15
|
+
for(int y = 0; y < height; y++) {
|
16
|
+
for (int x = 0; x < width; x++) {
|
17
|
+
if ( ((y - fh_index) % <%= stride[0]%>) == 0 && ((x - fw_index) % <%= stride[1]%>) == 0 && fh_index <= y && fw_index <= x) {
|
18
|
+
const <%= ctype %> image_grad = grad[image_index + ((y - fh_index) / <%= stride[0] %>) * grad_image_row_width + ((x - fw_index) / <%= stride[1] %>) * <%= out_ch %> + f_out_channel];
|
19
|
+
grad_sum += images[batch * image_size + y * width * <%= ch %> + x * <%= ch %> + channel] * image_grad;
|
20
|
+
}
|
21
|
+
}
|
22
|
+
}
|
23
|
+
}
|
24
|
+
output[fh_index * <%= fw * ch * out_ch %> + fw_index * <%= ch * out_ch %> + channel * <%= out_ch %> + f_out_channel] = grad_sum;
|
25
|
+
}
|
26
|
+
}
|
@@ -0,0 +1,32 @@
|
|
1
|
+
% ctype = dtype_to_c_type(dtype)
|
2
|
+
|
3
|
+
__kernel void conv2d_backprop_input(const int height, const int width, __global const <%= ctype %> *filter, __global const <%= ctype %> *grad, __global <%= ctype %> *output) {
|
4
|
+
// Get the index of the current element to be processed
|
5
|
+
int batch_index = get_global_id(0);
|
6
|
+
int h_index = get_global_id(1); // orig image y
|
7
|
+
int w_index = get_global_id(2); // orig image x
|
8
|
+
|
9
|
+
int h_index_with_stride = h_index / <%= stride[0] %>;
|
10
|
+
int w_index_with_stride = w_index / <%= stride[1] %>;
|
11
|
+
int grad_height = height / <%= stride[0] %>;
|
12
|
+
int grad_width = width / <%= stride[1] %>;
|
13
|
+
|
14
|
+
int image_index = batch_index * grad_height * grad_width * <%= out_ch %>;
|
15
|
+
int image_row_width = grad_width * <%= out_ch %>;
|
16
|
+
|
17
|
+
for (int channel_index = 0; channel_index < <%= ch %>; channel_index++) {
|
18
|
+
<%= ctype %> g = 0.0;
|
19
|
+
for (int out_channel_index = 0; out_channel_index < <%= out_ch %>; out_channel_index++) {
|
20
|
+
for(int y = 0; y < <%= fh %>; y++) {
|
21
|
+
for (int x = 0; x < <%= fw %>; x++) {
|
22
|
+
if ( (y <= h_index) && (x <= w_index) && ( (h_index - y) % <%= stride[0]%> == 0) && ( (w_index - x) % <%= stride[1]%> == 0)) {
|
23
|
+
<%= ctype %> imag_grad = grad[image_index + ( (h_index - y) / <%= stride[0] %>) * image_row_width + ( (w_index - x) / <%= stride[1] %>) * <%= out_ch %> + out_channel_index];
|
24
|
+
g += imag_grad * filter[y * <%= fw * ch * out_ch %> + x * <%= ch * out_ch %> + (channel_index*<%= out_ch %>) + out_channel_index];
|
25
|
+
}
|
26
|
+
}
|
27
|
+
}
|
28
|
+
}
|
29
|
+
|
30
|
+
output[batch_index * height * width * <%= ch %> + h_index * width * <%= ch %> + w_index * <%= ch %> + channel_index ] = g;
|
31
|
+
}
|
32
|
+
}
|
@@ -1,8 +1,6 @@
|
|
1
1
|
// First naive implementation
|
2
2
|
% c_dtype = dtype_to_c_type(dtype)
|
3
3
|
__kernel void gemm_<%= dtype %>(const int M, const int N, const int K,
|
4
|
-
const int A_transpose,
|
5
|
-
const int B_transpose,
|
6
4
|
const __global <%= c_dtype %>* A,
|
7
5
|
const __global <%= c_dtype %>* B,
|
8
6
|
__global <%= c_dtype %>* C) {
|
@@ -16,14 +14,8 @@ __kernel void gemm_<%= dtype %>(const int M, const int N, const int K,
|
|
16
14
|
for (int k=0; k<K; k++) {
|
17
15
|
int a_index = globalRow*K + k;
|
18
16
|
int b_index = k*N + globalCol;
|
19
|
-
|
20
|
-
|
21
|
-
a_index = M*k + globalRow;
|
22
|
-
}
|
23
|
-
|
24
|
-
if (B_transpose) {
|
25
|
-
b_index = globalCol*K + k;
|
26
|
-
}
|
17
|
+
<% if ta %>a_index = M*k + globalRow;<% end %>
|
18
|
+
<% if tb %>b_index = globalCol*K + k;<% end %>
|
27
19
|
acc += A[a_index] * B[b_index];
|
28
20
|
}
|
29
21
|
|
@@ -5,7 +5,7 @@
|
|
5
5
|
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
6
6
|
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
7
7
|
|
8
|
-
C[globalRow * N + globalCol] =
|
8
|
+
C[globalRow * N + globalCol] = max((<%= c_dtype %>)A[globalRow * N + globalCol], (<%= c_dtype %>)B[globalRow * N + globalCol]);
|
9
9
|
}
|
10
10
|
|
11
11
|
// 1D + Scalar floating point add op
|
@@ -13,12 +13,8 @@
|
|
13
13
|
// Get the index of the current element to be processed
|
14
14
|
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
15
15
|
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
16
|
-
|
17
|
-
|
18
|
-
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] >= B[0] ? A[globalRow * N + globalCol] : B[0];
|
19
|
-
} else {
|
20
|
-
C[globalRow * N + globalCol] = B[0] >= A[globalRow * N + globalCol] ? B[0] : A[globalRow * N + globalCol];
|
21
|
-
}
|
16
|
+
|
17
|
+
C[globalRow * N + globalCol] = max((<%= c_dtype %>)A[globalRow * N + globalCol], (<%= c_dtype %>)B[0]);
|
22
18
|
}
|
23
19
|
|
24
20
|
// 1D + Scalar floating point add op broadcast
|
@@ -26,7 +22,7 @@
|
|
26
22
|
// Get the index of the current element to be processed
|
27
23
|
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
28
24
|
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
29
|
-
|
25
|
+
|
30
26
|
int b_m_index = globalRow;
|
31
27
|
int b_n_index = globalCol;
|
32
28
|
|
@@ -38,9 +34,5 @@
|
|
38
34
|
b_n_index = b_n_index % N2;
|
39
35
|
}
|
40
36
|
|
41
|
-
|
42
|
-
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] >= B[b_m_index * N2 + b_n_index] ? A[globalRow * N + globalCol] : B[b_m_index * N2 + b_n_index];
|
43
|
-
} else {
|
44
|
-
C[globalRow * N + globalCol] = B[b_m_index * N2 + b_n_index] >= A[globalRow * N + globalCol] ? B[b_m_index * N2 + b_n_index] : A[globalRow * N + globalCol];
|
45
|
-
}
|
37
|
+
C[globalRow * N + globalCol] = max((<%= c_dtype %>)A[globalRow * N + globalCol],(<%= c_dtype %>) B[b_m_index * N2 + b_n_index]);
|
46
38
|
}
|
@@ -0,0 +1,26 @@
|
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
__kernel void mean_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
|
+
// Get the index of the current element to be processed
|
4
|
+
const int id = get_global_id(0);
|
5
|
+
int offset = (id + <%= index %>) * <%= w %>;
|
6
|
+
<%= c_dtype %> sum = 0;
|
7
|
+
<% if n > 4 %>
|
8
|
+
for(int i = 0; i < <%= n/4 %> ; i++) {
|
9
|
+
<% sums = 4.times.map do |i|
|
10
|
+
"A[offset + #{i}]"
|
11
|
+
end %>
|
12
|
+
sum += <%= sums.join(' + ') %>;
|
13
|
+
offset += 4;
|
14
|
+
}
|
15
|
+
<% if n%4!=0 %>
|
16
|
+
<% (n % 4).times do |i| %>
|
17
|
+
sum += A[offset + <%= i %>];
|
18
|
+
<% end %>
|
19
|
+
<% end %>
|
20
|
+
<% else %>
|
21
|
+
<% n.times do |i| %>
|
22
|
+
sum += A[offset + <%= i %>];
|
23
|
+
<% end %>
|
24
|
+
<% end %>
|
25
|
+
C[id] = sum / <%= n %>;
|
26
|
+
}
|
@@ -5,7 +5,7 @@
|
|
5
5
|
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
6
6
|
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
7
7
|
|
8
|
-
C[globalRow * N + globalCol] =
|
8
|
+
C[globalRow * N + globalCol] = min((<%= c_dtype %>)A[globalRow * N + globalCol],(<%= c_dtype %>) B[globalRow * N + globalCol]);
|
9
9
|
}
|
10
10
|
|
11
11
|
// 1D + Scalar floating point add op
|
@@ -14,11 +14,7 @@
|
|
14
14
|
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
15
15
|
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
16
16
|
|
17
|
-
|
18
|
-
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <= B[0] ? A[globalRow * N + globalCol] : B[0];
|
19
|
-
} else {
|
20
|
-
C[globalRow * N + globalCol] = B[0] <= A[globalRow * N + globalCol] ? B[0] : A[globalRow * N + globalCol];
|
21
|
-
}
|
17
|
+
C[globalRow * N + globalCol] = min((<%= c_dtype %>)A[globalRow * N + globalCol], (<%= c_dtype %>) B[0]);
|
22
18
|
}
|
23
19
|
|
24
20
|
// 1D + Scalar floating point add op broadcast
|
@@ -38,9 +34,5 @@
|
|
38
34
|
b_n_index = b_n_index % N2;
|
39
35
|
}
|
40
36
|
|
41
|
-
|
42
|
-
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <= B[b_m_index * N2 + b_n_index] ? A[globalRow * N + globalCol] : B[b_m_index * N2 + b_n_index];
|
43
|
-
} else {
|
44
|
-
C[globalRow * N + globalCol] = B[b_m_index * N2 + b_n_index] <= A[globalRow * N + globalCol] ? B[b_m_index * N2 + b_n_index] : A[globalRow * N + globalCol];
|
45
|
-
}
|
37
|
+
C[globalRow * N + globalCol] = min((<%= c_dtype %>)A[globalRow * N + globalCol], (<%= c_dtype %>)B[b_m_index * N2 + b_n_index]);
|
46
38
|
}
|
@@ -0,0 +1,26 @@
|
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
__kernel void prod_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
|
+
// Get the index of the current element to be processed
|
4
|
+
int id = get_global_id(0);
|
5
|
+
int offset = (id + <%= index %>) * <%= w %>;
|
6
|
+
<%= c_dtype %> prod = 1;
|
7
|
+
<% if n > 4 %>
|
8
|
+
for(int i = 0; i < <%= n/4 %> ; i++) {
|
9
|
+
<% sums = 4.times.map do |i|
|
10
|
+
"A[offset + #{i}]"
|
11
|
+
end %>
|
12
|
+
prod *= <%= sums.join(' * ') %>;
|
13
|
+
offset += 4;
|
14
|
+
}
|
15
|
+
<% if n%4!=0 %>
|
16
|
+
<% (n % 4).times do |i| %>
|
17
|
+
prod *= A[offset + <%= i %>];
|
18
|
+
<% end %>
|
19
|
+
<% end %>
|
20
|
+
<% else %>
|
21
|
+
<% n.times do |i| %>
|
22
|
+
prod *= A[offset + <%= i %>];
|
23
|
+
<% end %>
|
24
|
+
<% end %>
|
25
|
+
C[id] = prod;
|
26
|
+
}
|