tensor_stream-opencl 0.2.2 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/benchmark/benchmark.rb +23 -1
- data/benchmark_ryzen.txt +56 -0
- data/lib/tensor_stream/opencl/array_ops.rb +3 -3
- data/lib/tensor_stream/opencl/images_ops.rb +30 -0
- data/lib/tensor_stream/opencl/kernels/conv2d.cl +27 -0
- data/lib/tensor_stream/opencl/kernels/conv2d_backprop_filter.cl +26 -0
- data/lib/tensor_stream/opencl/kernels/conv2d_backprop_input.cl +32 -0
- data/lib/tensor_stream/opencl/kernels/gemm.cl +2 -10
- data/lib/tensor_stream/opencl/kernels/max.cl +5 -13
- data/lib/tensor_stream/opencl/kernels/mean.cl +26 -0
- data/lib/tensor_stream/opencl/kernels/min.cl +3 -11
- data/lib/tensor_stream/opencl/kernels/prod.cl +26 -0
- data/lib/tensor_stream/opencl/kernels/relu6.cl +7 -0
- data/lib/tensor_stream/opencl/kernels/round.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/sum.cl +26 -0
- data/lib/tensor_stream/opencl/math_ops.rb +86 -29
- data/lib/tensor_stream/opencl/nn_ops.rb +89 -5
- data/lib/tensor_stream/opencl/opencl_buffer.rb +6 -2
- data/lib/tensor_stream/opencl/opencl_evaluator.rb +97 -92
- data/lib/tensor_stream/opencl/version.rb +1 -1
- data/samples/iris.rb +2 -2
- data/samples/logistic_regression.rb +84 -0
- data/samples/mnist_data_2.1.rb +9 -4
- data/samples/mnist_data_2.2.rb +12 -7
- data/samples/mnist_data_2.3.rb +111 -0
- data/samples/rnn.rb +1 -1
- data/tensor_stream-opencl.gemspec +2 -1
- metadata +28 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7888f5385dcec4b9a747128d1b25d0cb6d7fb01ab12f1c1b70706d0df5770903
|
4
|
+
data.tar.gz: 1129196d93a7b194d9de888fe49b43b0839c00589fb0987c0841ad1ac1693a4c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 727d97b9aa1402ed9681eb71fe0c0cdcc966e8a9e2f08b7480fdefa503509e26872b187a0b184d31a5b0cee86dc96e1d97f4fce14b378fa4a7c8ce4679b06421
|
7
|
+
data.tar.gz: 0e2f1601543d582042d0911222c1209f2df2c08139b7acc8ee347b5d7e236cd451a1b00032fb75d6f532105fb8f4ee49e53614071d111e0fabc8ed44010c4301
|
data/.gitignore
CHANGED
data/benchmark/benchmark.rb
CHANGED
@@ -26,7 +26,7 @@ tf.set_random_seed(seed)
|
|
26
26
|
SHAPES = [32, 32]
|
27
27
|
|
28
28
|
sess = tf.session(:ruby_evaluator)
|
29
|
-
|
29
|
+
large_tensor = tf.constant(sess.run(tf.random_uniform([256, 256])))
|
30
30
|
a = tf.constant(sess.run(tf.random_uniform(SHAPES)))
|
31
31
|
a_int = tf.constant([
|
32
32
|
[1, 2, 3, 4, 4, 1, 4, 8, 3, 4, 1, 1],
|
@@ -49,6 +49,9 @@ c = tf.constant(sess.run(tf.random_uniform(SHAPES)))
|
|
49
49
|
|
50
50
|
d = tf.constant(sess.run(tf.random_uniform(SHAPES)))
|
51
51
|
|
52
|
+
sample_image = tf.constant(sess.run(tf.random_uniform([10, 8, 8, 3])))
|
53
|
+
sample_filter = tf.constant(sess.run(tf.random_uniform([2, 2, 3, 3])))
|
54
|
+
|
52
55
|
p = tf.placeholder('float')
|
53
56
|
q = tf.placeholder('float')
|
54
57
|
|
@@ -61,6 +64,13 @@ out_of_order = tf.matmul(a, b) + tf.matmul(c, d)
|
|
61
64
|
softmax = tf.nn.softmax(a)
|
62
65
|
add_n = tf.add_n([a,b,c,d])
|
63
66
|
split = tf.split(a, 4)
|
67
|
+
sum = tf.reduce_sum(large_tensor)
|
68
|
+
sum_axis_1 = tf.reduce_sum(large_tensor, 1)
|
69
|
+
min = tf.min(large_tensor, 1)
|
70
|
+
index = large_tensor[0]
|
71
|
+
|
72
|
+
conv2d = tf.nn.conv2d(sample_image, sample_filter, [1, 1, 1, 1], 'SAME')
|
73
|
+
conv2d_grad = tf.gradients(conv2d, [sample_image, sample_filter])
|
64
74
|
|
65
75
|
puts TensorStream::Evaluator.default_evaluators
|
66
76
|
|
@@ -70,6 +80,18 @@ puts `cat /proc/cpuinfo | grep "model name" | head -1`
|
|
70
80
|
device = TensorStream::Evaluator::OpenclEvaluator.default_device.native_device
|
71
81
|
puts "OpenCL device #{device.platform.to_s} #{device.name}"
|
72
82
|
Benchmark.bmbm do |x|
|
83
|
+
x.report("pure ruby conv2d_backprop :") { 100.times do sess.run(conv2d_grad) end }
|
84
|
+
x.report("opencl conv2d_backprop :") { 100.times do sess2.run(conv2d_grad) end }
|
85
|
+
x.report("pure ruby conv2d :") { 100.times do sess.run(conv2d) end }
|
86
|
+
x.report("opencl conv2d :") { 100.times do sess2.run(conv2d) end }
|
87
|
+
x.report("pure ruby arr index :") { 100.times do sess.run(index) end }
|
88
|
+
x.report("opencl arr index :") { 100.times do sess2.run(index) end }
|
89
|
+
x.report("pure ruby min :") { 100.times do sess.run(min) end }
|
90
|
+
x.report("opencl min :") { 100.times do sess2.run(min) end }
|
91
|
+
x.report("pure ruby sum :") { 100.times do sess.run(sum) end }
|
92
|
+
x.report("opencl sum :") { 100.times do sess2.run(sum) end }
|
93
|
+
x.report("pure ruby sum axis 1 :") { 100.times do sess.run(sum_axis_1) end }
|
94
|
+
x.report("opencl sum axis 1 :") { 100.times do sess2.run(sum_axis_1) end }
|
73
95
|
x.report("pure ruby split :") { 100.times do sess.run(split) end }
|
74
96
|
x.report("opencl split :") { 100.times do sess2.run(split) end }
|
75
97
|
x.report("pure ruby add_n :") { 100.times do sess.run(add_n) end }
|
data/benchmark_ryzen.txt
ADDED
@@ -0,0 +1,56 @@
|
|
1
|
+
TensorStream::Evaluator::OpenclEvaluator
|
2
|
+
TensorStream::Evaluator::RubyEvaluator
|
3
|
+
model name : AMD Ryzen 3 1300X Quad-Core Processor
|
4
|
+
OpenCL device NVIDIA CUDA GeForce GTX 1060 6GB
|
5
|
+
Rehearsal --------------------------------------------------------------
|
6
|
+
pure ruby arr index : 0.005448 0.003557 0.009005 ( 0.008999)
|
7
|
+
opencl arr index : 0.074642 0.190132 0.264774 ( 0.275557)
|
8
|
+
pure ruby min : 0.256004 0.000777 0.256781 ( 0.256682)
|
9
|
+
opencl min : 0.017543 0.004523 0.022066 ( 0.018797)
|
10
|
+
pure ruby sum : 0.313039 0.000565 0.313604 ( 0.313535)
|
11
|
+
opencl sum : 0.009037 0.004249 0.013286 ( 0.011073)
|
12
|
+
pure ruby split : 0.017223 0.000300 0.017523 ( 0.017542)
|
13
|
+
opencl split : 0.033489 0.014394 0.047883 ( 0.038798)
|
14
|
+
pure ruby add_n : 0.159864 0.000153 0.160017 ( 0.159992)
|
15
|
+
opencl add_n : 0.018535 0.000563 0.019098 ( 0.016168)
|
16
|
+
pure ruby ooo matmul : 1.390970 0.000304 1.391274 ( 1.390790)
|
17
|
+
opencl ooo matmul : 0.014119 0.000229 0.014348 ( 0.011738)
|
18
|
+
pure ruby softmax : 0.024103 0.000014 0.024117 ( 0.024135)
|
19
|
+
opencl softmax : 0.010602 0.004277 0.014879 ( 0.011941)
|
20
|
+
pure ruby matmul : 0.668126 0.000006 0.668132 ( 0.667778)
|
21
|
+
opencl matmul : 0.006672 0.007527 0.014199 ( 0.011594)
|
22
|
+
pure ruby : 2.388817 0.000005 2.388822 ( 2.387870)
|
23
|
+
opencl : 0.152289 0.007804 0.160093 ( 0.156279)
|
24
|
+
pure ruby single function: 0.356575 0.000062 0.356637 ( 0.356488)
|
25
|
+
opencl singlefunction: 0.120073 0.000210 0.120283 ( 0.116378)
|
26
|
+
pure ruby pow float: 0.088966 0.000051 0.089017 ( 0.088996)
|
27
|
+
opencl pow float: 0.018054 0.000100 0.018154 ( 0.015429)
|
28
|
+
pure ruby pow int: 0.025430 0.000070 0.025500 ( 0.025524)
|
29
|
+
opencl pow int: 0.015652 0.003880 0.019532 ( 0.017386)
|
30
|
+
----------------------------------------------------- total: 6.429024sec
|
31
|
+
|
32
|
+
user system total real
|
33
|
+
pure ruby arr index : 0.003564 0.000110 0.003674 ( 0.003636)
|
34
|
+
opencl arr index : 0.007966 0.003974 0.011940 ( 0.009775)
|
35
|
+
pure ruby min : 0.246153 0.000102 0.246255 ( 0.246172)
|
36
|
+
opencl min : 0.011787 0.007785 0.019572 ( 0.016169)
|
37
|
+
pure ruby sum : 0.294371 0.000000 0.294371 ( 0.294335)
|
38
|
+
opencl sum : 0.008266 0.003879 0.012145 ( 0.009315)
|
39
|
+
pure ruby split : 0.014552 0.000000 0.014552 ( 0.014539)
|
40
|
+
opencl split : 0.037984 0.004037 0.042021 ( 0.033276)
|
41
|
+
pure ruby add_n : 0.146300 0.000053 0.146353 ( 0.146319)
|
42
|
+
opencl add_n : 0.006426 0.007827 0.014253 ( 0.011461)
|
43
|
+
pure ruby ooo matmul : 1.373232 0.000096 1.373328 ( 1.372788)
|
44
|
+
opencl ooo matmul : 0.013838 0.000000 0.013838 ( 0.011088)
|
45
|
+
pure ruby softmax : 0.024478 0.000000 0.024478 ( 0.024493)
|
46
|
+
opencl softmax : 0.014117 0.000022 0.014139 ( 0.011246)
|
47
|
+
pure ruby matmul : 0.653146 0.000054 0.653200 ( 0.652889)
|
48
|
+
opencl matmul : 0.002750 0.011934 0.014684 ( 0.011729)
|
49
|
+
pure ruby : 2.392733 0.000058 2.392791 ( 2.391726)
|
50
|
+
opencl : 0.140118 0.016001 0.156119 ( 0.151788)
|
51
|
+
pure ruby single function: 0.352515 0.000000 0.352515 ( 0.352443)
|
52
|
+
opencl singlefunction: 0.093955 0.011813 0.105768 ( 0.102301)
|
53
|
+
pure ruby pow float: 0.083659 0.000000 0.083659 ( 0.083623)
|
54
|
+
opencl pow float: 0.017433 0.000125 0.017558 ( 0.014508)
|
55
|
+
pure ruby pow int: 0.018381 0.000000 0.018381 ( 0.018391)
|
56
|
+
opencl pow int: 0.008186 0.003755 0.011941 ( 0.009828)
|
@@ -197,6 +197,7 @@ module TensorStream
|
|
197
197
|
ops = if axis.zero? # fast path
|
198
198
|
inputs.each_with_index.map do |input, index|
|
199
199
|
next if input.empty_value?
|
200
|
+
|
200
201
|
start = index * input.buffer.size * input.buffer.element_size
|
201
202
|
region = [input.buffer.size * input.buffer.element_size, 1, 1]
|
202
203
|
event_wait_list = build_event_wait_list(input)
|
@@ -339,7 +340,7 @@ module TensorStream
|
|
339
340
|
|
340
341
|
register_op :index, noop: true do |context, tensor, inputs|
|
341
342
|
a = _run(inputs[0], context)
|
342
|
-
index = read_final_result(_run(inputs[1], context))
|
343
|
+
index = inputs[1].value || read_final_result(_run(inputs[1], context))
|
343
344
|
|
344
345
|
if a.is_a?(TensorStream::Evaluator::OutputGroup)
|
345
346
|
a.outputs[index]
|
@@ -348,8 +349,7 @@ module TensorStream
|
|
348
349
|
else
|
349
350
|
new_shape = a.shape.dup
|
350
351
|
new_shape.shift
|
351
|
-
|
352
|
-
convert_to_opencl(input_a[index], new_shape, data_type: a.data_type, name: tensor.name)
|
352
|
+
_create_result_sub_buffer(a, index, tensor.data_type, new_shape, "#{tensor.name}/out_#{index}")
|
353
353
|
end
|
354
354
|
end
|
355
355
|
|
@@ -7,9 +7,23 @@ module TensorStream
|
|
7
7
|
register_op :decode_png do |context, tensor, inputs|
|
8
8
|
content = _run(inputs[0], context)
|
9
9
|
channels = tensor.options[:channels]
|
10
|
+
resample_new_shape = tensor.options[:new_shape]
|
11
|
+
resample_method = tensor.options[:resample_method] || :bilinear
|
10
12
|
channels = 4 if channels.zero?
|
11
13
|
|
12
14
|
image = ChunkyPNG::Image.from_blob(content.buffer.to_a.pack('C*'))
|
15
|
+
|
16
|
+
if resample_new_shape
|
17
|
+
case resample_method
|
18
|
+
when :bilinear
|
19
|
+
image.resample_bilinear!(resample_new_shape[1], resample_new_shape[0]) # width, # height
|
20
|
+
when :nearest_neighbor
|
21
|
+
image.resample_nearest_neighbor!(resample_new_shape[1], resample_new_shape[0])
|
22
|
+
else
|
23
|
+
raise TensorStream::ValueError, "invalid resample method provided #{resample_method}. Available (:bilinear, :nearest_neighbor)"
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
13
27
|
output_buffer = _create_result_buffer(tensor.data_type, [image.height, image.width, channels], "out_#{tensor.name}")
|
14
28
|
|
15
29
|
image.grayscale! if channels == 1
|
@@ -38,6 +52,10 @@ module TensorStream
|
|
38
52
|
|
39
53
|
register_op :encode_png do |_context, tensor, inputs|
|
40
54
|
image_data = inputs[0]
|
55
|
+
|
56
|
+
resample_new_shape = tensor.options[:new_shape]
|
57
|
+
resample_method = tensor.options[:resample_method] || :bilinear
|
58
|
+
|
41
59
|
height, width, channels = image_data.shape
|
42
60
|
image_buffer = image_data.buffer.reshape(*image_data.shape.reverse).to_a
|
43
61
|
\
|
@@ -53,6 +71,18 @@ module TensorStream
|
|
53
71
|
end
|
54
72
|
end
|
55
73
|
end
|
74
|
+
|
75
|
+
if resample_new_shape
|
76
|
+
case resample_method
|
77
|
+
when :bilinear
|
78
|
+
png.resample_bilinear!(resample_new_shape[1], resample_new_shape[0]) # width, # height
|
79
|
+
when :nearest_neighbor
|
80
|
+
png.resample_nearest_neighbor!(resample_new_shape[1], resample_new_shape[0])
|
81
|
+
else
|
82
|
+
raise TensorStream::ValueError, "invalid resample method provided #{resample_method}. Available (:bilinear, :nearest_neighbor)"
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
56
86
|
convert_to_opencl(png.to_s, [], data_type: :string, name: tensor.name)
|
57
87
|
end
|
58
88
|
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
% ctype = dtype_to_c_type(dtype)
|
2
|
+
|
3
|
+
__kernel void conv2d(const int height, const int width, __global const <%= ctype %> *images, __global const <%= ctype %> *filter, __global <%= ctype %> *output) {
|
4
|
+
// Get the index of the current element to be processed
|
5
|
+
const int batch_index = get_global_id(0);
|
6
|
+
const int h_index = get_global_id(1);
|
7
|
+
const int w_index = get_global_id(2);
|
8
|
+
const int h_index_with_stride = h_index * <%= stride[0] %>;
|
9
|
+
const int w_index_with_stride = w_index * <%= stride[1] %>;
|
10
|
+
|
11
|
+
const int image_index = batch_index * height * width * <%= ch %>;
|
12
|
+
const int image_row_width = width * <%= ch %>;
|
13
|
+
|
14
|
+
for (int out_channel_index = 0; out_channel_index < <%= out_ch %>; out_channel_index++) {
|
15
|
+
<%= ctype %> sum = 0;
|
16
|
+
for (int channel_index = 0; channel_index < <%= ch %>; channel_index++) {
|
17
|
+
for(int y = 0; y < <%= fh %>; y++) {
|
18
|
+
for (int x = 0; x < <%= fw %>; x++) {
|
19
|
+
if ( (h_index_with_stride + y) < height && (w_index_with_stride + x) < width) {
|
20
|
+
sum += images[image_index + (h_index_with_stride + y)*image_row_width + (w_index_with_stride + x)*<%= ch %> + channel_index] * filter[y*<%= fw * ch * out_ch %> + x*<%= ch * out_ch %> + (channel_index*<%= out_ch %>) + out_channel_index];
|
21
|
+
}
|
22
|
+
}
|
23
|
+
}
|
24
|
+
}
|
25
|
+
output[batch_index * (height/<%= stride[0] %>) * (width/<%= stride[1] %>) * <%= out_ch %> + h_index * (width/<%= stride[1] %>) * <%= out_ch %> + w_index * <%= out_ch %> + out_channel_index ] = sum;
|
26
|
+
}
|
27
|
+
}
|
@@ -0,0 +1,26 @@
|
|
1
|
+
% ctype = dtype_to_c_type(dtype)
|
2
|
+
|
3
|
+
__kernel void conv2d_backprop_filter(const int batch_size, const int height, const int width, __global const <%= ctype %> *images, __global const <%= ctype %> *grad, __global <%= ctype %> *output) {
|
4
|
+
// Get the index of the current element to be processed
|
5
|
+
const int fh_index = get_global_id(0);
|
6
|
+
const int fw_index = get_global_id(1);
|
7
|
+
const int f_out_channel = get_global_id(2);
|
8
|
+
const int image_size = height * width * <%= ch %>;
|
9
|
+
const int grad_image_row_width = width * <%= out_ch %>;
|
10
|
+
|
11
|
+
for(int channel = 0; channel < <%= ch %>; channel++) {
|
12
|
+
<%= ctype %> grad_sum = 0.0;
|
13
|
+
for(int batch = 0; batch < batch_size; batch++) {
|
14
|
+
const int image_index = batch * height * width * <%= out_ch %>;
|
15
|
+
for(int y = 0; y < height; y++) {
|
16
|
+
for (int x = 0; x < width; x++) {
|
17
|
+
if ( ((y - fh_index) % <%= stride[0]%>) == 0 && ((x - fw_index) % <%= stride[1]%>) == 0 && fh_index <= y && fw_index <= x) {
|
18
|
+
const <%= ctype %> image_grad = grad[image_index + ((y - fh_index) / <%= stride[0] %>) * grad_image_row_width + ((x - fw_index) / <%= stride[1] %>) * <%= out_ch %> + f_out_channel];
|
19
|
+
grad_sum += images[batch * image_size + y * width * <%= ch %> + x * <%= ch %> + channel] * image_grad;
|
20
|
+
}
|
21
|
+
}
|
22
|
+
}
|
23
|
+
}
|
24
|
+
output[fh_index * <%= fw * ch * out_ch %> + fw_index * <%= ch * out_ch %> + channel * <%= out_ch %> + f_out_channel] = grad_sum;
|
25
|
+
}
|
26
|
+
}
|
@@ -0,0 +1,32 @@
|
|
1
|
+
% ctype = dtype_to_c_type(dtype)
|
2
|
+
|
3
|
+
__kernel void conv2d_backprop_input(const int height, const int width, __global const <%= ctype %> *filter, __global const <%= ctype %> *grad, __global <%= ctype %> *output) {
|
4
|
+
// Get the index of the current element to be processed
|
5
|
+
int batch_index = get_global_id(0);
|
6
|
+
int h_index = get_global_id(1); // orig image y
|
7
|
+
int w_index = get_global_id(2); // orig image x
|
8
|
+
|
9
|
+
int h_index_with_stride = h_index / <%= stride[0] %>;
|
10
|
+
int w_index_with_stride = w_index / <%= stride[1] %>;
|
11
|
+
int grad_height = height / <%= stride[0] %>;
|
12
|
+
int grad_width = width / <%= stride[1] %>;
|
13
|
+
|
14
|
+
int image_index = batch_index * grad_height * grad_width * <%= out_ch %>;
|
15
|
+
int image_row_width = grad_width * <%= out_ch %>;
|
16
|
+
|
17
|
+
for (int channel_index = 0; channel_index < <%= ch %>; channel_index++) {
|
18
|
+
<%= ctype %> g = 0.0;
|
19
|
+
for (int out_channel_index = 0; out_channel_index < <%= out_ch %>; out_channel_index++) {
|
20
|
+
for(int y = 0; y < <%= fh %>; y++) {
|
21
|
+
for (int x = 0; x < <%= fw %>; x++) {
|
22
|
+
if ( (y <= h_index) && (x <= w_index) && ( (h_index - y) % <%= stride[0]%> == 0) && ( (w_index - x) % <%= stride[1]%> == 0)) {
|
23
|
+
<%= ctype %> imag_grad = grad[image_index + ( (h_index - y) / <%= stride[0] %>) * image_row_width + ( (w_index - x) / <%= stride[1] %>) * <%= out_ch %> + out_channel_index];
|
24
|
+
g += imag_grad * filter[y * <%= fw * ch * out_ch %> + x * <%= ch * out_ch %> + (channel_index*<%= out_ch %>) + out_channel_index];
|
25
|
+
}
|
26
|
+
}
|
27
|
+
}
|
28
|
+
}
|
29
|
+
|
30
|
+
output[batch_index * height * width * <%= ch %> + h_index * width * <%= ch %> + w_index * <%= ch %> + channel_index ] = g;
|
31
|
+
}
|
32
|
+
}
|
@@ -1,8 +1,6 @@
|
|
1
1
|
// First naive implementation
|
2
2
|
% c_dtype = dtype_to_c_type(dtype)
|
3
3
|
__kernel void gemm_<%= dtype %>(const int M, const int N, const int K,
|
4
|
-
const int A_transpose,
|
5
|
-
const int B_transpose,
|
6
4
|
const __global <%= c_dtype %>* A,
|
7
5
|
const __global <%= c_dtype %>* B,
|
8
6
|
__global <%= c_dtype %>* C) {
|
@@ -16,14 +14,8 @@ __kernel void gemm_<%= dtype %>(const int M, const int N, const int K,
|
|
16
14
|
for (int k=0; k<K; k++) {
|
17
15
|
int a_index = globalRow*K + k;
|
18
16
|
int b_index = k*N + globalCol;
|
19
|
-
|
20
|
-
|
21
|
-
a_index = M*k + globalRow;
|
22
|
-
}
|
23
|
-
|
24
|
-
if (B_transpose) {
|
25
|
-
b_index = globalCol*K + k;
|
26
|
-
}
|
17
|
+
<% if ta %>a_index = M*k + globalRow;<% end %>
|
18
|
+
<% if tb %>b_index = globalCol*K + k;<% end %>
|
27
19
|
acc += A[a_index] * B[b_index];
|
28
20
|
}
|
29
21
|
|
@@ -5,7 +5,7 @@
|
|
5
5
|
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
6
6
|
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
7
7
|
|
8
|
-
C[globalRow * N + globalCol] =
|
8
|
+
C[globalRow * N + globalCol] = max((<%= c_dtype %>)A[globalRow * N + globalCol], (<%= c_dtype %>)B[globalRow * N + globalCol]);
|
9
9
|
}
|
10
10
|
|
11
11
|
// 1D + Scalar floating point add op
|
@@ -13,12 +13,8 @@
|
|
13
13
|
// Get the index of the current element to be processed
|
14
14
|
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
15
15
|
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
16
|
-
|
17
|
-
|
18
|
-
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] >= B[0] ? A[globalRow * N + globalCol] : B[0];
|
19
|
-
} else {
|
20
|
-
C[globalRow * N + globalCol] = B[0] >= A[globalRow * N + globalCol] ? B[0] : A[globalRow * N + globalCol];
|
21
|
-
}
|
16
|
+
|
17
|
+
C[globalRow * N + globalCol] = max((<%= c_dtype %>)A[globalRow * N + globalCol], (<%= c_dtype %>)B[0]);
|
22
18
|
}
|
23
19
|
|
24
20
|
// 1D + Scalar floating point add op broadcast
|
@@ -26,7 +22,7 @@
|
|
26
22
|
// Get the index of the current element to be processed
|
27
23
|
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
28
24
|
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
29
|
-
|
25
|
+
|
30
26
|
int b_m_index = globalRow;
|
31
27
|
int b_n_index = globalCol;
|
32
28
|
|
@@ -38,9 +34,5 @@
|
|
38
34
|
b_n_index = b_n_index % N2;
|
39
35
|
}
|
40
36
|
|
41
|
-
|
42
|
-
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] >= B[b_m_index * N2 + b_n_index] ? A[globalRow * N + globalCol] : B[b_m_index * N2 + b_n_index];
|
43
|
-
} else {
|
44
|
-
C[globalRow * N + globalCol] = B[b_m_index * N2 + b_n_index] >= A[globalRow * N + globalCol] ? B[b_m_index * N2 + b_n_index] : A[globalRow * N + globalCol];
|
45
|
-
}
|
37
|
+
C[globalRow * N + globalCol] = max((<%= c_dtype %>)A[globalRow * N + globalCol],(<%= c_dtype %>) B[b_m_index * N2 + b_n_index]);
|
46
38
|
}
|
@@ -0,0 +1,26 @@
|
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
__kernel void mean_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
|
+
// Get the index of the current element to be processed
|
4
|
+
const int id = get_global_id(0);
|
5
|
+
int offset = (id + <%= index %>) * <%= w %>;
|
6
|
+
<%= c_dtype %> sum = 0;
|
7
|
+
<% if n > 4 %>
|
8
|
+
for(int i = 0; i < <%= n/4 %> ; i++) {
|
9
|
+
<% sums = 4.times.map do |i|
|
10
|
+
"A[offset + #{i}]"
|
11
|
+
end %>
|
12
|
+
sum += <%= sums.join(' + ') %>;
|
13
|
+
offset += 4;
|
14
|
+
}
|
15
|
+
<% if n%4!=0 %>
|
16
|
+
<% (n % 4).times do |i| %>
|
17
|
+
sum += A[offset + <%= i %>];
|
18
|
+
<% end %>
|
19
|
+
<% end %>
|
20
|
+
<% else %>
|
21
|
+
<% n.times do |i| %>
|
22
|
+
sum += A[offset + <%= i %>];
|
23
|
+
<% end %>
|
24
|
+
<% end %>
|
25
|
+
C[id] = sum / <%= n %>;
|
26
|
+
}
|
@@ -5,7 +5,7 @@
|
|
5
5
|
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
6
6
|
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
7
7
|
|
8
|
-
C[globalRow * N + globalCol] =
|
8
|
+
C[globalRow * N + globalCol] = min((<%= c_dtype %>)A[globalRow * N + globalCol],(<%= c_dtype %>) B[globalRow * N + globalCol]);
|
9
9
|
}
|
10
10
|
|
11
11
|
// 1D + Scalar floating point add op
|
@@ -14,11 +14,7 @@
|
|
14
14
|
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
15
15
|
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
16
16
|
|
17
|
-
|
18
|
-
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <= B[0] ? A[globalRow * N + globalCol] : B[0];
|
19
|
-
} else {
|
20
|
-
C[globalRow * N + globalCol] = B[0] <= A[globalRow * N + globalCol] ? B[0] : A[globalRow * N + globalCol];
|
21
|
-
}
|
17
|
+
C[globalRow * N + globalCol] = min((<%= c_dtype %>)A[globalRow * N + globalCol], (<%= c_dtype %>) B[0]);
|
22
18
|
}
|
23
19
|
|
24
20
|
// 1D + Scalar floating point add op broadcast
|
@@ -38,9 +34,5 @@
|
|
38
34
|
b_n_index = b_n_index % N2;
|
39
35
|
}
|
40
36
|
|
41
|
-
|
42
|
-
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <= B[b_m_index * N2 + b_n_index] ? A[globalRow * N + globalCol] : B[b_m_index * N2 + b_n_index];
|
43
|
-
} else {
|
44
|
-
C[globalRow * N + globalCol] = B[b_m_index * N2 + b_n_index] <= A[globalRow * N + globalCol] ? B[b_m_index * N2 + b_n_index] : A[globalRow * N + globalCol];
|
45
|
-
}
|
37
|
+
C[globalRow * N + globalCol] = min((<%= c_dtype %>)A[globalRow * N + globalCol], (<%= c_dtype %>)B[b_m_index * N2 + b_n_index]);
|
46
38
|
}
|
@@ -0,0 +1,26 @@
|
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
__kernel void prod_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
|
+
// Get the index of the current element to be processed
|
4
|
+
int id = get_global_id(0);
|
5
|
+
int offset = (id + <%= index %>) * <%= w %>;
|
6
|
+
<%= c_dtype %> prod = 1;
|
7
|
+
<% if n > 4 %>
|
8
|
+
for(int i = 0; i < <%= n/4 %> ; i++) {
|
9
|
+
<% sums = 4.times.map do |i|
|
10
|
+
"A[offset + #{i}]"
|
11
|
+
end %>
|
12
|
+
prod *= <%= sums.join(' * ') %>;
|
13
|
+
offset += 4;
|
14
|
+
}
|
15
|
+
<% if n%4!=0 %>
|
16
|
+
<% (n % 4).times do |i| %>
|
17
|
+
prod *= A[offset + <%= i %>];
|
18
|
+
<% end %>
|
19
|
+
<% end %>
|
20
|
+
<% else %>
|
21
|
+
<% n.times do |i| %>
|
22
|
+
prod *= A[offset + <%= i %>];
|
23
|
+
<% end %>
|
24
|
+
<% end %>
|
25
|
+
C[id] = prod;
|
26
|
+
}
|