tensor_stream-opencl 0.2.10 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/benchmark/benchmark.rb +12 -0
- data/benchmark_ryzen_nvidia.txt +80 -0
- data/lib/tensor_stream/opencl/kernels/arg_axis.cl +42 -0
- data/lib/tensor_stream/opencl/kernels/argmax.cl +12 -6
- data/lib/tensor_stream/opencl/kernels/argmin.cl +12 -6
- data/lib/tensor_stream/opencl/kernels/bias_add.cl +9 -0
- data/lib/tensor_stream/opencl/kernels/bias_add_grad.cl +10 -0
- data/lib/tensor_stream/opencl/kernels/reduce_axis.cl +42 -0
- data/lib/tensor_stream/opencl/math_ops.rb +62 -38
- data/lib/tensor_stream/opencl/opencl_buffer.rb +4 -0
- data/lib/tensor_stream/opencl/opencl_template_helper.rb +21 -0
- data/lib/tensor_stream/opencl/version.rb +1 -1
- data/tensor_stream-opencl.gemspec +1 -1
- metadata +9 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d2b7fa8bb0dcaa041e5ec390418ffac2bb40d777086e9ebe771f9f42d79ccabc
|
4
|
+
data.tar.gz: e21e0cf638e177da495e9748087808c51aee15ac10212c856ee789ae8c32d458
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 31a1cc671716dee18d6841e39f3bf43c5326a423be59e118e5acf35777aaed9b29acdfdd3425d6ce48f5f5c5f8e5692fae687a592828be0a458f337498965008
|
7
|
+
data.tar.gz: ecef0d771882d662996aa31cfa4eaf9e8cef720c2b05eb48f872ce9c3fbe9e07dfc99427332a6bcbf5fb8054f7f00d27fc55184462a559be834635aaad358c30
|
data/benchmark/benchmark.rb
CHANGED
@@ -43,6 +43,8 @@ a_int = tf.constant([
|
|
43
43
|
[4, 2, 3, 4, 0, 1, 1, 0, 0, 2, 1, 2],
|
44
44
|
])
|
45
45
|
|
46
|
+
large_tensor_bias = tf.constant(sess.run(tf.random_uniform([256])))
|
47
|
+
|
46
48
|
b = tf.constant(sess.run(tf.random_uniform(SHAPES)))
|
47
49
|
|
48
50
|
c = tf.constant(sess.run(tf.random_uniform(SHAPES)))
|
@@ -67,11 +69,15 @@ split = tf.split(a, 4)
|
|
67
69
|
sum = tf.reduce_sum(large_tensor)
|
68
70
|
sum_axis_1 = tf.reduce_sum(large_tensor, 1)
|
69
71
|
min = tf.min(large_tensor, 1)
|
72
|
+
argmin = tf.argmin(large_tensor)
|
70
73
|
index = large_tensor[0]
|
71
74
|
|
72
75
|
conv2d = tf.nn.conv2d(sample_image, sample_filter, [1, 1, 1, 1], 'SAME')
|
73
76
|
conv2d_grad = tf.gradients(conv2d, [sample_image, sample_filter])
|
74
77
|
|
78
|
+
bias_add = tf.nn.bias_add(large_tensor, large_tensor_bias)
|
79
|
+
bias_add_grad = tf.gradients(bias_add, [large_tensor_bias])
|
80
|
+
|
75
81
|
puts TensorStream::Evaluator.default_evaluators
|
76
82
|
|
77
83
|
sess2 = tf.session
|
@@ -80,6 +86,12 @@ puts `cat /proc/cpuinfo | grep "model name" | head -1`
|
|
80
86
|
device = TensorStream::Evaluator::OpenclEvaluator.default_device.native_device
|
81
87
|
puts "OpenCL device #{device.platform.to_s} #{device.name}"
|
82
88
|
Benchmark.bmbm do |x|
|
89
|
+
x.report("pure ruby argmin :") { 100.times do sess.run(argmin) end }
|
90
|
+
x.report("opencl argmin :") { 100.times do sess2.run(argmin) end }
|
91
|
+
x.report("pure ruby bias_add_grad :") { 100.times do sess.run(bias_add_grad) end }
|
92
|
+
x.report("opencl bias_add_grad :") { 100.times do sess2.run(bias_add_grad) end }
|
93
|
+
x.report("pure ruby bias_add :") { 100.times do sess.run(bias_add) end }
|
94
|
+
x.report("opencl bias_add :") { 100.times do sess2.run(bias_add) end }
|
83
95
|
x.report("pure ruby conv2d_backprop :") { 100.times do sess.run(conv2d_grad) end }
|
84
96
|
x.report("opencl conv2d_backprop :") { 100.times do sess2.run(conv2d_grad) end }
|
85
97
|
x.report("pure ruby conv2d :") { 100.times do sess.run(conv2d) end }
|
@@ -0,0 +1,80 @@
|
|
1
|
+
TensorStream::Evaluator::OpenclEvaluator
|
2
|
+
TensorStream::Evaluator::RubyEvaluator
|
3
|
+
model name : AMD Ryzen 3 1300X Quad-Core Processor
|
4
|
+
OpenCL device NVIDIA CUDA GeForce GTX 1060 6GB
|
5
|
+
Rehearsal ------------------------------------------------------------------------
|
6
|
+
pure ruby argmin : 0.708414 0.007882 0.716296 ( 0.717201)
|
7
|
+
opencl argmin : 0.204186 0.222389 0.426575 ( 0.447862)
|
8
|
+
pure ruby bias_add_grad : 2.048097 0.005187 2.053284 ( 2.057617)
|
9
|
+
opencl bias_add_grad : 0.012482 0.000426 0.012908 ( 0.013225)
|
10
|
+
pure ruby bias_add : 2.406516 0.000087 2.406603 ( 2.406307)
|
11
|
+
opencl bias_add : 0.136466 0.008025 0.144491 ( 0.134989)
|
12
|
+
pure ruby conv2d_backprop : 3.685220 0.000155 3.685375 ( 3.685049)
|
13
|
+
opencl conv2d_backprop : 0.028940 0.008031 0.036971 ( 0.029904)
|
14
|
+
pure ruby conv2d : 0.788991 0.000041 0.789032 ( 0.788881)
|
15
|
+
opencl conv2d : 0.020150 0.000138 0.020288 ( 0.016917)
|
16
|
+
pure ruby arr index : 0.003036 0.000000 0.003036 ( 0.003044)
|
17
|
+
opencl arr index : 0.009626 0.000023 0.009649 ( 0.006703)
|
18
|
+
pure ruby min : 3.767836 0.007871 3.775707 ( 3.773523)
|
19
|
+
opencl min : 0.141541 0.008039 0.149580 ( 0.139246)
|
20
|
+
pure ruby sum : 3.219801 0.000076 3.219877 ( 3.218388)
|
21
|
+
opencl sum : 0.007480 0.004074 0.011554 ( 0.008261)
|
22
|
+
pure ruby sum axis 1 : 3.203423 0.000000 3.203423 ( 3.201832)
|
23
|
+
opencl sum axis 1 : 0.011710 0.000033 0.011743 ( 0.008379)
|
24
|
+
pure ruby split : 0.016504 0.000008 0.016512 ( 0.016529)
|
25
|
+
opencl split : 0.041059 0.012026 0.053085 ( 0.043289)
|
26
|
+
pure ruby add_n : 0.141810 0.000000 0.141810 ( 0.141721)
|
27
|
+
opencl add_n : 0.013751 0.000000 0.013751 ( 0.012208)
|
28
|
+
pure ruby ooo matmul : 1.395286 0.000000 1.395286 ( 1.394697)
|
29
|
+
opencl ooo matmul : 0.013448 0.000000 0.013448 ( 0.009873)
|
30
|
+
pure ruby softmax : 0.025362 0.000018 0.025380 ( 0.025382)
|
31
|
+
opencl softmax : 0.014999 0.000051 0.015050 ( 0.011977)
|
32
|
+
pure ruby matmul : 0.666863 0.000000 0.666863 ( 0.666499)
|
33
|
+
opencl matmul : 0.008572 0.003920 0.012492 ( 0.009246)
|
34
|
+
pure ruby : 2.429792 0.000005 2.429797 ( 2.428788)
|
35
|
+
opencl : 0.140862 0.004014 0.144876 ( 0.137264)
|
36
|
+
pure ruby single function: 0.340247 0.000000 0.340247 ( 0.340184)
|
37
|
+
opencl singlefunction: 0.084871 0.007956 0.092827 ( 0.087899)
|
38
|
+
pure ruby pow float: 0.083372 0.000000 0.083372 ( 0.083339)
|
39
|
+
opencl pow float: 0.013498 0.000014 0.013512 ( 0.010353)
|
40
|
+
pure ruby pow int: 0.018739 0.000000 0.018739 ( 0.018753)
|
41
|
+
opencl pow int: 0.007737 0.004041 0.011778 ( 0.008817)
|
42
|
+
-------------------------------------------------------------- total: 26.165217sec
|
43
|
+
|
44
|
+
user system total real
|
45
|
+
pure ruby argmin : 0.677097 0.000009 0.677106 ( 0.676828)
|
46
|
+
opencl argmin : 0.005919 0.003950 0.009869 ( 0.006618)
|
47
|
+
pure ruby bias_add_grad : 2.027326 0.000000 2.027326 ( 2.026399)
|
48
|
+
opencl bias_add_grad : 0.011544 0.000050 0.011594 ( 0.008380)
|
49
|
+
pure ruby bias_add : 2.378283 0.000000 2.378283 ( 2.377411)
|
50
|
+
opencl bias_add : 0.130993 0.011994 0.142987 ( 0.132772)
|
51
|
+
pure ruby conv2d_backprop : 3.738167 0.000000 3.738167 ( 3.737946)
|
52
|
+
opencl conv2d_backprop : 0.031267 0.003958 0.035225 ( 0.030381)
|
53
|
+
pure ruby conv2d : 0.794182 0.000000 0.794182 ( 0.794100)
|
54
|
+
opencl conv2d : 0.015865 0.004020 0.019885 ( 0.016878)
|
55
|
+
pure ruby arr index : 0.003112 0.000000 0.003112 ( 0.003109)
|
56
|
+
opencl arr index : 0.012100 0.000000 0.012100 ( 0.009728)
|
57
|
+
pure ruby min : 3.669509 0.003944 3.673453 ( 3.671906)
|
58
|
+
opencl min : 0.137071 0.004055 0.141126 ( 0.131802)
|
59
|
+
pure ruby sum : 3.210619 0.000000 3.210619 ( 3.210064)
|
60
|
+
opencl sum : 0.002431 0.008030 0.010461 ( 0.007522)
|
61
|
+
pure ruby sum axis 1 : 3.208789 0.000000 3.208789 ( 3.208125)
|
62
|
+
opencl sum axis 1 : 0.006075 0.003963 0.010038 ( 0.007679)
|
63
|
+
pure ruby split : 0.013985 0.000000 0.013985 ( 0.013990)
|
64
|
+
opencl split : 0.029464 0.011999 0.041463 ( 0.030797)
|
65
|
+
pure ruby add_n : 0.140984 0.000003 0.140987 ( 0.140959)
|
66
|
+
opencl add_n : 0.003146 0.007934 0.011080 ( 0.007778)
|
67
|
+
pure ruby ooo matmul : 1.416585 0.000000 1.416585 ( 1.416290)
|
68
|
+
opencl ooo matmul : 0.011156 0.000000 0.011156 ( 0.008723)
|
69
|
+
pure ruby softmax : 0.024724 0.000000 0.024724 ( 0.024731)
|
70
|
+
opencl softmax : 0.006237 0.003945 0.010182 ( 0.009005)
|
71
|
+
pure ruby matmul : 0.679538 0.000000 0.679538 ( 0.680048)
|
72
|
+
opencl matmul : 0.003456 0.007965 0.011421 ( 0.008568)
|
73
|
+
pure ruby : 2.437790 0.004031 2.441821 ( 2.443126)
|
74
|
+
opencl : 0.133039 0.003996 0.137035 ( 0.130579)
|
75
|
+
pure ruby single function: 0.332269 0.004003 0.336272 ( 0.336201)
|
76
|
+
opencl singlefunction: 0.078607 0.004009 0.082616 ( 0.078640)
|
77
|
+
pure ruby pow float: 0.081409 0.000000 0.081409 ( 0.081364)
|
78
|
+
opencl pow float: 0.011501 0.000000 0.011501 ( 0.008471)
|
79
|
+
pure ruby pow int: 0.016687 0.000000 0.016687 ( 0.016711)
|
80
|
+
opencl pow int: 0.007061 0.003950 0.011011 ( 0.007819)
|
@@ -0,0 +1,42 @@
|
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
% out_c_dtype = dtype_to_c_type(out_dtype)
|
3
|
+
% o_multipliers = o_shape.dup.drop(1).reverse.inject([1]) { |a, s| a << s * a.last }.reverse
|
4
|
+
% i_multipliers = shape.dup.drop(1).reverse.inject([1]) { |a, s| a << s * a.last }.reverse
|
5
|
+
% out_ops = o_multipliers.map.with_index { |m, index| "id_#{index} * #{m}"}.join(' + ')
|
6
|
+
% axis = axis[0]
|
7
|
+
% in_axis_multipliers = i_multipliers.select.with_index { |m, index| axis == index }
|
8
|
+
% in_axis_ops = in_axis_multipliers.map.with_index { |m| "i * #{m}"}.join(' + ')
|
9
|
+
% in_output_multipliers = i_multipliers.reject.with_index { |m, index| axis == index }
|
10
|
+
% in_output_ops = in_output_multipliers.map.with_index { |m, index| "id_#{index} * #{m}"}.join(' + ')
|
11
|
+
__kernel void arg_axis_<%= dtype %>(__global const <%= c_dtype %> *value, __global <%= out_c_dtype %> *output) {
|
12
|
+
// Get the index of the current element to be processed
|
13
|
+
<% o_multipliers.size.times.each_with_index do |s, index| %>
|
14
|
+
const int id_<%= index %> = get_global_id(<%= index %>);
|
15
|
+
<% end %>
|
16
|
+
|
17
|
+
<%= c_dtype %> min_or_max_value = <%= f == :argmax ? min_value_for(dtype) : max_value_for(dtype) %>;
|
18
|
+
int min_or_max_index = 0;
|
19
|
+
|
20
|
+
for (int i = 0; i < <%= shape[axis] %>; i++) {
|
21
|
+
|
22
|
+
int index = <%= in_axis_ops %>;
|
23
|
+
|
24
|
+
<% unless in_output_ops.empty? %>
|
25
|
+
index += <%= in_output_ops %>;
|
26
|
+
<% end %>
|
27
|
+
<%= case(f)
|
28
|
+
when :argmax
|
29
|
+
"if (value[index] > min_or_max_value) {"
|
30
|
+
when :argmin
|
31
|
+
"if (value[index] < min_or_max_value) {"
|
32
|
+
else
|
33
|
+
raise "unkown redunction func #{f}"
|
34
|
+
end
|
35
|
+
%>
|
36
|
+
min_or_max_index = i;
|
37
|
+
min_or_max_value = value[index];
|
38
|
+
}
|
39
|
+
}
|
40
|
+
|
41
|
+
output[<%= out_ops %>] = (<%= out_c_dtype %>)min_or_max_index;
|
42
|
+
}
|
@@ -1,8 +1,14 @@
|
|
1
1
|
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
2
|
+
% out_c_dtype = dtype_to_c_type(out_dtype)
|
3
|
+
__kernel void argmax_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
4
|
+
<%= c_dtype %> max = <%= min_value_for(dtype) %>;
|
5
|
+
<%= out_c_dtype %> max_index = 0;
|
6
|
+
|
7
|
+
for(int i = 0; i < <%= n %>; i++) {
|
8
|
+
if (A[i] > max) {
|
9
|
+
max = A[i];
|
10
|
+
max_index = i;
|
11
|
+
}
|
12
|
+
}
|
13
|
+
C[0] = max_index;
|
8
14
|
}
|
@@ -1,8 +1,14 @@
|
|
1
1
|
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
2
|
+
% out_c_dtype = dtype_to_c_type(out_dtype)
|
3
|
+
__kernel void argmin_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
4
|
+
<%= c_dtype %> min = <%= max_value_for(dtype) %>;
|
5
|
+
<%= out_c_dtype %> min_index = 0;
|
6
|
+
|
7
|
+
for(int i = 0; i < <%= n %>; i++) {
|
8
|
+
if (A[i] < min) {
|
9
|
+
min = A[i];
|
10
|
+
min_index = i;
|
11
|
+
}
|
12
|
+
}
|
13
|
+
C[0] = min_index;
|
8
14
|
}
|
@@ -0,0 +1,9 @@
|
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
|
3
|
+
__kernel void bias_add_<%= dtype %>(__global const <%= c_dtype %> *value, __constant const <%= c_dtype %> *bias, __global <%= c_dtype %> *output) {
|
4
|
+
const int id = get_global_id(0);
|
5
|
+
|
6
|
+
for(int i = 0; i < <%= n %>; i++) {
|
7
|
+
output[ <%= n %> * id + i] = value[ <%= n %> * id + i] + bias[i];
|
8
|
+
}
|
9
|
+
}
|
@@ -0,0 +1,10 @@
|
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
|
3
|
+
__kernel void bias_add_grad_<%= dtype %>(__global const <%= c_dtype %> *received_grad, __global <%= c_dtype %> *output) {
|
4
|
+
const int id = get_global_id(0);
|
5
|
+
<%= c_dtype %> sum = 0;
|
6
|
+
for(int i = 0; i < <%= rows %>; i++) {
|
7
|
+
sum += received_grad[<%= n %> * i + id];
|
8
|
+
}
|
9
|
+
output[id] = sum;
|
10
|
+
}
|
@@ -0,0 +1,42 @@
|
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
% o_multipliers = o_shape.dup.drop(1).reverse.inject([1]) { |a, s| a << s * a.last }.reverse
|
3
|
+
% i_multipliers = shape.dup.drop(1).reverse.inject([1]) { |a, s| a << s * a.last }.reverse
|
4
|
+
% out_ops = o_multipliers.map.with_index { |m, index| "id_#{index} * #{m}" }.join(' + ')
|
5
|
+
% in_axis_multipliers = i_multipliers.select.with_index { |m, index| axis.include?(index) }
|
6
|
+
% in_axis_ops = in_axis_multipliers.map.with_index { |m, index| "i_#{index} * #{m}"}.join(' + ')
|
7
|
+
% in_output_multipliers = i_multipliers.reject.with_index { |m, index| axis.include?(index) }
|
8
|
+
% in_output_ops = in_output_multipliers.map.with_index { |m, index| "id_#{index} * #{m}"}.join(' + ')
|
9
|
+
__kernel void reduce_axis_<%= dtype %>(__global const <%= c_dtype %> *value, __global <%= c_dtype %> *output) {
|
10
|
+
// Get the index of the current element to be processed
|
11
|
+
<% o_multipliers.size.times.each_with_index do |s, index| %>
|
12
|
+
const int id_<%= index %> = get_global_id(<%= index %>);
|
13
|
+
<% end %>
|
14
|
+
|
15
|
+
<%= c_dtype %> sum = <%= f == :prod ? 1 : 0 %>;
|
16
|
+
<%= c_dtype %> item_size = 0;
|
17
|
+
<% axis.each_with_index do |axis, index| %>
|
18
|
+
for (int i_<%= index %> = 0; i_<%= index %> < <%= shape[axis] %>; i_<%= index %>++) {
|
19
|
+
<% end %>
|
20
|
+
int index = <%= in_axis_ops %>;
|
21
|
+
item_size += 1;
|
22
|
+
<% unless in_output_ops.empty? %>
|
23
|
+
index += <%= in_output_ops %>;
|
24
|
+
<% end %>
|
25
|
+
<%= case(f)
|
26
|
+
when :sum, :mean
|
27
|
+
"sum += value[index];"
|
28
|
+
when :prod
|
29
|
+
"sum *= value[index];"
|
30
|
+
else
|
31
|
+
raise "unkown redunction func #{f}"
|
32
|
+
end
|
33
|
+
%>
|
34
|
+
<% axis.each do |axis| %>
|
35
|
+
}
|
36
|
+
<% end %>
|
37
|
+
<% if f == :mean %>
|
38
|
+
output[<%= out_ops %>] = sum / item_size;
|
39
|
+
<% else %>
|
40
|
+
output[<%= out_ops %>] = sum;
|
41
|
+
<% end %>
|
42
|
+
}
|
@@ -90,6 +90,36 @@ module TensorStream
|
|
90
90
|
output_buffer
|
91
91
|
end
|
92
92
|
|
93
|
+
register_op :bias_add do |context, tensor, inputs|
|
94
|
+
value, bias = inputs
|
95
|
+
output_buffer = _create_result_buffer(value.data_type, value.shape, tensor.name)
|
96
|
+
result_shape = value.shape.dup
|
97
|
+
bias_length = result_shape.pop
|
98
|
+
work_group = [result_shape.reduce(:*)]
|
99
|
+
event_wait_list = build_event_wait_list([value, bias])
|
100
|
+
dtype = tensor.data_type
|
101
|
+
output_buffer.op = _cl_program('bias_add', n: bias_length, dtype: dtype)
|
102
|
+
.send(:"bias_add_#{dtype}", _opencl_queue, work_group, value.cl_buffer,
|
103
|
+
bias.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
104
|
+
output_buffer
|
105
|
+
end
|
106
|
+
|
107
|
+
register_op :bias_add_grad do |context, tensor, inputs|
|
108
|
+
received_grad = inputs[0]
|
109
|
+
bias_size = received_grad.shape.last
|
110
|
+
output_buffer = _create_result_buffer(received_grad.data_type, [bias_size], tensor.name)
|
111
|
+
work_group = [bias_size]
|
112
|
+
|
113
|
+
received_grad_shape = received_grad.shape.dup
|
114
|
+
received_grad_shape.pop
|
115
|
+
item_rows = received_grad_shape.reduce(:*)
|
116
|
+
dtype = tensor.data_type
|
117
|
+
output_buffer.op = _cl_program('bias_add_grad', n: bias_size, rows: item_rows, dtype: dtype)
|
118
|
+
.send(:"bias_add_grad_#{dtype}", _opencl_queue, work_group, received_grad.cl_buffer,
|
119
|
+
output_buffer.cl_buffer, event_wait_list: build_event_wait_list([received_grad]))
|
120
|
+
output_buffer
|
121
|
+
end
|
122
|
+
|
93
123
|
%i[sign exp tan acos asin sin cos abs sqrt negate square reciprocal tanh tanh_grad sigmoid log1p round floor ceil log].each do |op|
|
94
124
|
register_op op, noop: true do |context, tensor, inputs|
|
95
125
|
execute_func(op.to_s, tensor, inputs[0], context)
|
@@ -110,25 +140,18 @@ module TensorStream
|
|
110
140
|
end
|
111
141
|
end
|
112
142
|
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
# arr = inputs[0].buffer.reshape(*inputs[0].shape.reverse).to_a
|
119
|
-
# op = get_op_with_axis(arr, axis, 0, inputs[0].data_type, ->(a, b) { a < b })
|
120
|
-
# convert_to_opencl(op, shape_eval(op), data_type: tensor.data_type, name: tensor.name)
|
121
|
-
# end
|
143
|
+
%i[argmin argmax].each do |op|
|
144
|
+
register_op op do |context, tensor, inputs|
|
145
|
+
value, axis = inputs
|
146
|
+
rank = value.shape.size
|
147
|
+
axis = 0 if axis.nil?
|
122
148
|
|
123
|
-
|
124
|
-
|
125
|
-
# rank = inputs[0].shape.size
|
126
|
-
# raise TensorStream::InvalidArgumentError, "Expected dimension in the range [#{-rank},#{rank}) but got #{axis}" if axis < -rank || axis >= rank
|
149
|
+
axis = axis.is_a?(OpenCLBuffer) ? read_final_result(axis) : axis
|
150
|
+
raise TensorStream::InvalidArgumentError, "Expected dimension in the range [#{-rank},#{rank}) but got #{axis}" if axis < -rank || axis >= rank
|
127
151
|
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
# end
|
152
|
+
reduce_multi_axis(context, tensor, value, axis, 'arg', op.to_sym)
|
153
|
+
end
|
154
|
+
end
|
132
155
|
|
133
156
|
def reduction(child_context, tensor, value, axis, func)
|
134
157
|
if axis.nil?
|
@@ -164,33 +187,34 @@ module TensorStream
|
|
164
187
|
end
|
165
188
|
end
|
166
189
|
else
|
167
|
-
|
190
|
+
reduce_multi_axis(child_context, tensor, value, axis, 'reduce', func)
|
191
|
+
end
|
192
|
+
end
|
168
193
|
|
169
|
-
|
170
|
-
|
194
|
+
def reduce_multi_axis(child_context, tensor, value, axis, prog, func)
|
195
|
+
return value if value.shape.empty?
|
171
196
|
|
172
|
-
|
173
|
-
rank = input.shape.size - 1
|
197
|
+
rank = value.shape.size
|
174
198
|
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
value = value.send(func, rank - axis.abs)
|
181
|
-
end
|
199
|
+
axis = axis.is_a?(OpenCLBuffer) ? read_final_result(axis) : axis
|
200
|
+
axis = [axis] unless axis.is_a?(Array)
|
201
|
+
return value if axis.empty?
|
202
|
+
# remap negative values
|
203
|
+
axis.map! { |axis| axis < 0 ? rank - axis.abs : axis }
|
182
204
|
|
183
|
-
|
184
|
-
value.shape.reverse
|
185
|
-
else
|
186
|
-
value = [value]
|
187
|
-
[]
|
188
|
-
end
|
205
|
+
new_shape = value.shape.collect.with_index { |v, index| axis.include?(index) ? nil : v }.compact
|
189
206
|
|
190
|
-
|
207
|
+
buffer_shape = tensor.options[:keepdims] ? _reduced_shape(value.shape.dup, axis) : new_shape
|
208
|
+
output_buffer = _create_result_buffer(tensor.options[:output_type] || tensor.data_type, buffer_shape, tensor.name)
|
191
209
|
|
192
|
-
|
193
|
-
|
210
|
+
work_group = new_shape.empty? ? [1] : new_shape
|
211
|
+
dtype = value.data_type
|
212
|
+
|
213
|
+
output_buffer.op = _cl_program("#{prog}_axis", f: func, axis: axis, shape: value.shape, o_shape: new_shape, dtype: dtype, out_dtype: tensor.options[:output_type])
|
214
|
+
.send("#{prog}_axis_#{dtype}", _opencl_queue, work_group, value.cl_buffer,
|
215
|
+
output_buffer.cl_buffer, event_wait_list: build_event_wait_list([value]))
|
216
|
+
|
217
|
+
output_buffer
|
194
218
|
end
|
195
219
|
end
|
196
220
|
end
|
@@ -23,6 +23,10 @@ module TensorStream
|
|
23
23
|
@shape == [0]
|
24
24
|
end
|
25
25
|
|
26
|
+
def inspect
|
27
|
+
"CLBuffer(shape: #{shape || "?"} data_type: #{data_type}, cl_allocated: #{cl_buffer ? cl_buffer.size : 'unallocated'}) -> raw: #{buffer.to_a}"
|
28
|
+
end
|
29
|
+
|
26
30
|
def to_ruby
|
27
31
|
return [] if buffer.empty?
|
28
32
|
|
@@ -76,6 +76,27 @@ class OpenclTemplateHelper
|
|
76
76
|
end
|
77
77
|
end
|
78
78
|
|
79
|
+
def max_value_for(dtype)
|
80
|
+
case dtype.to_s
|
81
|
+
when 'float64'
|
82
|
+
'DBL_MAX'
|
83
|
+
when 'float32', 'float', 'float16'
|
84
|
+
'FLT_MAX'
|
85
|
+
when 'int32', 'int'
|
86
|
+
'INT_MAX'
|
87
|
+
when 'uint32', 'uint16'
|
88
|
+
'0'
|
89
|
+
when 'int16'
|
90
|
+
'SHRT_MAX'
|
91
|
+
when 'int8'
|
92
|
+
'256'
|
93
|
+
when 'boolean'
|
94
|
+
'1'
|
95
|
+
else
|
96
|
+
raise "unknown dtype #{dtype}"
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
79
100
|
def operator_to_c(op)
|
80
101
|
case op
|
81
102
|
when 'less'
|
@@ -39,7 +39,7 @@ Gem::Specification.new do |spec|
|
|
39
39
|
spec.add_development_dependency "awesome_print"
|
40
40
|
spec.add_development_dependency "mnist-learn"
|
41
41
|
spec.add_development_dependency "simplecov"
|
42
|
-
spec.add_dependency "tensor_stream", "1.0.
|
42
|
+
spec.add_dependency "tensor_stream", "1.0.6"
|
43
43
|
spec.add_dependency "opencl_ruby_ffi"
|
44
44
|
spec.add_dependency "oily_png"
|
45
45
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tensor_stream-opencl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Joseph Dayo
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-03-
|
11
|
+
date: 2019-03-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -114,14 +114,14 @@ dependencies:
|
|
114
114
|
requirements:
|
115
115
|
- - '='
|
116
116
|
- !ruby/object:Gem::Version
|
117
|
-
version: 1.0.
|
117
|
+
version: 1.0.6
|
118
118
|
type: :runtime
|
119
119
|
prerelease: false
|
120
120
|
version_requirements: !ruby/object:Gem::Requirement
|
121
121
|
requirements:
|
122
122
|
- - '='
|
123
123
|
- !ruby/object:Gem::Version
|
124
|
-
version: 1.0.
|
124
|
+
version: 1.0.6
|
125
125
|
- !ruby/object:Gem::Dependency
|
126
126
|
name: opencl_ruby_ffi
|
127
127
|
requirement: !ruby/object:Gem::Requirement
|
@@ -169,6 +169,7 @@ files:
|
|
169
169
|
- benchmark/benchmark.rb
|
170
170
|
- benchmark_intel.txt
|
171
171
|
- benchmark_ryzen.txt
|
172
|
+
- benchmark_ryzen_nvidia.txt
|
172
173
|
- bin/console
|
173
174
|
- bin/setup
|
174
175
|
- lib/tensor_stream/opencl.rb
|
@@ -186,9 +187,12 @@ files:
|
|
186
187
|
- lib/tensor_stream/opencl/kernels/apply_gradient.cl
|
187
188
|
- lib/tensor_stream/opencl/kernels/apply_momentum.cl
|
188
189
|
- lib/tensor_stream/opencl/kernels/apply_rms_prop.cl
|
190
|
+
- lib/tensor_stream/opencl/kernels/arg_axis.cl
|
189
191
|
- lib/tensor_stream/opencl/kernels/argmax.cl
|
190
192
|
- lib/tensor_stream/opencl/kernels/argmin.cl
|
191
193
|
- lib/tensor_stream/opencl/kernels/asin.cl
|
194
|
+
- lib/tensor_stream/opencl/kernels/bias_add.cl
|
195
|
+
- lib/tensor_stream/opencl/kernels/bias_add_grad.cl
|
192
196
|
- lib/tensor_stream/opencl/kernels/cast.cl
|
193
197
|
- lib/tensor_stream/opencl/kernels/ceil.cl
|
194
198
|
- lib/tensor_stream/opencl/kernels/concat.cl
|
@@ -217,6 +221,7 @@ files:
|
|
217
221
|
- lib/tensor_stream/opencl/kernels/prod.cl
|
218
222
|
- lib/tensor_stream/opencl/kernels/real_div.cl
|
219
223
|
- lib/tensor_stream/opencl/kernels/reciprocal.cl
|
224
|
+
- lib/tensor_stream/opencl/kernels/reduce_axis.cl
|
220
225
|
- lib/tensor_stream/opencl/kernels/relu6.cl
|
221
226
|
- lib/tensor_stream/opencl/kernels/round.cl
|
222
227
|
- lib/tensor_stream/opencl/kernels/sigmoid.cl
|