tensor_stream-opencl 0.2.10 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/benchmark/benchmark.rb +12 -0
- data/benchmark_ryzen_nvidia.txt +80 -0
- data/lib/tensor_stream/opencl/kernels/arg_axis.cl +42 -0
- data/lib/tensor_stream/opencl/kernels/argmax.cl +12 -6
- data/lib/tensor_stream/opencl/kernels/argmin.cl +12 -6
- data/lib/tensor_stream/opencl/kernels/bias_add.cl +9 -0
- data/lib/tensor_stream/opencl/kernels/bias_add_grad.cl +10 -0
- data/lib/tensor_stream/opencl/kernels/reduce_axis.cl +42 -0
- data/lib/tensor_stream/opencl/math_ops.rb +62 -38
- data/lib/tensor_stream/opencl/opencl_buffer.rb +4 -0
- data/lib/tensor_stream/opencl/opencl_template_helper.rb +21 -0
- data/lib/tensor_stream/opencl/version.rb +1 -1
- data/tensor_stream-opencl.gemspec +1 -1
- metadata +9 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d2b7fa8bb0dcaa041e5ec390418ffac2bb40d777086e9ebe771f9f42d79ccabc
|
4
|
+
data.tar.gz: e21e0cf638e177da495e9748087808c51aee15ac10212c856ee789ae8c32d458
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 31a1cc671716dee18d6841e39f3bf43c5326a423be59e118e5acf35777aaed9b29acdfdd3425d6ce48f5f5c5f8e5692fae687a592828be0a458f337498965008
|
7
|
+
data.tar.gz: ecef0d771882d662996aa31cfa4eaf9e8cef720c2b05eb48f872ce9c3fbe9e07dfc99427332a6bcbf5fb8054f7f00d27fc55184462a559be834635aaad358c30
|
data/benchmark/benchmark.rb
CHANGED
@@ -43,6 +43,8 @@ a_int = tf.constant([
|
|
43
43
|
[4, 2, 3, 4, 0, 1, 1, 0, 0, 2, 1, 2],
|
44
44
|
])
|
45
45
|
|
46
|
+
large_tensor_bias = tf.constant(sess.run(tf.random_uniform([256])))
|
47
|
+
|
46
48
|
b = tf.constant(sess.run(tf.random_uniform(SHAPES)))
|
47
49
|
|
48
50
|
c = tf.constant(sess.run(tf.random_uniform(SHAPES)))
|
@@ -67,11 +69,15 @@ split = tf.split(a, 4)
|
|
67
69
|
sum = tf.reduce_sum(large_tensor)
|
68
70
|
sum_axis_1 = tf.reduce_sum(large_tensor, 1)
|
69
71
|
min = tf.min(large_tensor, 1)
|
72
|
+
argmin = tf.argmin(large_tensor)
|
70
73
|
index = large_tensor[0]
|
71
74
|
|
72
75
|
conv2d = tf.nn.conv2d(sample_image, sample_filter, [1, 1, 1, 1], 'SAME')
|
73
76
|
conv2d_grad = tf.gradients(conv2d, [sample_image, sample_filter])
|
74
77
|
|
78
|
+
bias_add = tf.nn.bias_add(large_tensor, large_tensor_bias)
|
79
|
+
bias_add_grad = tf.gradients(bias_add, [large_tensor_bias])
|
80
|
+
|
75
81
|
puts TensorStream::Evaluator.default_evaluators
|
76
82
|
|
77
83
|
sess2 = tf.session
|
@@ -80,6 +86,12 @@ puts `cat /proc/cpuinfo | grep "model name" | head -1`
|
|
80
86
|
device = TensorStream::Evaluator::OpenclEvaluator.default_device.native_device
|
81
87
|
puts "OpenCL device #{device.platform.to_s} #{device.name}"
|
82
88
|
Benchmark.bmbm do |x|
|
89
|
+
x.report("pure ruby argmin :") { 100.times do sess.run(argmin) end }
|
90
|
+
x.report("opencl argmin :") { 100.times do sess2.run(argmin) end }
|
91
|
+
x.report("pure ruby bias_add_grad :") { 100.times do sess.run(bias_add_grad) end }
|
92
|
+
x.report("opencl bias_add_grad :") { 100.times do sess2.run(bias_add_grad) end }
|
93
|
+
x.report("pure ruby bias_add :") { 100.times do sess.run(bias_add) end }
|
94
|
+
x.report("opencl bias_add :") { 100.times do sess2.run(bias_add) end }
|
83
95
|
x.report("pure ruby conv2d_backprop :") { 100.times do sess.run(conv2d_grad) end }
|
84
96
|
x.report("opencl conv2d_backprop :") { 100.times do sess2.run(conv2d_grad) end }
|
85
97
|
x.report("pure ruby conv2d :") { 100.times do sess.run(conv2d) end }
|
@@ -0,0 +1,80 @@
|
|
1
|
+
TensorStream::Evaluator::OpenclEvaluator
|
2
|
+
TensorStream::Evaluator::RubyEvaluator
|
3
|
+
model name : AMD Ryzen 3 1300X Quad-Core Processor
|
4
|
+
OpenCL device NVIDIA CUDA GeForce GTX 1060 6GB
|
5
|
+
Rehearsal ------------------------------------------------------------------------
|
6
|
+
pure ruby argmin : 0.708414 0.007882 0.716296 ( 0.717201)
|
7
|
+
opencl argmin : 0.204186 0.222389 0.426575 ( 0.447862)
|
8
|
+
pure ruby bias_add_grad : 2.048097 0.005187 2.053284 ( 2.057617)
|
9
|
+
opencl bias_add_grad : 0.012482 0.000426 0.012908 ( 0.013225)
|
10
|
+
pure ruby bias_add : 2.406516 0.000087 2.406603 ( 2.406307)
|
11
|
+
opencl bias_add : 0.136466 0.008025 0.144491 ( 0.134989)
|
12
|
+
pure ruby conv2d_backprop : 3.685220 0.000155 3.685375 ( 3.685049)
|
13
|
+
opencl conv2d_backprop : 0.028940 0.008031 0.036971 ( 0.029904)
|
14
|
+
pure ruby conv2d : 0.788991 0.000041 0.789032 ( 0.788881)
|
15
|
+
opencl conv2d : 0.020150 0.000138 0.020288 ( 0.016917)
|
16
|
+
pure ruby arr index : 0.003036 0.000000 0.003036 ( 0.003044)
|
17
|
+
opencl arr index : 0.009626 0.000023 0.009649 ( 0.006703)
|
18
|
+
pure ruby min : 3.767836 0.007871 3.775707 ( 3.773523)
|
19
|
+
opencl min : 0.141541 0.008039 0.149580 ( 0.139246)
|
20
|
+
pure ruby sum : 3.219801 0.000076 3.219877 ( 3.218388)
|
21
|
+
opencl sum : 0.007480 0.004074 0.011554 ( 0.008261)
|
22
|
+
pure ruby sum axis 1 : 3.203423 0.000000 3.203423 ( 3.201832)
|
23
|
+
opencl sum axis 1 : 0.011710 0.000033 0.011743 ( 0.008379)
|
24
|
+
pure ruby split : 0.016504 0.000008 0.016512 ( 0.016529)
|
25
|
+
opencl split : 0.041059 0.012026 0.053085 ( 0.043289)
|
26
|
+
pure ruby add_n : 0.141810 0.000000 0.141810 ( 0.141721)
|
27
|
+
opencl add_n : 0.013751 0.000000 0.013751 ( 0.012208)
|
28
|
+
pure ruby ooo matmul : 1.395286 0.000000 1.395286 ( 1.394697)
|
29
|
+
opencl ooo matmul : 0.013448 0.000000 0.013448 ( 0.009873)
|
30
|
+
pure ruby softmax : 0.025362 0.000018 0.025380 ( 0.025382)
|
31
|
+
opencl softmax : 0.014999 0.000051 0.015050 ( 0.011977)
|
32
|
+
pure ruby matmul : 0.666863 0.000000 0.666863 ( 0.666499)
|
33
|
+
opencl matmul : 0.008572 0.003920 0.012492 ( 0.009246)
|
34
|
+
pure ruby : 2.429792 0.000005 2.429797 ( 2.428788)
|
35
|
+
opencl : 0.140862 0.004014 0.144876 ( 0.137264)
|
36
|
+
pure ruby single function: 0.340247 0.000000 0.340247 ( 0.340184)
|
37
|
+
opencl singlefunction: 0.084871 0.007956 0.092827 ( 0.087899)
|
38
|
+
pure ruby pow float: 0.083372 0.000000 0.083372 ( 0.083339)
|
39
|
+
opencl pow float: 0.013498 0.000014 0.013512 ( 0.010353)
|
40
|
+
pure ruby pow int: 0.018739 0.000000 0.018739 ( 0.018753)
|
41
|
+
opencl pow int: 0.007737 0.004041 0.011778 ( 0.008817)
|
42
|
+
-------------------------------------------------------------- total: 26.165217sec
|
43
|
+
|
44
|
+
user system total real
|
45
|
+
pure ruby argmin : 0.677097 0.000009 0.677106 ( 0.676828)
|
46
|
+
opencl argmin : 0.005919 0.003950 0.009869 ( 0.006618)
|
47
|
+
pure ruby bias_add_grad : 2.027326 0.000000 2.027326 ( 2.026399)
|
48
|
+
opencl bias_add_grad : 0.011544 0.000050 0.011594 ( 0.008380)
|
49
|
+
pure ruby bias_add : 2.378283 0.000000 2.378283 ( 2.377411)
|
50
|
+
opencl bias_add : 0.130993 0.011994 0.142987 ( 0.132772)
|
51
|
+
pure ruby conv2d_backprop : 3.738167 0.000000 3.738167 ( 3.737946)
|
52
|
+
opencl conv2d_backprop : 0.031267 0.003958 0.035225 ( 0.030381)
|
53
|
+
pure ruby conv2d : 0.794182 0.000000 0.794182 ( 0.794100)
|
54
|
+
opencl conv2d : 0.015865 0.004020 0.019885 ( 0.016878)
|
55
|
+
pure ruby arr index : 0.003112 0.000000 0.003112 ( 0.003109)
|
56
|
+
opencl arr index : 0.012100 0.000000 0.012100 ( 0.009728)
|
57
|
+
pure ruby min : 3.669509 0.003944 3.673453 ( 3.671906)
|
58
|
+
opencl min : 0.137071 0.004055 0.141126 ( 0.131802)
|
59
|
+
pure ruby sum : 3.210619 0.000000 3.210619 ( 3.210064)
|
60
|
+
opencl sum : 0.002431 0.008030 0.010461 ( 0.007522)
|
61
|
+
pure ruby sum axis 1 : 3.208789 0.000000 3.208789 ( 3.208125)
|
62
|
+
opencl sum axis 1 : 0.006075 0.003963 0.010038 ( 0.007679)
|
63
|
+
pure ruby split : 0.013985 0.000000 0.013985 ( 0.013990)
|
64
|
+
opencl split : 0.029464 0.011999 0.041463 ( 0.030797)
|
65
|
+
pure ruby add_n : 0.140984 0.000003 0.140987 ( 0.140959)
|
66
|
+
opencl add_n : 0.003146 0.007934 0.011080 ( 0.007778)
|
67
|
+
pure ruby ooo matmul : 1.416585 0.000000 1.416585 ( 1.416290)
|
68
|
+
opencl ooo matmul : 0.011156 0.000000 0.011156 ( 0.008723)
|
69
|
+
pure ruby softmax : 0.024724 0.000000 0.024724 ( 0.024731)
|
70
|
+
opencl softmax : 0.006237 0.003945 0.010182 ( 0.009005)
|
71
|
+
pure ruby matmul : 0.679538 0.000000 0.679538 ( 0.680048)
|
72
|
+
opencl matmul : 0.003456 0.007965 0.011421 ( 0.008568)
|
73
|
+
pure ruby : 2.437790 0.004031 2.441821 ( 2.443126)
|
74
|
+
opencl : 0.133039 0.003996 0.137035 ( 0.130579)
|
75
|
+
pure ruby single function: 0.332269 0.004003 0.336272 ( 0.336201)
|
76
|
+
opencl singlefunction: 0.078607 0.004009 0.082616 ( 0.078640)
|
77
|
+
pure ruby pow float: 0.081409 0.000000 0.081409 ( 0.081364)
|
78
|
+
opencl pow float: 0.011501 0.000000 0.011501 ( 0.008471)
|
79
|
+
pure ruby pow int: 0.016687 0.000000 0.016687 ( 0.016711)
|
80
|
+
opencl pow int: 0.007061 0.003950 0.011011 ( 0.007819)
|
@@ -0,0 +1,42 @@
|
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
% out_c_dtype = dtype_to_c_type(out_dtype)
|
3
|
+
% o_multipliers = o_shape.dup.drop(1).reverse.inject([1]) { |a, s| a << s * a.last }.reverse
|
4
|
+
% i_multipliers = shape.dup.drop(1).reverse.inject([1]) { |a, s| a << s * a.last }.reverse
|
5
|
+
% out_ops = o_multipliers.map.with_index { |m, index| "id_#{index} * #{m}"}.join(' + ')
|
6
|
+
% axis = axis[0]
|
7
|
+
% in_axis_multipliers = i_multipliers.select.with_index { |m, index| axis == index }
|
8
|
+
% in_axis_ops = in_axis_multipliers.map.with_index { |m| "i * #{m}"}.join(' + ')
|
9
|
+
% in_output_multipliers = i_multipliers.reject.with_index { |m, index| axis == index }
|
10
|
+
% in_output_ops = in_output_multipliers.map.with_index { |m, index| "id_#{index} * #{m}"}.join(' + ')
|
11
|
+
__kernel void arg_axis_<%= dtype %>(__global const <%= c_dtype %> *value, __global <%= out_c_dtype %> *output) {
|
12
|
+
// Get the index of the current element to be processed
|
13
|
+
<% o_multipliers.size.times.each_with_index do |s, index| %>
|
14
|
+
const int id_<%= index %> = get_global_id(<%= index %>);
|
15
|
+
<% end %>
|
16
|
+
|
17
|
+
<%= c_dtype %> min_or_max_value = <%= f == :argmax ? min_value_for(dtype) : max_value_for(dtype) %>;
|
18
|
+
int min_or_max_index = 0;
|
19
|
+
|
20
|
+
for (int i = 0; i < <%= shape[axis] %>; i++) {
|
21
|
+
|
22
|
+
int index = <%= in_axis_ops %>;
|
23
|
+
|
24
|
+
<% unless in_output_ops.empty? %>
|
25
|
+
index += <%= in_output_ops %>;
|
26
|
+
<% end %>
|
27
|
+
<%= case(f)
|
28
|
+
when :argmax
|
29
|
+
"if (value[index] > min_or_max_value) {"
|
30
|
+
when :argmin
|
31
|
+
"if (value[index] < min_or_max_value) {"
|
32
|
+
else
|
33
|
+
raise "unkown redunction func #{f}"
|
34
|
+
end
|
35
|
+
%>
|
36
|
+
min_or_max_index = i;
|
37
|
+
min_or_max_value = value[index];
|
38
|
+
}
|
39
|
+
}
|
40
|
+
|
41
|
+
output[<%= out_ops %>] = (<%= out_c_dtype %>)min_or_max_index;
|
42
|
+
}
|
@@ -1,8 +1,14 @@
|
|
1
1
|
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
2
|
+
% out_c_dtype = dtype_to_c_type(out_dtype)
|
3
|
+
__kernel void argmax_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
4
|
+
<%= c_dtype %> max = <%= min_value_for(dtype) %>;
|
5
|
+
<%= out_c_dtype %> max_index = 0;
|
6
|
+
|
7
|
+
for(int i = 0; i < <%= n %>; i++) {
|
8
|
+
if (A[i] > max) {
|
9
|
+
max = A[i];
|
10
|
+
max_index = i;
|
11
|
+
}
|
12
|
+
}
|
13
|
+
C[0] = max_index;
|
8
14
|
}
|
@@ -1,8 +1,14 @@
|
|
1
1
|
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
2
|
+
% out_c_dtype = dtype_to_c_type(out_dtype)
|
3
|
+
__kernel void argmin_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
4
|
+
<%= c_dtype %> min = <%= max_value_for(dtype) %>;
|
5
|
+
<%= out_c_dtype %> min_index = 0;
|
6
|
+
|
7
|
+
for(int i = 0; i < <%= n %>; i++) {
|
8
|
+
if (A[i] < min) {
|
9
|
+
min = A[i];
|
10
|
+
min_index = i;
|
11
|
+
}
|
12
|
+
}
|
13
|
+
C[0] = min_index;
|
8
14
|
}
|
@@ -0,0 +1,9 @@
|
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
|
3
|
+
__kernel void bias_add_<%= dtype %>(__global const <%= c_dtype %> *value, __constant const <%= c_dtype %> *bias, __global <%= c_dtype %> *output) {
|
4
|
+
const int id = get_global_id(0);
|
5
|
+
|
6
|
+
for(int i = 0; i < <%= n %>; i++) {
|
7
|
+
output[ <%= n %> * id + i] = value[ <%= n %> * id + i] + bias[i];
|
8
|
+
}
|
9
|
+
}
|
@@ -0,0 +1,10 @@
|
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
|
3
|
+
__kernel void bias_add_grad_<%= dtype %>(__global const <%= c_dtype %> *received_grad, __global <%= c_dtype %> *output) {
|
4
|
+
const int id = get_global_id(0);
|
5
|
+
<%= c_dtype %> sum = 0;
|
6
|
+
for(int i = 0; i < <%= rows %>; i++) {
|
7
|
+
sum += received_grad[<%= n %> * i + id];
|
8
|
+
}
|
9
|
+
output[id] = sum;
|
10
|
+
}
|
@@ -0,0 +1,42 @@
|
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
% o_multipliers = o_shape.dup.drop(1).reverse.inject([1]) { |a, s| a << s * a.last }.reverse
|
3
|
+
% i_multipliers = shape.dup.drop(1).reverse.inject([1]) { |a, s| a << s * a.last }.reverse
|
4
|
+
% out_ops = o_multipliers.map.with_index { |m, index| "id_#{index} * #{m}" }.join(' + ')
|
5
|
+
% in_axis_multipliers = i_multipliers.select.with_index { |m, index| axis.include?(index) }
|
6
|
+
% in_axis_ops = in_axis_multipliers.map.with_index { |m, index| "i_#{index} * #{m}"}.join(' + ')
|
7
|
+
% in_output_multipliers = i_multipliers.reject.with_index { |m, index| axis.include?(index) }
|
8
|
+
% in_output_ops = in_output_multipliers.map.with_index { |m, index| "id_#{index} * #{m}"}.join(' + ')
|
9
|
+
__kernel void reduce_axis_<%= dtype %>(__global const <%= c_dtype %> *value, __global <%= c_dtype %> *output) {
|
10
|
+
// Get the index of the current element to be processed
|
11
|
+
<% o_multipliers.size.times.each_with_index do |s, index| %>
|
12
|
+
const int id_<%= index %> = get_global_id(<%= index %>);
|
13
|
+
<% end %>
|
14
|
+
|
15
|
+
<%= c_dtype %> sum = <%= f == :prod ? 1 : 0 %>;
|
16
|
+
<%= c_dtype %> item_size = 0;
|
17
|
+
<% axis.each_with_index do |axis, index| %>
|
18
|
+
for (int i_<%= index %> = 0; i_<%= index %> < <%= shape[axis] %>; i_<%= index %>++) {
|
19
|
+
<% end %>
|
20
|
+
int index = <%= in_axis_ops %>;
|
21
|
+
item_size += 1;
|
22
|
+
<% unless in_output_ops.empty? %>
|
23
|
+
index += <%= in_output_ops %>;
|
24
|
+
<% end %>
|
25
|
+
<%= case(f)
|
26
|
+
when :sum, :mean
|
27
|
+
"sum += value[index];"
|
28
|
+
when :prod
|
29
|
+
"sum *= value[index];"
|
30
|
+
else
|
31
|
+
raise "unkown redunction func #{f}"
|
32
|
+
end
|
33
|
+
%>
|
34
|
+
<% axis.each do |axis| %>
|
35
|
+
}
|
36
|
+
<% end %>
|
37
|
+
<% if f == :mean %>
|
38
|
+
output[<%= out_ops %>] = sum / item_size;
|
39
|
+
<% else %>
|
40
|
+
output[<%= out_ops %>] = sum;
|
41
|
+
<% end %>
|
42
|
+
}
|
@@ -90,6 +90,36 @@ module TensorStream
|
|
90
90
|
output_buffer
|
91
91
|
end
|
92
92
|
|
93
|
+
register_op :bias_add do |context, tensor, inputs|
|
94
|
+
value, bias = inputs
|
95
|
+
output_buffer = _create_result_buffer(value.data_type, value.shape, tensor.name)
|
96
|
+
result_shape = value.shape.dup
|
97
|
+
bias_length = result_shape.pop
|
98
|
+
work_group = [result_shape.reduce(:*)]
|
99
|
+
event_wait_list = build_event_wait_list([value, bias])
|
100
|
+
dtype = tensor.data_type
|
101
|
+
output_buffer.op = _cl_program('bias_add', n: bias_length, dtype: dtype)
|
102
|
+
.send(:"bias_add_#{dtype}", _opencl_queue, work_group, value.cl_buffer,
|
103
|
+
bias.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
104
|
+
output_buffer
|
105
|
+
end
|
106
|
+
|
107
|
+
register_op :bias_add_grad do |context, tensor, inputs|
|
108
|
+
received_grad = inputs[0]
|
109
|
+
bias_size = received_grad.shape.last
|
110
|
+
output_buffer = _create_result_buffer(received_grad.data_type, [bias_size], tensor.name)
|
111
|
+
work_group = [bias_size]
|
112
|
+
|
113
|
+
received_grad_shape = received_grad.shape.dup
|
114
|
+
received_grad_shape.pop
|
115
|
+
item_rows = received_grad_shape.reduce(:*)
|
116
|
+
dtype = tensor.data_type
|
117
|
+
output_buffer.op = _cl_program('bias_add_grad', n: bias_size, rows: item_rows, dtype: dtype)
|
118
|
+
.send(:"bias_add_grad_#{dtype}", _opencl_queue, work_group, received_grad.cl_buffer,
|
119
|
+
output_buffer.cl_buffer, event_wait_list: build_event_wait_list([received_grad]))
|
120
|
+
output_buffer
|
121
|
+
end
|
122
|
+
|
93
123
|
%i[sign exp tan acos asin sin cos abs sqrt negate square reciprocal tanh tanh_grad sigmoid log1p round floor ceil log].each do |op|
|
94
124
|
register_op op, noop: true do |context, tensor, inputs|
|
95
125
|
execute_func(op.to_s, tensor, inputs[0], context)
|
@@ -110,25 +140,18 @@ module TensorStream
|
|
110
140
|
end
|
111
141
|
end
|
112
142
|
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
# arr = inputs[0].buffer.reshape(*inputs[0].shape.reverse).to_a
|
119
|
-
# op = get_op_with_axis(arr, axis, 0, inputs[0].data_type, ->(a, b) { a < b })
|
120
|
-
# convert_to_opencl(op, shape_eval(op), data_type: tensor.data_type, name: tensor.name)
|
121
|
-
# end
|
143
|
+
%i[argmin argmax].each do |op|
|
144
|
+
register_op op do |context, tensor, inputs|
|
145
|
+
value, axis = inputs
|
146
|
+
rank = value.shape.size
|
147
|
+
axis = 0 if axis.nil?
|
122
148
|
|
123
|
-
|
124
|
-
|
125
|
-
# rank = inputs[0].shape.size
|
126
|
-
# raise TensorStream::InvalidArgumentError, "Expected dimension in the range [#{-rank},#{rank}) but got #{axis}" if axis < -rank || axis >= rank
|
149
|
+
axis = axis.is_a?(OpenCLBuffer) ? read_final_result(axis) : axis
|
150
|
+
raise TensorStream::InvalidArgumentError, "Expected dimension in the range [#{-rank},#{rank}) but got #{axis}" if axis < -rank || axis >= rank
|
127
151
|
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
# end
|
152
|
+
reduce_multi_axis(context, tensor, value, axis, 'arg', op.to_sym)
|
153
|
+
end
|
154
|
+
end
|
132
155
|
|
133
156
|
def reduction(child_context, tensor, value, axis, func)
|
134
157
|
if axis.nil?
|
@@ -164,33 +187,34 @@ module TensorStream
|
|
164
187
|
end
|
165
188
|
end
|
166
189
|
else
|
167
|
-
|
190
|
+
reduce_multi_axis(child_context, tensor, value, axis, 'reduce', func)
|
191
|
+
end
|
192
|
+
end
|
168
193
|
|
169
|
-
|
170
|
-
|
194
|
+
def reduce_multi_axis(child_context, tensor, value, axis, prog, func)
|
195
|
+
return value if value.shape.empty?
|
171
196
|
|
172
|
-
|
173
|
-
rank = input.shape.size - 1
|
197
|
+
rank = value.shape.size
|
174
198
|
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
value = value.send(func, rank - axis.abs)
|
181
|
-
end
|
199
|
+
axis = axis.is_a?(OpenCLBuffer) ? read_final_result(axis) : axis
|
200
|
+
axis = [axis] unless axis.is_a?(Array)
|
201
|
+
return value if axis.empty?
|
202
|
+
# remap negative values
|
203
|
+
axis.map! { |axis| axis < 0 ? rank - axis.abs : axis }
|
182
204
|
|
183
|
-
|
184
|
-
value.shape.reverse
|
185
|
-
else
|
186
|
-
value = [value]
|
187
|
-
[]
|
188
|
-
end
|
205
|
+
new_shape = value.shape.collect.with_index { |v, index| axis.include?(index) ? nil : v }.compact
|
189
206
|
|
190
|
-
|
207
|
+
buffer_shape = tensor.options[:keepdims] ? _reduced_shape(value.shape.dup, axis) : new_shape
|
208
|
+
output_buffer = _create_result_buffer(tensor.options[:output_type] || tensor.data_type, buffer_shape, tensor.name)
|
191
209
|
|
192
|
-
|
193
|
-
|
210
|
+
work_group = new_shape.empty? ? [1] : new_shape
|
211
|
+
dtype = value.data_type
|
212
|
+
|
213
|
+
output_buffer.op = _cl_program("#{prog}_axis", f: func, axis: axis, shape: value.shape, o_shape: new_shape, dtype: dtype, out_dtype: tensor.options[:output_type])
|
214
|
+
.send("#{prog}_axis_#{dtype}", _opencl_queue, work_group, value.cl_buffer,
|
215
|
+
output_buffer.cl_buffer, event_wait_list: build_event_wait_list([value]))
|
216
|
+
|
217
|
+
output_buffer
|
194
218
|
end
|
195
219
|
end
|
196
220
|
end
|
@@ -23,6 +23,10 @@ module TensorStream
|
|
23
23
|
@shape == [0]
|
24
24
|
end
|
25
25
|
|
26
|
+
def inspect
|
27
|
+
"CLBuffer(shape: #{shape || "?"} data_type: #{data_type}, cl_allocated: #{cl_buffer ? cl_buffer.size : 'unallocated'}) -> raw: #{buffer.to_a}"
|
28
|
+
end
|
29
|
+
|
26
30
|
def to_ruby
|
27
31
|
return [] if buffer.empty?
|
28
32
|
|
@@ -76,6 +76,27 @@ class OpenclTemplateHelper
|
|
76
76
|
end
|
77
77
|
end
|
78
78
|
|
79
|
+
def max_value_for(dtype)
|
80
|
+
case dtype.to_s
|
81
|
+
when 'float64'
|
82
|
+
'DBL_MAX'
|
83
|
+
when 'float32', 'float', 'float16'
|
84
|
+
'FLT_MAX'
|
85
|
+
when 'int32', 'int'
|
86
|
+
'INT_MAX'
|
87
|
+
when 'uint32', 'uint16'
|
88
|
+
'0'
|
89
|
+
when 'int16'
|
90
|
+
'SHRT_MAX'
|
91
|
+
when 'int8'
|
92
|
+
'256'
|
93
|
+
when 'boolean'
|
94
|
+
'1'
|
95
|
+
else
|
96
|
+
raise "unknown dtype #{dtype}"
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
79
100
|
def operator_to_c(op)
|
80
101
|
case op
|
81
102
|
when 'less'
|
@@ -39,7 +39,7 @@ Gem::Specification.new do |spec|
|
|
39
39
|
spec.add_development_dependency "awesome_print"
|
40
40
|
spec.add_development_dependency "mnist-learn"
|
41
41
|
spec.add_development_dependency "simplecov"
|
42
|
-
spec.add_dependency "tensor_stream", "1.0.
|
42
|
+
spec.add_dependency "tensor_stream", "1.0.6"
|
43
43
|
spec.add_dependency "opencl_ruby_ffi"
|
44
44
|
spec.add_dependency "oily_png"
|
45
45
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tensor_stream-opencl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Joseph Dayo
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-03-
|
11
|
+
date: 2019-03-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -114,14 +114,14 @@ dependencies:
|
|
114
114
|
requirements:
|
115
115
|
- - '='
|
116
116
|
- !ruby/object:Gem::Version
|
117
|
-
version: 1.0.
|
117
|
+
version: 1.0.6
|
118
118
|
type: :runtime
|
119
119
|
prerelease: false
|
120
120
|
version_requirements: !ruby/object:Gem::Requirement
|
121
121
|
requirements:
|
122
122
|
- - '='
|
123
123
|
- !ruby/object:Gem::Version
|
124
|
-
version: 1.0.
|
124
|
+
version: 1.0.6
|
125
125
|
- !ruby/object:Gem::Dependency
|
126
126
|
name: opencl_ruby_ffi
|
127
127
|
requirement: !ruby/object:Gem::Requirement
|
@@ -169,6 +169,7 @@ files:
|
|
169
169
|
- benchmark/benchmark.rb
|
170
170
|
- benchmark_intel.txt
|
171
171
|
- benchmark_ryzen.txt
|
172
|
+
- benchmark_ryzen_nvidia.txt
|
172
173
|
- bin/console
|
173
174
|
- bin/setup
|
174
175
|
- lib/tensor_stream/opencl.rb
|
@@ -186,9 +187,12 @@ files:
|
|
186
187
|
- lib/tensor_stream/opencl/kernels/apply_gradient.cl
|
187
188
|
- lib/tensor_stream/opencl/kernels/apply_momentum.cl
|
188
189
|
- lib/tensor_stream/opencl/kernels/apply_rms_prop.cl
|
190
|
+
- lib/tensor_stream/opencl/kernels/arg_axis.cl
|
189
191
|
- lib/tensor_stream/opencl/kernels/argmax.cl
|
190
192
|
- lib/tensor_stream/opencl/kernels/argmin.cl
|
191
193
|
- lib/tensor_stream/opencl/kernels/asin.cl
|
194
|
+
- lib/tensor_stream/opencl/kernels/bias_add.cl
|
195
|
+
- lib/tensor_stream/opencl/kernels/bias_add_grad.cl
|
192
196
|
- lib/tensor_stream/opencl/kernels/cast.cl
|
193
197
|
- lib/tensor_stream/opencl/kernels/ceil.cl
|
194
198
|
- lib/tensor_stream/opencl/kernels/concat.cl
|
@@ -217,6 +221,7 @@ files:
|
|
217
221
|
- lib/tensor_stream/opencl/kernels/prod.cl
|
218
222
|
- lib/tensor_stream/opencl/kernels/real_div.cl
|
219
223
|
- lib/tensor_stream/opencl/kernels/reciprocal.cl
|
224
|
+
- lib/tensor_stream/opencl/kernels/reduce_axis.cl
|
220
225
|
- lib/tensor_stream/opencl/kernels/relu6.cl
|
221
226
|
- lib/tensor_stream/opencl/kernels/round.cl
|
222
227
|
- lib/tensor_stream/opencl/kernels/sigmoid.cl
|