tensor_stream-opencl 0.2.10 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d590302525812d813924ea639202fa41db60a6b4d46b2f4eafaf01f77910a530
4
- data.tar.gz: 6413c7d9e5376844fd2da090e6f8e84d23cd0b5e47f2be2a0eef5f82652d6f78
3
+ metadata.gz: d2b7fa8bb0dcaa041e5ec390418ffac2bb40d777086e9ebe771f9f42d79ccabc
4
+ data.tar.gz: e21e0cf638e177da495e9748087808c51aee15ac10212c856ee789ae8c32d458
5
5
  SHA512:
6
- metadata.gz: af8845e919363d7d1cb06bf6899a9644e1cc7907908bc9c43a2efb0995c7696e6215445ee60fb802250f6032740cc7169f48404e9141c226bf2a3b0b1caf018d
7
- data.tar.gz: eb56a21e66f2624f19bc8d19e374dfb967f219806eac538009588db16cf2f2ed89b98f2f6fa35135e28a543db0e6b389d5047da3ce0bf6a26518ab8c1e5cd75b
6
+ metadata.gz: 31a1cc671716dee18d6841e39f3bf43c5326a423be59e118e5acf35777aaed9b29acdfdd3425d6ce48f5f5c5f8e5692fae687a592828be0a458f337498965008
7
+ data.tar.gz: ecef0d771882d662996aa31cfa4eaf9e8cef720c2b05eb48f872ce9c3fbe9e07dfc99427332a6bcbf5fb8054f7f00d27fc55184462a559be834635aaad358c30
@@ -43,6 +43,8 @@ a_int = tf.constant([
43
43
  [4, 2, 3, 4, 0, 1, 1, 0, 0, 2, 1, 2],
44
44
  ])
45
45
 
46
+ large_tensor_bias = tf.constant(sess.run(tf.random_uniform([256])))
47
+
46
48
  b = tf.constant(sess.run(tf.random_uniform(SHAPES)))
47
49
 
48
50
  c = tf.constant(sess.run(tf.random_uniform(SHAPES)))
@@ -67,11 +69,15 @@ split = tf.split(a, 4)
67
69
  sum = tf.reduce_sum(large_tensor)
68
70
  sum_axis_1 = tf.reduce_sum(large_tensor, 1)
69
71
  min = tf.min(large_tensor, 1)
72
+ argmin = tf.argmin(large_tensor)
70
73
  index = large_tensor[0]
71
74
 
72
75
  conv2d = tf.nn.conv2d(sample_image, sample_filter, [1, 1, 1, 1], 'SAME')
73
76
  conv2d_grad = tf.gradients(conv2d, [sample_image, sample_filter])
74
77
 
78
+ bias_add = tf.nn.bias_add(large_tensor, large_tensor_bias)
79
+ bias_add_grad = tf.gradients(bias_add, [large_tensor_bias])
80
+
75
81
  puts TensorStream::Evaluator.default_evaluators
76
82
 
77
83
  sess2 = tf.session
@@ -80,6 +86,12 @@ puts `cat /proc/cpuinfo | grep "model name" | head -1`
80
86
  device = TensorStream::Evaluator::OpenclEvaluator.default_device.native_device
81
87
  puts "OpenCL device #{device.platform.to_s} #{device.name}"
82
88
  Benchmark.bmbm do |x|
89
+ x.report("pure ruby argmin :") { 100.times do sess.run(argmin) end }
90
+ x.report("opencl argmin :") { 100.times do sess2.run(argmin) end }
91
+ x.report("pure ruby bias_add_grad :") { 100.times do sess.run(bias_add_grad) end }
92
+ x.report("opencl bias_add_grad :") { 100.times do sess2.run(bias_add_grad) end }
93
+ x.report("pure ruby bias_add :") { 100.times do sess.run(bias_add) end }
94
+ x.report("opencl bias_add :") { 100.times do sess2.run(bias_add) end }
83
95
  x.report("pure ruby conv2d_backprop :") { 100.times do sess.run(conv2d_grad) end }
84
96
  x.report("opencl conv2d_backprop :") { 100.times do sess2.run(conv2d_grad) end }
85
97
  x.report("pure ruby conv2d :") { 100.times do sess.run(conv2d) end }
@@ -0,0 +1,80 @@
1
+ TensorStream::Evaluator::OpenclEvaluator
2
+ TensorStream::Evaluator::RubyEvaluator
3
+ model name : AMD Ryzen 3 1300X Quad-Core Processor
4
+ OpenCL device NVIDIA CUDA GeForce GTX 1060 6GB
5
+ Rehearsal ------------------------------------------------------------------------
6
+ pure ruby argmin : 0.708414 0.007882 0.716296 ( 0.717201)
7
+ opencl argmin : 0.204186 0.222389 0.426575 ( 0.447862)
8
+ pure ruby bias_add_grad : 2.048097 0.005187 2.053284 ( 2.057617)
9
+ opencl bias_add_grad : 0.012482 0.000426 0.012908 ( 0.013225)
10
+ pure ruby bias_add : 2.406516 0.000087 2.406603 ( 2.406307)
11
+ opencl bias_add : 0.136466 0.008025 0.144491 ( 0.134989)
12
+ pure ruby conv2d_backprop : 3.685220 0.000155 3.685375 ( 3.685049)
13
+ opencl conv2d_backprop : 0.028940 0.008031 0.036971 ( 0.029904)
14
+ pure ruby conv2d : 0.788991 0.000041 0.789032 ( 0.788881)
15
+ opencl conv2d : 0.020150 0.000138 0.020288 ( 0.016917)
16
+ pure ruby arr index : 0.003036 0.000000 0.003036 ( 0.003044)
17
+ opencl arr index : 0.009626 0.000023 0.009649 ( 0.006703)
18
+ pure ruby min : 3.767836 0.007871 3.775707 ( 3.773523)
19
+ opencl min : 0.141541 0.008039 0.149580 ( 0.139246)
20
+ pure ruby sum : 3.219801 0.000076 3.219877 ( 3.218388)
21
+ opencl sum : 0.007480 0.004074 0.011554 ( 0.008261)
22
+ pure ruby sum axis 1 : 3.203423 0.000000 3.203423 ( 3.201832)
23
+ opencl sum axis 1 : 0.011710 0.000033 0.011743 ( 0.008379)
24
+ pure ruby split : 0.016504 0.000008 0.016512 ( 0.016529)
25
+ opencl split : 0.041059 0.012026 0.053085 ( 0.043289)
26
+ pure ruby add_n : 0.141810 0.000000 0.141810 ( 0.141721)
27
+ opencl add_n : 0.013751 0.000000 0.013751 ( 0.012208)
28
+ pure ruby ooo matmul : 1.395286 0.000000 1.395286 ( 1.394697)
29
+ opencl ooo matmul : 0.013448 0.000000 0.013448 ( 0.009873)
30
+ pure ruby softmax : 0.025362 0.000018 0.025380 ( 0.025382)
31
+ opencl softmax : 0.014999 0.000051 0.015050 ( 0.011977)
32
+ pure ruby matmul : 0.666863 0.000000 0.666863 ( 0.666499)
33
+ opencl matmul : 0.008572 0.003920 0.012492 ( 0.009246)
34
+ pure ruby : 2.429792 0.000005 2.429797 ( 2.428788)
35
+ opencl : 0.140862 0.004014 0.144876 ( 0.137264)
36
+ pure ruby single function: 0.340247 0.000000 0.340247 ( 0.340184)
37
+ opencl singlefunction: 0.084871 0.007956 0.092827 ( 0.087899)
38
+ pure ruby pow float: 0.083372 0.000000 0.083372 ( 0.083339)
39
+ opencl pow float: 0.013498 0.000014 0.013512 ( 0.010353)
40
+ pure ruby pow int: 0.018739 0.000000 0.018739 ( 0.018753)
41
+ opencl pow int: 0.007737 0.004041 0.011778 ( 0.008817)
42
+ -------------------------------------------------------------- total: 26.165217sec
43
+
44
+ user system total real
45
+ pure ruby argmin : 0.677097 0.000009 0.677106 ( 0.676828)
46
+ opencl argmin : 0.005919 0.003950 0.009869 ( 0.006618)
47
+ pure ruby bias_add_grad : 2.027326 0.000000 2.027326 ( 2.026399)
48
+ opencl bias_add_grad : 0.011544 0.000050 0.011594 ( 0.008380)
49
+ pure ruby bias_add : 2.378283 0.000000 2.378283 ( 2.377411)
50
+ opencl bias_add : 0.130993 0.011994 0.142987 ( 0.132772)
51
+ pure ruby conv2d_backprop : 3.738167 0.000000 3.738167 ( 3.737946)
52
+ opencl conv2d_backprop : 0.031267 0.003958 0.035225 ( 0.030381)
53
+ pure ruby conv2d : 0.794182 0.000000 0.794182 ( 0.794100)
54
+ opencl conv2d : 0.015865 0.004020 0.019885 ( 0.016878)
55
+ pure ruby arr index : 0.003112 0.000000 0.003112 ( 0.003109)
56
+ opencl arr index : 0.012100 0.000000 0.012100 ( 0.009728)
57
+ pure ruby min : 3.669509 0.003944 3.673453 ( 3.671906)
58
+ opencl min : 0.137071 0.004055 0.141126 ( 0.131802)
59
+ pure ruby sum : 3.210619 0.000000 3.210619 ( 3.210064)
60
+ opencl sum : 0.002431 0.008030 0.010461 ( 0.007522)
61
+ pure ruby sum axis 1 : 3.208789 0.000000 3.208789 ( 3.208125)
62
+ opencl sum axis 1 : 0.006075 0.003963 0.010038 ( 0.007679)
63
+ pure ruby split : 0.013985 0.000000 0.013985 ( 0.013990)
64
+ opencl split : 0.029464 0.011999 0.041463 ( 0.030797)
65
+ pure ruby add_n : 0.140984 0.000003 0.140987 ( 0.140959)
66
+ opencl add_n : 0.003146 0.007934 0.011080 ( 0.007778)
67
+ pure ruby ooo matmul : 1.416585 0.000000 1.416585 ( 1.416290)
68
+ opencl ooo matmul : 0.011156 0.000000 0.011156 ( 0.008723)
69
+ pure ruby softmax : 0.024724 0.000000 0.024724 ( 0.024731)
70
+ opencl softmax : 0.006237 0.003945 0.010182 ( 0.009005)
71
+ pure ruby matmul : 0.679538 0.000000 0.679538 ( 0.680048)
72
+ opencl matmul : 0.003456 0.007965 0.011421 ( 0.008568)
73
+ pure ruby : 2.437790 0.004031 2.441821 ( 2.443126)
74
+ opencl : 0.133039 0.003996 0.137035 ( 0.130579)
75
+ pure ruby single function: 0.332269 0.004003 0.336272 ( 0.336201)
76
+ opencl singlefunction: 0.078607 0.004009 0.082616 ( 0.078640)
77
+ pure ruby pow float: 0.081409 0.000000 0.081409 ( 0.081364)
78
+ opencl pow float: 0.011501 0.000000 0.011501 ( 0.008471)
79
+ pure ruby pow int: 0.016687 0.000000 0.016687 ( 0.016711)
80
+ opencl pow int: 0.007061 0.003950 0.011011 ( 0.007819)
@@ -0,0 +1,42 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ % out_c_dtype = dtype_to_c_type(out_dtype)
3
+ % o_multipliers = o_shape.dup.drop(1).reverse.inject([1]) { |a, s| a << s * a.last }.reverse
4
+ % i_multipliers = shape.dup.drop(1).reverse.inject([1]) { |a, s| a << s * a.last }.reverse
5
+ % out_ops = o_multipliers.map.with_index { |m, index| "id_#{index} * #{m}"}.join(' + ')
6
+ % axis = axis[0]
7
+ % in_axis_multipliers = i_multipliers.select.with_index { |m, index| axis == index }
8
+ % in_axis_ops = in_axis_multipliers.map.with_index { |m| "i * #{m}"}.join(' + ')
9
+ % in_output_multipliers = i_multipliers.reject.with_index { |m, index| axis == index }
10
+ % in_output_ops = in_output_multipliers.map.with_index { |m, index| "id_#{index} * #{m}"}.join(' + ')
11
+ __kernel void arg_axis_<%= dtype %>(__global const <%= c_dtype %> *value, __global <%= out_c_dtype %> *output) {
12
+ // Get the index of the current element to be processed
13
+ <% o_multipliers.size.times.each_with_index do |s, index| %>
14
+ const int id_<%= index %> = get_global_id(<%= index %>);
15
+ <% end %>
16
+
17
+ <%= c_dtype %> min_or_max_value = <%= f == :argmax ? min_value_for(dtype) : max_value_for(dtype) %>;
18
+ int min_or_max_index = 0;
19
+
20
+ for (int i = 0; i < <%= shape[axis] %>; i++) {
21
+
22
+ int index = <%= in_axis_ops %>;
23
+
24
+ <% unless in_output_ops.empty? %>
25
+ index += <%= in_output_ops %>;
26
+ <% end %>
27
+ <%= case(f)
28
+ when :argmax
29
+ "if (value[index] > min_or_max_value) {"
30
+ when :argmin
31
+ "if (value[index] < min_or_max_value) {"
32
+ else
33
+ raise "unkown redunction func #{f}"
34
+ end
35
+ %>
36
+ min_or_max_index = i;
37
+ min_or_max_value = value[index];
38
+ }
39
+ }
40
+
41
+ output[<%= out_ops %>] = (<%= out_c_dtype %>)min_or_max_index;
42
+ }
@@ -1,8 +1,14 @@
1
1
  % c_dtype = dtype_to_c_type(dtype)
2
- __kernel void argmax_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
3
- // Get the index of the current element to be processed
4
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
-
7
- C[globalRow * N + globalCol] = A[globalRow * N + globalCol] + B[globalRow * N + globalCol];
2
+ % out_c_dtype = dtype_to_c_type(out_dtype)
3
+ __kernel void argmax_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
4
+ <%= c_dtype %> max = <%= min_value_for(dtype) %>;
5
+ <%= out_c_dtype %> max_index = 0;
6
+
7
+ for(int i = 0; i < <%= n %>; i++) {
8
+ if (A[i] > max) {
9
+ max = A[i];
10
+ max_index = i;
11
+ }
12
+ }
13
+ C[0] = max_index;
8
14
  }
@@ -1,8 +1,14 @@
1
1
  % c_dtype = dtype_to_c_type(dtype)
2
- __kernel void argmin_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
3
- // Get the index of the current element to be processed
4
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
-
7
- C[globalRow * N + globalCol] = A[globalRow * N + globalCol] + B[globalRow * N + globalCol];
2
+ % out_c_dtype = dtype_to_c_type(out_dtype)
3
+ __kernel void argmin_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
4
+ <%= c_dtype %> min = <%= max_value_for(dtype) %>;
5
+ <%= out_c_dtype %> min_index = 0;
6
+
7
+ for(int i = 0; i < <%= n %>; i++) {
8
+ if (A[i] < min) {
9
+ min = A[i];
10
+ min_index = i;
11
+ }
12
+ }
13
+ C[0] = min_index;
8
14
  }
@@ -0,0 +1,9 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+
3
+ __kernel void bias_add_<%= dtype %>(__global const <%= c_dtype %> *value, __constant const <%= c_dtype %> *bias, __global <%= c_dtype %> *output) {
4
+ const int id = get_global_id(0);
5
+
6
+ for(int i = 0; i < <%= n %>; i++) {
7
+ output[ <%= n %> * id + i] = value[ <%= n %> * id + i] + bias[i];
8
+ }
9
+ }
@@ -0,0 +1,10 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+
3
+ __kernel void bias_add_grad_<%= dtype %>(__global const <%= c_dtype %> *received_grad, __global <%= c_dtype %> *output) {
4
+ const int id = get_global_id(0);
5
+ <%= c_dtype %> sum = 0;
6
+ for(int i = 0; i < <%= rows %>; i++) {
7
+ sum += received_grad[<%= n %> * i + id];
8
+ }
9
+ output[id] = sum;
10
+ }
@@ -0,0 +1,42 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ % o_multipliers = o_shape.dup.drop(1).reverse.inject([1]) { |a, s| a << s * a.last }.reverse
3
+ % i_multipliers = shape.dup.drop(1).reverse.inject([1]) { |a, s| a << s * a.last }.reverse
4
+ % out_ops = o_multipliers.map.with_index { |m, index| "id_#{index} * #{m}" }.join(' + ')
5
+ % in_axis_multipliers = i_multipliers.select.with_index { |m, index| axis.include?(index) }
6
+ % in_axis_ops = in_axis_multipliers.map.with_index { |m, index| "i_#{index} * #{m}"}.join(' + ')
7
+ % in_output_multipliers = i_multipliers.reject.with_index { |m, index| axis.include?(index) }
8
+ % in_output_ops = in_output_multipliers.map.with_index { |m, index| "id_#{index} * #{m}"}.join(' + ')
9
+ __kernel void reduce_axis_<%= dtype %>(__global const <%= c_dtype %> *value, __global <%= c_dtype %> *output) {
10
+ // Get the index of the current element to be processed
11
+ <% o_multipliers.size.times.each_with_index do |s, index| %>
12
+ const int id_<%= index %> = get_global_id(<%= index %>);
13
+ <% end %>
14
+
15
+ <%= c_dtype %> sum = <%= f == :prod ? 1 : 0 %>;
16
+ <%= c_dtype %> item_size = 0;
17
+ <% axis.each_with_index do |axis, index| %>
18
+ for (int i_<%= index %> = 0; i_<%= index %> < <%= shape[axis] %>; i_<%= index %>++) {
19
+ <% end %>
20
+ int index = <%= in_axis_ops %>;
21
+ item_size += 1;
22
+ <% unless in_output_ops.empty? %>
23
+ index += <%= in_output_ops %>;
24
+ <% end %>
25
+ <%= case(f)
26
+ when :sum, :mean
27
+ "sum += value[index];"
28
+ when :prod
29
+ "sum *= value[index];"
30
+ else
31
+ raise "unkown redunction func #{f}"
32
+ end
33
+ %>
34
+ <% axis.each do |axis| %>
35
+ }
36
+ <% end %>
37
+ <% if f == :mean %>
38
+ output[<%= out_ops %>] = sum / item_size;
39
+ <% else %>
40
+ output[<%= out_ops %>] = sum;
41
+ <% end %>
42
+ }
@@ -90,6 +90,36 @@ module TensorStream
90
90
  output_buffer
91
91
  end
92
92
 
93
+ register_op :bias_add do |context, tensor, inputs|
94
+ value, bias = inputs
95
+ output_buffer = _create_result_buffer(value.data_type, value.shape, tensor.name)
96
+ result_shape = value.shape.dup
97
+ bias_length = result_shape.pop
98
+ work_group = [result_shape.reduce(:*)]
99
+ event_wait_list = build_event_wait_list([value, bias])
100
+ dtype = tensor.data_type
101
+ output_buffer.op = _cl_program('bias_add', n: bias_length, dtype: dtype)
102
+ .send(:"bias_add_#{dtype}", _opencl_queue, work_group, value.cl_buffer,
103
+ bias.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
104
+ output_buffer
105
+ end
106
+
107
+ register_op :bias_add_grad do |context, tensor, inputs|
108
+ received_grad = inputs[0]
109
+ bias_size = received_grad.shape.last
110
+ output_buffer = _create_result_buffer(received_grad.data_type, [bias_size], tensor.name)
111
+ work_group = [bias_size]
112
+
113
+ received_grad_shape = received_grad.shape.dup
114
+ received_grad_shape.pop
115
+ item_rows = received_grad_shape.reduce(:*)
116
+ dtype = tensor.data_type
117
+ output_buffer.op = _cl_program('bias_add_grad', n: bias_size, rows: item_rows, dtype: dtype)
118
+ .send(:"bias_add_grad_#{dtype}", _opencl_queue, work_group, received_grad.cl_buffer,
119
+ output_buffer.cl_buffer, event_wait_list: build_event_wait_list([received_grad]))
120
+ output_buffer
121
+ end
122
+
93
123
  %i[sign exp tan acos asin sin cos abs sqrt negate square reciprocal tanh tanh_grad sigmoid log1p round floor ceil log].each do |op|
94
124
  register_op op, noop: true do |context, tensor, inputs|
95
125
  execute_func(op.to_s, tensor, inputs[0], context)
@@ -110,25 +140,18 @@ module TensorStream
110
140
  end
111
141
  end
112
142
 
113
- # register_op :argmin, buffer: true do |_context, tensor, inputs|
114
- # axis = inputs[1].nil? || inputs[1].buffer.nil? || inputs[1].buffer.empty? ? 0 : inputs[1].buffer
115
- # rank = inputs[0].shape.size
116
- # raise TensorStream::InvalidArgumentError, "Expected dimension in the range [#{-rank},#{rank}) but got #{axis}" if axis < -rank || axis >= rank
117
-
118
- # arr = inputs[0].buffer.reshape(*inputs[0].shape.reverse).to_a
119
- # op = get_op_with_axis(arr, axis, 0, inputs[0].data_type, ->(a, b) { a < b })
120
- # convert_to_opencl(op, shape_eval(op), data_type: tensor.data_type, name: tensor.name)
121
- # end
143
+ %i[argmin argmax].each do |op|
144
+ register_op op do |context, tensor, inputs|
145
+ value, axis = inputs
146
+ rank = value.shape.size
147
+ axis = 0 if axis.nil?
122
148
 
123
- # register_op :argmax, buffer: true do |_context, tensor, inputs|
124
- # axis = inputs[1].nil? || inputs[1].buffer.nil? || inputs[1].buffer.empty? ? 0 : inputs[1].buffer
125
- # rank = inputs[0].shape.size
126
- # raise TensorStream::InvalidArgumentError, "Expected dimension in the range [#{-rank},#{rank}) but got #{axis}" if axis < -rank || axis >= rank
149
+ axis = axis.is_a?(OpenCLBuffer) ? read_final_result(axis) : axis
150
+ raise TensorStream::InvalidArgumentError, "Expected dimension in the range [#{-rank},#{rank}) but got #{axis}" if axis < -rank || axis >= rank
127
151
 
128
- # arr = inputs[0].buffer.reshape(*inputs[0].shape.reverse).to_a
129
- # op = get_op_with_axis(arr, axis, 0, inputs[0].data_type, ->(a, b) { a > b })
130
- # convert_to_opencl(op, shape_eval(op), data_type: tensor.data_type, name: tensor.name)
131
- # end
152
+ reduce_multi_axis(context, tensor, value, axis, 'arg', op.to_sym)
153
+ end
154
+ end
132
155
 
133
156
  def reduction(child_context, tensor, value, axis, func)
134
157
  if axis.nil?
@@ -164,33 +187,34 @@ module TensorStream
164
187
  end
165
188
  end
166
189
  else
167
- return value if value.shape.empty?
190
+ reduce_multi_axis(child_context, tensor, value, axis, 'reduce', func)
191
+ end
192
+ end
168
193
 
169
- axis = axis.is_a?(OpenCLBuffer) ? read_final_result(axis) : axis
170
- input = complete_eval(value, child_context)
194
+ def reduce_multi_axis(child_context, tensor, value, axis, prog, func)
195
+ return value if value.shape.empty?
171
196
 
172
- value = value.buffer.reshape(*value.shape.reverse)
173
- rank = input.shape.size - 1
197
+ rank = value.shape.size
174
198
 
175
- if axis.is_a?(Array)
176
- axis.map { |x| rank - x.abs }.sort.reverse_each do |x|
177
- value = value.send(func, x.to_i)
178
- end
179
- else
180
- value = value.send(func, rank - axis.abs)
181
- end
199
+ axis = axis.is_a?(OpenCLBuffer) ? read_final_result(axis) : axis
200
+ axis = [axis] unless axis.is_a?(Array)
201
+ return value if axis.empty?
202
+ # remap negative values
203
+ axis.map! { |axis| axis < 0 ? rank - axis.abs : axis }
182
204
 
183
- new_shape = if value.is_a?(NArray)
184
- value.shape.reverse
185
- else
186
- value = [value]
187
- []
188
- end
205
+ new_shape = value.shape.collect.with_index { |v, index| axis.include?(index) ? nil : v }.compact
189
206
 
190
- new_shape = _reduced_shape(input.shape.dup, axis) if tensor.options[:keepdims]
207
+ buffer_shape = tensor.options[:keepdims] ? _reduced_shape(value.shape.dup, axis) : new_shape
208
+ output_buffer = _create_result_buffer(tensor.options[:output_type] || tensor.data_type, buffer_shape, tensor.name)
191
209
 
192
- convert_to_opencl(value.flatten, new_shape, data_type: tensor.data_type, name: tensor.name)
193
- end
210
+ work_group = new_shape.empty? ? [1] : new_shape
211
+ dtype = value.data_type
212
+
213
+ output_buffer.op = _cl_program("#{prog}_axis", f: func, axis: axis, shape: value.shape, o_shape: new_shape, dtype: dtype, out_dtype: tensor.options[:output_type])
214
+ .send("#{prog}_axis_#{dtype}", _opencl_queue, work_group, value.cl_buffer,
215
+ output_buffer.cl_buffer, event_wait_list: build_event_wait_list([value]))
216
+
217
+ output_buffer
194
218
  end
195
219
  end
196
220
  end
@@ -23,6 +23,10 @@ module TensorStream
23
23
  @shape == [0]
24
24
  end
25
25
 
26
+ def inspect
27
+ "CLBuffer(shape: #{shape || "?"} data_type: #{data_type}, cl_allocated: #{cl_buffer ? cl_buffer.size : 'unallocated'}) -> raw: #{buffer.to_a}"
28
+ end
29
+
26
30
  def to_ruby
27
31
  return [] if buffer.empty?
28
32
 
@@ -76,6 +76,27 @@ class OpenclTemplateHelper
76
76
  end
77
77
  end
78
78
 
79
+ def max_value_for(dtype)
80
+ case dtype.to_s
81
+ when 'float64'
82
+ 'DBL_MAX'
83
+ when 'float32', 'float', 'float16'
84
+ 'FLT_MAX'
85
+ when 'int32', 'int'
86
+ 'INT_MAX'
87
+ when 'uint32', 'uint16'
88
+ '0'
89
+ when 'int16'
90
+ 'SHRT_MAX'
91
+ when 'int8'
92
+ '256'
93
+ when 'boolean'
94
+ '1'
95
+ else
96
+ raise "unknown dtype #{dtype}"
97
+ end
98
+ end
99
+
79
100
  def operator_to_c(op)
80
101
  case op
81
102
  when 'less'
@@ -1,5 +1,5 @@
1
1
  module TensorStream
2
2
  module Opencl
3
- VERSION = "0.2.10"
3
+ VERSION = "0.3.0"
4
4
  end
5
5
  end
@@ -39,7 +39,7 @@ Gem::Specification.new do |spec|
39
39
  spec.add_development_dependency "awesome_print"
40
40
  spec.add_development_dependency "mnist-learn"
41
41
  spec.add_development_dependency "simplecov"
42
- spec.add_dependency "tensor_stream", "1.0.5"
42
+ spec.add_dependency "tensor_stream", "1.0.6"
43
43
  spec.add_dependency "opencl_ruby_ffi"
44
44
  spec.add_dependency "oily_png"
45
45
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tensor_stream-opencl
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.10
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Joseph Dayo
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-03-20 00:00:00.000000000 Z
11
+ date: 2019-03-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -114,14 +114,14 @@ dependencies:
114
114
  requirements:
115
115
  - - '='
116
116
  - !ruby/object:Gem::Version
117
- version: 1.0.5
117
+ version: 1.0.6
118
118
  type: :runtime
119
119
  prerelease: false
120
120
  version_requirements: !ruby/object:Gem::Requirement
121
121
  requirements:
122
122
  - - '='
123
123
  - !ruby/object:Gem::Version
124
- version: 1.0.5
124
+ version: 1.0.6
125
125
  - !ruby/object:Gem::Dependency
126
126
  name: opencl_ruby_ffi
127
127
  requirement: !ruby/object:Gem::Requirement
@@ -169,6 +169,7 @@ files:
169
169
  - benchmark/benchmark.rb
170
170
  - benchmark_intel.txt
171
171
  - benchmark_ryzen.txt
172
+ - benchmark_ryzen_nvidia.txt
172
173
  - bin/console
173
174
  - bin/setup
174
175
  - lib/tensor_stream/opencl.rb
@@ -186,9 +187,12 @@ files:
186
187
  - lib/tensor_stream/opencl/kernels/apply_gradient.cl
187
188
  - lib/tensor_stream/opencl/kernels/apply_momentum.cl
188
189
  - lib/tensor_stream/opencl/kernels/apply_rms_prop.cl
190
+ - lib/tensor_stream/opencl/kernels/arg_axis.cl
189
191
  - lib/tensor_stream/opencl/kernels/argmax.cl
190
192
  - lib/tensor_stream/opencl/kernels/argmin.cl
191
193
  - lib/tensor_stream/opencl/kernels/asin.cl
194
+ - lib/tensor_stream/opencl/kernels/bias_add.cl
195
+ - lib/tensor_stream/opencl/kernels/bias_add_grad.cl
192
196
  - lib/tensor_stream/opencl/kernels/cast.cl
193
197
  - lib/tensor_stream/opencl/kernels/ceil.cl
194
198
  - lib/tensor_stream/opencl/kernels/concat.cl
@@ -217,6 +221,7 @@ files:
217
221
  - lib/tensor_stream/opencl/kernels/prod.cl
218
222
  - lib/tensor_stream/opencl/kernels/real_div.cl
219
223
  - lib/tensor_stream/opencl/kernels/reciprocal.cl
224
+ - lib/tensor_stream/opencl/kernels/reduce_axis.cl
220
225
  - lib/tensor_stream/opencl/kernels/relu6.cl
221
226
  - lib/tensor_stream/opencl/kernels/round.cl
222
227
  - lib/tensor_stream/opencl/kernels/sigmoid.cl