tensor_stream-opencl 0.2.10 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d590302525812d813924ea639202fa41db60a6b4d46b2f4eafaf01f77910a530
4
- data.tar.gz: 6413c7d9e5376844fd2da090e6f8e84d23cd0b5e47f2be2a0eef5f82652d6f78
3
+ metadata.gz: d2b7fa8bb0dcaa041e5ec390418ffac2bb40d777086e9ebe771f9f42d79ccabc
4
+ data.tar.gz: e21e0cf638e177da495e9748087808c51aee15ac10212c856ee789ae8c32d458
5
5
  SHA512:
6
- metadata.gz: af8845e919363d7d1cb06bf6899a9644e1cc7907908bc9c43a2efb0995c7696e6215445ee60fb802250f6032740cc7169f48404e9141c226bf2a3b0b1caf018d
7
- data.tar.gz: eb56a21e66f2624f19bc8d19e374dfb967f219806eac538009588db16cf2f2ed89b98f2f6fa35135e28a543db0e6b389d5047da3ce0bf6a26518ab8c1e5cd75b
6
+ metadata.gz: 31a1cc671716dee18d6841e39f3bf43c5326a423be59e118e5acf35777aaed9b29acdfdd3425d6ce48f5f5c5f8e5692fae687a592828be0a458f337498965008
7
+ data.tar.gz: ecef0d771882d662996aa31cfa4eaf9e8cef720c2b05eb48f872ce9c3fbe9e07dfc99427332a6bcbf5fb8054f7f00d27fc55184462a559be834635aaad358c30
@@ -43,6 +43,8 @@ a_int = tf.constant([
43
43
  [4, 2, 3, 4, 0, 1, 1, 0, 0, 2, 1, 2],
44
44
  ])
45
45
 
46
+ large_tensor_bias = tf.constant(sess.run(tf.random_uniform([256])))
47
+
46
48
  b = tf.constant(sess.run(tf.random_uniform(SHAPES)))
47
49
 
48
50
  c = tf.constant(sess.run(tf.random_uniform(SHAPES)))
@@ -67,11 +69,15 @@ split = tf.split(a, 4)
67
69
  sum = tf.reduce_sum(large_tensor)
68
70
  sum_axis_1 = tf.reduce_sum(large_tensor, 1)
69
71
  min = tf.min(large_tensor, 1)
72
+ argmin = tf.argmin(large_tensor)
70
73
  index = large_tensor[0]
71
74
 
72
75
  conv2d = tf.nn.conv2d(sample_image, sample_filter, [1, 1, 1, 1], 'SAME')
73
76
  conv2d_grad = tf.gradients(conv2d, [sample_image, sample_filter])
74
77
 
78
+ bias_add = tf.nn.bias_add(large_tensor, large_tensor_bias)
79
+ bias_add_grad = tf.gradients(bias_add, [large_tensor_bias])
80
+
75
81
  puts TensorStream::Evaluator.default_evaluators
76
82
 
77
83
  sess2 = tf.session
@@ -80,6 +86,12 @@ puts `cat /proc/cpuinfo | grep "model name" | head -1`
80
86
  device = TensorStream::Evaluator::OpenclEvaluator.default_device.native_device
81
87
  puts "OpenCL device #{device.platform.to_s} #{device.name}"
82
88
  Benchmark.bmbm do |x|
89
+ x.report("pure ruby argmin :") { 100.times do sess.run(argmin) end }
90
+ x.report("opencl argmin :") { 100.times do sess2.run(argmin) end }
91
+ x.report("pure ruby bias_add_grad :") { 100.times do sess.run(bias_add_grad) end }
92
+ x.report("opencl bias_add_grad :") { 100.times do sess2.run(bias_add_grad) end }
93
+ x.report("pure ruby bias_add :") { 100.times do sess.run(bias_add) end }
94
+ x.report("opencl bias_add :") { 100.times do sess2.run(bias_add) end }
83
95
  x.report("pure ruby conv2d_backprop :") { 100.times do sess.run(conv2d_grad) end }
84
96
  x.report("opencl conv2d_backprop :") { 100.times do sess2.run(conv2d_grad) end }
85
97
  x.report("pure ruby conv2d :") { 100.times do sess.run(conv2d) end }
@@ -0,0 +1,80 @@
1
+ TensorStream::Evaluator::OpenclEvaluator
2
+ TensorStream::Evaluator::RubyEvaluator
3
+ model name : AMD Ryzen 3 1300X Quad-Core Processor
4
+ OpenCL device NVIDIA CUDA GeForce GTX 1060 6GB
5
+ Rehearsal ------------------------------------------------------------------------
6
+ pure ruby argmin : 0.708414 0.007882 0.716296 ( 0.717201)
7
+ opencl argmin : 0.204186 0.222389 0.426575 ( 0.447862)
8
+ pure ruby bias_add_grad : 2.048097 0.005187 2.053284 ( 2.057617)
9
+ opencl bias_add_grad : 0.012482 0.000426 0.012908 ( 0.013225)
10
+ pure ruby bias_add : 2.406516 0.000087 2.406603 ( 2.406307)
11
+ opencl bias_add : 0.136466 0.008025 0.144491 ( 0.134989)
12
+ pure ruby conv2d_backprop : 3.685220 0.000155 3.685375 ( 3.685049)
13
+ opencl conv2d_backprop : 0.028940 0.008031 0.036971 ( 0.029904)
14
+ pure ruby conv2d : 0.788991 0.000041 0.789032 ( 0.788881)
15
+ opencl conv2d : 0.020150 0.000138 0.020288 ( 0.016917)
16
+ pure ruby arr index : 0.003036 0.000000 0.003036 ( 0.003044)
17
+ opencl arr index : 0.009626 0.000023 0.009649 ( 0.006703)
18
+ pure ruby min : 3.767836 0.007871 3.775707 ( 3.773523)
19
+ opencl min : 0.141541 0.008039 0.149580 ( 0.139246)
20
+ pure ruby sum : 3.219801 0.000076 3.219877 ( 3.218388)
21
+ opencl sum : 0.007480 0.004074 0.011554 ( 0.008261)
22
+ pure ruby sum axis 1 : 3.203423 0.000000 3.203423 ( 3.201832)
23
+ opencl sum axis 1 : 0.011710 0.000033 0.011743 ( 0.008379)
24
+ pure ruby split : 0.016504 0.000008 0.016512 ( 0.016529)
25
+ opencl split : 0.041059 0.012026 0.053085 ( 0.043289)
26
+ pure ruby add_n : 0.141810 0.000000 0.141810 ( 0.141721)
27
+ opencl add_n : 0.013751 0.000000 0.013751 ( 0.012208)
28
+ pure ruby ooo matmul : 1.395286 0.000000 1.395286 ( 1.394697)
29
+ opencl ooo matmul : 0.013448 0.000000 0.013448 ( 0.009873)
30
+ pure ruby softmax : 0.025362 0.000018 0.025380 ( 0.025382)
31
+ opencl softmax : 0.014999 0.000051 0.015050 ( 0.011977)
32
+ pure ruby matmul : 0.666863 0.000000 0.666863 ( 0.666499)
33
+ opencl matmul : 0.008572 0.003920 0.012492 ( 0.009246)
34
+ pure ruby : 2.429792 0.000005 2.429797 ( 2.428788)
35
+ opencl : 0.140862 0.004014 0.144876 ( 0.137264)
36
+ pure ruby single function: 0.340247 0.000000 0.340247 ( 0.340184)
37
+ opencl singlefunction: 0.084871 0.007956 0.092827 ( 0.087899)
38
+ pure ruby pow float: 0.083372 0.000000 0.083372 ( 0.083339)
39
+ opencl pow float: 0.013498 0.000014 0.013512 ( 0.010353)
40
+ pure ruby pow int: 0.018739 0.000000 0.018739 ( 0.018753)
41
+ opencl pow int: 0.007737 0.004041 0.011778 ( 0.008817)
42
+ -------------------------------------------------------------- total: 26.165217sec
43
+
44
+ user system total real
45
+ pure ruby argmin : 0.677097 0.000009 0.677106 ( 0.676828)
46
+ opencl argmin : 0.005919 0.003950 0.009869 ( 0.006618)
47
+ pure ruby bias_add_grad : 2.027326 0.000000 2.027326 ( 2.026399)
48
+ opencl bias_add_grad : 0.011544 0.000050 0.011594 ( 0.008380)
49
+ pure ruby bias_add : 2.378283 0.000000 2.378283 ( 2.377411)
50
+ opencl bias_add : 0.130993 0.011994 0.142987 ( 0.132772)
51
+ pure ruby conv2d_backprop : 3.738167 0.000000 3.738167 ( 3.737946)
52
+ opencl conv2d_backprop : 0.031267 0.003958 0.035225 ( 0.030381)
53
+ pure ruby conv2d : 0.794182 0.000000 0.794182 ( 0.794100)
54
+ opencl conv2d : 0.015865 0.004020 0.019885 ( 0.016878)
55
+ pure ruby arr index : 0.003112 0.000000 0.003112 ( 0.003109)
56
+ opencl arr index : 0.012100 0.000000 0.012100 ( 0.009728)
57
+ pure ruby min : 3.669509 0.003944 3.673453 ( 3.671906)
58
+ opencl min : 0.137071 0.004055 0.141126 ( 0.131802)
59
+ pure ruby sum : 3.210619 0.000000 3.210619 ( 3.210064)
60
+ opencl sum : 0.002431 0.008030 0.010461 ( 0.007522)
61
+ pure ruby sum axis 1 : 3.208789 0.000000 3.208789 ( 3.208125)
62
+ opencl sum axis 1 : 0.006075 0.003963 0.010038 ( 0.007679)
63
+ pure ruby split : 0.013985 0.000000 0.013985 ( 0.013990)
64
+ opencl split : 0.029464 0.011999 0.041463 ( 0.030797)
65
+ pure ruby add_n : 0.140984 0.000003 0.140987 ( 0.140959)
66
+ opencl add_n : 0.003146 0.007934 0.011080 ( 0.007778)
67
+ pure ruby ooo matmul : 1.416585 0.000000 1.416585 ( 1.416290)
68
+ opencl ooo matmul : 0.011156 0.000000 0.011156 ( 0.008723)
69
+ pure ruby softmax : 0.024724 0.000000 0.024724 ( 0.024731)
70
+ opencl softmax : 0.006237 0.003945 0.010182 ( 0.009005)
71
+ pure ruby matmul : 0.679538 0.000000 0.679538 ( 0.680048)
72
+ opencl matmul : 0.003456 0.007965 0.011421 ( 0.008568)
73
+ pure ruby : 2.437790 0.004031 2.441821 ( 2.443126)
74
+ opencl : 0.133039 0.003996 0.137035 ( 0.130579)
75
+ pure ruby single function: 0.332269 0.004003 0.336272 ( 0.336201)
76
+ opencl singlefunction: 0.078607 0.004009 0.082616 ( 0.078640)
77
+ pure ruby pow float: 0.081409 0.000000 0.081409 ( 0.081364)
78
+ opencl pow float: 0.011501 0.000000 0.011501 ( 0.008471)
79
+ pure ruby pow int: 0.016687 0.000000 0.016687 ( 0.016711)
80
+ opencl pow int: 0.007061 0.003950 0.011011 ( 0.007819)
@@ -0,0 +1,42 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ % out_c_dtype = dtype_to_c_type(out_dtype)
3
+ % o_multipliers = o_shape.dup.drop(1).reverse.inject([1]) { |a, s| a << s * a.last }.reverse
4
+ % i_multipliers = shape.dup.drop(1).reverse.inject([1]) { |a, s| a << s * a.last }.reverse
5
+ % out_ops = o_multipliers.map.with_index { |m, index| "id_#{index} * #{m}"}.join(' + ')
6
+ % axis = axis[0]
7
+ % in_axis_multipliers = i_multipliers.select.with_index { |m, index| axis == index }
8
+ % in_axis_ops = in_axis_multipliers.map.with_index { |m| "i * #{m}"}.join(' + ')
9
+ % in_output_multipliers = i_multipliers.reject.with_index { |m, index| axis == index }
10
+ % in_output_ops = in_output_multipliers.map.with_index { |m, index| "id_#{index} * #{m}"}.join(' + ')
11
+ __kernel void arg_axis_<%= dtype %>(__global const <%= c_dtype %> *value, __global <%= out_c_dtype %> *output) {
12
+ // Get the index of the current element to be processed
13
+ <% o_multipliers.size.times.each_with_index do |s, index| %>
14
+ const int id_<%= index %> = get_global_id(<%= index %>);
15
+ <% end %>
16
+
17
+ <%= c_dtype %> min_or_max_value = <%= f == :argmax ? min_value_for(dtype) : max_value_for(dtype) %>;
18
+ int min_or_max_index = 0;
19
+
20
+ for (int i = 0; i < <%= shape[axis] %>; i++) {
21
+
22
+ int index = <%= in_axis_ops %>;
23
+
24
+ <% unless in_output_ops.empty? %>
25
+ index += <%= in_output_ops %>;
26
+ <% end %>
27
+ <%= case(f)
28
+ when :argmax
29
+ "if (value[index] > min_or_max_value) {"
30
+ when :argmin
31
+ "if (value[index] < min_or_max_value) {"
32
+ else
33
+ raise "unkown redunction func #{f}"
34
+ end
35
+ %>
36
+ min_or_max_index = i;
37
+ min_or_max_value = value[index];
38
+ }
39
+ }
40
+
41
+ output[<%= out_ops %>] = (<%= out_c_dtype %>)min_or_max_index;
42
+ }
@@ -1,8 +1,14 @@
1
1
  % c_dtype = dtype_to_c_type(dtype)
2
- __kernel void argmax_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
3
- // Get the index of the current element to be processed
4
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
-
7
- C[globalRow * N + globalCol] = A[globalRow * N + globalCol] + B[globalRow * N + globalCol];
2
+ % out_c_dtype = dtype_to_c_type(out_dtype)
3
+ __kernel void argmax_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
4
+ <%= c_dtype %> max = <%= min_value_for(dtype) %>;
5
+ <%= out_c_dtype %> max_index = 0;
6
+
7
+ for(int i = 0; i < <%= n %>; i++) {
8
+ if (A[i] > max) {
9
+ max = A[i];
10
+ max_index = i;
11
+ }
12
+ }
13
+ C[0] = max_index;
8
14
  }
@@ -1,8 +1,14 @@
1
1
  % c_dtype = dtype_to_c_type(dtype)
2
- __kernel void argmin_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
3
- // Get the index of the current element to be processed
4
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
-
7
- C[globalRow * N + globalCol] = A[globalRow * N + globalCol] + B[globalRow * N + globalCol];
2
+ % out_c_dtype = dtype_to_c_type(out_dtype)
3
+ __kernel void argmin_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
4
+ <%= c_dtype %> min = <%= max_value_for(dtype) %>;
5
+ <%= out_c_dtype %> min_index = 0;
6
+
7
+ for(int i = 0; i < <%= n %>; i++) {
8
+ if (A[i] < min) {
9
+ min = A[i];
10
+ min_index = i;
11
+ }
12
+ }
13
+ C[0] = min_index;
8
14
  }
@@ -0,0 +1,9 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+
3
+ __kernel void bias_add_<%= dtype %>(__global const <%= c_dtype %> *value, __constant const <%= c_dtype %> *bias, __global <%= c_dtype %> *output) {
4
+ const int id = get_global_id(0);
5
+
6
+ for(int i = 0; i < <%= n %>; i++) {
7
+ output[ <%= n %> * id + i] = value[ <%= n %> * id + i] + bias[i];
8
+ }
9
+ }
@@ -0,0 +1,10 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+
3
+ __kernel void bias_add_grad_<%= dtype %>(__global const <%= c_dtype %> *received_grad, __global <%= c_dtype %> *output) {
4
+ const int id = get_global_id(0);
5
+ <%= c_dtype %> sum = 0;
6
+ for(int i = 0; i < <%= rows %>; i++) {
7
+ sum += received_grad[<%= n %> * i + id];
8
+ }
9
+ output[id] = sum;
10
+ }
@@ -0,0 +1,42 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ % o_multipliers = o_shape.dup.drop(1).reverse.inject([1]) { |a, s| a << s * a.last }.reverse
3
+ % i_multipliers = shape.dup.drop(1).reverse.inject([1]) { |a, s| a << s * a.last }.reverse
4
+ % out_ops = o_multipliers.map.with_index { |m, index| "id_#{index} * #{m}" }.join(' + ')
5
+ % in_axis_multipliers = i_multipliers.select.with_index { |m, index| axis.include?(index) }
6
+ % in_axis_ops = in_axis_multipliers.map.with_index { |m, index| "i_#{index} * #{m}"}.join(' + ')
7
+ % in_output_multipliers = i_multipliers.reject.with_index { |m, index| axis.include?(index) }
8
+ % in_output_ops = in_output_multipliers.map.with_index { |m, index| "id_#{index} * #{m}"}.join(' + ')
9
+ __kernel void reduce_axis_<%= dtype %>(__global const <%= c_dtype %> *value, __global <%= c_dtype %> *output) {
10
+ // Get the index of the current element to be processed
11
+ <% o_multipliers.size.times.each_with_index do |s, index| %>
12
+ const int id_<%= index %> = get_global_id(<%= index %>);
13
+ <% end %>
14
+
15
+ <%= c_dtype %> sum = <%= f == :prod ? 1 : 0 %>;
16
+ <%= c_dtype %> item_size = 0;
17
+ <% axis.each_with_index do |axis, index| %>
18
+ for (int i_<%= index %> = 0; i_<%= index %> < <%= shape[axis] %>; i_<%= index %>++) {
19
+ <% end %>
20
+ int index = <%= in_axis_ops %>;
21
+ item_size += 1;
22
+ <% unless in_output_ops.empty? %>
23
+ index += <%= in_output_ops %>;
24
+ <% end %>
25
+ <%= case(f)
26
+ when :sum, :mean
27
+ "sum += value[index];"
28
+ when :prod
29
+ "sum *= value[index];"
30
+ else
31
+ raise "unkown redunction func #{f}"
32
+ end
33
+ %>
34
+ <% axis.each do |axis| %>
35
+ }
36
+ <% end %>
37
+ <% if f == :mean %>
38
+ output[<%= out_ops %>] = sum / item_size;
39
+ <% else %>
40
+ output[<%= out_ops %>] = sum;
41
+ <% end %>
42
+ }
@@ -90,6 +90,36 @@ module TensorStream
90
90
  output_buffer
91
91
  end
92
92
 
93
+ register_op :bias_add do |context, tensor, inputs|
94
+ value, bias = inputs
95
+ output_buffer = _create_result_buffer(value.data_type, value.shape, tensor.name)
96
+ result_shape = value.shape.dup
97
+ bias_length = result_shape.pop
98
+ work_group = [result_shape.reduce(:*)]
99
+ event_wait_list = build_event_wait_list([value, bias])
100
+ dtype = tensor.data_type
101
+ output_buffer.op = _cl_program('bias_add', n: bias_length, dtype: dtype)
102
+ .send(:"bias_add_#{dtype}", _opencl_queue, work_group, value.cl_buffer,
103
+ bias.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
104
+ output_buffer
105
+ end
106
+
107
+ register_op :bias_add_grad do |context, tensor, inputs|
108
+ received_grad = inputs[0]
109
+ bias_size = received_grad.shape.last
110
+ output_buffer = _create_result_buffer(received_grad.data_type, [bias_size], tensor.name)
111
+ work_group = [bias_size]
112
+
113
+ received_grad_shape = received_grad.shape.dup
114
+ received_grad_shape.pop
115
+ item_rows = received_grad_shape.reduce(:*)
116
+ dtype = tensor.data_type
117
+ output_buffer.op = _cl_program('bias_add_grad', n: bias_size, rows: item_rows, dtype: dtype)
118
+ .send(:"bias_add_grad_#{dtype}", _opencl_queue, work_group, received_grad.cl_buffer,
119
+ output_buffer.cl_buffer, event_wait_list: build_event_wait_list([received_grad]))
120
+ output_buffer
121
+ end
122
+
93
123
  %i[sign exp tan acos asin sin cos abs sqrt negate square reciprocal tanh tanh_grad sigmoid log1p round floor ceil log].each do |op|
94
124
  register_op op, noop: true do |context, tensor, inputs|
95
125
  execute_func(op.to_s, tensor, inputs[0], context)
@@ -110,25 +140,18 @@ module TensorStream
110
140
  end
111
141
  end
112
142
 
113
- # register_op :argmin, buffer: true do |_context, tensor, inputs|
114
- # axis = inputs[1].nil? || inputs[1].buffer.nil? || inputs[1].buffer.empty? ? 0 : inputs[1].buffer
115
- # rank = inputs[0].shape.size
116
- # raise TensorStream::InvalidArgumentError, "Expected dimension in the range [#{-rank},#{rank}) but got #{axis}" if axis < -rank || axis >= rank
117
-
118
- # arr = inputs[0].buffer.reshape(*inputs[0].shape.reverse).to_a
119
- # op = get_op_with_axis(arr, axis, 0, inputs[0].data_type, ->(a, b) { a < b })
120
- # convert_to_opencl(op, shape_eval(op), data_type: tensor.data_type, name: tensor.name)
121
- # end
143
+ %i[argmin argmax].each do |op|
144
+ register_op op do |context, tensor, inputs|
145
+ value, axis = inputs
146
+ rank = value.shape.size
147
+ axis = 0 if axis.nil?
122
148
 
123
- # register_op :argmax, buffer: true do |_context, tensor, inputs|
124
- # axis = inputs[1].nil? || inputs[1].buffer.nil? || inputs[1].buffer.empty? ? 0 : inputs[1].buffer
125
- # rank = inputs[0].shape.size
126
- # raise TensorStream::InvalidArgumentError, "Expected dimension in the range [#{-rank},#{rank}) but got #{axis}" if axis < -rank || axis >= rank
149
+ axis = axis.is_a?(OpenCLBuffer) ? read_final_result(axis) : axis
150
+ raise TensorStream::InvalidArgumentError, "Expected dimension in the range [#{-rank},#{rank}) but got #{axis}" if axis < -rank || axis >= rank
127
151
 
128
- # arr = inputs[0].buffer.reshape(*inputs[0].shape.reverse).to_a
129
- # op = get_op_with_axis(arr, axis, 0, inputs[0].data_type, ->(a, b) { a > b })
130
- # convert_to_opencl(op, shape_eval(op), data_type: tensor.data_type, name: tensor.name)
131
- # end
152
+ reduce_multi_axis(context, tensor, value, axis, 'arg', op.to_sym)
153
+ end
154
+ end
132
155
 
133
156
  def reduction(child_context, tensor, value, axis, func)
134
157
  if axis.nil?
@@ -164,33 +187,34 @@ module TensorStream
164
187
  end
165
188
  end
166
189
  else
167
- return value if value.shape.empty?
190
+ reduce_multi_axis(child_context, tensor, value, axis, 'reduce', func)
191
+ end
192
+ end
168
193
 
169
- axis = axis.is_a?(OpenCLBuffer) ? read_final_result(axis) : axis
170
- input = complete_eval(value, child_context)
194
+ def reduce_multi_axis(child_context, tensor, value, axis, prog, func)
195
+ return value if value.shape.empty?
171
196
 
172
- value = value.buffer.reshape(*value.shape.reverse)
173
- rank = input.shape.size - 1
197
+ rank = value.shape.size
174
198
 
175
- if axis.is_a?(Array)
176
- axis.map { |x| rank - x.abs }.sort.reverse_each do |x|
177
- value = value.send(func, x.to_i)
178
- end
179
- else
180
- value = value.send(func, rank - axis.abs)
181
- end
199
+ axis = axis.is_a?(OpenCLBuffer) ? read_final_result(axis) : axis
200
+ axis = [axis] unless axis.is_a?(Array)
201
+ return value if axis.empty?
202
+ # remap negative values
203
+ axis.map! { |axis| axis < 0 ? rank - axis.abs : axis }
182
204
 
183
- new_shape = if value.is_a?(NArray)
184
- value.shape.reverse
185
- else
186
- value = [value]
187
- []
188
- end
205
+ new_shape = value.shape.collect.with_index { |v, index| axis.include?(index) ? nil : v }.compact
189
206
 
190
- new_shape = _reduced_shape(input.shape.dup, axis) if tensor.options[:keepdims]
207
+ buffer_shape = tensor.options[:keepdims] ? _reduced_shape(value.shape.dup, axis) : new_shape
208
+ output_buffer = _create_result_buffer(tensor.options[:output_type] || tensor.data_type, buffer_shape, tensor.name)
191
209
 
192
- convert_to_opencl(value.flatten, new_shape, data_type: tensor.data_type, name: tensor.name)
193
- end
210
+ work_group = new_shape.empty? ? [1] : new_shape
211
+ dtype = value.data_type
212
+
213
+ output_buffer.op = _cl_program("#{prog}_axis", f: func, axis: axis, shape: value.shape, o_shape: new_shape, dtype: dtype, out_dtype: tensor.options[:output_type])
214
+ .send("#{prog}_axis_#{dtype}", _opencl_queue, work_group, value.cl_buffer,
215
+ output_buffer.cl_buffer, event_wait_list: build_event_wait_list([value]))
216
+
217
+ output_buffer
194
218
  end
195
219
  end
196
220
  end
@@ -23,6 +23,10 @@ module TensorStream
23
23
  @shape == [0]
24
24
  end
25
25
 
26
+ def inspect
27
+ "CLBuffer(shape: #{shape || "?"} data_type: #{data_type}, cl_allocated: #{cl_buffer ? cl_buffer.size : 'unallocated'}) -> raw: #{buffer.to_a}"
28
+ end
29
+
26
30
  def to_ruby
27
31
  return [] if buffer.empty?
28
32
 
@@ -76,6 +76,27 @@ class OpenclTemplateHelper
76
76
  end
77
77
  end
78
78
 
79
+ def max_value_for(dtype)
80
+ case dtype.to_s
81
+ when 'float64'
82
+ 'DBL_MAX'
83
+ when 'float32', 'float', 'float16'
84
+ 'FLT_MAX'
85
+ when 'int32', 'int'
86
+ 'INT_MAX'
87
+ when 'uint32', 'uint16'
88
+ '0'
89
+ when 'int16'
90
+ 'SHRT_MAX'
91
+ when 'int8'
92
+ '256'
93
+ when 'boolean'
94
+ '1'
95
+ else
96
+ raise "unknown dtype #{dtype}"
97
+ end
98
+ end
99
+
79
100
  def operator_to_c(op)
80
101
  case op
81
102
  when 'less'
@@ -1,5 +1,5 @@
1
1
  module TensorStream
2
2
  module Opencl
3
- VERSION = "0.2.10"
3
+ VERSION = "0.3.0"
4
4
  end
5
5
  end
@@ -39,7 +39,7 @@ Gem::Specification.new do |spec|
39
39
  spec.add_development_dependency "awesome_print"
40
40
  spec.add_development_dependency "mnist-learn"
41
41
  spec.add_development_dependency "simplecov"
42
- spec.add_dependency "tensor_stream", "1.0.5"
42
+ spec.add_dependency "tensor_stream", "1.0.6"
43
43
  spec.add_dependency "opencl_ruby_ffi"
44
44
  spec.add_dependency "oily_png"
45
45
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tensor_stream-opencl
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.10
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Joseph Dayo
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-03-20 00:00:00.000000000 Z
11
+ date: 2019-03-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -114,14 +114,14 @@ dependencies:
114
114
  requirements:
115
115
  - - '='
116
116
  - !ruby/object:Gem::Version
117
- version: 1.0.5
117
+ version: 1.0.6
118
118
  type: :runtime
119
119
  prerelease: false
120
120
  version_requirements: !ruby/object:Gem::Requirement
121
121
  requirements:
122
122
  - - '='
123
123
  - !ruby/object:Gem::Version
124
- version: 1.0.5
124
+ version: 1.0.6
125
125
  - !ruby/object:Gem::Dependency
126
126
  name: opencl_ruby_ffi
127
127
  requirement: !ruby/object:Gem::Requirement
@@ -169,6 +169,7 @@ files:
169
169
  - benchmark/benchmark.rb
170
170
  - benchmark_intel.txt
171
171
  - benchmark_ryzen.txt
172
+ - benchmark_ryzen_nvidia.txt
172
173
  - bin/console
173
174
  - bin/setup
174
175
  - lib/tensor_stream/opencl.rb
@@ -186,9 +187,12 @@ files:
186
187
  - lib/tensor_stream/opencl/kernels/apply_gradient.cl
187
188
  - lib/tensor_stream/opencl/kernels/apply_momentum.cl
188
189
  - lib/tensor_stream/opencl/kernels/apply_rms_prop.cl
190
+ - lib/tensor_stream/opencl/kernels/arg_axis.cl
189
191
  - lib/tensor_stream/opencl/kernels/argmax.cl
190
192
  - lib/tensor_stream/opencl/kernels/argmin.cl
191
193
  - lib/tensor_stream/opencl/kernels/asin.cl
194
+ - lib/tensor_stream/opencl/kernels/bias_add.cl
195
+ - lib/tensor_stream/opencl/kernels/bias_add_grad.cl
192
196
  - lib/tensor_stream/opencl/kernels/cast.cl
193
197
  - lib/tensor_stream/opencl/kernels/ceil.cl
194
198
  - lib/tensor_stream/opencl/kernels/concat.cl
@@ -217,6 +221,7 @@ files:
217
221
  - lib/tensor_stream/opencl/kernels/prod.cl
218
222
  - lib/tensor_stream/opencl/kernels/real_div.cl
219
223
  - lib/tensor_stream/opencl/kernels/reciprocal.cl
224
+ - lib/tensor_stream/opencl/kernels/reduce_axis.cl
220
225
  - lib/tensor_stream/opencl/kernels/relu6.cl
221
226
  - lib/tensor_stream/opencl/kernels/round.cl
222
227
  - lib/tensor_stream/opencl/kernels/sigmoid.cl