tensor_stream-opencl 0.2.2 → 0.2.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/benchmark/benchmark.rb +23 -1
- data/benchmark_ryzen.txt +56 -0
- data/lib/tensor_stream/opencl/array_ops.rb +3 -3
- data/lib/tensor_stream/opencl/images_ops.rb +30 -0
- data/lib/tensor_stream/opencl/kernels/conv2d.cl +27 -0
- data/lib/tensor_stream/opencl/kernels/conv2d_backprop_filter.cl +26 -0
- data/lib/tensor_stream/opencl/kernels/conv2d_backprop_input.cl +32 -0
- data/lib/tensor_stream/opencl/kernels/gemm.cl +2 -10
- data/lib/tensor_stream/opencl/kernels/max.cl +5 -13
- data/lib/tensor_stream/opencl/kernels/mean.cl +26 -0
- data/lib/tensor_stream/opencl/kernels/min.cl +3 -11
- data/lib/tensor_stream/opencl/kernels/prod.cl +26 -0
- data/lib/tensor_stream/opencl/kernels/relu6.cl +7 -0
- data/lib/tensor_stream/opencl/kernels/round.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/sum.cl +26 -0
- data/lib/tensor_stream/opencl/math_ops.rb +86 -29
- data/lib/tensor_stream/opencl/nn_ops.rb +89 -5
- data/lib/tensor_stream/opencl/opencl_buffer.rb +6 -2
- data/lib/tensor_stream/opencl/opencl_evaluator.rb +97 -92
- data/lib/tensor_stream/opencl/version.rb +1 -1
- data/samples/iris.rb +2 -2
- data/samples/logistic_regression.rb +84 -0
- data/samples/mnist_data_2.1.rb +9 -4
- data/samples/mnist_data_2.2.rb +12 -7
- data/samples/mnist_data_2.3.rb +111 -0
- data/samples/rnn.rb +1 -1
- data/tensor_stream-opencl.gemspec +2 -1
- metadata +28 -4
@@ -0,0 +1,7 @@
|
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
__kernel void relu6_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
|
+
// Get the index of the current element to be processed
|
4
|
+
const int id = get_global_id(0);
|
5
|
+
|
6
|
+
C[id] = min((<%= c_dtype %>)max((<%= c_dtype %>) A[id], (<%= c_dtype %>)0), (<%= c_dtype %>)6);
|
7
|
+
}
|
@@ -1,8 +1,7 @@
|
|
1
1
|
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
__kernel void round_<%= dtype %>(
|
2
|
+
__kernel void round_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
3
|
// Get the index of the current element to be processed
|
4
|
-
const int
|
5
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
4
|
+
const int id = get_global_id(0); // Row ID of C (0..M)
|
6
5
|
|
7
|
-
C[
|
6
|
+
C[id] = round(A[id]);
|
8
7
|
}
|
@@ -0,0 +1,26 @@
|
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
__kernel void sum_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
|
+
// Get the index of the current element to be processed
|
4
|
+
const int id = get_global_id(0);
|
5
|
+
int offset = (id + <%= index %>) * <%= w %>;
|
6
|
+
<%= c_dtype %> sum = 0;
|
7
|
+
<% if n > 4 %>
|
8
|
+
for(int i = 0; i < <%= n/4 %> ; i++) {
|
9
|
+
<% sums = 4.times.map do |i|
|
10
|
+
"A[offset + #{i}]"
|
11
|
+
end %>
|
12
|
+
sum += <%= sums.join(' + ') %>;
|
13
|
+
offset += 4;
|
14
|
+
}
|
15
|
+
<% if n%4!=0 %>
|
16
|
+
<% (n % 4).times do |i| %>
|
17
|
+
sum += A[offset + <%= i %>];
|
18
|
+
<% end %>
|
19
|
+
<% end %>
|
20
|
+
<% else %>
|
21
|
+
<% n.times do |i| %>
|
22
|
+
sum += A[offset + <%= i %>];
|
23
|
+
<% end %>
|
24
|
+
<% end %>
|
25
|
+
C[id] = sum;
|
26
|
+
}
|
@@ -5,8 +5,8 @@ module TensorStream
|
|
5
5
|
def MathOps.included(klass)
|
6
6
|
klass.class_eval do
|
7
7
|
%i[max min add real_div div sub floor_mod mod mul pow sigmoid_grad squared_difference].each do |op|
|
8
|
-
register_op op
|
9
|
-
execute_2_operand_func(op.to_s, tensor, inputs[0], inputs[1]
|
8
|
+
register_op op do |context, tensor, inputs|
|
9
|
+
execute_2_operand_func(op.to_s, tensor, inputs[0], inputs[1])
|
10
10
|
end
|
11
11
|
end
|
12
12
|
|
@@ -38,11 +38,11 @@ module TensorStream
|
|
38
38
|
end
|
39
39
|
end
|
40
40
|
|
41
|
-
register_op :floor_div
|
41
|
+
register_op :floor_div do |context, tensor, inputs|
|
42
42
|
if fp_type?(tensor.data_type)
|
43
|
-
execute_2_operand_func('floor_div', tensor, inputs[0], inputs[1]
|
43
|
+
execute_2_operand_func('floor_div', tensor, inputs[0], inputs[1])
|
44
44
|
else
|
45
|
-
execute_2_operand_func('div', tensor, inputs[0], inputs[1]
|
45
|
+
execute_2_operand_func('div', tensor, inputs[0], inputs[1])
|
46
46
|
end
|
47
47
|
end
|
48
48
|
|
@@ -78,11 +78,8 @@ module TensorStream
|
|
78
78
|
cl_n = OpenCL::Int1.new(n)
|
79
79
|
cl_k = OpenCL::Int1.new(k)
|
80
80
|
|
81
|
-
transpose_a = OpenCL::Int1.new(tensor.options[:transpose_a] ? 1 : 0)
|
82
|
-
transpose_b = OpenCL::Int1.new(tensor.options[:transpose_b] ? 1 : 0)
|
83
81
|
event_wait_list = build_event_wait_list([a, b])
|
84
|
-
|
85
|
-
output_buffer.op = _cl_program('gemm', dtype: dtype).send(:"gemm_#{dtype}", _opencl_queue, result_shape, cl_m, cl_n, cl_k, transpose_a, transpose_b, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
82
|
+
output_buffer.op = _cl_program('gemm', ta: !!tensor.options[:transpose_a], tb: !!tensor.options[:transpose_b], dtype: dtype).send(:"gemm_#{dtype}", _opencl_queue, result_shape, cl_m, cl_n, cl_k, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
86
83
|
|
87
84
|
output_buffer
|
88
85
|
end
|
@@ -94,39 +91,99 @@ module TensorStream
|
|
94
91
|
end
|
95
92
|
|
96
93
|
%i[sum mean].each do |op|
|
97
|
-
register_op op
|
94
|
+
register_op op do |context, tensor, inputs|
|
98
95
|
reduction(context, tensor, inputs[0], inputs[1], op.to_sym)
|
99
96
|
end
|
100
97
|
end
|
101
98
|
|
102
|
-
register_op :prod
|
103
|
-
|
104
|
-
|
105
|
-
if input_a.buffer.empty?
|
99
|
+
register_op :prod do |context, tensor, inputs|
|
100
|
+
if inputs[0].shape == [0]
|
106
101
|
convert_to_opencl([1.0], [], data_type: inputs[0].data_type, name: tensor.name)
|
107
102
|
else
|
108
103
|
reduction(context, tensor, inputs[0], inputs[1], :prod)
|
109
104
|
end
|
110
105
|
end
|
111
106
|
|
112
|
-
register_op :argmin, buffer: true do |_context, tensor, inputs|
|
113
|
-
|
114
|
-
|
115
|
-
|
107
|
+
# register_op :argmin, buffer: true do |_context, tensor, inputs|
|
108
|
+
# axis = inputs[1].nil? || inputs[1].buffer.nil? || inputs[1].buffer.empty? ? 0 : inputs[1].buffer
|
109
|
+
# rank = inputs[0].shape.size
|
110
|
+
# raise TensorStream::InvalidArgumentError, "Expected dimension in the range [#{-rank},#{rank}) but got #{axis}" if axis < -rank || axis >= rank
|
111
|
+
|
112
|
+
# arr = inputs[0].buffer.reshape(*inputs[0].shape.reverse).to_a
|
113
|
+
# op = get_op_with_axis(arr, axis, 0, inputs[0].data_type, ->(a, b) { a < b })
|
114
|
+
# convert_to_opencl(op, shape_eval(op), data_type: tensor.data_type, name: tensor.name)
|
115
|
+
# end
|
116
|
+
|
117
|
+
# register_op :argmax, buffer: true do |_context, tensor, inputs|
|
118
|
+
# axis = inputs[1].nil? || inputs[1].buffer.nil? || inputs[1].buffer.empty? ? 0 : inputs[1].buffer
|
119
|
+
# rank = inputs[0].shape.size
|
120
|
+
# raise TensorStream::InvalidArgumentError, "Expected dimension in the range [#{-rank},#{rank}) but got #{axis}" if axis < -rank || axis >= rank
|
121
|
+
|
122
|
+
# arr = inputs[0].buffer.reshape(*inputs[0].shape.reverse).to_a
|
123
|
+
# op = get_op_with_axis(arr, axis, 0, inputs[0].data_type, ->(a, b) { a > b })
|
124
|
+
# convert_to_opencl(op, shape_eval(op), data_type: tensor.data_type, name: tensor.name)
|
125
|
+
# end
|
126
|
+
|
127
|
+
def reduction(child_context, tensor, value, axis, func)
|
128
|
+
if axis.nil?
|
129
|
+
value = _run(value, child_context)
|
130
|
+
size = value.shape.reduce(:*) || 1
|
131
|
+
if value.shape.empty? # for scalars, just return as is
|
132
|
+
value
|
133
|
+
else
|
134
|
+
reduction_threads = 32
|
135
|
+
items_per_thread_threshold = 4
|
136
|
+
|
137
|
+
output_buffer = _create_result_buffer(value.data_type, [], tensor.name)
|
138
|
+
event_wait_list = build_event_wait_list([value])
|
139
|
+
|
140
|
+
if (size > reduction_threads) && ((size / reduction_threads) > items_per_thread_threshold)
|
141
|
+
items_per_thread = size / reduction_threads
|
142
|
+
extra_items = size % reduction_threads
|
143
|
+
intermediate_output_buffer = _create_result_buffer(value.data_type, [reduction_threads], tensor.name)
|
144
|
+
|
145
|
+
temp_values = if extra_items.zero?
|
146
|
+
_cl_program(func, dtype: value.data_type, index: 0, n: items_per_thread, w: items_per_thread).
|
147
|
+
send(:"#{func}_#{value.data_type}", _opencl_queue, [reduction_threads], value.cl_buffer, intermediate_output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
148
|
+
else
|
149
|
+
[_cl_program(func, dtype: value.data_type, index: 0, n: items_per_thread, w: items_per_thread).
|
150
|
+
send(:"#{func}_#{value.data_type}", _opencl_queue, [reduction_threads - 1], value.cl_buffer, intermediate_output_buffer.cl_buffer, event_wait_list: event_wait_list),
|
151
|
+
_cl_program(func, dtype: value.data_type, index: reduction_threads - 1, n: items_per_thread + extra_items, w: items_per_thread).send(:"#{func}_#{value.data_type}", _opencl_queue, [1], value.cl_buffer, intermediate_output_buffer.cl_buffer, event_wait_list: event_wait_list)]
|
152
|
+
end
|
153
|
+
output_buffer.op = _cl_program(func, dtype: value.data_type, n: reduction_threads, index: 0, w: 0).send(:"#{func}_#{value.data_type}", _opencl_queue, [1], value.cl_buffer, output_buffer.cl_buffer, event_wait_list: temp_values)
|
154
|
+
output_buffer
|
155
|
+
else
|
156
|
+
output_buffer.op = _cl_program(func, dtype: value.data_type, n: size, index: 0, w: 0).send(:"#{func}_#{value.data_type}", _opencl_queue, [1], value.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
157
|
+
output_buffer
|
158
|
+
end
|
159
|
+
end
|
160
|
+
else
|
161
|
+
return value if value.shape.empty?
|
162
|
+
|
163
|
+
axis = axis.is_a?(OpenCLBuffer) ? read_final_result(axis) : axis
|
164
|
+
input = complete_eval(value, child_context)
|
165
|
+
value = value.buffer.reshape(*value.shape.reverse)
|
166
|
+
rank = input.shape.size - 1
|
167
|
+
|
168
|
+
if axis.is_a?(Array)
|
169
|
+
axis.map { |x| rank - x.abs }.sort.reverse_each do |x|
|
170
|
+
value = value.send(func, x.to_i)
|
171
|
+
end
|
172
|
+
else
|
173
|
+
value = value.send(func, rank - axis.abs)
|
174
|
+
end
|
116
175
|
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
176
|
+
new_shape = if value.is_a?(NArray)
|
177
|
+
value.shape.reverse
|
178
|
+
else
|
179
|
+
value = [value]
|
180
|
+
[]
|
181
|
+
end
|
121
182
|
|
122
|
-
|
123
|
-
axis = tensor.options[:axis] || 0
|
124
|
-
rank = inputs[0].shape.size
|
125
|
-
raise TensorStream::InvalidArgumentError, "Expected dimension in the range [#{-rank},#{rank}) but got #{axis}" if axis < -rank || axis >= rank
|
183
|
+
new_shape = _reduced_shape(input.shape.dup, axis) if tensor.options[:keepdims]
|
126
184
|
|
127
|
-
|
128
|
-
|
129
|
-
convert_to_opencl(op, shape_eval(op), data_type: tensor.data_type, name: tensor.name)
|
185
|
+
convert_to_opencl(value.flatten, new_shape, data_type: tensor.data_type, name: tensor.name)
|
186
|
+
end
|
130
187
|
end
|
131
188
|
end
|
132
189
|
end
|
@@ -121,11 +121,11 @@ module TensorStream
|
|
121
121
|
end
|
122
122
|
|
123
123
|
register_op :apply_adagrad do |context, tensor, inputs|
|
124
|
-
|
124
|
+
_target_var, _accum, lr, grad = inputs
|
125
125
|
|
126
126
|
assign = tensor.inputs[0] || tensor
|
127
127
|
assign_acc = tensor.inputs[1]
|
128
|
-
|
128
|
+
|
129
129
|
assign.buffer.dirty = true
|
130
130
|
assign_acc.buffer.dirty = true
|
131
131
|
output_buffer = assign.buffer
|
@@ -133,7 +133,7 @@ module TensorStream
|
|
133
133
|
work_group = [output_buffer.total_elements]
|
134
134
|
|
135
135
|
event_wait_list = build_event_wait_list(inputs)
|
136
|
-
event = call_program('apply_adagrad',
|
136
|
+
event = call_program('apply_adagrad',
|
137
137
|
output_buffer.data_type,
|
138
138
|
work_group,
|
139
139
|
lr.cl_buffer,
|
@@ -195,7 +195,7 @@ module TensorStream
|
|
195
195
|
event_wait_list = build_event_wait_list(inputs)
|
196
196
|
work_group = [output_buffer.total_elements]
|
197
197
|
|
198
|
-
event = call_program('apply_rms_prop', output_buffer.data_type,
|
198
|
+
event = call_program('apply_rms_prop', output_buffer.data_type,
|
199
199
|
work_group,
|
200
200
|
lr.cl_buffer,
|
201
201
|
rho.cl_buffer,
|
@@ -298,7 +298,7 @@ module TensorStream
|
|
298
298
|
end
|
299
299
|
|
300
300
|
b = wrap_opencl(labels, data_type: inputs[0].data_type, name: "#{tensor.name}_label")
|
301
|
-
|
301
|
+
|
302
302
|
event_wait_list = build_event_wait_list(inputs)
|
303
303
|
dtype = tensor.data_type
|
304
304
|
output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
|
@@ -335,6 +335,90 @@ module TensorStream
|
|
335
335
|
output_buffer.op = event
|
336
336
|
output_buffer
|
337
337
|
end
|
338
|
+
|
339
|
+
%i[relu6].each do |op|
|
340
|
+
register_op op, noop: true do |context, tensor, inputs|
|
341
|
+
execute_func(op.to_s, tensor, inputs[0], context)
|
342
|
+
end
|
343
|
+
end
|
344
|
+
|
345
|
+
# Fast per pixel parallel convolution operation
|
346
|
+
register_op :conv2d do |_context, tensor, inputs|
|
347
|
+
filter = inputs[1]
|
348
|
+
batch, height, width, channel = inputs[0].shape
|
349
|
+
filter_shape = filter.shape
|
350
|
+
strides = tensor.options[:strides]
|
351
|
+
height_stride = strides[1]
|
352
|
+
width_stride = strides[2]
|
353
|
+
|
354
|
+
raise TensorStream::ValueError, " Current implementation does not yet support strides in the batch and depth dimensions." if strides[0] != 1 || strides[3] != 1
|
355
|
+
|
356
|
+
event_wait_list = build_event_wait_list(inputs)
|
357
|
+
|
358
|
+
f_height, f_width, in_channels, out_channels = filter_shape
|
359
|
+
out_shape = [batch, height / height_stride, width / width_stride, out_channels]
|
360
|
+
output_buffer = _create_result_buffer(tensor.data_type, out_shape, tensor.name)
|
361
|
+
|
362
|
+
cl_image_height = OpenCL::Int1.new(height)
|
363
|
+
cl_image_width = OpenCL::Int1.new(width)
|
364
|
+
|
365
|
+
work_dimen = [batch, height / height_stride, width / width_stride]
|
366
|
+
|
367
|
+
output_buffer.op = _cl_program("conv2d", dtype: tensor.data_type, fh: f_height, fw: f_width, ch: channel, out_ch: out_channels, stride: [height_stride, width_stride] ).send(:conv2d, _opencl_queue, work_dimen, cl_image_height, cl_image_width, inputs[0].cl_buffer,
|
368
|
+
inputs[1].cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
369
|
+
output_buffer
|
370
|
+
end
|
371
|
+
|
372
|
+
register_op :conv2d_backprop_input do |context, tensor, inputs|
|
373
|
+
image_shape, filter, grad = inputs
|
374
|
+
filter_shape = filter.shape
|
375
|
+
|
376
|
+
strides = tensor.options[:strides]
|
377
|
+
height_stride = strides[1]
|
378
|
+
width_stride = strides[2]
|
379
|
+
|
380
|
+
image_shape = read_final_result(complete_eval(image_shape, context))
|
381
|
+
|
382
|
+
event_wait_list = build_event_wait_list(inputs)
|
383
|
+
output_buffer = _create_result_buffer(tensor.data_type, image_shape, tensor.name)
|
384
|
+
|
385
|
+
batch, height, width, channels = image_shape
|
386
|
+
f_height, f_width, in_channels, out_channels = filter_shape
|
387
|
+
|
388
|
+
work_dimen = [batch, height, width]
|
389
|
+
|
390
|
+
cl_image_height = OpenCL::Int1.new(height)
|
391
|
+
cl_image_width = OpenCL::Int1.new(width)
|
392
|
+
|
393
|
+
output_buffer.op = _cl_program("conv2d_backprop_input", dtype: tensor.data_type, fh: f_height, fw: f_width, ch: channels, out_ch: out_channels, stride: [height_stride, width_stride] ).send(:conv2d_backprop_input, _opencl_queue, work_dimen, cl_image_height, cl_image_width,
|
394
|
+
filter.cl_buffer, grad.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
395
|
+
output_buffer
|
396
|
+
end
|
397
|
+
|
398
|
+
register_op :conv2d_backprop_filter do |context, tensor, inputs|
|
399
|
+
images, filter_shape, grad = inputs
|
400
|
+
|
401
|
+
event_wait_list = build_event_wait_list(inputs)
|
402
|
+
|
403
|
+
strides = tensor.options[:strides]
|
404
|
+
height_stride = strides[1]
|
405
|
+
width_stride = strides[2]
|
406
|
+
|
407
|
+
filter_shape = read_final_result(complete_eval(filter_shape, context))
|
408
|
+
output_buffer = _create_result_buffer(tensor.data_type, filter_shape, tensor.name)
|
409
|
+
|
410
|
+
batch_size, height, width, channels = images.shape
|
411
|
+
f_height, f_width, input_channels, output_channels = filter_shape
|
412
|
+
work_dimen = [f_height, f_width, output_channels]
|
413
|
+
|
414
|
+
cl_batch_size = OpenCL::Int1.new(batch_size)
|
415
|
+
cl_image_height = OpenCL::Int1.new(height)
|
416
|
+
cl_image_width = OpenCL::Int1.new(width)
|
417
|
+
|
418
|
+
output_buffer.op = _cl_program("conv2d_backprop_filter", dtype: tensor.data_type, fh: f_height, fw: f_width, ch: channels, out_ch: output_channels, stride: [height_stride, width_stride] ).send(:conv2d_backprop_filter, _opencl_queue, work_dimen, cl_batch_size, cl_image_height, cl_image_width,
|
419
|
+
images.cl_buffer, grad.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
420
|
+
output_buffer
|
421
|
+
end
|
338
422
|
end
|
339
423
|
end
|
340
424
|
end
|
@@ -37,9 +37,13 @@ module TensorStream
|
|
37
37
|
return buffer[0] != 0 if data_type == :boolean
|
38
38
|
return buffer[0]
|
39
39
|
end
|
40
|
-
|
41
|
-
result = buffer.reshape(*shape.map(&:to_i).reverse).to_a
|
40
|
+
|
41
|
+
result = buffer.reshape(*shape.map(&:to_i).reverse).to_a
|
42
42
|
data_type == :boolean ? process_function_op(result, ->(a, _b) { a != 0 }) : result
|
43
43
|
end
|
44
|
+
|
45
|
+
def self.nil_buffer(owner, name, data_type)
|
46
|
+
OpenCLBuffer.new(owner, name: name, data_type: data_type, shape: [0], buffer: nil, cl_buffer: nil)
|
47
|
+
end
|
44
48
|
end
|
45
49
|
end
|
@@ -35,7 +35,8 @@ module TensorStream
|
|
35
35
|
end
|
36
36
|
|
37
37
|
##
|
38
|
-
#
|
38
|
+
# OpenCL hardware accelerated evaluator
|
39
|
+
#
|
39
40
|
class OpenclEvaluator < BaseEvaluator
|
40
41
|
attr_accessor :retain
|
41
42
|
attr_reader :opencl_device, :opencl_context
|
@@ -53,41 +54,57 @@ module TensorStream
|
|
53
54
|
super
|
54
55
|
_create_opencl_context
|
55
56
|
@opencl_device = device.native_device
|
57
|
+
|
58
|
+
@max_work_item_dimensions = @opencl_device.max_work_item_dimensions
|
59
|
+
@max_work_item_sizes = @opencl_device.max_work_item_sizes
|
60
|
+
@max_work_group_size = @opencl_device.max_work_group_size
|
61
|
+
|
62
|
+
@local_mem_size = @opencl_device.local_mem_size
|
63
|
+
@device_type = @opencl_device.type.to_s.downcase
|
64
|
+
|
56
65
|
create_command_queue
|
57
66
|
end
|
58
67
|
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
68
|
+
class << self
|
69
|
+
def query_supported_devices
|
70
|
+
devices = query_devices_with_score
|
71
|
+
devices.sort_by { |a| a[1] }.map do |d|
|
72
|
+
opencl_to_device(d)
|
73
|
+
end
|
63
74
|
end
|
64
|
-
end
|
65
75
|
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
76
|
+
def fetch_device(query = [])
|
77
|
+
devices = query_devices_with_score
|
78
|
+
platform_devices = devices.select { |d| d[0].platform.to_s.tr(' ', '_').downcase =~ /#{query[0].downcase}/ }
|
79
|
+
opencl_to_device(platform_devices[[query[1].to_i, platform_devices.size - 1].min])
|
80
|
+
end
|
71
81
|
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
82
|
+
def opencl_to_device(dev)
|
83
|
+
device = dev[0]
|
84
|
+
index = dev[3]
|
85
|
+
platform_name = device.platform.name.tr(' ', '_').downcase
|
86
|
+
uri = [platform_name, index].join(':')
|
77
87
|
|
78
|
-
|
88
|
+
device_type = device.type.to_s == 'GPU' ? :gpu : :cpu
|
79
89
|
|
80
|
-
|
81
|
-
|
90
|
+
OpenclDevice.new(uri, device_type, self).tap do |d|
|
91
|
+
d.native_device = device
|
92
|
+
end
|
82
93
|
end
|
83
|
-
end
|
84
94
|
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
95
|
+
##
|
96
|
+
# Select the best device available in the system for this evaluator
|
97
|
+
def default_device
|
98
|
+
devices = OpenclEvaluator.query_devices_with_score
|
99
|
+
device = devices.max { |a, b| a[1] <=> b[1] }
|
100
|
+
opencl_to_device(device)
|
101
|
+
end
|
102
|
+
|
103
|
+
def getset_global_opencl_context(platform)
|
104
|
+
@global_opencl_context ||= {}
|
105
|
+
@global_opencl_context[platform] ||= yield
|
106
|
+
@global_opencl_context[platform]
|
107
|
+
end
|
91
108
|
end
|
92
109
|
|
93
110
|
# opencl evaluator main entrypoint
|
@@ -228,16 +245,22 @@ module TensorStream
|
|
228
245
|
|
229
246
|
def _create_opencl_context(device = nil)
|
230
247
|
if device.nil?
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
248
|
+
all_devices_by_platform = {}
|
249
|
+
TensorStream::Evaluator::OpenclEvaluator.query_supported_devices.map(&:native_device).each do |d|
|
250
|
+
all_devices_by_platform[d.platform.name] ||= []
|
251
|
+
all_devices_by_platform[d.platform.name] << d
|
235
252
|
end
|
236
253
|
|
237
|
-
|
254
|
+
all_devices_by_platform.each do |platform, devices|
|
255
|
+
@opencl_context = TensorStream::Evaluator::OpenclEvaluator.getset_global_opencl_context(platform) do
|
256
|
+
OpenCL.create_context(devices)
|
257
|
+
end
|
258
|
+
end
|
238
259
|
else
|
239
260
|
puts "context created for #{device.native_device}"
|
240
|
-
@opencl_context =
|
261
|
+
@opencl_context = TensorStream::Evaluator::OpenclEvaluator.getset_global_opencl_context(device.native_device.platform) do
|
262
|
+
OpenCL.create_context(device.native_device)
|
263
|
+
end
|
241
264
|
end
|
242
265
|
end
|
243
266
|
|
@@ -269,11 +292,12 @@ module TensorStream
|
|
269
292
|
@context[:_cache][kernel_cache_key] ||=
|
270
293
|
begin
|
271
294
|
# puts "building #{kernel_cache_key}"
|
272
|
-
file_path = File.join('/tmp', "#{kernel}.#{suffix}.cl")
|
295
|
+
file_path = File.join(ENV['TS_OPENCL_FILE_CACHE_PATH'] || '/tmp', "#{kernel}.#{suffix}.cl")
|
273
296
|
source = if File.exist?(file_path) && ENV['TS_OPENCL_FILE_CACHE']
|
274
297
|
File.read(file_path)
|
275
298
|
else
|
276
|
-
|
299
|
+
filenames = ['', ".#{@device_type}"].map { |type| %w[cl.erb cl].map { |ext| cl_template_path("#{kernel}#{type}", ext) } }.flatten
|
300
|
+
filename = filenames.find { |n| File.exist?(n) }
|
277
301
|
raise "opencl kernel template for #{kernel} has not yet been defined" if filename.nil?
|
278
302
|
|
279
303
|
source = File.read(filename)
|
@@ -322,6 +346,7 @@ module TensorStream
|
|
322
346
|
|
323
347
|
def eval_variable(tensor, _child_context)
|
324
348
|
raise "variable #{tensor.name} not initalized" if tensor.value.nil? && (tensor.buffer.nil? || !tensor.buffer.dirty)
|
349
|
+
|
325
350
|
tensor.buffer = wrap_opencl(tensor, name: tensor.name) if tensor.buffer.nil?
|
326
351
|
tensor.buffer
|
327
352
|
end
|
@@ -339,7 +364,7 @@ module TensorStream
|
|
339
364
|
end
|
340
365
|
end
|
341
366
|
|
342
|
-
register_op :identity do |
|
367
|
+
register_op :identity do |_context, tensor, inputs|
|
343
368
|
value = inputs[0]
|
344
369
|
buffer = OpenCLBuffer.new(self, name: tensor.name, data_type: tensor.data_type, shape: value.shape, buffer: value.buffer, cl_buffer: value.cl_buffer)
|
345
370
|
buffer.op = build_event_wait_list(inputs)
|
@@ -351,25 +376,26 @@ module TensorStream
|
|
351
376
|
end
|
352
377
|
|
353
378
|
register_op :assign_add do |context, tensor, inputs|
|
354
|
-
value = execute_2_operand_func('add', tensor, inputs[0], inputs[1]
|
379
|
+
value = execute_2_operand_func('add', tensor, inputs[0], inputs[1])
|
355
380
|
assign_var(tensor, value, context)
|
356
381
|
end
|
357
382
|
|
358
383
|
register_op :assign_sub do |context, tensor, inputs|
|
359
|
-
value = execute_2_operand_func('sub', tensor, inputs[0], inputs[1]
|
384
|
+
value = execute_2_operand_func('sub', tensor, inputs[0], inputs[1])
|
360
385
|
assign_var(tensor, value, context)
|
361
386
|
end
|
362
387
|
|
363
|
-
register_op :variable, noop: true do |
|
388
|
+
register_op :variable, noop: true do |_context, tensor, _inputs|
|
364
389
|
variable = tensor.inputs[0]
|
365
390
|
raise "variable #{tensor.name} not initalized" if variable.value.nil? && (variable.buffer.nil? || !variable.buffer.dirty)
|
391
|
+
|
366
392
|
variable.buffer = wrap_opencl(variable, name: variable.name) if variable.buffer.nil?
|
367
393
|
variable.buffer
|
368
394
|
end
|
369
395
|
|
370
396
|
%i[less less_equal greater greater_equal equal not_equal logical_and].each do |op|
|
371
|
-
register_op op
|
372
|
-
execute_2_operand_func(op.to_s, tensor, inputs[0], inputs[1],
|
397
|
+
register_op op do |context, tensor, inputs|
|
398
|
+
execute_2_operand_func(op.to_s, tensor, inputs[0], inputs[1], 'cond')
|
373
399
|
end
|
374
400
|
end
|
375
401
|
|
@@ -463,11 +489,11 @@ module TensorStream
|
|
463
489
|
rescue EvaluatorExcecutionException => e
|
464
490
|
_opencl_queue.finish # dump queue
|
465
491
|
puts e.message
|
466
|
-
raise e, "error #{e.message} while evaluating #{tensor.name} :
|
492
|
+
raise e, "error #{e.message} while evaluating #{tensor.name} : defined at #{tensor.source}"
|
467
493
|
rescue TensorStreamError => e
|
468
494
|
_opencl_queue.finish # dump queue
|
469
495
|
puts e.message
|
470
|
-
raise e, "error #{e.message} while evaluating #{tensor.name} :
|
496
|
+
raise e, "error #{e.message} while evaluating #{tensor.name} : defined at #{tensor.source}"
|
471
497
|
rescue StandardError => e
|
472
498
|
_opencl_queue.finish # dump queue
|
473
499
|
puts e.message
|
@@ -496,6 +522,7 @@ module TensorStream
|
|
496
522
|
cache_key = "#{tensor.graph.object_id}_opencl_#{tensor.name}:#{object_id}"
|
497
523
|
return @context[cache_key] if @context.key?(cache_key)
|
498
524
|
return @context[:_cache][cache_key] if tensor.is_const && @context[:_cache][cache_key]
|
525
|
+
|
499
526
|
@context[cache_key] = if tensor.value.is_a?(Tensor)
|
500
527
|
_run(tensor.value, child_context)
|
501
528
|
else
|
@@ -512,7 +539,6 @@ module TensorStream
|
|
512
539
|
buffer = complete_eval(b, child_context)
|
513
540
|
|
514
541
|
if assign.buffer
|
515
|
-
# buffer = type_cast(buffer, assign.data_type, name: "#{tensor.name}/cast_#{tensor.name}_#{tensor.data_type}")
|
516
542
|
event_wait_list = build_event_wait_list([buffer, assign.buffer])
|
517
543
|
assign.buffer.op = if assign.buffer.cl_buffer != buffer.cl_buffer
|
518
544
|
_opencl_queue.enqueue_copy_buffer(buffer.cl_buffer, assign.buffer.cl_buffer, event_wait_list: event_wait_list)
|
@@ -524,23 +550,32 @@ module TensorStream
|
|
524
550
|
assign.buffer = convert_to_opencl(value, buffer.shape, data_type: tensor.data_type, name: assign.name)
|
525
551
|
assign.value = value
|
526
552
|
end
|
553
|
+
|
527
554
|
assign.buffer.dirty = true
|
528
555
|
assign.buffer
|
529
556
|
end
|
530
557
|
|
531
|
-
def execute_2_operand_func(op_name, tensor,
|
532
|
-
a = _run(input_a, child_context)
|
533
|
-
b = _run(input_b, child_context)
|
558
|
+
def execute_2_operand_func(op_name, tensor, a, b, prog_name = nil)
|
534
559
|
a, b = auto_type_cast(a, b, name: "#{tensor.name}/cast_#{a.name}_#{b.data_type}")
|
535
560
|
dtype = tensor.data_type
|
536
561
|
result_shape = TensorShape.infer_shape(a.shape, b.shape)
|
537
|
-
return
|
562
|
+
return OpenCLBuffer.nil_buffer(self, "out_#{tensor.name}", dtype) if result_shape == [0]
|
563
|
+
|
538
564
|
output_buffer = _create_result_buffer(tensor.data_type, result_shape, "out_#{tensor.name}")
|
539
565
|
a, b, prog, switch_operands = select_program(a, b, op_name)
|
540
566
|
m, n = result_shape
|
541
|
-
|
542
|
-
|
543
|
-
|
567
|
+
|
568
|
+
work_group = if result_shape.size > 2 && (b.shape.size.zero? || (a.shape == b.shape))
|
569
|
+
[m, result_shape.reduce(:*) / m]
|
570
|
+
elsif result_shape.size <= 2
|
571
|
+
[m || 1, n || 1]
|
572
|
+
else
|
573
|
+
raise "rank > 2 not supported for now"
|
574
|
+
end
|
575
|
+
|
576
|
+
cl_m = OpenCL::Int1.new(work_group[0])
|
577
|
+
cl_n = OpenCL::Int1.new(work_group[1])
|
578
|
+
|
544
579
|
cl_switch = OpenCL::Int1.new(switch_operands) # no need to switch for addition
|
545
580
|
|
546
581
|
event_wait_list = build_event_wait_list([a, b]) # add dependency wait list
|
@@ -610,6 +645,7 @@ module TensorStream
|
|
610
645
|
|
611
646
|
def auto_type_cast(a, b, name: nil)
|
612
647
|
return [a, b] if a.data_type == b.data_type
|
648
|
+
|
613
649
|
m, n = b.shape
|
614
650
|
work_group = [m || 1, n || 1]
|
615
651
|
event_wait_list = build_event_wait_list([b])
|
@@ -624,6 +660,7 @@ module TensorStream
|
|
624
660
|
|
625
661
|
def type_cast(source, data_type, name: nil)
|
626
662
|
return source if source.data_type == data_type
|
663
|
+
|
627
664
|
m, n = source.shape
|
628
665
|
work_group = [m || 1, n || 1]
|
629
666
|
event_wait_list = [source.op].compact
|
@@ -673,8 +710,6 @@ module TensorStream
|
|
673
710
|
|
674
711
|
return nil if buffer.nil?
|
675
712
|
|
676
|
-
|
677
|
-
|
678
713
|
cl_buffer = unless value.flatten.empty?
|
679
714
|
cl_buffer_size = 1 if cl_buffer_size.zero?
|
680
715
|
_opencl_context.create_buffer(cl_buffer_size * buffer.element_size)
|
@@ -682,6 +717,7 @@ module TensorStream
|
|
682
717
|
|
683
718
|
@context[:_cache][cache_key] = OpenCLBuffer.new(self, name: name, data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer)
|
684
719
|
end
|
720
|
+
|
685
721
|
if data_type == :string
|
686
722
|
value[0].each_byte.with_index do |c, index|
|
687
723
|
cl_object.buffer[index] = c
|
@@ -704,11 +740,11 @@ module TensorStream
|
|
704
740
|
cl_object.buffer[0] = Tensor.cast_dtype(value, data_type)
|
705
741
|
end
|
706
742
|
|
707
|
-
if
|
708
|
-
|
743
|
+
# if OpenCL buffer is valid enqueue a write
|
744
|
+
if cl_object.cl_buffer && value && (!value.is_a?(Array) || !value.empty?)
|
745
|
+
cl_object.op = _opencl_queue.enqueue_write_buffer(cl_object.cl_buffer, cl_object.buffer)
|
709
746
|
end
|
710
747
|
|
711
|
-
cl_object.op = write_op
|
712
748
|
cl_object
|
713
749
|
end
|
714
750
|
|
@@ -718,7 +754,7 @@ module TensorStream
|
|
718
754
|
NArray.sfloat(narray_size)
|
719
755
|
when :float64
|
720
756
|
NArray.float(narray_size)
|
721
|
-
when :int, :int32, :int64, :uint64, :uint32 #NArray does not have 64 bit int types
|
757
|
+
when :int, :int32, :int64, :uint64, :uint32 # NArray does not have 64 bit int types
|
722
758
|
NArray.int(narray_size)
|
723
759
|
when :int16, :uint16
|
724
760
|
NArray.sint(narray_size)
|
@@ -736,7 +772,8 @@ module TensorStream
|
|
736
772
|
end
|
737
773
|
|
738
774
|
def _create_result_buffer(data_type, shape, name)
|
739
|
-
return OpenCLBuffer.
|
775
|
+
return OpenCLBuffer.nil_buffer(self, name, data_type) if shape == [0]
|
776
|
+
|
740
777
|
cache_key = "_result_#{name}_#{shape.join('_')}:#{object_id}"
|
741
778
|
@context[:_cache][:_cl_buffers][cache_key] ||= begin
|
742
779
|
# puts "create result buffer #{cache_key}"
|
@@ -759,7 +796,7 @@ module TensorStream
|
|
759
796
|
region = OpenCL::BufferRegion::new(start, buffer.size * buffer.element_size)
|
760
797
|
cl_buffer = parent_buffer.cl_buffer.create_sub_buffer(OpenCL::BUFFER_CREATE_TYPE_REGION, region)
|
761
798
|
OpenCLBuffer.new(self, data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer, name: name)
|
762
|
-
else
|
799
|
+
else # source buffer already a sub buffer, OpenCL does not allow sub buffers from sub buffers
|
763
800
|
_create_result_buffer(tensor.data_type, shape, name)
|
764
801
|
end
|
765
802
|
end
|
@@ -768,7 +805,7 @@ module TensorStream
|
|
768
805
|
|
769
806
|
if buffer.cl_buffer.associated_memobject
|
770
807
|
buffer.op = parent_buffer.op
|
771
|
-
else
|
808
|
+
else # source buffer alreay a sub buffer, so we need to do a copy instead
|
772
809
|
region_size_in_bytes = buffer.buffer.size * buffer.buffer.element_size
|
773
810
|
start = index * region_size_in_bytes
|
774
811
|
region = [region_size_in_bytes, 1, 1]
|
@@ -841,6 +878,7 @@ module TensorStream
|
|
841
878
|
|
842
879
|
def _reduced_shape(input_shape, axes)
|
843
880
|
return [] if axes.nil? # reduce to scalar
|
881
|
+
|
844
882
|
axes = [axes] unless axes.is_a?(Array)
|
845
883
|
return input_shape if axes.empty?
|
846
884
|
|
@@ -850,39 +888,6 @@ module TensorStream
|
|
850
888
|
input_shape
|
851
889
|
end
|
852
890
|
|
853
|
-
def reduction(child_context, tensor, a, b, func)
|
854
|
-
input = complete_eval(a, child_context)
|
855
|
-
axis = b.is_a?(Tensor) ? read_final_result(complete_eval(b, child_context)) : b
|
856
|
-
if axis.nil?
|
857
|
-
red = input.buffer.send(func)
|
858
|
-
convert_to_opencl(red, [], data_type: tensor.data_type, name: tensor.name)
|
859
|
-
else
|
860
|
-
return input if input.shape.empty?
|
861
|
-
|
862
|
-
value = input.buffer.reshape(*input.shape.reverse)
|
863
|
-
rank = input.shape.size - 1
|
864
|
-
|
865
|
-
if axis.is_a?(Array)
|
866
|
-
axis.map { |x| rank - x.abs }.sort.reverse_each do |x|
|
867
|
-
value = value.send(func, x.to_i)
|
868
|
-
end
|
869
|
-
else
|
870
|
-
value = value.send(func, rank - axis.abs)
|
871
|
-
end
|
872
|
-
|
873
|
-
new_shape = if value.is_a?(NArray)
|
874
|
-
value.shape.reverse
|
875
|
-
else
|
876
|
-
value = [value]
|
877
|
-
[]
|
878
|
-
end
|
879
|
-
|
880
|
-
new_shape = _reduced_shape(input.shape.dup, axis) if tensor.options[:keepdims]
|
881
|
-
|
882
|
-
convert_to_opencl(value.flatten, new_shape, data_type: tensor.data_type, name: tensor.name)
|
883
|
-
end
|
884
|
-
end
|
885
|
-
|
886
891
|
# selects variants of cl programs depending on input
|
887
892
|
def select_program(input_a, input_b, op)
|
888
893
|
return [input_a, input_b, op.to_s, 0] if input_a.shape == input_b.shape
|