tensor_stream-opencl 0.2.2 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/benchmark/benchmark.rb +23 -1
- data/benchmark_ryzen.txt +56 -0
- data/lib/tensor_stream/opencl/array_ops.rb +3 -3
- data/lib/tensor_stream/opencl/images_ops.rb +30 -0
- data/lib/tensor_stream/opencl/kernels/conv2d.cl +27 -0
- data/lib/tensor_stream/opencl/kernels/conv2d_backprop_filter.cl +26 -0
- data/lib/tensor_stream/opencl/kernels/conv2d_backprop_input.cl +32 -0
- data/lib/tensor_stream/opencl/kernels/gemm.cl +2 -10
- data/lib/tensor_stream/opencl/kernels/max.cl +5 -13
- data/lib/tensor_stream/opencl/kernels/mean.cl +26 -0
- data/lib/tensor_stream/opencl/kernels/min.cl +3 -11
- data/lib/tensor_stream/opencl/kernels/prod.cl +26 -0
- data/lib/tensor_stream/opencl/kernels/relu6.cl +7 -0
- data/lib/tensor_stream/opencl/kernels/round.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/sum.cl +26 -0
- data/lib/tensor_stream/opencl/math_ops.rb +86 -29
- data/lib/tensor_stream/opencl/nn_ops.rb +89 -5
- data/lib/tensor_stream/opencl/opencl_buffer.rb +6 -2
- data/lib/tensor_stream/opencl/opencl_evaluator.rb +97 -92
- data/lib/tensor_stream/opencl/version.rb +1 -1
- data/samples/iris.rb +2 -2
- data/samples/logistic_regression.rb +84 -0
- data/samples/mnist_data_2.1.rb +9 -4
- data/samples/mnist_data_2.2.rb +12 -7
- data/samples/mnist_data_2.3.rb +111 -0
- data/samples/rnn.rb +1 -1
- data/tensor_stream-opencl.gemspec +2 -1
- metadata +28 -4
@@ -0,0 +1,7 @@
|
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
__kernel void relu6_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
|
+
// Get the index of the current element to be processed
|
4
|
+
const int id = get_global_id(0);
|
5
|
+
|
6
|
+
C[id] = min((<%= c_dtype %>)max((<%= c_dtype %>) A[id], (<%= c_dtype %>)0), (<%= c_dtype %>)6);
|
7
|
+
}
|
@@ -1,8 +1,7 @@
|
|
1
1
|
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
__kernel void round_<%= dtype %>(
|
2
|
+
__kernel void round_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
3
|
// Get the index of the current element to be processed
|
4
|
-
const int
|
5
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
4
|
+
const int id = get_global_id(0); // Row ID of C (0..M)
|
6
5
|
|
7
|
-
C[
|
6
|
+
C[id] = round(A[id]);
|
8
7
|
}
|
@@ -0,0 +1,26 @@
|
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
__kernel void sum_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
|
+
// Get the index of the current element to be processed
|
4
|
+
const int id = get_global_id(0);
|
5
|
+
int offset = (id + <%= index %>) * <%= w %>;
|
6
|
+
<%= c_dtype %> sum = 0;
|
7
|
+
<% if n > 4 %>
|
8
|
+
for(int i = 0; i < <%= n/4 %> ; i++) {
|
9
|
+
<% sums = 4.times.map do |i|
|
10
|
+
"A[offset + #{i}]"
|
11
|
+
end %>
|
12
|
+
sum += <%= sums.join(' + ') %>;
|
13
|
+
offset += 4;
|
14
|
+
}
|
15
|
+
<% if n%4!=0 %>
|
16
|
+
<% (n % 4).times do |i| %>
|
17
|
+
sum += A[offset + <%= i %>];
|
18
|
+
<% end %>
|
19
|
+
<% end %>
|
20
|
+
<% else %>
|
21
|
+
<% n.times do |i| %>
|
22
|
+
sum += A[offset + <%= i %>];
|
23
|
+
<% end %>
|
24
|
+
<% end %>
|
25
|
+
C[id] = sum;
|
26
|
+
}
|
@@ -5,8 +5,8 @@ module TensorStream
|
|
5
5
|
def MathOps.included(klass)
|
6
6
|
klass.class_eval do
|
7
7
|
%i[max min add real_div div sub floor_mod mod mul pow sigmoid_grad squared_difference].each do |op|
|
8
|
-
register_op op
|
9
|
-
execute_2_operand_func(op.to_s, tensor, inputs[0], inputs[1]
|
8
|
+
register_op op do |context, tensor, inputs|
|
9
|
+
execute_2_operand_func(op.to_s, tensor, inputs[0], inputs[1])
|
10
10
|
end
|
11
11
|
end
|
12
12
|
|
@@ -38,11 +38,11 @@ module TensorStream
|
|
38
38
|
end
|
39
39
|
end
|
40
40
|
|
41
|
-
register_op :floor_div
|
41
|
+
register_op :floor_div do |context, tensor, inputs|
|
42
42
|
if fp_type?(tensor.data_type)
|
43
|
-
execute_2_operand_func('floor_div', tensor, inputs[0], inputs[1]
|
43
|
+
execute_2_operand_func('floor_div', tensor, inputs[0], inputs[1])
|
44
44
|
else
|
45
|
-
execute_2_operand_func('div', tensor, inputs[0], inputs[1]
|
45
|
+
execute_2_operand_func('div', tensor, inputs[0], inputs[1])
|
46
46
|
end
|
47
47
|
end
|
48
48
|
|
@@ -78,11 +78,8 @@ module TensorStream
|
|
78
78
|
cl_n = OpenCL::Int1.new(n)
|
79
79
|
cl_k = OpenCL::Int1.new(k)
|
80
80
|
|
81
|
-
transpose_a = OpenCL::Int1.new(tensor.options[:transpose_a] ? 1 : 0)
|
82
|
-
transpose_b = OpenCL::Int1.new(tensor.options[:transpose_b] ? 1 : 0)
|
83
81
|
event_wait_list = build_event_wait_list([a, b])
|
84
|
-
|
85
|
-
output_buffer.op = _cl_program('gemm', dtype: dtype).send(:"gemm_#{dtype}", _opencl_queue, result_shape, cl_m, cl_n, cl_k, transpose_a, transpose_b, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
82
|
+
output_buffer.op = _cl_program('gemm', ta: !!tensor.options[:transpose_a], tb: !!tensor.options[:transpose_b], dtype: dtype).send(:"gemm_#{dtype}", _opencl_queue, result_shape, cl_m, cl_n, cl_k, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
86
83
|
|
87
84
|
output_buffer
|
88
85
|
end
|
@@ -94,39 +91,99 @@ module TensorStream
|
|
94
91
|
end
|
95
92
|
|
96
93
|
%i[sum mean].each do |op|
|
97
|
-
register_op op
|
94
|
+
register_op op do |context, tensor, inputs|
|
98
95
|
reduction(context, tensor, inputs[0], inputs[1], op.to_sym)
|
99
96
|
end
|
100
97
|
end
|
101
98
|
|
102
|
-
register_op :prod
|
103
|
-
|
104
|
-
|
105
|
-
if input_a.buffer.empty?
|
99
|
+
register_op :prod do |context, tensor, inputs|
|
100
|
+
if inputs[0].shape == [0]
|
106
101
|
convert_to_opencl([1.0], [], data_type: inputs[0].data_type, name: tensor.name)
|
107
102
|
else
|
108
103
|
reduction(context, tensor, inputs[0], inputs[1], :prod)
|
109
104
|
end
|
110
105
|
end
|
111
106
|
|
112
|
-
register_op :argmin, buffer: true do |_context, tensor, inputs|
|
113
|
-
|
114
|
-
|
115
|
-
|
107
|
+
# register_op :argmin, buffer: true do |_context, tensor, inputs|
|
108
|
+
# axis = inputs[1].nil? || inputs[1].buffer.nil? || inputs[1].buffer.empty? ? 0 : inputs[1].buffer
|
109
|
+
# rank = inputs[0].shape.size
|
110
|
+
# raise TensorStream::InvalidArgumentError, "Expected dimension in the range [#{-rank},#{rank}) but got #{axis}" if axis < -rank || axis >= rank
|
111
|
+
|
112
|
+
# arr = inputs[0].buffer.reshape(*inputs[0].shape.reverse).to_a
|
113
|
+
# op = get_op_with_axis(arr, axis, 0, inputs[0].data_type, ->(a, b) { a < b })
|
114
|
+
# convert_to_opencl(op, shape_eval(op), data_type: tensor.data_type, name: tensor.name)
|
115
|
+
# end
|
116
|
+
|
117
|
+
# register_op :argmax, buffer: true do |_context, tensor, inputs|
|
118
|
+
# axis = inputs[1].nil? || inputs[1].buffer.nil? || inputs[1].buffer.empty? ? 0 : inputs[1].buffer
|
119
|
+
# rank = inputs[0].shape.size
|
120
|
+
# raise TensorStream::InvalidArgumentError, "Expected dimension in the range [#{-rank},#{rank}) but got #{axis}" if axis < -rank || axis >= rank
|
121
|
+
|
122
|
+
# arr = inputs[0].buffer.reshape(*inputs[0].shape.reverse).to_a
|
123
|
+
# op = get_op_with_axis(arr, axis, 0, inputs[0].data_type, ->(a, b) { a > b })
|
124
|
+
# convert_to_opencl(op, shape_eval(op), data_type: tensor.data_type, name: tensor.name)
|
125
|
+
# end
|
126
|
+
|
127
|
+
def reduction(child_context, tensor, value, axis, func)
|
128
|
+
if axis.nil?
|
129
|
+
value = _run(value, child_context)
|
130
|
+
size = value.shape.reduce(:*) || 1
|
131
|
+
if value.shape.empty? # for scalars, just return as is
|
132
|
+
value
|
133
|
+
else
|
134
|
+
reduction_threads = 32
|
135
|
+
items_per_thread_threshold = 4
|
136
|
+
|
137
|
+
output_buffer = _create_result_buffer(value.data_type, [], tensor.name)
|
138
|
+
event_wait_list = build_event_wait_list([value])
|
139
|
+
|
140
|
+
if (size > reduction_threads) && ((size / reduction_threads) > items_per_thread_threshold)
|
141
|
+
items_per_thread = size / reduction_threads
|
142
|
+
extra_items = size % reduction_threads
|
143
|
+
intermediate_output_buffer = _create_result_buffer(value.data_type, [reduction_threads], tensor.name)
|
144
|
+
|
145
|
+
temp_values = if extra_items.zero?
|
146
|
+
_cl_program(func, dtype: value.data_type, index: 0, n: items_per_thread, w: items_per_thread).
|
147
|
+
send(:"#{func}_#{value.data_type}", _opencl_queue, [reduction_threads], value.cl_buffer, intermediate_output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
148
|
+
else
|
149
|
+
[_cl_program(func, dtype: value.data_type, index: 0, n: items_per_thread, w: items_per_thread).
|
150
|
+
send(:"#{func}_#{value.data_type}", _opencl_queue, [reduction_threads - 1], value.cl_buffer, intermediate_output_buffer.cl_buffer, event_wait_list: event_wait_list),
|
151
|
+
_cl_program(func, dtype: value.data_type, index: reduction_threads - 1, n: items_per_thread + extra_items, w: items_per_thread).send(:"#{func}_#{value.data_type}", _opencl_queue, [1], value.cl_buffer, intermediate_output_buffer.cl_buffer, event_wait_list: event_wait_list)]
|
152
|
+
end
|
153
|
+
output_buffer.op = _cl_program(func, dtype: value.data_type, n: reduction_threads, index: 0, w: 0).send(:"#{func}_#{value.data_type}", _opencl_queue, [1], value.cl_buffer, output_buffer.cl_buffer, event_wait_list: temp_values)
|
154
|
+
output_buffer
|
155
|
+
else
|
156
|
+
output_buffer.op = _cl_program(func, dtype: value.data_type, n: size, index: 0, w: 0).send(:"#{func}_#{value.data_type}", _opencl_queue, [1], value.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
157
|
+
output_buffer
|
158
|
+
end
|
159
|
+
end
|
160
|
+
else
|
161
|
+
return value if value.shape.empty?
|
162
|
+
|
163
|
+
axis = axis.is_a?(OpenCLBuffer) ? read_final_result(axis) : axis
|
164
|
+
input = complete_eval(value, child_context)
|
165
|
+
value = value.buffer.reshape(*value.shape.reverse)
|
166
|
+
rank = input.shape.size - 1
|
167
|
+
|
168
|
+
if axis.is_a?(Array)
|
169
|
+
axis.map { |x| rank - x.abs }.sort.reverse_each do |x|
|
170
|
+
value = value.send(func, x.to_i)
|
171
|
+
end
|
172
|
+
else
|
173
|
+
value = value.send(func, rank - axis.abs)
|
174
|
+
end
|
116
175
|
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
176
|
+
new_shape = if value.is_a?(NArray)
|
177
|
+
value.shape.reverse
|
178
|
+
else
|
179
|
+
value = [value]
|
180
|
+
[]
|
181
|
+
end
|
121
182
|
|
122
|
-
|
123
|
-
axis = tensor.options[:axis] || 0
|
124
|
-
rank = inputs[0].shape.size
|
125
|
-
raise TensorStream::InvalidArgumentError, "Expected dimension in the range [#{-rank},#{rank}) but got #{axis}" if axis < -rank || axis >= rank
|
183
|
+
new_shape = _reduced_shape(input.shape.dup, axis) if tensor.options[:keepdims]
|
126
184
|
|
127
|
-
|
128
|
-
|
129
|
-
convert_to_opencl(op, shape_eval(op), data_type: tensor.data_type, name: tensor.name)
|
185
|
+
convert_to_opencl(value.flatten, new_shape, data_type: tensor.data_type, name: tensor.name)
|
186
|
+
end
|
130
187
|
end
|
131
188
|
end
|
132
189
|
end
|
@@ -121,11 +121,11 @@ module TensorStream
|
|
121
121
|
end
|
122
122
|
|
123
123
|
register_op :apply_adagrad do |context, tensor, inputs|
|
124
|
-
|
124
|
+
_target_var, _accum, lr, grad = inputs
|
125
125
|
|
126
126
|
assign = tensor.inputs[0] || tensor
|
127
127
|
assign_acc = tensor.inputs[1]
|
128
|
-
|
128
|
+
|
129
129
|
assign.buffer.dirty = true
|
130
130
|
assign_acc.buffer.dirty = true
|
131
131
|
output_buffer = assign.buffer
|
@@ -133,7 +133,7 @@ module TensorStream
|
|
133
133
|
work_group = [output_buffer.total_elements]
|
134
134
|
|
135
135
|
event_wait_list = build_event_wait_list(inputs)
|
136
|
-
event = call_program('apply_adagrad',
|
136
|
+
event = call_program('apply_adagrad',
|
137
137
|
output_buffer.data_type,
|
138
138
|
work_group,
|
139
139
|
lr.cl_buffer,
|
@@ -195,7 +195,7 @@ module TensorStream
|
|
195
195
|
event_wait_list = build_event_wait_list(inputs)
|
196
196
|
work_group = [output_buffer.total_elements]
|
197
197
|
|
198
|
-
event = call_program('apply_rms_prop', output_buffer.data_type,
|
198
|
+
event = call_program('apply_rms_prop', output_buffer.data_type,
|
199
199
|
work_group,
|
200
200
|
lr.cl_buffer,
|
201
201
|
rho.cl_buffer,
|
@@ -298,7 +298,7 @@ module TensorStream
|
|
298
298
|
end
|
299
299
|
|
300
300
|
b = wrap_opencl(labels, data_type: inputs[0].data_type, name: "#{tensor.name}_label")
|
301
|
-
|
301
|
+
|
302
302
|
event_wait_list = build_event_wait_list(inputs)
|
303
303
|
dtype = tensor.data_type
|
304
304
|
output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
|
@@ -335,6 +335,90 @@ module TensorStream
|
|
335
335
|
output_buffer.op = event
|
336
336
|
output_buffer
|
337
337
|
end
|
338
|
+
|
339
|
+
%i[relu6].each do |op|
|
340
|
+
register_op op, noop: true do |context, tensor, inputs|
|
341
|
+
execute_func(op.to_s, tensor, inputs[0], context)
|
342
|
+
end
|
343
|
+
end
|
344
|
+
|
345
|
+
# Fast per pixel parallel convolution operation
|
346
|
+
register_op :conv2d do |_context, tensor, inputs|
|
347
|
+
filter = inputs[1]
|
348
|
+
batch, height, width, channel = inputs[0].shape
|
349
|
+
filter_shape = filter.shape
|
350
|
+
strides = tensor.options[:strides]
|
351
|
+
height_stride = strides[1]
|
352
|
+
width_stride = strides[2]
|
353
|
+
|
354
|
+
raise TensorStream::ValueError, " Current implementation does not yet support strides in the batch and depth dimensions." if strides[0] != 1 || strides[3] != 1
|
355
|
+
|
356
|
+
event_wait_list = build_event_wait_list(inputs)
|
357
|
+
|
358
|
+
f_height, f_width, in_channels, out_channels = filter_shape
|
359
|
+
out_shape = [batch, height / height_stride, width / width_stride, out_channels]
|
360
|
+
output_buffer = _create_result_buffer(tensor.data_type, out_shape, tensor.name)
|
361
|
+
|
362
|
+
cl_image_height = OpenCL::Int1.new(height)
|
363
|
+
cl_image_width = OpenCL::Int1.new(width)
|
364
|
+
|
365
|
+
work_dimen = [batch, height / height_stride, width / width_stride]
|
366
|
+
|
367
|
+
output_buffer.op = _cl_program("conv2d", dtype: tensor.data_type, fh: f_height, fw: f_width, ch: channel, out_ch: out_channels, stride: [height_stride, width_stride] ).send(:conv2d, _opencl_queue, work_dimen, cl_image_height, cl_image_width, inputs[0].cl_buffer,
|
368
|
+
inputs[1].cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
369
|
+
output_buffer
|
370
|
+
end
|
371
|
+
|
372
|
+
register_op :conv2d_backprop_input do |context, tensor, inputs|
|
373
|
+
image_shape, filter, grad = inputs
|
374
|
+
filter_shape = filter.shape
|
375
|
+
|
376
|
+
strides = tensor.options[:strides]
|
377
|
+
height_stride = strides[1]
|
378
|
+
width_stride = strides[2]
|
379
|
+
|
380
|
+
image_shape = read_final_result(complete_eval(image_shape, context))
|
381
|
+
|
382
|
+
event_wait_list = build_event_wait_list(inputs)
|
383
|
+
output_buffer = _create_result_buffer(tensor.data_type, image_shape, tensor.name)
|
384
|
+
|
385
|
+
batch, height, width, channels = image_shape
|
386
|
+
f_height, f_width, in_channels, out_channels = filter_shape
|
387
|
+
|
388
|
+
work_dimen = [batch, height, width]
|
389
|
+
|
390
|
+
cl_image_height = OpenCL::Int1.new(height)
|
391
|
+
cl_image_width = OpenCL::Int1.new(width)
|
392
|
+
|
393
|
+
output_buffer.op = _cl_program("conv2d_backprop_input", dtype: tensor.data_type, fh: f_height, fw: f_width, ch: channels, out_ch: out_channels, stride: [height_stride, width_stride] ).send(:conv2d_backprop_input, _opencl_queue, work_dimen, cl_image_height, cl_image_width,
|
394
|
+
filter.cl_buffer, grad.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
395
|
+
output_buffer
|
396
|
+
end
|
397
|
+
|
398
|
+
register_op :conv2d_backprop_filter do |context, tensor, inputs|
|
399
|
+
images, filter_shape, grad = inputs
|
400
|
+
|
401
|
+
event_wait_list = build_event_wait_list(inputs)
|
402
|
+
|
403
|
+
strides = tensor.options[:strides]
|
404
|
+
height_stride = strides[1]
|
405
|
+
width_stride = strides[2]
|
406
|
+
|
407
|
+
filter_shape = read_final_result(complete_eval(filter_shape, context))
|
408
|
+
output_buffer = _create_result_buffer(tensor.data_type, filter_shape, tensor.name)
|
409
|
+
|
410
|
+
batch_size, height, width, channels = images.shape
|
411
|
+
f_height, f_width, input_channels, output_channels = filter_shape
|
412
|
+
work_dimen = [f_height, f_width, output_channels]
|
413
|
+
|
414
|
+
cl_batch_size = OpenCL::Int1.new(batch_size)
|
415
|
+
cl_image_height = OpenCL::Int1.new(height)
|
416
|
+
cl_image_width = OpenCL::Int1.new(width)
|
417
|
+
|
418
|
+
output_buffer.op = _cl_program("conv2d_backprop_filter", dtype: tensor.data_type, fh: f_height, fw: f_width, ch: channels, out_ch: output_channels, stride: [height_stride, width_stride] ).send(:conv2d_backprop_filter, _opencl_queue, work_dimen, cl_batch_size, cl_image_height, cl_image_width,
|
419
|
+
images.cl_buffer, grad.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
420
|
+
output_buffer
|
421
|
+
end
|
338
422
|
end
|
339
423
|
end
|
340
424
|
end
|
@@ -37,9 +37,13 @@ module TensorStream
|
|
37
37
|
return buffer[0] != 0 if data_type == :boolean
|
38
38
|
return buffer[0]
|
39
39
|
end
|
40
|
-
|
41
|
-
result = buffer.reshape(*shape.map(&:to_i).reverse).to_a
|
40
|
+
|
41
|
+
result = buffer.reshape(*shape.map(&:to_i).reverse).to_a
|
42
42
|
data_type == :boolean ? process_function_op(result, ->(a, _b) { a != 0 }) : result
|
43
43
|
end
|
44
|
+
|
45
|
+
def self.nil_buffer(owner, name, data_type)
|
46
|
+
OpenCLBuffer.new(owner, name: name, data_type: data_type, shape: [0], buffer: nil, cl_buffer: nil)
|
47
|
+
end
|
44
48
|
end
|
45
49
|
end
|
@@ -35,7 +35,8 @@ module TensorStream
|
|
35
35
|
end
|
36
36
|
|
37
37
|
##
|
38
|
-
#
|
38
|
+
# OpenCL hardware accelerated evaluator
|
39
|
+
#
|
39
40
|
class OpenclEvaluator < BaseEvaluator
|
40
41
|
attr_accessor :retain
|
41
42
|
attr_reader :opencl_device, :opencl_context
|
@@ -53,41 +54,57 @@ module TensorStream
|
|
53
54
|
super
|
54
55
|
_create_opencl_context
|
55
56
|
@opencl_device = device.native_device
|
57
|
+
|
58
|
+
@max_work_item_dimensions = @opencl_device.max_work_item_dimensions
|
59
|
+
@max_work_item_sizes = @opencl_device.max_work_item_sizes
|
60
|
+
@max_work_group_size = @opencl_device.max_work_group_size
|
61
|
+
|
62
|
+
@local_mem_size = @opencl_device.local_mem_size
|
63
|
+
@device_type = @opencl_device.type.to_s.downcase
|
64
|
+
|
56
65
|
create_command_queue
|
57
66
|
end
|
58
67
|
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
68
|
+
class << self
|
69
|
+
def query_supported_devices
|
70
|
+
devices = query_devices_with_score
|
71
|
+
devices.sort_by { |a| a[1] }.map do |d|
|
72
|
+
opencl_to_device(d)
|
73
|
+
end
|
63
74
|
end
|
64
|
-
end
|
65
75
|
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
76
|
+
def fetch_device(query = [])
|
77
|
+
devices = query_devices_with_score
|
78
|
+
platform_devices = devices.select { |d| d[0].platform.to_s.tr(' ', '_').downcase =~ /#{query[0].downcase}/ }
|
79
|
+
opencl_to_device(platform_devices[[query[1].to_i, platform_devices.size - 1].min])
|
80
|
+
end
|
71
81
|
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
82
|
+
def opencl_to_device(dev)
|
83
|
+
device = dev[0]
|
84
|
+
index = dev[3]
|
85
|
+
platform_name = device.platform.name.tr(' ', '_').downcase
|
86
|
+
uri = [platform_name, index].join(':')
|
77
87
|
|
78
|
-
|
88
|
+
device_type = device.type.to_s == 'GPU' ? :gpu : :cpu
|
79
89
|
|
80
|
-
|
81
|
-
|
90
|
+
OpenclDevice.new(uri, device_type, self).tap do |d|
|
91
|
+
d.native_device = device
|
92
|
+
end
|
82
93
|
end
|
83
|
-
end
|
84
94
|
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
95
|
+
##
|
96
|
+
# Select the best device available in the system for this evaluator
|
97
|
+
def default_device
|
98
|
+
devices = OpenclEvaluator.query_devices_with_score
|
99
|
+
device = devices.max { |a, b| a[1] <=> b[1] }
|
100
|
+
opencl_to_device(device)
|
101
|
+
end
|
102
|
+
|
103
|
+
def getset_global_opencl_context(platform)
|
104
|
+
@global_opencl_context ||= {}
|
105
|
+
@global_opencl_context[platform] ||= yield
|
106
|
+
@global_opencl_context[platform]
|
107
|
+
end
|
91
108
|
end
|
92
109
|
|
93
110
|
# opencl evaluator main entrypoint
|
@@ -228,16 +245,22 @@ module TensorStream
|
|
228
245
|
|
229
246
|
def _create_opencl_context(device = nil)
|
230
247
|
if device.nil?
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
248
|
+
all_devices_by_platform = {}
|
249
|
+
TensorStream::Evaluator::OpenclEvaluator.query_supported_devices.map(&:native_device).each do |d|
|
250
|
+
all_devices_by_platform[d.platform.name] ||= []
|
251
|
+
all_devices_by_platform[d.platform.name] << d
|
235
252
|
end
|
236
253
|
|
237
|
-
|
254
|
+
all_devices_by_platform.each do |platform, devices|
|
255
|
+
@opencl_context = TensorStream::Evaluator::OpenclEvaluator.getset_global_opencl_context(platform) do
|
256
|
+
OpenCL.create_context(devices)
|
257
|
+
end
|
258
|
+
end
|
238
259
|
else
|
239
260
|
puts "context created for #{device.native_device}"
|
240
|
-
@opencl_context =
|
261
|
+
@opencl_context = TensorStream::Evaluator::OpenclEvaluator.getset_global_opencl_context(device.native_device.platform) do
|
262
|
+
OpenCL.create_context(device.native_device)
|
263
|
+
end
|
241
264
|
end
|
242
265
|
end
|
243
266
|
|
@@ -269,11 +292,12 @@ module TensorStream
|
|
269
292
|
@context[:_cache][kernel_cache_key] ||=
|
270
293
|
begin
|
271
294
|
# puts "building #{kernel_cache_key}"
|
272
|
-
file_path = File.join('/tmp', "#{kernel}.#{suffix}.cl")
|
295
|
+
file_path = File.join(ENV['TS_OPENCL_FILE_CACHE_PATH'] || '/tmp', "#{kernel}.#{suffix}.cl")
|
273
296
|
source = if File.exist?(file_path) && ENV['TS_OPENCL_FILE_CACHE']
|
274
297
|
File.read(file_path)
|
275
298
|
else
|
276
|
-
|
299
|
+
filenames = ['', ".#{@device_type}"].map { |type| %w[cl.erb cl].map { |ext| cl_template_path("#{kernel}#{type}", ext) } }.flatten
|
300
|
+
filename = filenames.find { |n| File.exist?(n) }
|
277
301
|
raise "opencl kernel template for #{kernel} has not yet been defined" if filename.nil?
|
278
302
|
|
279
303
|
source = File.read(filename)
|
@@ -322,6 +346,7 @@ module TensorStream
|
|
322
346
|
|
323
347
|
def eval_variable(tensor, _child_context)
|
324
348
|
raise "variable #{tensor.name} not initalized" if tensor.value.nil? && (tensor.buffer.nil? || !tensor.buffer.dirty)
|
349
|
+
|
325
350
|
tensor.buffer = wrap_opencl(tensor, name: tensor.name) if tensor.buffer.nil?
|
326
351
|
tensor.buffer
|
327
352
|
end
|
@@ -339,7 +364,7 @@ module TensorStream
|
|
339
364
|
end
|
340
365
|
end
|
341
366
|
|
342
|
-
register_op :identity do |
|
367
|
+
register_op :identity do |_context, tensor, inputs|
|
343
368
|
value = inputs[0]
|
344
369
|
buffer = OpenCLBuffer.new(self, name: tensor.name, data_type: tensor.data_type, shape: value.shape, buffer: value.buffer, cl_buffer: value.cl_buffer)
|
345
370
|
buffer.op = build_event_wait_list(inputs)
|
@@ -351,25 +376,26 @@ module TensorStream
|
|
351
376
|
end
|
352
377
|
|
353
378
|
register_op :assign_add do |context, tensor, inputs|
|
354
|
-
value = execute_2_operand_func('add', tensor, inputs[0], inputs[1]
|
379
|
+
value = execute_2_operand_func('add', tensor, inputs[0], inputs[1])
|
355
380
|
assign_var(tensor, value, context)
|
356
381
|
end
|
357
382
|
|
358
383
|
register_op :assign_sub do |context, tensor, inputs|
|
359
|
-
value = execute_2_operand_func('sub', tensor, inputs[0], inputs[1]
|
384
|
+
value = execute_2_operand_func('sub', tensor, inputs[0], inputs[1])
|
360
385
|
assign_var(tensor, value, context)
|
361
386
|
end
|
362
387
|
|
363
|
-
register_op :variable, noop: true do |
|
388
|
+
register_op :variable, noop: true do |_context, tensor, _inputs|
|
364
389
|
variable = tensor.inputs[0]
|
365
390
|
raise "variable #{tensor.name} not initalized" if variable.value.nil? && (variable.buffer.nil? || !variable.buffer.dirty)
|
391
|
+
|
366
392
|
variable.buffer = wrap_opencl(variable, name: variable.name) if variable.buffer.nil?
|
367
393
|
variable.buffer
|
368
394
|
end
|
369
395
|
|
370
396
|
%i[less less_equal greater greater_equal equal not_equal logical_and].each do |op|
|
371
|
-
register_op op
|
372
|
-
execute_2_operand_func(op.to_s, tensor, inputs[0], inputs[1],
|
397
|
+
register_op op do |context, tensor, inputs|
|
398
|
+
execute_2_operand_func(op.to_s, tensor, inputs[0], inputs[1], 'cond')
|
373
399
|
end
|
374
400
|
end
|
375
401
|
|
@@ -463,11 +489,11 @@ module TensorStream
|
|
463
489
|
rescue EvaluatorExcecutionException => e
|
464
490
|
_opencl_queue.finish # dump queue
|
465
491
|
puts e.message
|
466
|
-
raise e, "error #{e.message} while evaluating #{tensor.name} :
|
492
|
+
raise e, "error #{e.message} while evaluating #{tensor.name} : defined at #{tensor.source}"
|
467
493
|
rescue TensorStreamError => e
|
468
494
|
_opencl_queue.finish # dump queue
|
469
495
|
puts e.message
|
470
|
-
raise e, "error #{e.message} while evaluating #{tensor.name} :
|
496
|
+
raise e, "error #{e.message} while evaluating #{tensor.name} : defined at #{tensor.source}"
|
471
497
|
rescue StandardError => e
|
472
498
|
_opencl_queue.finish # dump queue
|
473
499
|
puts e.message
|
@@ -496,6 +522,7 @@ module TensorStream
|
|
496
522
|
cache_key = "#{tensor.graph.object_id}_opencl_#{tensor.name}:#{object_id}"
|
497
523
|
return @context[cache_key] if @context.key?(cache_key)
|
498
524
|
return @context[:_cache][cache_key] if tensor.is_const && @context[:_cache][cache_key]
|
525
|
+
|
499
526
|
@context[cache_key] = if tensor.value.is_a?(Tensor)
|
500
527
|
_run(tensor.value, child_context)
|
501
528
|
else
|
@@ -512,7 +539,6 @@ module TensorStream
|
|
512
539
|
buffer = complete_eval(b, child_context)
|
513
540
|
|
514
541
|
if assign.buffer
|
515
|
-
# buffer = type_cast(buffer, assign.data_type, name: "#{tensor.name}/cast_#{tensor.name}_#{tensor.data_type}")
|
516
542
|
event_wait_list = build_event_wait_list([buffer, assign.buffer])
|
517
543
|
assign.buffer.op = if assign.buffer.cl_buffer != buffer.cl_buffer
|
518
544
|
_opencl_queue.enqueue_copy_buffer(buffer.cl_buffer, assign.buffer.cl_buffer, event_wait_list: event_wait_list)
|
@@ -524,23 +550,32 @@ module TensorStream
|
|
524
550
|
assign.buffer = convert_to_opencl(value, buffer.shape, data_type: tensor.data_type, name: assign.name)
|
525
551
|
assign.value = value
|
526
552
|
end
|
553
|
+
|
527
554
|
assign.buffer.dirty = true
|
528
555
|
assign.buffer
|
529
556
|
end
|
530
557
|
|
531
|
-
def execute_2_operand_func(op_name, tensor,
|
532
|
-
a = _run(input_a, child_context)
|
533
|
-
b = _run(input_b, child_context)
|
558
|
+
def execute_2_operand_func(op_name, tensor, a, b, prog_name = nil)
|
534
559
|
a, b = auto_type_cast(a, b, name: "#{tensor.name}/cast_#{a.name}_#{b.data_type}")
|
535
560
|
dtype = tensor.data_type
|
536
561
|
result_shape = TensorShape.infer_shape(a.shape, b.shape)
|
537
|
-
return
|
562
|
+
return OpenCLBuffer.nil_buffer(self, "out_#{tensor.name}", dtype) if result_shape == [0]
|
563
|
+
|
538
564
|
output_buffer = _create_result_buffer(tensor.data_type, result_shape, "out_#{tensor.name}")
|
539
565
|
a, b, prog, switch_operands = select_program(a, b, op_name)
|
540
566
|
m, n = result_shape
|
541
|
-
|
542
|
-
|
543
|
-
|
567
|
+
|
568
|
+
work_group = if result_shape.size > 2 && (b.shape.size.zero? || (a.shape == b.shape))
|
569
|
+
[m, result_shape.reduce(:*) / m]
|
570
|
+
elsif result_shape.size <= 2
|
571
|
+
[m || 1, n || 1]
|
572
|
+
else
|
573
|
+
raise "rank > 2 not supported for now"
|
574
|
+
end
|
575
|
+
|
576
|
+
cl_m = OpenCL::Int1.new(work_group[0])
|
577
|
+
cl_n = OpenCL::Int1.new(work_group[1])
|
578
|
+
|
544
579
|
cl_switch = OpenCL::Int1.new(switch_operands) # no need to switch for addition
|
545
580
|
|
546
581
|
event_wait_list = build_event_wait_list([a, b]) # add dependency wait list
|
@@ -610,6 +645,7 @@ module TensorStream
|
|
610
645
|
|
611
646
|
def auto_type_cast(a, b, name: nil)
|
612
647
|
return [a, b] if a.data_type == b.data_type
|
648
|
+
|
613
649
|
m, n = b.shape
|
614
650
|
work_group = [m || 1, n || 1]
|
615
651
|
event_wait_list = build_event_wait_list([b])
|
@@ -624,6 +660,7 @@ module TensorStream
|
|
624
660
|
|
625
661
|
def type_cast(source, data_type, name: nil)
|
626
662
|
return source if source.data_type == data_type
|
663
|
+
|
627
664
|
m, n = source.shape
|
628
665
|
work_group = [m || 1, n || 1]
|
629
666
|
event_wait_list = [source.op].compact
|
@@ -673,8 +710,6 @@ module TensorStream
|
|
673
710
|
|
674
711
|
return nil if buffer.nil?
|
675
712
|
|
676
|
-
|
677
|
-
|
678
713
|
cl_buffer = unless value.flatten.empty?
|
679
714
|
cl_buffer_size = 1 if cl_buffer_size.zero?
|
680
715
|
_opencl_context.create_buffer(cl_buffer_size * buffer.element_size)
|
@@ -682,6 +717,7 @@ module TensorStream
|
|
682
717
|
|
683
718
|
@context[:_cache][cache_key] = OpenCLBuffer.new(self, name: name, data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer)
|
684
719
|
end
|
720
|
+
|
685
721
|
if data_type == :string
|
686
722
|
value[0].each_byte.with_index do |c, index|
|
687
723
|
cl_object.buffer[index] = c
|
@@ -704,11 +740,11 @@ module TensorStream
|
|
704
740
|
cl_object.buffer[0] = Tensor.cast_dtype(value, data_type)
|
705
741
|
end
|
706
742
|
|
707
|
-
if
|
708
|
-
|
743
|
+
# if OpenCL buffer is valid enqueue a write
|
744
|
+
if cl_object.cl_buffer && value && (!value.is_a?(Array) || !value.empty?)
|
745
|
+
cl_object.op = _opencl_queue.enqueue_write_buffer(cl_object.cl_buffer, cl_object.buffer)
|
709
746
|
end
|
710
747
|
|
711
|
-
cl_object.op = write_op
|
712
748
|
cl_object
|
713
749
|
end
|
714
750
|
|
@@ -718,7 +754,7 @@ module TensorStream
|
|
718
754
|
NArray.sfloat(narray_size)
|
719
755
|
when :float64
|
720
756
|
NArray.float(narray_size)
|
721
|
-
when :int, :int32, :int64, :uint64, :uint32 #NArray does not have 64 bit int types
|
757
|
+
when :int, :int32, :int64, :uint64, :uint32 # NArray does not have 64 bit int types
|
722
758
|
NArray.int(narray_size)
|
723
759
|
when :int16, :uint16
|
724
760
|
NArray.sint(narray_size)
|
@@ -736,7 +772,8 @@ module TensorStream
|
|
736
772
|
end
|
737
773
|
|
738
774
|
def _create_result_buffer(data_type, shape, name)
|
739
|
-
return OpenCLBuffer.
|
775
|
+
return OpenCLBuffer.nil_buffer(self, name, data_type) if shape == [0]
|
776
|
+
|
740
777
|
cache_key = "_result_#{name}_#{shape.join('_')}:#{object_id}"
|
741
778
|
@context[:_cache][:_cl_buffers][cache_key] ||= begin
|
742
779
|
# puts "create result buffer #{cache_key}"
|
@@ -759,7 +796,7 @@ module TensorStream
|
|
759
796
|
region = OpenCL::BufferRegion::new(start, buffer.size * buffer.element_size)
|
760
797
|
cl_buffer = parent_buffer.cl_buffer.create_sub_buffer(OpenCL::BUFFER_CREATE_TYPE_REGION, region)
|
761
798
|
OpenCLBuffer.new(self, data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer, name: name)
|
762
|
-
else
|
799
|
+
else # source buffer already a sub buffer, OpenCL does not allow sub buffers from sub buffers
|
763
800
|
_create_result_buffer(tensor.data_type, shape, name)
|
764
801
|
end
|
765
802
|
end
|
@@ -768,7 +805,7 @@ module TensorStream
|
|
768
805
|
|
769
806
|
if buffer.cl_buffer.associated_memobject
|
770
807
|
buffer.op = parent_buffer.op
|
771
|
-
else
|
808
|
+
else # source buffer alreay a sub buffer, so we need to do a copy instead
|
772
809
|
region_size_in_bytes = buffer.buffer.size * buffer.buffer.element_size
|
773
810
|
start = index * region_size_in_bytes
|
774
811
|
region = [region_size_in_bytes, 1, 1]
|
@@ -841,6 +878,7 @@ module TensorStream
|
|
841
878
|
|
842
879
|
def _reduced_shape(input_shape, axes)
|
843
880
|
return [] if axes.nil? # reduce to scalar
|
881
|
+
|
844
882
|
axes = [axes] unless axes.is_a?(Array)
|
845
883
|
return input_shape if axes.empty?
|
846
884
|
|
@@ -850,39 +888,6 @@ module TensorStream
|
|
850
888
|
input_shape
|
851
889
|
end
|
852
890
|
|
853
|
-
def reduction(child_context, tensor, a, b, func)
|
854
|
-
input = complete_eval(a, child_context)
|
855
|
-
axis = b.is_a?(Tensor) ? read_final_result(complete_eval(b, child_context)) : b
|
856
|
-
if axis.nil?
|
857
|
-
red = input.buffer.send(func)
|
858
|
-
convert_to_opencl(red, [], data_type: tensor.data_type, name: tensor.name)
|
859
|
-
else
|
860
|
-
return input if input.shape.empty?
|
861
|
-
|
862
|
-
value = input.buffer.reshape(*input.shape.reverse)
|
863
|
-
rank = input.shape.size - 1
|
864
|
-
|
865
|
-
if axis.is_a?(Array)
|
866
|
-
axis.map { |x| rank - x.abs }.sort.reverse_each do |x|
|
867
|
-
value = value.send(func, x.to_i)
|
868
|
-
end
|
869
|
-
else
|
870
|
-
value = value.send(func, rank - axis.abs)
|
871
|
-
end
|
872
|
-
|
873
|
-
new_shape = if value.is_a?(NArray)
|
874
|
-
value.shape.reverse
|
875
|
-
else
|
876
|
-
value = [value]
|
877
|
-
[]
|
878
|
-
end
|
879
|
-
|
880
|
-
new_shape = _reduced_shape(input.shape.dup, axis) if tensor.options[:keepdims]
|
881
|
-
|
882
|
-
convert_to_opencl(value.flatten, new_shape, data_type: tensor.data_type, name: tensor.name)
|
883
|
-
end
|
884
|
-
end
|
885
|
-
|
886
891
|
# selects variants of cl programs depending on input
|
887
892
|
def select_program(input_a, input_b, op)
|
888
893
|
return [input_a, input_b, op.to_s, 0] if input_a.shape == input_b.shape
|