tensor_stream-opencl 0.1.3 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +11 -4
- data/benchmark/benchmark.rb +91 -0
- data/benchmark_intel.txt +36 -0
- data/lib/tensor_stream/opencl/array_ops.rb +395 -0
- data/lib/tensor_stream/opencl/images_ops.rb +62 -0
- data/lib/tensor_stream/opencl/kernels/abs.cl +6 -8
- data/lib/tensor_stream/opencl/kernels/acos.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/apply_adadelta.cl +2 -4
- data/lib/tensor_stream/opencl/kernels/apply_adagrad.cl +12 -0
- data/lib/tensor_stream/opencl/kernels/apply_adam.cl +2 -5
- data/lib/tensor_stream/opencl/kernels/apply_centered_rms_prop.cl +19 -0
- data/lib/tensor_stream/opencl/kernels/apply_gradient.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/apply_momentum.cl +2 -4
- data/lib/tensor_stream/opencl/kernels/apply_rms_prop.cl +16 -0
- data/lib/tensor_stream/opencl/kernels/asin.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/ceil.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/concat.cl +21 -0
- data/lib/tensor_stream/opencl/kernels/cos.cl +3 -5
- data/lib/tensor_stream/opencl/kernels/exp.cl +3 -5
- data/lib/tensor_stream/opencl/kernels/floor.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/log.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/log1p.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/negate.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/reciprocal.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/sigmoid.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/sign.cl +7 -8
- data/lib/tensor_stream/opencl/kernels/sin.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/split.cl +17 -0
- data/lib/tensor_stream/opencl/kernels/split_n.cl +18 -0
- data/lib/tensor_stream/opencl/kernels/sqrt.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/square.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/tan.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/tanh.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/tanh_grad.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/unpack.cl +23 -0
- data/lib/tensor_stream/opencl/nn_ops.rb +111 -26
- data/lib/tensor_stream/opencl/opencl_buffer.rb +9 -0
- data/lib/tensor_stream/opencl/opencl_evaluator.rb +129 -172
- data/lib/tensor_stream/opencl/version.rb +1 -1
- data/samples/iris.data +150 -0
- data/samples/iris.rb +110 -0
- data/samples/mnist_data.rb +65 -0
- data/samples/multigpu.rb +73 -0
- data/samples/nearest_neighbor.rb +56 -0
- data/samples/rnn.rb +108 -0
- data/tensor_stream-opencl.gemspec +4 -1
- metadata +62 -3
@@ -0,0 +1,18 @@
|
|
1
|
+
% ctype = dtype_to_c_type(data_type)
|
2
|
+
% mul_str = mul.each_with_index.collect { |mul, index| "#{mul} * index_map_#{index}" }
|
3
|
+
__kernel void split(const int offset, __global const <%= ctype %> *A, __global <%= ctype %> *C) {
|
4
|
+
// Get the index of the current element to be processed
|
5
|
+
const int globalCol = get_global_id(0); // Col ID of C (0..N)
|
6
|
+
|
7
|
+
// compute effective coordinates
|
8
|
+
int ptr = globalCol;
|
9
|
+
<% div.each_with_index do |div, index| %>
|
10
|
+
<% if index == axis %>
|
11
|
+
int index_map_<%= index %> = (int)floor(ptr / (float)<%= div %>) + <%= step %>;
|
12
|
+
<% else %>
|
13
|
+
int index_map_<%= index %> = (int)floor(ptr / (float)<%= div %>);
|
14
|
+
<% end %>
|
15
|
+
<% if index < div.size - 1%>ptr = ptr % <%= div %>;<% end %><% end %>
|
16
|
+
C[offset + globalCol] = A[<%= mul_str.join(" + ") %>];
|
17
|
+
|
18
|
+
}
|
@@ -1,9 +1,8 @@
|
|
1
1
|
% c_dtype = dtype_to_c_type(dtype)
|
2
2
|
|
3
|
-
__kernel void sqrt_<%= dtype %>(
|
3
|
+
__kernel void sqrt_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
4
4
|
// Get the index of the current element to be processed
|
5
|
-
const int
|
6
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
5
|
+
const int id = get_global_id(0);
|
7
6
|
|
8
|
-
C[
|
7
|
+
C[id] = sqrt(A[id]);
|
9
8
|
}
|
@@ -1,9 +1,8 @@
|
|
1
1
|
% c_dtype = dtype_to_c_type(dtype)
|
2
2
|
|
3
|
-
__kernel void square_<%= dtype %>(
|
3
|
+
__kernel void square_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
4
4
|
// Get the index of the current element to be processed
|
5
|
-
const int
|
6
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
5
|
+
const int id = get_global_id(0);
|
7
6
|
|
8
|
-
C[
|
7
|
+
C[id] = A[id] * A[id];
|
9
8
|
}
|
@@ -1,8 +1,7 @@
|
|
1
1
|
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
__kernel void tan_<%= dtype %>(
|
2
|
+
__kernel void tan_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
3
|
// Get the index of the current element to be processed
|
4
|
-
const int
|
5
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
4
|
+
const int id = get_global_id(0); // Row ID of C (0..M)
|
6
5
|
|
7
|
-
C[
|
6
|
+
C[id] = tan(A[id]);
|
8
7
|
}
|
@@ -1,8 +1,7 @@
|
|
1
1
|
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
__kernel void tanh_<%= dtype %>(
|
2
|
+
__kernel void tanh_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
3
|
// Get the index of the current element to be processed
|
4
|
-
const int
|
5
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
4
|
+
const int id = get_global_id(0); // Row ID of C (0..M)
|
6
5
|
|
7
|
-
C[
|
6
|
+
C[id] = tanh(A[id]);
|
8
7
|
}
|
@@ -1,7 +1,6 @@
|
|
1
1
|
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
__kernel void tanh_grad_<%= dtype %>(
|
2
|
+
__kernel void tanh_grad_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
3
|
// Get the index of the current element to be processed
|
4
|
-
const int
|
5
|
-
|
6
|
-
C[globalRow * N + globalCol] = 1 - tanh(A[globalRow * N + globalCol]) * tanh(A[globalRow * N + globalCol]);
|
4
|
+
const int id = get_global_id(0);
|
5
|
+
C[id] = 1 - tanh(A[id]) * tanh(A[id]);
|
7
6
|
}
|
@@ -0,0 +1,23 @@
|
|
1
|
+
% ctype = dtype_to_c_type(data_type)
|
2
|
+
|
3
|
+
__kernel void unpack(const int N, const int index, __global const <%= ctype %> *A, __global <%= ctype %> *C) {
|
4
|
+
// Get the index of the current element to be processed
|
5
|
+
const int globalCol = get_global_id(0); // Col ID of C (0..N)
|
6
|
+
|
7
|
+
int start = index * <%= divisors[0] %>;
|
8
|
+
int ptr = start + globalCol;
|
9
|
+
int index_map[<%= divisors.size %>] = { <%= Array.new(divisors.size) { 0 }.join(', ') %> };
|
10
|
+
|
11
|
+
// compute effective coordinates
|
12
|
+
<% divisors.each_with_index do |div, index| %>
|
13
|
+
index_map[<%= index %>] = (int)floor(ptr / (float)<%= div %>);<% if index < divisors.size - 1%>ptr = ptr % <%= div %>;<% end %><% end %>
|
14
|
+
|
15
|
+
// Apply axis translation if needed
|
16
|
+
<% if axis > 0 %>
|
17
|
+
int last = index_map[<%= axis %>];
|
18
|
+
<% axis.downto(1) do |i| %> index_map[<%= i %>] = index_map[<%= (i - 1) %>];<% end %>
|
19
|
+
index_map[0] = last;
|
20
|
+
<% end%>
|
21
|
+
|
22
|
+
C[<%= multipliers.each_with_index.map { |m, idx| "#{m}*index_map[#{idx}]" }.join(' + ') %>] = A[globalCol];
|
23
|
+
}
|
@@ -14,14 +14,15 @@ module TensorStream
|
|
14
14
|
assign.buffer.dirty = true # force buffer copy when variable is read externally
|
15
15
|
output_buffer = assign.buffer
|
16
16
|
|
17
|
-
|
18
|
-
work_group = [m || 1, n || 1]
|
19
|
-
cl_m = OpenCL::Int1.new(m || 1)
|
20
|
-
cl_n = OpenCL::Int1.new(n || 1)
|
17
|
+
work_group = [output_buffer.total_elements]
|
21
18
|
|
22
19
|
event_wait_list = build_event_wait_list([assign.buffer, learning_rate, delta])
|
23
|
-
|
24
|
-
event =
|
20
|
+
|
21
|
+
event = call_program("apply_gradient", output_buffer.data_type,
|
22
|
+
work_group,
|
23
|
+
delta.cl_buffer,
|
24
|
+
learning_rate.cl_buffer,
|
25
|
+
output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
25
26
|
output_buffer.op = event
|
26
27
|
output_buffer
|
27
28
|
end
|
@@ -37,15 +38,12 @@ module TensorStream
|
|
37
38
|
|
38
39
|
output_buffer = assign.buffer
|
39
40
|
|
40
|
-
|
41
|
-
work_group = [m || 1, n || 1]
|
42
|
-
cl_m = OpenCL::Int1.new(m || 1)
|
43
|
-
cl_n = OpenCL::Int1.new(n || 1)
|
41
|
+
work_group = [output_buffer.total_elements]
|
44
42
|
|
45
43
|
event_wait_list = build_event_wait_list([assign.buffer, assign_acc.buffer, learning_rate, grad, momentum])
|
46
44
|
method_call = :"apply_momentum_#{output_buffer.data_type}"
|
47
45
|
event = _cl_program("apply_momentum", nesterov: tensor.options[:use_nesterov], dtype: output_buffer.data_type).
|
48
|
-
send(method_call, _opencl_queue, work_group,
|
46
|
+
send(method_call, _opencl_queue, work_group, grad.cl_buffer,
|
49
47
|
learning_rate.cl_buffer, momentum.cl_buffer, output_buffer.cl_buffer,
|
50
48
|
assign_acc.buffer.cl_buffer, event_wait_list: event_wait_list)
|
51
49
|
output_buffer.op = event
|
@@ -66,15 +64,11 @@ module TensorStream
|
|
66
64
|
|
67
65
|
output_buffer = assign.buffer
|
68
66
|
|
69
|
-
|
70
|
-
work_group = [m || 1, n || 1]
|
71
|
-
cl_m = OpenCL::Int1.new(m || 1)
|
72
|
-
cl_n = OpenCL::Int1.new(n || 1)
|
67
|
+
work_group = [output_buffer.total_elements]
|
73
68
|
|
74
69
|
event_wait_list = build_event_wait_list(inputs)
|
75
|
-
|
76
|
-
|
77
|
-
.send(method_call, _opencl_queue, work_group, cl_m, cl_n,
|
70
|
+
event = call_program('apply_adadelta', output_buffer.data_type,
|
71
|
+
work_group,
|
78
72
|
lr.cl_buffer,
|
79
73
|
rho.cl_buffer,
|
80
74
|
epsilon.cl_buffer,
|
@@ -104,15 +98,11 @@ module TensorStream
|
|
104
98
|
|
105
99
|
output_buffer = assign.buffer
|
106
100
|
|
107
|
-
|
108
|
-
work_group = [m || 1, n || 1]
|
109
|
-
cl_m = OpenCL::Int1.new(m || 1)
|
110
|
-
cl_n = OpenCL::Int1.new(n || 1)
|
101
|
+
work_group = [output_buffer.total_elements]
|
111
102
|
|
112
103
|
event_wait_list = build_event_wait_list(inputs)
|
113
|
-
|
114
|
-
|
115
|
-
.send(method_call, _opencl_queue, work_group, cl_m, cl_n,
|
104
|
+
event = call_program("apply_adam", output_buffer.data_type,
|
105
|
+
work_group,
|
116
106
|
grad.cl_buffer,
|
117
107
|
lr_t.cl_buffer,
|
118
108
|
beta1_power.cl_buffer,
|
@@ -130,6 +120,99 @@ module TensorStream
|
|
130
120
|
output_buffer
|
131
121
|
end
|
132
122
|
|
123
|
+
register_op :apply_adagrad do |context, tensor, inputs|
|
124
|
+
target_var, accum, lr, grad = inputs
|
125
|
+
|
126
|
+
assign = tensor.inputs[0] || tensor
|
127
|
+
assign_acc = tensor.inputs[1]
|
128
|
+
|
129
|
+
assign.buffer.dirty = true
|
130
|
+
assign_acc.buffer.dirty = true
|
131
|
+
output_buffer = assign.buffer
|
132
|
+
|
133
|
+
work_group = [output_buffer.total_elements]
|
134
|
+
|
135
|
+
event_wait_list = build_event_wait_list(inputs)
|
136
|
+
event = call_program('apply_adagrad',
|
137
|
+
output_buffer.data_type,
|
138
|
+
work_group,
|
139
|
+
lr.cl_buffer,
|
140
|
+
grad.cl_buffer,
|
141
|
+
assign.buffer.cl_buffer,
|
142
|
+
assign_acc.buffer.cl_buffer,
|
143
|
+
event_wait_list: event_wait_list)
|
144
|
+
output_buffer.op = event
|
145
|
+
assign_acc.buffer.op = event
|
146
|
+
output_buffer
|
147
|
+
end
|
148
|
+
|
149
|
+
register_op :apply_centered_rms_prop do |context, tensor, inputs|
|
150
|
+
var, mg, ms, mom, lr, rho, momentum, epsilon, grad = inputs
|
151
|
+
|
152
|
+
assign = tensor.inputs[0]
|
153
|
+
assign_mg = tensor.inputs[1]
|
154
|
+
assign_ms = tensor.inputs[2]
|
155
|
+
assign_mom = tensor.inputs[3]
|
156
|
+
|
157
|
+
assign.buffer.dirty = true
|
158
|
+
assign_mg.buffer.dirty = true
|
159
|
+
assign_ms.buffer.dirty = true
|
160
|
+
assign_mom.buffer.dirty = true
|
161
|
+
output_buffer = assign.buffer
|
162
|
+
event_wait_list = build_event_wait_list(inputs)
|
163
|
+
work_group = [output_buffer.total_elements]
|
164
|
+
|
165
|
+
event = call_program('apply_centered_rms_prop', output_buffer.data_type, work_group,
|
166
|
+
lr.cl_buffer,
|
167
|
+
rho.cl_buffer,
|
168
|
+
momentum.cl_buffer,
|
169
|
+
epsilon.cl_buffer,
|
170
|
+
grad.cl_buffer,
|
171
|
+
assign.buffer.cl_buffer,
|
172
|
+
assign_ms.buffer.cl_buffer,
|
173
|
+
assign_mg.buffer.cl_buffer,
|
174
|
+
assign_mom.buffer.cl_buffer,
|
175
|
+
event_wait_list: event_wait_list)
|
176
|
+
|
177
|
+
output_buffer.op = event
|
178
|
+
assign_mg.buffer.op = event
|
179
|
+
assign_ms.buffer.op = event
|
180
|
+
assign_mom.buffer.op = event
|
181
|
+
output_buffer
|
182
|
+
end
|
183
|
+
|
184
|
+
register_op :apply_rms_prop do |context, tensor, inputs|
|
185
|
+
var, ms, mom, lr, rho, momentum, epsilon, grad = inputs
|
186
|
+
|
187
|
+
assign = tensor.inputs[0]
|
188
|
+
assign_ms = tensor.inputs[1]
|
189
|
+
assign_mom = tensor.inputs[2]
|
190
|
+
|
191
|
+
assign.buffer.dirty = true
|
192
|
+
assign_ms.buffer.dirty = true
|
193
|
+
assign_mom.buffer.dirty = true
|
194
|
+
output_buffer = assign.buffer
|
195
|
+
event_wait_list = build_event_wait_list(inputs)
|
196
|
+
work_group = [output_buffer.total_elements]
|
197
|
+
|
198
|
+
event = call_program('apply_rms_prop', output_buffer.data_type,
|
199
|
+
work_group,
|
200
|
+
lr.cl_buffer,
|
201
|
+
rho.cl_buffer,
|
202
|
+
momentum.cl_buffer,
|
203
|
+
epsilon.cl_buffer,
|
204
|
+
grad.cl_buffer,
|
205
|
+
assign.buffer.cl_buffer,
|
206
|
+
assign_ms.buffer.cl_buffer,
|
207
|
+
assign_mom.buffer.cl_buffer,
|
208
|
+
event_wait_list: event_wait_list)
|
209
|
+
|
210
|
+
output_buffer.op = event
|
211
|
+
assign_ms.buffer.op = event
|
212
|
+
assign_mom.buffer.op = event
|
213
|
+
output_buffer
|
214
|
+
end
|
215
|
+
|
133
216
|
register_op :softmax do |_context, tensor, inputs|
|
134
217
|
a = inputs[0]
|
135
218
|
event_wait_list = build_event_wait_list(inputs)
|
@@ -213,7 +296,9 @@ module TensorStream
|
|
213
296
|
work_group = [m]
|
214
297
|
n = m if n.nil?
|
215
298
|
cl_n = OpenCL::Int1.new(n || 1)
|
216
|
-
event = _cl_program('softmax_grad', dtype: dtype, size: n).
|
299
|
+
event = _cl_program('softmax_grad', dtype: dtype, size: n).
|
300
|
+
send(:"softmax_grad_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer,
|
301
|
+
grad.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
217
302
|
output_buffer.op = event
|
218
303
|
output_buffer
|
219
304
|
end
|
@@ -14,6 +14,14 @@ module TensorStream
|
|
14
14
|
@op = op
|
15
15
|
end
|
16
16
|
|
17
|
+
def total_elements
|
18
|
+
shape.reduce(:*) || 1
|
19
|
+
end
|
20
|
+
|
21
|
+
def empty_value?
|
22
|
+
@shape == [0]
|
23
|
+
end
|
24
|
+
|
17
25
|
def to_ruby
|
18
26
|
return [] if buffer.empty?
|
19
27
|
|
@@ -24,6 +32,7 @@ module TensorStream
|
|
24
32
|
end
|
25
33
|
|
26
34
|
if shape.empty?
|
35
|
+
return buffer.to_s if data_type == :string
|
27
36
|
return buffer[0] != 0 if data_type == :boolean
|
28
37
|
return buffer[0]
|
29
38
|
end
|
@@ -11,6 +11,8 @@ require 'narray_ffi'
|
|
11
11
|
require 'tensor_stream/evaluator/base_evaluator'
|
12
12
|
require 'tensor_stream/opencl/math_ops'
|
13
13
|
require 'tensor_stream/opencl/nn_ops'
|
14
|
+
require 'tensor_stream/opencl/images_ops'
|
15
|
+
require 'tensor_stream/opencl/array_ops'
|
14
16
|
require 'tensor_stream/helpers/op_helper'
|
15
17
|
|
16
18
|
module TensorStream
|
@@ -32,7 +34,8 @@ module TensorStream
|
|
32
34
|
end
|
33
35
|
end
|
34
36
|
|
35
|
-
##
|
37
|
+
##
|
38
|
+
# PURE ruby evaluator used for testing and development
|
36
39
|
class OpenclEvaluator < BaseEvaluator
|
37
40
|
attr_accessor :retain
|
38
41
|
attr_reader :opencl_device
|
@@ -42,6 +45,8 @@ module TensorStream
|
|
42
45
|
include TensorStream::MathHelper
|
43
46
|
include TensorStream::OpenCLHelpers::MathOps
|
44
47
|
include TensorStream::OpenCLHelpers::NNOps
|
48
|
+
include TensorStream::OpenCLHelpers::ImagesOps
|
49
|
+
include TensorStream::OpenCLHelpers::ArrayOps
|
45
50
|
|
46
51
|
def initialize(session, device, thread_pool: nil, log_intermediates: false)
|
47
52
|
super
|
@@ -86,7 +91,10 @@ module TensorStream
|
|
86
91
|
|
87
92
|
# opencl evaluator main entrypoint
|
88
93
|
def run(tensor, execution_context)
|
89
|
-
|
94
|
+
result = complete_eval(tensor, execution_context)
|
95
|
+
# puts "wait finish"
|
96
|
+
_opencl_queue.finish
|
97
|
+
read_final_result(result)
|
90
98
|
end
|
91
99
|
|
92
100
|
def run_with_buffer(tensor, context, execution_context)
|
@@ -117,9 +125,9 @@ module TensorStream
|
|
117
125
|
def enqueue_buffer_read(tensor, context)
|
118
126
|
buffer = _run(tensor, context)
|
119
127
|
if buffer.is_a?(Array)
|
120
|
-
buffer
|
128
|
+
buffer.collect do |b|
|
121
129
|
next b if b.buffer.size.zero?
|
122
|
-
_opencl_queue.enqueue_read_buffer(b.cl_buffer, b.buffer, event_wait_list: build_event_wait_list([b]))
|
130
|
+
b.op = _opencl_queue.enqueue_read_buffer(b.cl_buffer, b.buffer, event_wait_list: build_event_wait_list([b]))
|
123
131
|
b
|
124
132
|
end
|
125
133
|
else
|
@@ -127,14 +135,18 @@ module TensorStream
|
|
127
135
|
return buffer if buffer.nil?
|
128
136
|
return [] if buffer.buffer.nil?
|
129
137
|
return buffer if buffer.buffer.size.zero?
|
130
|
-
_opencl_queue.enqueue_read_buffer(buffer.cl_buffer, buffer.buffer, event_wait_list: build_event_wait_list([buffer]))
|
138
|
+
buffer.op = _opencl_queue.enqueue_read_buffer(buffer.cl_buffer, buffer.buffer, event_wait_list: build_event_wait_list([buffer]))
|
131
139
|
buffer
|
132
140
|
end
|
133
141
|
end
|
134
142
|
|
135
143
|
def complete_eval(tensor, context)
|
144
|
+
return nil if tensor.nil?
|
145
|
+
|
136
146
|
buffer = enqueue_buffer_read(tensor, context)
|
137
|
-
|
147
|
+
events = build_event_wait_list([buffer])
|
148
|
+
# puts "wait #{tensor.name}"
|
149
|
+
OpenCL.wait_for_events(events) unless events.empty?
|
138
150
|
buffer
|
139
151
|
end
|
140
152
|
|
@@ -162,6 +174,7 @@ module TensorStream
|
|
162
174
|
|
163
175
|
def prepare_input(tensor, context, options = {})
|
164
176
|
return nil unless tensor
|
177
|
+
|
165
178
|
tensor = resolve_placeholder(tensor)
|
166
179
|
if options[:noop]
|
167
180
|
tensor
|
@@ -210,11 +223,17 @@ module TensorStream
|
|
210
223
|
def _cl_program(kernel, args = {})
|
211
224
|
suffix = args.collect { |k, v| "#{k}.#{escape_arg_content(v)}" }.join('.')
|
212
225
|
@context[:_cache]["_opencl_kernel_#{kernel}.#{suffix}:#{object_id}"] ||= begin
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
226
|
+
file_path = File.join('/tmp', "#{kernel}.#{suffix}.cl")
|
227
|
+
source = if File.exist?(file_path) && ENV['TS_OPENCL_FILE_CACHE']
|
228
|
+
File.read(file_path)
|
229
|
+
else
|
230
|
+
filename = %w[cl.erb cl].map { |ext| cl_template_path(kernel, ext) }.find { |n| File.exist?(n) }
|
231
|
+
raise "opencl kernel template for #{kernel} has not yet been defined" if filename.nil?
|
232
|
+
source = File.read(filename)
|
233
|
+
source = OpenclTemplateHelper.new(source).generate(args)
|
234
|
+
File.write(file_path, source) if ENV['TS_OPENCL_FILE_CACHE']
|
235
|
+
source
|
236
|
+
end
|
218
237
|
program = _opencl_context.create_program_with_source(source)
|
219
238
|
program.build
|
220
239
|
rescue OpenCL::Error::BUILD_PROGRAM_FAILURE => e
|
@@ -275,10 +294,10 @@ module TensorStream
|
|
275
294
|
end
|
276
295
|
|
277
296
|
register_op :identity do |context, tensor, inputs|
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
297
|
+
value = inputs[0]
|
298
|
+
buffer = OpenCLBuffer.new(name: tensor.name, data_type: tensor.data_type, shape: value.shape, buffer: value.buffer, cl_buffer: value.cl_buffer)
|
299
|
+
buffer.op = build_event_wait_list(inputs)
|
300
|
+
buffer
|
282
301
|
end
|
283
302
|
|
284
303
|
register_op :assign, noop: true do |context, tensor, inputs|
|
@@ -308,86 +327,11 @@ module TensorStream
|
|
308
327
|
end
|
309
328
|
end
|
310
329
|
|
311
|
-
register_op :expand_dims, buffer: true do |_context, tensor, inputs|
|
312
|
-
axis = inputs[1].buffer[0]
|
313
|
-
shape = inputs[0].shape.dup
|
314
|
-
axis = -axis if axis == shape.size
|
315
|
-
new_shape = shape.insert(axis, 1).compact
|
316
|
-
new_buf = inputs[0].buffer.reshape(*new_shape.reverse)
|
317
|
-
convert_to_opencl(new_buf, new_shape, data_type: inputs[0].data_type, name: tensor.name)
|
318
|
-
end
|
319
|
-
|
320
|
-
register_op :fill, buffer: true do |_context, tensor, inputs|
|
321
|
-
shape = inputs[0]
|
322
|
-
value = inputs[1]
|
323
|
-
|
324
|
-
narray_size = shape.buffer.to_a.reduce(:*) || 1
|
325
|
-
cl_buffer = get_cached_buffer(tensor.name, shape.buffer.to_a)
|
326
|
-
|
327
|
-
buffer = if cl_buffer
|
328
|
-
cl_buffer.buffer
|
329
|
-
else
|
330
|
-
allocate_narray_for_type(tensor.data_type, narray_size)
|
331
|
-
end
|
332
|
-
|
333
|
-
buffer.fill!(value.buffer[0])
|
334
|
-
convert_to_opencl(buffer, shape.buffer.to_a, data_type: tensor.data_type, name: tensor.name)
|
335
|
-
end
|
336
|
-
|
337
330
|
register_op :where, noop: true do |context, tensor, inputs|
|
338
331
|
pred = tensor.options[:pred]
|
339
332
|
execute_cond_func('where', tensor, pred, inputs[0], inputs[1], context)
|
340
333
|
end
|
341
334
|
|
342
|
-
register_op :cast do |_context, tensor, inputs|
|
343
|
-
a = inputs[0]
|
344
|
-
if a.data_type != tensor.data_type
|
345
|
-
buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
|
346
|
-
m, n = a.shape
|
347
|
-
cl_m = OpenCL::Int1.new(m || 1)
|
348
|
-
cl_n = OpenCL::Int1.new(n || 1)
|
349
|
-
work_group = [m || 1, n || 1]
|
350
|
-
event_wait_list = build_event_wait_list(inputs)
|
351
|
-
buffer.op = _cl_program("cast", source_dt: a.data_type, target_dt: tensor.data_type).cast(_opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, buffer.cl_buffer, event_wait_list: event_wait_list)
|
352
|
-
buffer
|
353
|
-
else
|
354
|
-
a
|
355
|
-
end
|
356
|
-
end
|
357
|
-
|
358
|
-
register_op :stack do |_context, tensor, inputs|
|
359
|
-
axis = tensor.options[:axis] || 0
|
360
|
-
shape = inputs[0].shape
|
361
|
-
rank = shape.size + 1
|
362
|
-
elem_size = shape.empty? ? 1 : shape.reduce(:*)
|
363
|
-
|
364
|
-
new_shape = [inputs.size]
|
365
|
-
shape.inject(new_shape) { |ns, s| ns << s }
|
366
|
-
|
367
|
-
divisors = new_shape.dup.drop(1).reverse.inject([1]) do |a, s|
|
368
|
-
a << s * a.last
|
369
|
-
end.reverse
|
370
|
-
|
371
|
-
axis = rank + axis if axis < 0
|
372
|
-
rotated_shape = Array.new(axis + 1) { new_shape.shift }
|
373
|
-
new_shape = rotated_shape.rotate! + new_shape
|
374
|
-
|
375
|
-
output_buffer = _create_result_buffer(tensor.data_type, new_shape, tensor.name)
|
376
|
-
multipliers = new_shape.dup.drop(1).reverse.inject([1]) do |a, s|
|
377
|
-
a << s * a.last
|
378
|
-
end.reverse
|
379
|
-
|
380
|
-
cl_n = OpenCL::Int1.new(elem_size)
|
381
|
-
work_group = [elem_size]
|
382
|
-
event_wait_list = build_event_wait_list(inputs)
|
383
|
-
ops = inputs.each_with_index.map do |input, index|
|
384
|
-
cl_index = OpenCL::Int1.new(index)
|
385
|
-
_cl_program("pack", data_type: tensor.data_type, divisors: divisors, multipliers: multipliers, axis: axis).pack(_opencl_queue, work_group, cl_n, cl_index, input.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
386
|
-
end
|
387
|
-
output_buffer.op = ops
|
388
|
-
output_buffer
|
389
|
-
end
|
390
|
-
|
391
335
|
register_op :check_numerics, noop: true do |context, tensor, inputs|
|
392
336
|
a = complete_eval(inputs[0], context)
|
393
337
|
name = tensor.options[:name]
|
@@ -420,86 +364,18 @@ module TensorStream
|
|
420
364
|
a
|
421
365
|
end
|
422
366
|
|
423
|
-
register_op :rank do |_context, tensor, inputs|
|
424
|
-
wrap_opencl(inputs[0].shape.size, data_type: tensor.data_type, name: tensor.name)
|
425
|
-
end
|
426
|
-
|
427
367
|
register_op :stop_gradient do |_context, _tensor, inputs|
|
428
368
|
inputs[0]
|
429
369
|
end
|
430
370
|
|
431
|
-
register_op :slice, noop: true do |context, tensor, inputs|
|
432
|
-
input_a = complete_eval(inputs[0], context)
|
433
|
-
input_b = read_final_result(complete_eval(inputs[1], context))
|
434
|
-
size = tensor.options[:size]
|
435
|
-
|
436
|
-
slice_param = input_b.zip(size).collect { |x, y| x..x + y - 1 }.reverse
|
437
|
-
|
438
|
-
new_buf = input_a.buffer.reshape(*input_a.shape.reverse)
|
439
|
-
sliced = new_buf.slice[*slice_param]
|
440
|
-
convert_to_opencl(sliced.flatten, sliced.shape.reverse, data_type: inputs[0].data_type, name: tensor.name)
|
441
|
-
end
|
442
|
-
|
443
|
-
register_op :transpose, buffer: true do |_context, tensor, inputs|
|
444
|
-
t_param = Array.new(inputs[0].shape.size) { |index| index }.reverse
|
445
|
-
|
446
|
-
if inputs[0].shape.size == 2 && inputs[1].nil?
|
447
|
-
transposed = inputs[0].buffer.reshape(*inputs[0].shape.reverse).transpose(*t_param)
|
448
|
-
res = convert_to_opencl(transposed.flatten, transposed.shape.reverse, data_type: inputs[0].data_type, name: tensor.name)
|
449
|
-
res
|
450
|
-
else
|
451
|
-
rank = inputs[0].shape.size
|
452
|
-
perm = inputs[1].nil? ? (0...rank).to_a.reverse : inputs[1].buffer
|
453
|
-
new_shape = perm.map { |p| inputs[0].shape[p] }.to_a
|
454
|
-
output_buffer = _create_result_buffer(tensor.data_type, new_shape, tensor.name)
|
455
|
-
transpose_with_perm(inputs[0].buffer, output_buffer.buffer, inputs[0].shape, new_shape, perm)
|
456
|
-
|
457
|
-
write_op = _opencl_queue.enqueue_write_buffer(output_buffer.cl_buffer, output_buffer.buffer)
|
458
|
-
output_buffer.op = write_op
|
459
|
-
output_buffer
|
460
|
-
end
|
461
|
-
end
|
462
|
-
|
463
|
-
register_op :index, noop: true do |context, tensor, inputs|
|
464
|
-
a = _run(inputs[0], context)
|
465
|
-
index = read_final_result(_run(inputs[1], context))
|
466
|
-
|
467
|
-
if a.is_a?(OutputGroup)
|
468
|
-
a.outputs[index]
|
469
|
-
elsif a.is_a?(Array)
|
470
|
-
a[index]
|
471
|
-
else
|
472
|
-
new_shape = a.shape.dup
|
473
|
-
new_shape.shift
|
474
|
-
input_a = read_final_result(a)
|
475
|
-
convert_to_opencl(input_a[index], new_shape, data_type: a.data_type, name: tensor.name)
|
476
|
-
end
|
477
|
-
end
|
478
|
-
|
479
371
|
register_op :broadcast_gradient_args, buffer: true do |_context, tensor, inputs|
|
480
372
|
rx, ry = get_broadcast_gradient_args(inputs[0].buffer.to_a, inputs[1].buffer.to_a)
|
481
373
|
OutputGroup.new([wrap_opencl(rx, data_type: :int32, name: tensor.name), wrap_opencl(ry, data_type: :int32, name: "#{tensor.name}:1")], tensor.inputs.map(&:data_type))
|
482
374
|
end
|
483
375
|
|
484
|
-
register_op :
|
485
|
-
|
486
|
-
|
487
|
-
|
488
|
-
register_op :reshape, buffer: true do |_context, tensor, inputs|
|
489
|
-
arr = inputs[0]
|
490
|
-
new_shape = read_final_result(inputs[1])
|
491
|
-
|
492
|
-
shape = if new_shape.size.zero? && arr.buffer.size == 1
|
493
|
-
new_shape
|
494
|
-
else
|
495
|
-
TensorShape.fix_inferred_elements(new_shape, arr.buffer.size)
|
496
|
-
end
|
497
|
-
|
498
|
-
convert_to_opencl(arr.buffer, shape, data_type: arr.data_type, name: tensor.name)
|
499
|
-
end
|
500
|
-
|
501
|
-
register_op :flow_group do |context, _tensor, inputs|
|
502
|
-
_opencl_queue.finish
|
376
|
+
register_op :flow_group do |_context, _tensor, inputs|
|
377
|
+
events = build_event_wait_list(inputs)
|
378
|
+
OpenCL.wait_for_events(events) unless events.empty?
|
503
379
|
nil
|
504
380
|
end
|
505
381
|
|
@@ -657,7 +533,10 @@ module TensorStream
|
|
657
533
|
cl_n = OpenCL::Int1.new(n || 1)
|
658
534
|
|
659
535
|
event_wait_list = build_event_wait_list([a, b, p]) # add dependency wait list
|
660
|
-
output_buffer.op = _cl_program(op_name.to_s, dtype: dtype).
|
536
|
+
output_buffer.op = _cl_program(op_name.to_s, dtype: dtype).
|
537
|
+
send(:"#{op_name}_#{dtype}", _opencl_queue, work_group,
|
538
|
+
cl_m, cl_n, p.cl_buffer, a.cl_buffer, b.cl_buffer,
|
539
|
+
output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
661
540
|
output_buffer
|
662
541
|
end
|
663
542
|
|
@@ -667,16 +546,17 @@ module TensorStream
|
|
667
546
|
dtype = tensor.data_type
|
668
547
|
output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
|
669
548
|
|
670
|
-
|
671
|
-
work_group = [m || 1, n || 1]
|
672
|
-
cl_m = OpenCL::Int1.new(m || 1)
|
673
|
-
cl_n = OpenCL::Int1.new(n || 1)
|
549
|
+
work_group = [a.total_elements]
|
674
550
|
|
675
|
-
event =
|
551
|
+
event = call_program(op_name, dtype, work_group, a.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
676
552
|
output_buffer.op = event
|
677
553
|
output_buffer
|
678
554
|
end
|
679
555
|
|
556
|
+
def call_program(name, dtype, work_group, *args)
|
557
|
+
_cl_program(name.to_s, dtype: dtype).send(:"#{name}_#{dtype}", _opencl_queue, work_group, *args)
|
558
|
+
end
|
559
|
+
|
680
560
|
def auto_type_cast(a, b, name: nil)
|
681
561
|
return [a, b] if a.data_type == b.data_type
|
682
562
|
m, n = b.shape
|
@@ -728,16 +608,20 @@ module TensorStream
|
|
728
608
|
@context[:_cache][cache_key]
|
729
609
|
else
|
730
610
|
narray_size = shape.reduce(:*) || 1
|
611
|
+
cl_buffer_size = shape.empty? ? 1 : shape.reduce(:*)
|
731
612
|
|
732
613
|
buffer = if value.is_a?(NArray)
|
733
614
|
value
|
615
|
+
elsif data_type == :string && shape.empty?
|
616
|
+
cl_buffer_size = value[0].bytesize
|
617
|
+
allocate_narray_for_type(data_type, value[0].bytesize)
|
734
618
|
else
|
735
619
|
allocate_narray_for_type(data_type, narray_size)
|
736
620
|
end
|
737
621
|
|
738
622
|
return nil if buffer.nil?
|
739
623
|
|
740
|
-
|
624
|
+
|
741
625
|
|
742
626
|
cl_buffer = unless value.flatten.empty?
|
743
627
|
cl_buffer_size = 1 if cl_buffer_size.zero?
|
@@ -746,8 +630,11 @@ module TensorStream
|
|
746
630
|
|
747
631
|
@context[:_cache][cache_key] = OpenCLBuffer.new(name: name, data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer)
|
748
632
|
end
|
749
|
-
|
750
|
-
|
633
|
+
if data_type == :string
|
634
|
+
value[0].each_byte.with_index do |c, index|
|
635
|
+
cl_object.buffer[index] = c
|
636
|
+
end
|
637
|
+
elsif value.is_a?(Array)
|
751
638
|
value.flatten.each_with_index do |element, index|
|
752
639
|
cl_object.buffer[index] = if element.is_a?(Tensor)
|
753
640
|
read_final_result(complete_eval(element, {}))
|
@@ -765,7 +652,10 @@ module TensorStream
|
|
765
652
|
cl_object.buffer[0] = Tensor.cast_dtype(value, data_type)
|
766
653
|
end
|
767
654
|
|
768
|
-
|
655
|
+
if cl_object.cl_buffer && !value.nil? && (!value.is_a?(Array) || !value.empty?)
|
656
|
+
write_op = _opencl_queue.enqueue_write_buffer(cl_object.cl_buffer, cl_object.buffer)
|
657
|
+
end
|
658
|
+
|
769
659
|
cl_object.op = write_op
|
770
660
|
cl_object
|
771
661
|
end
|
@@ -780,8 +670,12 @@ module TensorStream
|
|
780
670
|
NArray.int(narray_size)
|
781
671
|
when :int16
|
782
672
|
NArray.sint(narray_size)
|
673
|
+
when :uint8
|
674
|
+
NArray.byte(narray_size)
|
783
675
|
when :boolean
|
784
676
|
NArray.byte(narray_size)
|
677
|
+
when :string
|
678
|
+
NArray.byte(narray_size)
|
785
679
|
when :unknown
|
786
680
|
nil
|
787
681
|
else
|
@@ -799,6 +693,65 @@ module TensorStream
|
|
799
693
|
end
|
800
694
|
end
|
801
695
|
|
696
|
+
# automatically use sub buffers
|
697
|
+
def _create_result_sub_buffer(parent_buffer, index, data_type, shape, name)
|
698
|
+
cache_key ="_sub_result_#{parent_buffer.object_id}_#{name}_#{index}:#{object_id}"
|
699
|
+
@context[:_cache][:_cl_buffers][cache_key] ||= begin
|
700
|
+
size = shape.empty? || shape == [0] ? 1 : shape.reduce(:*)
|
701
|
+
buffer = allocate_narray_for_type(data_type, size)
|
702
|
+
|
703
|
+
if parent_buffer.cl_buffer.associated_memobject.nil?
|
704
|
+
start = index * buffer.size * buffer.element_size
|
705
|
+
region = OpenCL::BufferRegion::new(start, buffer.size * buffer.element_size)
|
706
|
+
cl_buffer = parent_buffer.cl_buffer.create_sub_buffer(OpenCL::BUFFER_CREATE_TYPE_REGION, region)
|
707
|
+
OpenCLBuffer.new(data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer, name: name)
|
708
|
+
else
|
709
|
+
_create_result_buffer(tensor.data_type, shape, name)
|
710
|
+
end
|
711
|
+
end
|
712
|
+
|
713
|
+
buffer = @context[:_cache][:_cl_buffers][cache_key]
|
714
|
+
|
715
|
+
if buffer.cl_buffer.associated_memobject
|
716
|
+
buffer.op = parent_buffer.op
|
717
|
+
else
|
718
|
+
region_size_in_bytes = buffer.buffer.size * buffer.buffer.element_size
|
719
|
+
start = index * region_size_in_bytes
|
720
|
+
region = [region_size_in_bytes, 1, 1]
|
721
|
+
buffer.op = _opencl_queue.enqueue_copy_buffer_rect(parent_buffer.cl_buffer, buffer.cl_buffer, region, src_origin: [start, 0, 0], event_wait_list: parent_buffer.op)
|
722
|
+
end
|
723
|
+
|
724
|
+
buffer
|
725
|
+
end
|
726
|
+
|
727
|
+
# create sub buffers of different sizes
|
728
|
+
def _create_variable_result_sub_buffer(parent_buffer, index, start, region_size_in_bytes, data_type, shape, name)
|
729
|
+
cache_key ="_sub_result_#{parent_buffer.object_id}_#{name}_#{index}:#{object_id}"
|
730
|
+
@context[:_cache][:_cl_buffers][cache_key] ||= begin
|
731
|
+
size = shape.empty? || shape == [0] ? 1 : shape.reduce(:*)
|
732
|
+
buffer = allocate_narray_for_type(data_type, size)
|
733
|
+
|
734
|
+
if parent_buffer.cl_buffer.associated_memobject.nil?
|
735
|
+
region = OpenCL::BufferRegion::new(start, region_size_in_bytes)
|
736
|
+
cl_buffer = parent_buffer.cl_buffer.create_sub_buffer(OpenCL::BUFFER_CREATE_TYPE_REGION, region)
|
737
|
+
OpenCLBuffer.new(data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer, name: "#{name}/sub")
|
738
|
+
else
|
739
|
+
_create_result_buffer(tensor.data_type, shape, name)
|
740
|
+
end
|
741
|
+
end
|
742
|
+
|
743
|
+
buffer = @context[:_cache][:_cl_buffers][cache_key]
|
744
|
+
|
745
|
+
if buffer.cl_buffer.associated_memobject
|
746
|
+
buffer.op = parent_buffer.op
|
747
|
+
else
|
748
|
+
region = [region_size_in_bytes, 1, 1]
|
749
|
+
buffer.op = _opencl_queue.enqueue_copy_buffer_rect(parent_buffer.cl_buffer, buffer.cl_buffer, region, src_origin: [start, 0, 0], event_wait_list: parent_buffer.op)
|
750
|
+
end
|
751
|
+
|
752
|
+
buffer
|
753
|
+
end
|
754
|
+
|
802
755
|
def get_op_with_axis(a, target_axis, current_axis, output_type, op = ->(t, u) { t > u })
|
803
756
|
if target_axis == current_axis
|
804
757
|
if a[0].is_a?(Array)
|
@@ -898,7 +851,11 @@ module TensorStream
|
|
898
851
|
end
|
899
852
|
|
900
853
|
def build_event_wait_list(inputs)
|
901
|
-
inputs.
|
854
|
+
if inputs.is_a?(Array)
|
855
|
+
inputs.flatten.compact.map(&:op).compact.uniq
|
856
|
+
else
|
857
|
+
inputs.op ? [inputs.op] : []
|
858
|
+
end
|
902
859
|
end
|
903
860
|
|
904
861
|
def resolve_placeholder(placeholder, _execution_context = {})
|