tensor_stream-opencl 0.1.3 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +11 -4
- data/benchmark/benchmark.rb +91 -0
- data/benchmark_intel.txt +36 -0
- data/lib/tensor_stream/opencl/array_ops.rb +395 -0
- data/lib/tensor_stream/opencl/images_ops.rb +62 -0
- data/lib/tensor_stream/opencl/kernels/abs.cl +6 -8
- data/lib/tensor_stream/opencl/kernels/acos.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/apply_adadelta.cl +2 -4
- data/lib/tensor_stream/opencl/kernels/apply_adagrad.cl +12 -0
- data/lib/tensor_stream/opencl/kernels/apply_adam.cl +2 -5
- data/lib/tensor_stream/opencl/kernels/apply_centered_rms_prop.cl +19 -0
- data/lib/tensor_stream/opencl/kernels/apply_gradient.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/apply_momentum.cl +2 -4
- data/lib/tensor_stream/opencl/kernels/apply_rms_prop.cl +16 -0
- data/lib/tensor_stream/opencl/kernels/asin.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/ceil.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/concat.cl +21 -0
- data/lib/tensor_stream/opencl/kernels/cos.cl +3 -5
- data/lib/tensor_stream/opencl/kernels/exp.cl +3 -5
- data/lib/tensor_stream/opencl/kernels/floor.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/log.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/log1p.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/negate.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/reciprocal.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/sigmoid.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/sign.cl +7 -8
- data/lib/tensor_stream/opencl/kernels/sin.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/split.cl +17 -0
- data/lib/tensor_stream/opencl/kernels/split_n.cl +18 -0
- data/lib/tensor_stream/opencl/kernels/sqrt.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/square.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/tan.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/tanh.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/tanh_grad.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/unpack.cl +23 -0
- data/lib/tensor_stream/opencl/nn_ops.rb +111 -26
- data/lib/tensor_stream/opencl/opencl_buffer.rb +9 -0
- data/lib/tensor_stream/opencl/opencl_evaluator.rb +129 -172
- data/lib/tensor_stream/opencl/version.rb +1 -1
- data/samples/iris.data +150 -0
- data/samples/iris.rb +110 -0
- data/samples/mnist_data.rb +65 -0
- data/samples/multigpu.rb +73 -0
- data/samples/nearest_neighbor.rb +56 -0
- data/samples/rnn.rb +108 -0
- data/tensor_stream-opencl.gemspec +4 -1
- metadata +62 -3
@@ -0,0 +1,18 @@
|
|
1
|
+
% ctype = dtype_to_c_type(data_type)
|
2
|
+
% mul_str = mul.each_with_index.collect { |mul, index| "#{mul} * index_map_#{index}" }
|
3
|
+
__kernel void split(const int offset, __global const <%= ctype %> *A, __global <%= ctype %> *C) {
|
4
|
+
// Get the index of the current element to be processed
|
5
|
+
const int globalCol = get_global_id(0); // Col ID of C (0..N)
|
6
|
+
|
7
|
+
// compute effective coordinates
|
8
|
+
int ptr = globalCol;
|
9
|
+
<% div.each_with_index do |div, index| %>
|
10
|
+
<% if index == axis %>
|
11
|
+
int index_map_<%= index %> = (int)floor(ptr / (float)<%= div %>) + <%= step %>;
|
12
|
+
<% else %>
|
13
|
+
int index_map_<%= index %> = (int)floor(ptr / (float)<%= div %>);
|
14
|
+
<% end %>
|
15
|
+
<% if index < div.size - 1%>ptr = ptr % <%= div %>;<% end %><% end %>
|
16
|
+
C[offset + globalCol] = A[<%= mul_str.join(" + ") %>];
|
17
|
+
|
18
|
+
}
|
@@ -1,9 +1,8 @@
|
|
1
1
|
% c_dtype = dtype_to_c_type(dtype)
|
2
2
|
|
3
|
-
__kernel void sqrt_<%= dtype %>(
|
3
|
+
__kernel void sqrt_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
4
4
|
// Get the index of the current element to be processed
|
5
|
-
const int
|
6
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
5
|
+
const int id = get_global_id(0);
|
7
6
|
|
8
|
-
C[
|
7
|
+
C[id] = sqrt(A[id]);
|
9
8
|
}
|
@@ -1,9 +1,8 @@
|
|
1
1
|
% c_dtype = dtype_to_c_type(dtype)
|
2
2
|
|
3
|
-
__kernel void square_<%= dtype %>(
|
3
|
+
__kernel void square_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
4
4
|
// Get the index of the current element to be processed
|
5
|
-
const int
|
6
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
5
|
+
const int id = get_global_id(0);
|
7
6
|
|
8
|
-
C[
|
7
|
+
C[id] = A[id] * A[id];
|
9
8
|
}
|
@@ -1,8 +1,7 @@
|
|
1
1
|
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
__kernel void tan_<%= dtype %>(
|
2
|
+
__kernel void tan_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
3
|
// Get the index of the current element to be processed
|
4
|
-
const int
|
5
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
4
|
+
const int id = get_global_id(0); // Row ID of C (0..M)
|
6
5
|
|
7
|
-
C[
|
6
|
+
C[id] = tan(A[id]);
|
8
7
|
}
|
@@ -1,8 +1,7 @@
|
|
1
1
|
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
__kernel void tanh_<%= dtype %>(
|
2
|
+
__kernel void tanh_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
3
|
// Get the index of the current element to be processed
|
4
|
-
const int
|
5
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
4
|
+
const int id = get_global_id(0); // Row ID of C (0..M)
|
6
5
|
|
7
|
-
C[
|
6
|
+
C[id] = tanh(A[id]);
|
8
7
|
}
|
@@ -1,7 +1,6 @@
|
|
1
1
|
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
__kernel void tanh_grad_<%= dtype %>(
|
2
|
+
__kernel void tanh_grad_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
3
|
// Get the index of the current element to be processed
|
4
|
-
const int
|
5
|
-
|
6
|
-
C[globalRow * N + globalCol] = 1 - tanh(A[globalRow * N + globalCol]) * tanh(A[globalRow * N + globalCol]);
|
4
|
+
const int id = get_global_id(0);
|
5
|
+
C[id] = 1 - tanh(A[id]) * tanh(A[id]);
|
7
6
|
}
|
@@ -0,0 +1,23 @@
|
|
1
|
+
% ctype = dtype_to_c_type(data_type)
|
2
|
+
|
3
|
+
__kernel void unpack(const int N, const int index, __global const <%= ctype %> *A, __global <%= ctype %> *C) {
|
4
|
+
// Get the index of the current element to be processed
|
5
|
+
const int globalCol = get_global_id(0); // Col ID of C (0..N)
|
6
|
+
|
7
|
+
int start = index * <%= divisors[0] %>;
|
8
|
+
int ptr = start + globalCol;
|
9
|
+
int index_map[<%= divisors.size %>] = { <%= Array.new(divisors.size) { 0 }.join(', ') %> };
|
10
|
+
|
11
|
+
// compute effective coordinates
|
12
|
+
<% divisors.each_with_index do |div, index| %>
|
13
|
+
index_map[<%= index %>] = (int)floor(ptr / (float)<%= div %>);<% if index < divisors.size - 1%>ptr = ptr % <%= div %>;<% end %><% end %>
|
14
|
+
|
15
|
+
// Apply axis translation if needed
|
16
|
+
<% if axis > 0 %>
|
17
|
+
int last = index_map[<%= axis %>];
|
18
|
+
<% axis.downto(1) do |i| %> index_map[<%= i %>] = index_map[<%= (i - 1) %>];<% end %>
|
19
|
+
index_map[0] = last;
|
20
|
+
<% end%>
|
21
|
+
|
22
|
+
C[<%= multipliers.each_with_index.map { |m, idx| "#{m}*index_map[#{idx}]" }.join(' + ') %>] = A[globalCol];
|
23
|
+
}
|
@@ -14,14 +14,15 @@ module TensorStream
|
|
14
14
|
assign.buffer.dirty = true # force buffer copy when variable is read externally
|
15
15
|
output_buffer = assign.buffer
|
16
16
|
|
17
|
-
|
18
|
-
work_group = [m || 1, n || 1]
|
19
|
-
cl_m = OpenCL::Int1.new(m || 1)
|
20
|
-
cl_n = OpenCL::Int1.new(n || 1)
|
17
|
+
work_group = [output_buffer.total_elements]
|
21
18
|
|
22
19
|
event_wait_list = build_event_wait_list([assign.buffer, learning_rate, delta])
|
23
|
-
|
24
|
-
event =
|
20
|
+
|
21
|
+
event = call_program("apply_gradient", output_buffer.data_type,
|
22
|
+
work_group,
|
23
|
+
delta.cl_buffer,
|
24
|
+
learning_rate.cl_buffer,
|
25
|
+
output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
25
26
|
output_buffer.op = event
|
26
27
|
output_buffer
|
27
28
|
end
|
@@ -37,15 +38,12 @@ module TensorStream
|
|
37
38
|
|
38
39
|
output_buffer = assign.buffer
|
39
40
|
|
40
|
-
|
41
|
-
work_group = [m || 1, n || 1]
|
42
|
-
cl_m = OpenCL::Int1.new(m || 1)
|
43
|
-
cl_n = OpenCL::Int1.new(n || 1)
|
41
|
+
work_group = [output_buffer.total_elements]
|
44
42
|
|
45
43
|
event_wait_list = build_event_wait_list([assign.buffer, assign_acc.buffer, learning_rate, grad, momentum])
|
46
44
|
method_call = :"apply_momentum_#{output_buffer.data_type}"
|
47
45
|
event = _cl_program("apply_momentum", nesterov: tensor.options[:use_nesterov], dtype: output_buffer.data_type).
|
48
|
-
send(method_call, _opencl_queue, work_group,
|
46
|
+
send(method_call, _opencl_queue, work_group, grad.cl_buffer,
|
49
47
|
learning_rate.cl_buffer, momentum.cl_buffer, output_buffer.cl_buffer,
|
50
48
|
assign_acc.buffer.cl_buffer, event_wait_list: event_wait_list)
|
51
49
|
output_buffer.op = event
|
@@ -66,15 +64,11 @@ module TensorStream
|
|
66
64
|
|
67
65
|
output_buffer = assign.buffer
|
68
66
|
|
69
|
-
|
70
|
-
work_group = [m || 1, n || 1]
|
71
|
-
cl_m = OpenCL::Int1.new(m || 1)
|
72
|
-
cl_n = OpenCL::Int1.new(n || 1)
|
67
|
+
work_group = [output_buffer.total_elements]
|
73
68
|
|
74
69
|
event_wait_list = build_event_wait_list(inputs)
|
75
|
-
|
76
|
-
|
77
|
-
.send(method_call, _opencl_queue, work_group, cl_m, cl_n,
|
70
|
+
event = call_program('apply_adadelta', output_buffer.data_type,
|
71
|
+
work_group,
|
78
72
|
lr.cl_buffer,
|
79
73
|
rho.cl_buffer,
|
80
74
|
epsilon.cl_buffer,
|
@@ -104,15 +98,11 @@ module TensorStream
|
|
104
98
|
|
105
99
|
output_buffer = assign.buffer
|
106
100
|
|
107
|
-
|
108
|
-
work_group = [m || 1, n || 1]
|
109
|
-
cl_m = OpenCL::Int1.new(m || 1)
|
110
|
-
cl_n = OpenCL::Int1.new(n || 1)
|
101
|
+
work_group = [output_buffer.total_elements]
|
111
102
|
|
112
103
|
event_wait_list = build_event_wait_list(inputs)
|
113
|
-
|
114
|
-
|
115
|
-
.send(method_call, _opencl_queue, work_group, cl_m, cl_n,
|
104
|
+
event = call_program("apply_adam", output_buffer.data_type,
|
105
|
+
work_group,
|
116
106
|
grad.cl_buffer,
|
117
107
|
lr_t.cl_buffer,
|
118
108
|
beta1_power.cl_buffer,
|
@@ -130,6 +120,99 @@ module TensorStream
|
|
130
120
|
output_buffer
|
131
121
|
end
|
132
122
|
|
123
|
+
register_op :apply_adagrad do |context, tensor, inputs|
|
124
|
+
target_var, accum, lr, grad = inputs
|
125
|
+
|
126
|
+
assign = tensor.inputs[0] || tensor
|
127
|
+
assign_acc = tensor.inputs[1]
|
128
|
+
|
129
|
+
assign.buffer.dirty = true
|
130
|
+
assign_acc.buffer.dirty = true
|
131
|
+
output_buffer = assign.buffer
|
132
|
+
|
133
|
+
work_group = [output_buffer.total_elements]
|
134
|
+
|
135
|
+
event_wait_list = build_event_wait_list(inputs)
|
136
|
+
event = call_program('apply_adagrad',
|
137
|
+
output_buffer.data_type,
|
138
|
+
work_group,
|
139
|
+
lr.cl_buffer,
|
140
|
+
grad.cl_buffer,
|
141
|
+
assign.buffer.cl_buffer,
|
142
|
+
assign_acc.buffer.cl_buffer,
|
143
|
+
event_wait_list: event_wait_list)
|
144
|
+
output_buffer.op = event
|
145
|
+
assign_acc.buffer.op = event
|
146
|
+
output_buffer
|
147
|
+
end
|
148
|
+
|
149
|
+
register_op :apply_centered_rms_prop do |context, tensor, inputs|
|
150
|
+
var, mg, ms, mom, lr, rho, momentum, epsilon, grad = inputs
|
151
|
+
|
152
|
+
assign = tensor.inputs[0]
|
153
|
+
assign_mg = tensor.inputs[1]
|
154
|
+
assign_ms = tensor.inputs[2]
|
155
|
+
assign_mom = tensor.inputs[3]
|
156
|
+
|
157
|
+
assign.buffer.dirty = true
|
158
|
+
assign_mg.buffer.dirty = true
|
159
|
+
assign_ms.buffer.dirty = true
|
160
|
+
assign_mom.buffer.dirty = true
|
161
|
+
output_buffer = assign.buffer
|
162
|
+
event_wait_list = build_event_wait_list(inputs)
|
163
|
+
work_group = [output_buffer.total_elements]
|
164
|
+
|
165
|
+
event = call_program('apply_centered_rms_prop', output_buffer.data_type, work_group,
|
166
|
+
lr.cl_buffer,
|
167
|
+
rho.cl_buffer,
|
168
|
+
momentum.cl_buffer,
|
169
|
+
epsilon.cl_buffer,
|
170
|
+
grad.cl_buffer,
|
171
|
+
assign.buffer.cl_buffer,
|
172
|
+
assign_ms.buffer.cl_buffer,
|
173
|
+
assign_mg.buffer.cl_buffer,
|
174
|
+
assign_mom.buffer.cl_buffer,
|
175
|
+
event_wait_list: event_wait_list)
|
176
|
+
|
177
|
+
output_buffer.op = event
|
178
|
+
assign_mg.buffer.op = event
|
179
|
+
assign_ms.buffer.op = event
|
180
|
+
assign_mom.buffer.op = event
|
181
|
+
output_buffer
|
182
|
+
end
|
183
|
+
|
184
|
+
register_op :apply_rms_prop do |context, tensor, inputs|
|
185
|
+
var, ms, mom, lr, rho, momentum, epsilon, grad = inputs
|
186
|
+
|
187
|
+
assign = tensor.inputs[0]
|
188
|
+
assign_ms = tensor.inputs[1]
|
189
|
+
assign_mom = tensor.inputs[2]
|
190
|
+
|
191
|
+
assign.buffer.dirty = true
|
192
|
+
assign_ms.buffer.dirty = true
|
193
|
+
assign_mom.buffer.dirty = true
|
194
|
+
output_buffer = assign.buffer
|
195
|
+
event_wait_list = build_event_wait_list(inputs)
|
196
|
+
work_group = [output_buffer.total_elements]
|
197
|
+
|
198
|
+
event = call_program('apply_rms_prop', output_buffer.data_type,
|
199
|
+
work_group,
|
200
|
+
lr.cl_buffer,
|
201
|
+
rho.cl_buffer,
|
202
|
+
momentum.cl_buffer,
|
203
|
+
epsilon.cl_buffer,
|
204
|
+
grad.cl_buffer,
|
205
|
+
assign.buffer.cl_buffer,
|
206
|
+
assign_ms.buffer.cl_buffer,
|
207
|
+
assign_mom.buffer.cl_buffer,
|
208
|
+
event_wait_list: event_wait_list)
|
209
|
+
|
210
|
+
output_buffer.op = event
|
211
|
+
assign_ms.buffer.op = event
|
212
|
+
assign_mom.buffer.op = event
|
213
|
+
output_buffer
|
214
|
+
end
|
215
|
+
|
133
216
|
register_op :softmax do |_context, tensor, inputs|
|
134
217
|
a = inputs[0]
|
135
218
|
event_wait_list = build_event_wait_list(inputs)
|
@@ -213,7 +296,9 @@ module TensorStream
|
|
213
296
|
work_group = [m]
|
214
297
|
n = m if n.nil?
|
215
298
|
cl_n = OpenCL::Int1.new(n || 1)
|
216
|
-
event = _cl_program('softmax_grad', dtype: dtype, size: n).
|
299
|
+
event = _cl_program('softmax_grad', dtype: dtype, size: n).
|
300
|
+
send(:"softmax_grad_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer,
|
301
|
+
grad.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
217
302
|
output_buffer.op = event
|
218
303
|
output_buffer
|
219
304
|
end
|
@@ -14,6 +14,14 @@ module TensorStream
|
|
14
14
|
@op = op
|
15
15
|
end
|
16
16
|
|
17
|
+
def total_elements
|
18
|
+
shape.reduce(:*) || 1
|
19
|
+
end
|
20
|
+
|
21
|
+
def empty_value?
|
22
|
+
@shape == [0]
|
23
|
+
end
|
24
|
+
|
17
25
|
def to_ruby
|
18
26
|
return [] if buffer.empty?
|
19
27
|
|
@@ -24,6 +32,7 @@ module TensorStream
|
|
24
32
|
end
|
25
33
|
|
26
34
|
if shape.empty?
|
35
|
+
return buffer.to_s if data_type == :string
|
27
36
|
return buffer[0] != 0 if data_type == :boolean
|
28
37
|
return buffer[0]
|
29
38
|
end
|
@@ -11,6 +11,8 @@ require 'narray_ffi'
|
|
11
11
|
require 'tensor_stream/evaluator/base_evaluator'
|
12
12
|
require 'tensor_stream/opencl/math_ops'
|
13
13
|
require 'tensor_stream/opencl/nn_ops'
|
14
|
+
require 'tensor_stream/opencl/images_ops'
|
15
|
+
require 'tensor_stream/opencl/array_ops'
|
14
16
|
require 'tensor_stream/helpers/op_helper'
|
15
17
|
|
16
18
|
module TensorStream
|
@@ -32,7 +34,8 @@ module TensorStream
|
|
32
34
|
end
|
33
35
|
end
|
34
36
|
|
35
|
-
##
|
37
|
+
##
|
38
|
+
# PURE ruby evaluator used for testing and development
|
36
39
|
class OpenclEvaluator < BaseEvaluator
|
37
40
|
attr_accessor :retain
|
38
41
|
attr_reader :opencl_device
|
@@ -42,6 +45,8 @@ module TensorStream
|
|
42
45
|
include TensorStream::MathHelper
|
43
46
|
include TensorStream::OpenCLHelpers::MathOps
|
44
47
|
include TensorStream::OpenCLHelpers::NNOps
|
48
|
+
include TensorStream::OpenCLHelpers::ImagesOps
|
49
|
+
include TensorStream::OpenCLHelpers::ArrayOps
|
45
50
|
|
46
51
|
def initialize(session, device, thread_pool: nil, log_intermediates: false)
|
47
52
|
super
|
@@ -86,7 +91,10 @@ module TensorStream
|
|
86
91
|
|
87
92
|
# opencl evaluator main entrypoint
|
88
93
|
def run(tensor, execution_context)
|
89
|
-
|
94
|
+
result = complete_eval(tensor, execution_context)
|
95
|
+
# puts "wait finish"
|
96
|
+
_opencl_queue.finish
|
97
|
+
read_final_result(result)
|
90
98
|
end
|
91
99
|
|
92
100
|
def run_with_buffer(tensor, context, execution_context)
|
@@ -117,9 +125,9 @@ module TensorStream
|
|
117
125
|
def enqueue_buffer_read(tensor, context)
|
118
126
|
buffer = _run(tensor, context)
|
119
127
|
if buffer.is_a?(Array)
|
120
|
-
buffer
|
128
|
+
buffer.collect do |b|
|
121
129
|
next b if b.buffer.size.zero?
|
122
|
-
_opencl_queue.enqueue_read_buffer(b.cl_buffer, b.buffer, event_wait_list: build_event_wait_list([b]))
|
130
|
+
b.op = _opencl_queue.enqueue_read_buffer(b.cl_buffer, b.buffer, event_wait_list: build_event_wait_list([b]))
|
123
131
|
b
|
124
132
|
end
|
125
133
|
else
|
@@ -127,14 +135,18 @@ module TensorStream
|
|
127
135
|
return buffer if buffer.nil?
|
128
136
|
return [] if buffer.buffer.nil?
|
129
137
|
return buffer if buffer.buffer.size.zero?
|
130
|
-
_opencl_queue.enqueue_read_buffer(buffer.cl_buffer, buffer.buffer, event_wait_list: build_event_wait_list([buffer]))
|
138
|
+
buffer.op = _opencl_queue.enqueue_read_buffer(buffer.cl_buffer, buffer.buffer, event_wait_list: build_event_wait_list([buffer]))
|
131
139
|
buffer
|
132
140
|
end
|
133
141
|
end
|
134
142
|
|
135
143
|
def complete_eval(tensor, context)
|
144
|
+
return nil if tensor.nil?
|
145
|
+
|
136
146
|
buffer = enqueue_buffer_read(tensor, context)
|
137
|
-
|
147
|
+
events = build_event_wait_list([buffer])
|
148
|
+
# puts "wait #{tensor.name}"
|
149
|
+
OpenCL.wait_for_events(events) unless events.empty?
|
138
150
|
buffer
|
139
151
|
end
|
140
152
|
|
@@ -162,6 +174,7 @@ module TensorStream
|
|
162
174
|
|
163
175
|
def prepare_input(tensor, context, options = {})
|
164
176
|
return nil unless tensor
|
177
|
+
|
165
178
|
tensor = resolve_placeholder(tensor)
|
166
179
|
if options[:noop]
|
167
180
|
tensor
|
@@ -210,11 +223,17 @@ module TensorStream
|
|
210
223
|
def _cl_program(kernel, args = {})
|
211
224
|
suffix = args.collect { |k, v| "#{k}.#{escape_arg_content(v)}" }.join('.')
|
212
225
|
@context[:_cache]["_opencl_kernel_#{kernel}.#{suffix}:#{object_id}"] ||= begin
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
226
|
+
file_path = File.join('/tmp', "#{kernel}.#{suffix}.cl")
|
227
|
+
source = if File.exist?(file_path) && ENV['TS_OPENCL_FILE_CACHE']
|
228
|
+
File.read(file_path)
|
229
|
+
else
|
230
|
+
filename = %w[cl.erb cl].map { |ext| cl_template_path(kernel, ext) }.find { |n| File.exist?(n) }
|
231
|
+
raise "opencl kernel template for #{kernel} has not yet been defined" if filename.nil?
|
232
|
+
source = File.read(filename)
|
233
|
+
source = OpenclTemplateHelper.new(source).generate(args)
|
234
|
+
File.write(file_path, source) if ENV['TS_OPENCL_FILE_CACHE']
|
235
|
+
source
|
236
|
+
end
|
218
237
|
program = _opencl_context.create_program_with_source(source)
|
219
238
|
program.build
|
220
239
|
rescue OpenCL::Error::BUILD_PROGRAM_FAILURE => e
|
@@ -275,10 +294,10 @@ module TensorStream
|
|
275
294
|
end
|
276
295
|
|
277
296
|
register_op :identity do |context, tensor, inputs|
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
297
|
+
value = inputs[0]
|
298
|
+
buffer = OpenCLBuffer.new(name: tensor.name, data_type: tensor.data_type, shape: value.shape, buffer: value.buffer, cl_buffer: value.cl_buffer)
|
299
|
+
buffer.op = build_event_wait_list(inputs)
|
300
|
+
buffer
|
282
301
|
end
|
283
302
|
|
284
303
|
register_op :assign, noop: true do |context, tensor, inputs|
|
@@ -308,86 +327,11 @@ module TensorStream
|
|
308
327
|
end
|
309
328
|
end
|
310
329
|
|
311
|
-
register_op :expand_dims, buffer: true do |_context, tensor, inputs|
|
312
|
-
axis = inputs[1].buffer[0]
|
313
|
-
shape = inputs[0].shape.dup
|
314
|
-
axis = -axis if axis == shape.size
|
315
|
-
new_shape = shape.insert(axis, 1).compact
|
316
|
-
new_buf = inputs[0].buffer.reshape(*new_shape.reverse)
|
317
|
-
convert_to_opencl(new_buf, new_shape, data_type: inputs[0].data_type, name: tensor.name)
|
318
|
-
end
|
319
|
-
|
320
|
-
register_op :fill, buffer: true do |_context, tensor, inputs|
|
321
|
-
shape = inputs[0]
|
322
|
-
value = inputs[1]
|
323
|
-
|
324
|
-
narray_size = shape.buffer.to_a.reduce(:*) || 1
|
325
|
-
cl_buffer = get_cached_buffer(tensor.name, shape.buffer.to_a)
|
326
|
-
|
327
|
-
buffer = if cl_buffer
|
328
|
-
cl_buffer.buffer
|
329
|
-
else
|
330
|
-
allocate_narray_for_type(tensor.data_type, narray_size)
|
331
|
-
end
|
332
|
-
|
333
|
-
buffer.fill!(value.buffer[0])
|
334
|
-
convert_to_opencl(buffer, shape.buffer.to_a, data_type: tensor.data_type, name: tensor.name)
|
335
|
-
end
|
336
|
-
|
337
330
|
register_op :where, noop: true do |context, tensor, inputs|
|
338
331
|
pred = tensor.options[:pred]
|
339
332
|
execute_cond_func('where', tensor, pred, inputs[0], inputs[1], context)
|
340
333
|
end
|
341
334
|
|
342
|
-
register_op :cast do |_context, tensor, inputs|
|
343
|
-
a = inputs[0]
|
344
|
-
if a.data_type != tensor.data_type
|
345
|
-
buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
|
346
|
-
m, n = a.shape
|
347
|
-
cl_m = OpenCL::Int1.new(m || 1)
|
348
|
-
cl_n = OpenCL::Int1.new(n || 1)
|
349
|
-
work_group = [m || 1, n || 1]
|
350
|
-
event_wait_list = build_event_wait_list(inputs)
|
351
|
-
buffer.op = _cl_program("cast", source_dt: a.data_type, target_dt: tensor.data_type).cast(_opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, buffer.cl_buffer, event_wait_list: event_wait_list)
|
352
|
-
buffer
|
353
|
-
else
|
354
|
-
a
|
355
|
-
end
|
356
|
-
end
|
357
|
-
|
358
|
-
register_op :stack do |_context, tensor, inputs|
|
359
|
-
axis = tensor.options[:axis] || 0
|
360
|
-
shape = inputs[0].shape
|
361
|
-
rank = shape.size + 1
|
362
|
-
elem_size = shape.empty? ? 1 : shape.reduce(:*)
|
363
|
-
|
364
|
-
new_shape = [inputs.size]
|
365
|
-
shape.inject(new_shape) { |ns, s| ns << s }
|
366
|
-
|
367
|
-
divisors = new_shape.dup.drop(1).reverse.inject([1]) do |a, s|
|
368
|
-
a << s * a.last
|
369
|
-
end.reverse
|
370
|
-
|
371
|
-
axis = rank + axis if axis < 0
|
372
|
-
rotated_shape = Array.new(axis + 1) { new_shape.shift }
|
373
|
-
new_shape = rotated_shape.rotate! + new_shape
|
374
|
-
|
375
|
-
output_buffer = _create_result_buffer(tensor.data_type, new_shape, tensor.name)
|
376
|
-
multipliers = new_shape.dup.drop(1).reverse.inject([1]) do |a, s|
|
377
|
-
a << s * a.last
|
378
|
-
end.reverse
|
379
|
-
|
380
|
-
cl_n = OpenCL::Int1.new(elem_size)
|
381
|
-
work_group = [elem_size]
|
382
|
-
event_wait_list = build_event_wait_list(inputs)
|
383
|
-
ops = inputs.each_with_index.map do |input, index|
|
384
|
-
cl_index = OpenCL::Int1.new(index)
|
385
|
-
_cl_program("pack", data_type: tensor.data_type, divisors: divisors, multipliers: multipliers, axis: axis).pack(_opencl_queue, work_group, cl_n, cl_index, input.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
386
|
-
end
|
387
|
-
output_buffer.op = ops
|
388
|
-
output_buffer
|
389
|
-
end
|
390
|
-
|
391
335
|
register_op :check_numerics, noop: true do |context, tensor, inputs|
|
392
336
|
a = complete_eval(inputs[0], context)
|
393
337
|
name = tensor.options[:name]
|
@@ -420,86 +364,18 @@ module TensorStream
|
|
420
364
|
a
|
421
365
|
end
|
422
366
|
|
423
|
-
register_op :rank do |_context, tensor, inputs|
|
424
|
-
wrap_opencl(inputs[0].shape.size, data_type: tensor.data_type, name: tensor.name)
|
425
|
-
end
|
426
|
-
|
427
367
|
register_op :stop_gradient do |_context, _tensor, inputs|
|
428
368
|
inputs[0]
|
429
369
|
end
|
430
370
|
|
431
|
-
register_op :slice, noop: true do |context, tensor, inputs|
|
432
|
-
input_a = complete_eval(inputs[0], context)
|
433
|
-
input_b = read_final_result(complete_eval(inputs[1], context))
|
434
|
-
size = tensor.options[:size]
|
435
|
-
|
436
|
-
slice_param = input_b.zip(size).collect { |x, y| x..x + y - 1 }.reverse
|
437
|
-
|
438
|
-
new_buf = input_a.buffer.reshape(*input_a.shape.reverse)
|
439
|
-
sliced = new_buf.slice[*slice_param]
|
440
|
-
convert_to_opencl(sliced.flatten, sliced.shape.reverse, data_type: inputs[0].data_type, name: tensor.name)
|
441
|
-
end
|
442
|
-
|
443
|
-
register_op :transpose, buffer: true do |_context, tensor, inputs|
|
444
|
-
t_param = Array.new(inputs[0].shape.size) { |index| index }.reverse
|
445
|
-
|
446
|
-
if inputs[0].shape.size == 2 && inputs[1].nil?
|
447
|
-
transposed = inputs[0].buffer.reshape(*inputs[0].shape.reverse).transpose(*t_param)
|
448
|
-
res = convert_to_opencl(transposed.flatten, transposed.shape.reverse, data_type: inputs[0].data_type, name: tensor.name)
|
449
|
-
res
|
450
|
-
else
|
451
|
-
rank = inputs[0].shape.size
|
452
|
-
perm = inputs[1].nil? ? (0...rank).to_a.reverse : inputs[1].buffer
|
453
|
-
new_shape = perm.map { |p| inputs[0].shape[p] }.to_a
|
454
|
-
output_buffer = _create_result_buffer(tensor.data_type, new_shape, tensor.name)
|
455
|
-
transpose_with_perm(inputs[0].buffer, output_buffer.buffer, inputs[0].shape, new_shape, perm)
|
456
|
-
|
457
|
-
write_op = _opencl_queue.enqueue_write_buffer(output_buffer.cl_buffer, output_buffer.buffer)
|
458
|
-
output_buffer.op = write_op
|
459
|
-
output_buffer
|
460
|
-
end
|
461
|
-
end
|
462
|
-
|
463
|
-
register_op :index, noop: true do |context, tensor, inputs|
|
464
|
-
a = _run(inputs[0], context)
|
465
|
-
index = read_final_result(_run(inputs[1], context))
|
466
|
-
|
467
|
-
if a.is_a?(OutputGroup)
|
468
|
-
a.outputs[index]
|
469
|
-
elsif a.is_a?(Array)
|
470
|
-
a[index]
|
471
|
-
else
|
472
|
-
new_shape = a.shape.dup
|
473
|
-
new_shape.shift
|
474
|
-
input_a = read_final_result(a)
|
475
|
-
convert_to_opencl(input_a[index], new_shape, data_type: a.data_type, name: tensor.name)
|
476
|
-
end
|
477
|
-
end
|
478
|
-
|
479
371
|
register_op :broadcast_gradient_args, buffer: true do |_context, tensor, inputs|
|
480
372
|
rx, ry = get_broadcast_gradient_args(inputs[0].buffer.to_a, inputs[1].buffer.to_a)
|
481
373
|
OutputGroup.new([wrap_opencl(rx, data_type: :int32, name: tensor.name), wrap_opencl(ry, data_type: :int32, name: "#{tensor.name}:1")], tensor.inputs.map(&:data_type))
|
482
374
|
end
|
483
375
|
|
484
|
-
register_op :
|
485
|
-
|
486
|
-
|
487
|
-
|
488
|
-
register_op :reshape, buffer: true do |_context, tensor, inputs|
|
489
|
-
arr = inputs[0]
|
490
|
-
new_shape = read_final_result(inputs[1])
|
491
|
-
|
492
|
-
shape = if new_shape.size.zero? && arr.buffer.size == 1
|
493
|
-
new_shape
|
494
|
-
else
|
495
|
-
TensorShape.fix_inferred_elements(new_shape, arr.buffer.size)
|
496
|
-
end
|
497
|
-
|
498
|
-
convert_to_opencl(arr.buffer, shape, data_type: arr.data_type, name: tensor.name)
|
499
|
-
end
|
500
|
-
|
501
|
-
register_op :flow_group do |context, _tensor, inputs|
|
502
|
-
_opencl_queue.finish
|
376
|
+
register_op :flow_group do |_context, _tensor, inputs|
|
377
|
+
events = build_event_wait_list(inputs)
|
378
|
+
OpenCL.wait_for_events(events) unless events.empty?
|
503
379
|
nil
|
504
380
|
end
|
505
381
|
|
@@ -657,7 +533,10 @@ module TensorStream
|
|
657
533
|
cl_n = OpenCL::Int1.new(n || 1)
|
658
534
|
|
659
535
|
event_wait_list = build_event_wait_list([a, b, p]) # add dependency wait list
|
660
|
-
output_buffer.op = _cl_program(op_name.to_s, dtype: dtype).
|
536
|
+
output_buffer.op = _cl_program(op_name.to_s, dtype: dtype).
|
537
|
+
send(:"#{op_name}_#{dtype}", _opencl_queue, work_group,
|
538
|
+
cl_m, cl_n, p.cl_buffer, a.cl_buffer, b.cl_buffer,
|
539
|
+
output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
661
540
|
output_buffer
|
662
541
|
end
|
663
542
|
|
@@ -667,16 +546,17 @@ module TensorStream
|
|
667
546
|
dtype = tensor.data_type
|
668
547
|
output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
|
669
548
|
|
670
|
-
|
671
|
-
work_group = [m || 1, n || 1]
|
672
|
-
cl_m = OpenCL::Int1.new(m || 1)
|
673
|
-
cl_n = OpenCL::Int1.new(n || 1)
|
549
|
+
work_group = [a.total_elements]
|
674
550
|
|
675
|
-
event =
|
551
|
+
event = call_program(op_name, dtype, work_group, a.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
676
552
|
output_buffer.op = event
|
677
553
|
output_buffer
|
678
554
|
end
|
679
555
|
|
556
|
+
def call_program(name, dtype, work_group, *args)
|
557
|
+
_cl_program(name.to_s, dtype: dtype).send(:"#{name}_#{dtype}", _opencl_queue, work_group, *args)
|
558
|
+
end
|
559
|
+
|
680
560
|
def auto_type_cast(a, b, name: nil)
|
681
561
|
return [a, b] if a.data_type == b.data_type
|
682
562
|
m, n = b.shape
|
@@ -728,16 +608,20 @@ module TensorStream
|
|
728
608
|
@context[:_cache][cache_key]
|
729
609
|
else
|
730
610
|
narray_size = shape.reduce(:*) || 1
|
611
|
+
cl_buffer_size = shape.empty? ? 1 : shape.reduce(:*)
|
731
612
|
|
732
613
|
buffer = if value.is_a?(NArray)
|
733
614
|
value
|
615
|
+
elsif data_type == :string && shape.empty?
|
616
|
+
cl_buffer_size = value[0].bytesize
|
617
|
+
allocate_narray_for_type(data_type, value[0].bytesize)
|
734
618
|
else
|
735
619
|
allocate_narray_for_type(data_type, narray_size)
|
736
620
|
end
|
737
621
|
|
738
622
|
return nil if buffer.nil?
|
739
623
|
|
740
|
-
|
624
|
+
|
741
625
|
|
742
626
|
cl_buffer = unless value.flatten.empty?
|
743
627
|
cl_buffer_size = 1 if cl_buffer_size.zero?
|
@@ -746,8 +630,11 @@ module TensorStream
|
|
746
630
|
|
747
631
|
@context[:_cache][cache_key] = OpenCLBuffer.new(name: name, data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer)
|
748
632
|
end
|
749
|
-
|
750
|
-
|
633
|
+
if data_type == :string
|
634
|
+
value[0].each_byte.with_index do |c, index|
|
635
|
+
cl_object.buffer[index] = c
|
636
|
+
end
|
637
|
+
elsif value.is_a?(Array)
|
751
638
|
value.flatten.each_with_index do |element, index|
|
752
639
|
cl_object.buffer[index] = if element.is_a?(Tensor)
|
753
640
|
read_final_result(complete_eval(element, {}))
|
@@ -765,7 +652,10 @@ module TensorStream
|
|
765
652
|
cl_object.buffer[0] = Tensor.cast_dtype(value, data_type)
|
766
653
|
end
|
767
654
|
|
768
|
-
|
655
|
+
if cl_object.cl_buffer && !value.nil? && (!value.is_a?(Array) || !value.empty?)
|
656
|
+
write_op = _opencl_queue.enqueue_write_buffer(cl_object.cl_buffer, cl_object.buffer)
|
657
|
+
end
|
658
|
+
|
769
659
|
cl_object.op = write_op
|
770
660
|
cl_object
|
771
661
|
end
|
@@ -780,8 +670,12 @@ module TensorStream
|
|
780
670
|
NArray.int(narray_size)
|
781
671
|
when :int16
|
782
672
|
NArray.sint(narray_size)
|
673
|
+
when :uint8
|
674
|
+
NArray.byte(narray_size)
|
783
675
|
when :boolean
|
784
676
|
NArray.byte(narray_size)
|
677
|
+
when :string
|
678
|
+
NArray.byte(narray_size)
|
785
679
|
when :unknown
|
786
680
|
nil
|
787
681
|
else
|
@@ -799,6 +693,65 @@ module TensorStream
|
|
799
693
|
end
|
800
694
|
end
|
801
695
|
|
696
|
+
# automatically use sub buffers
|
697
|
+
def _create_result_sub_buffer(parent_buffer, index, data_type, shape, name)
|
698
|
+
cache_key ="_sub_result_#{parent_buffer.object_id}_#{name}_#{index}:#{object_id}"
|
699
|
+
@context[:_cache][:_cl_buffers][cache_key] ||= begin
|
700
|
+
size = shape.empty? || shape == [0] ? 1 : shape.reduce(:*)
|
701
|
+
buffer = allocate_narray_for_type(data_type, size)
|
702
|
+
|
703
|
+
if parent_buffer.cl_buffer.associated_memobject.nil?
|
704
|
+
start = index * buffer.size * buffer.element_size
|
705
|
+
region = OpenCL::BufferRegion::new(start, buffer.size * buffer.element_size)
|
706
|
+
cl_buffer = parent_buffer.cl_buffer.create_sub_buffer(OpenCL::BUFFER_CREATE_TYPE_REGION, region)
|
707
|
+
OpenCLBuffer.new(data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer, name: name)
|
708
|
+
else
|
709
|
+
_create_result_buffer(tensor.data_type, shape, name)
|
710
|
+
end
|
711
|
+
end
|
712
|
+
|
713
|
+
buffer = @context[:_cache][:_cl_buffers][cache_key]
|
714
|
+
|
715
|
+
if buffer.cl_buffer.associated_memobject
|
716
|
+
buffer.op = parent_buffer.op
|
717
|
+
else
|
718
|
+
region_size_in_bytes = buffer.buffer.size * buffer.buffer.element_size
|
719
|
+
start = index * region_size_in_bytes
|
720
|
+
region = [region_size_in_bytes, 1, 1]
|
721
|
+
buffer.op = _opencl_queue.enqueue_copy_buffer_rect(parent_buffer.cl_buffer, buffer.cl_buffer, region, src_origin: [start, 0, 0], event_wait_list: parent_buffer.op)
|
722
|
+
end
|
723
|
+
|
724
|
+
buffer
|
725
|
+
end
|
726
|
+
|
727
|
+
# create sub buffers of different sizes
|
728
|
+
def _create_variable_result_sub_buffer(parent_buffer, index, start, region_size_in_bytes, data_type, shape, name)
|
729
|
+
cache_key ="_sub_result_#{parent_buffer.object_id}_#{name}_#{index}:#{object_id}"
|
730
|
+
@context[:_cache][:_cl_buffers][cache_key] ||= begin
|
731
|
+
size = shape.empty? || shape == [0] ? 1 : shape.reduce(:*)
|
732
|
+
buffer = allocate_narray_for_type(data_type, size)
|
733
|
+
|
734
|
+
if parent_buffer.cl_buffer.associated_memobject.nil?
|
735
|
+
region = OpenCL::BufferRegion::new(start, region_size_in_bytes)
|
736
|
+
cl_buffer = parent_buffer.cl_buffer.create_sub_buffer(OpenCL::BUFFER_CREATE_TYPE_REGION, region)
|
737
|
+
OpenCLBuffer.new(data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer, name: "#{name}/sub")
|
738
|
+
else
|
739
|
+
_create_result_buffer(tensor.data_type, shape, name)
|
740
|
+
end
|
741
|
+
end
|
742
|
+
|
743
|
+
buffer = @context[:_cache][:_cl_buffers][cache_key]
|
744
|
+
|
745
|
+
if buffer.cl_buffer.associated_memobject
|
746
|
+
buffer.op = parent_buffer.op
|
747
|
+
else
|
748
|
+
region = [region_size_in_bytes, 1, 1]
|
749
|
+
buffer.op = _opencl_queue.enqueue_copy_buffer_rect(parent_buffer.cl_buffer, buffer.cl_buffer, region, src_origin: [start, 0, 0], event_wait_list: parent_buffer.op)
|
750
|
+
end
|
751
|
+
|
752
|
+
buffer
|
753
|
+
end
|
754
|
+
|
802
755
|
def get_op_with_axis(a, target_axis, current_axis, output_type, op = ->(t, u) { t > u })
|
803
756
|
if target_axis == current_axis
|
804
757
|
if a[0].is_a?(Array)
|
@@ -898,7 +851,11 @@ module TensorStream
|
|
898
851
|
end
|
899
852
|
|
900
853
|
def build_event_wait_list(inputs)
|
901
|
-
inputs.
|
854
|
+
if inputs.is_a?(Array)
|
855
|
+
inputs.flatten.compact.map(&:op).compact.uniq
|
856
|
+
else
|
857
|
+
inputs.op ? [inputs.op] : []
|
858
|
+
end
|
902
859
|
end
|
903
860
|
|
904
861
|
def resolve_placeholder(placeholder, _execution_context = {})
|