tensor_stream-opencl 0.1.3 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (48) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +11 -4
  3. data/benchmark/benchmark.rb +91 -0
  4. data/benchmark_intel.txt +36 -0
  5. data/lib/tensor_stream/opencl/array_ops.rb +395 -0
  6. data/lib/tensor_stream/opencl/images_ops.rb +62 -0
  7. data/lib/tensor_stream/opencl/kernels/abs.cl +6 -8
  8. data/lib/tensor_stream/opencl/kernels/acos.cl +3 -4
  9. data/lib/tensor_stream/opencl/kernels/apply_adadelta.cl +2 -4
  10. data/lib/tensor_stream/opencl/kernels/apply_adagrad.cl +12 -0
  11. data/lib/tensor_stream/opencl/kernels/apply_adam.cl +2 -5
  12. data/lib/tensor_stream/opencl/kernels/apply_centered_rms_prop.cl +19 -0
  13. data/lib/tensor_stream/opencl/kernels/apply_gradient.cl +3 -4
  14. data/lib/tensor_stream/opencl/kernels/apply_momentum.cl +2 -4
  15. data/lib/tensor_stream/opencl/kernels/apply_rms_prop.cl +16 -0
  16. data/lib/tensor_stream/opencl/kernels/asin.cl +3 -4
  17. data/lib/tensor_stream/opencl/kernels/ceil.cl +3 -4
  18. data/lib/tensor_stream/opencl/kernels/concat.cl +21 -0
  19. data/lib/tensor_stream/opencl/kernels/cos.cl +3 -5
  20. data/lib/tensor_stream/opencl/kernels/exp.cl +3 -5
  21. data/lib/tensor_stream/opencl/kernels/floor.cl +3 -4
  22. data/lib/tensor_stream/opencl/kernels/log.cl +3 -4
  23. data/lib/tensor_stream/opencl/kernels/log1p.cl +3 -4
  24. data/lib/tensor_stream/opencl/kernels/negate.cl +3 -4
  25. data/lib/tensor_stream/opencl/kernels/reciprocal.cl +3 -4
  26. data/lib/tensor_stream/opencl/kernels/sigmoid.cl +3 -4
  27. data/lib/tensor_stream/opencl/kernels/sign.cl +7 -8
  28. data/lib/tensor_stream/opencl/kernels/sin.cl +3 -4
  29. data/lib/tensor_stream/opencl/kernels/split.cl +17 -0
  30. data/lib/tensor_stream/opencl/kernels/split_n.cl +18 -0
  31. data/lib/tensor_stream/opencl/kernels/sqrt.cl +3 -4
  32. data/lib/tensor_stream/opencl/kernels/square.cl +3 -4
  33. data/lib/tensor_stream/opencl/kernels/tan.cl +3 -4
  34. data/lib/tensor_stream/opencl/kernels/tanh.cl +3 -4
  35. data/lib/tensor_stream/opencl/kernels/tanh_grad.cl +3 -4
  36. data/lib/tensor_stream/opencl/kernels/unpack.cl +23 -0
  37. data/lib/tensor_stream/opencl/nn_ops.rb +111 -26
  38. data/lib/tensor_stream/opencl/opencl_buffer.rb +9 -0
  39. data/lib/tensor_stream/opencl/opencl_evaluator.rb +129 -172
  40. data/lib/tensor_stream/opencl/version.rb +1 -1
  41. data/samples/iris.data +150 -0
  42. data/samples/iris.rb +110 -0
  43. data/samples/mnist_data.rb +65 -0
  44. data/samples/multigpu.rb +73 -0
  45. data/samples/nearest_neighbor.rb +56 -0
  46. data/samples/rnn.rb +108 -0
  47. data/tensor_stream-opencl.gemspec +4 -1
  48. metadata +62 -3
@@ -0,0 +1,18 @@
1
+ % ctype = dtype_to_c_type(data_type)
2
+ % mul_str = mul.each_with_index.collect { |mul, index| "#{mul} * index_map_#{index}" }
3
+ __kernel void split(const int offset, __global const <%= ctype %> *A, __global <%= ctype %> *C) {
4
+ // Get the index of the current element to be processed
5
+ const int globalCol = get_global_id(0); // Col ID of C (0..N)
6
+
7
+ // compute effective coordinates
8
+ int ptr = globalCol;
9
+ <% div.each_with_index do |div, index| %>
10
+ <% if index == axis %>
11
+ int index_map_<%= index %> = (int)floor(ptr / (float)<%= div %>) + <%= step %>;
12
+ <% else %>
13
+ int index_map_<%= index %> = (int)floor(ptr / (float)<%= div %>);
14
+ <% end %>
15
+ <% if index < div.size - 1%>ptr = ptr % <%= div %>;<% end %><% end %>
16
+ C[offset + globalCol] = A[<%= mul_str.join(" + ") %>];
17
+
18
+ }
@@ -1,9 +1,8 @@
1
1
  % c_dtype = dtype_to_c_type(dtype)
2
2
 
3
- __kernel void sqrt_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
+ __kernel void sqrt_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
4
4
  // Get the index of the current element to be processed
5
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
5
+ const int id = get_global_id(0);
7
6
 
8
- C[globalRow * N + globalCol] = sqrt(A[globalRow * N + globalCol]);
7
+ C[id] = sqrt(A[id]);
9
8
  }
@@ -1,9 +1,8 @@
1
1
  % c_dtype = dtype_to_c_type(dtype)
2
2
 
3
- __kernel void square_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
+ __kernel void square_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
4
4
  // Get the index of the current element to be processed
5
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
5
+ const int id = get_global_id(0);
7
6
 
8
- C[globalRow * N + globalCol] = A[globalRow * N + globalCol] * A[globalRow * N + globalCol];
7
+ C[id] = A[id] * A[id];
9
8
  }
@@ -1,8 +1,7 @@
1
1
  % c_dtype = dtype_to_c_type(dtype)
2
- __kernel void tan_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
2
+ __kernel void tan_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
3
  // Get the index of the current element to be processed
4
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
4
+ const int id = get_global_id(0); // Row ID of C (0..M)
6
5
 
7
- C[globalRow * N + globalCol] = tan(A[globalRow * N + globalCol]);
6
+ C[id] = tan(A[id]);
8
7
  }
@@ -1,8 +1,7 @@
1
1
  % c_dtype = dtype_to_c_type(dtype)
2
- __kernel void tanh_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
2
+ __kernel void tanh_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
3
  // Get the index of the current element to be processed
4
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
4
+ const int id = get_global_id(0); // Row ID of C (0..M)
6
5
 
7
- C[globalRow * N + globalCol] = tanh(A[globalRow * N + globalCol]);
6
+ C[id] = tanh(A[id]);
8
7
  }
@@ -1,7 +1,6 @@
1
1
  % c_dtype = dtype_to_c_type(dtype)
2
- __kernel void tanh_grad_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
2
+ __kernel void tanh_grad_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
3
  // Get the index of the current element to be processed
4
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
- C[globalRow * N + globalCol] = 1 - tanh(A[globalRow * N + globalCol]) * tanh(A[globalRow * N + globalCol]);
4
+ const int id = get_global_id(0);
5
+ C[id] = 1 - tanh(A[id]) * tanh(A[id]);
7
6
  }
@@ -0,0 +1,23 @@
1
+ % ctype = dtype_to_c_type(data_type)
2
+
3
+ __kernel void unpack(const int N, const int index, __global const <%= ctype %> *A, __global <%= ctype %> *C) {
4
+ // Get the index of the current element to be processed
5
+ const int globalCol = get_global_id(0); // Col ID of C (0..N)
6
+
7
+ int start = index * <%= divisors[0] %>;
8
+ int ptr = start + globalCol;
9
+ int index_map[<%= divisors.size %>] = { <%= Array.new(divisors.size) { 0 }.join(', ') %> };
10
+
11
+ // compute effective coordinates
12
+ <% divisors.each_with_index do |div, index| %>
13
+ index_map[<%= index %>] = (int)floor(ptr / (float)<%= div %>);<% if index < divisors.size - 1%>ptr = ptr % <%= div %>;<% end %><% end %>
14
+
15
+ // Apply axis translation if needed
16
+ <% if axis > 0 %>
17
+ int last = index_map[<%= axis %>];
18
+ <% axis.downto(1) do |i| %> index_map[<%= i %>] = index_map[<%= (i - 1) %>];<% end %>
19
+ index_map[0] = last;
20
+ <% end%>
21
+
22
+ C[<%= multipliers.each_with_index.map { |m, idx| "#{m}*index_map[#{idx}]" }.join(' + ') %>] = A[globalCol];
23
+ }
@@ -14,14 +14,15 @@ module TensorStream
14
14
  assign.buffer.dirty = true # force buffer copy when variable is read externally
15
15
  output_buffer = assign.buffer
16
16
 
17
- m, n = output_buffer.shape
18
- work_group = [m || 1, n || 1]
19
- cl_m = OpenCL::Int1.new(m || 1)
20
- cl_n = OpenCL::Int1.new(n || 1)
17
+ work_group = [output_buffer.total_elements]
21
18
 
22
19
  event_wait_list = build_event_wait_list([assign.buffer, learning_rate, delta])
23
- method_call = :"apply_gradient_#{output_buffer.data_type}"
24
- event = _cl_program("apply_gradient", dtype: output_buffer.data_type).send(method_call, _opencl_queue, work_group, cl_m, cl_n, delta.cl_buffer, learning_rate.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
20
+
21
+ event = call_program("apply_gradient", output_buffer.data_type,
22
+ work_group,
23
+ delta.cl_buffer,
24
+ learning_rate.cl_buffer,
25
+ output_buffer.cl_buffer, event_wait_list: event_wait_list)
25
26
  output_buffer.op = event
26
27
  output_buffer
27
28
  end
@@ -37,15 +38,12 @@ module TensorStream
37
38
 
38
39
  output_buffer = assign.buffer
39
40
 
40
- m, n = output_buffer.shape
41
- work_group = [m || 1, n || 1]
42
- cl_m = OpenCL::Int1.new(m || 1)
43
- cl_n = OpenCL::Int1.new(n || 1)
41
+ work_group = [output_buffer.total_elements]
44
42
 
45
43
  event_wait_list = build_event_wait_list([assign.buffer, assign_acc.buffer, learning_rate, grad, momentum])
46
44
  method_call = :"apply_momentum_#{output_buffer.data_type}"
47
45
  event = _cl_program("apply_momentum", nesterov: tensor.options[:use_nesterov], dtype: output_buffer.data_type).
48
- send(method_call, _opencl_queue, work_group, cl_m, cl_n, grad.cl_buffer,
46
+ send(method_call, _opencl_queue, work_group, grad.cl_buffer,
49
47
  learning_rate.cl_buffer, momentum.cl_buffer, output_buffer.cl_buffer,
50
48
  assign_acc.buffer.cl_buffer, event_wait_list: event_wait_list)
51
49
  output_buffer.op = event
@@ -66,15 +64,11 @@ module TensorStream
66
64
 
67
65
  output_buffer = assign.buffer
68
66
 
69
- m, n = output_buffer.shape
70
- work_group = [m || 1, n || 1]
71
- cl_m = OpenCL::Int1.new(m || 1)
72
- cl_n = OpenCL::Int1.new(n || 1)
67
+ work_group = [output_buffer.total_elements]
73
68
 
74
69
  event_wait_list = build_event_wait_list(inputs)
75
- method_call = :"apply_adadelta_#{output_buffer.data_type}"
76
- event = _cl_program('apply_adadelta', dtype: output_buffer.data_type)
77
- .send(method_call, _opencl_queue, work_group, cl_m, cl_n,
70
+ event = call_program('apply_adadelta', output_buffer.data_type,
71
+ work_group,
78
72
  lr.cl_buffer,
79
73
  rho.cl_buffer,
80
74
  epsilon.cl_buffer,
@@ -104,15 +98,11 @@ module TensorStream
104
98
 
105
99
  output_buffer = assign.buffer
106
100
 
107
- m, n = output_buffer.shape
108
- work_group = [m || 1, n || 1]
109
- cl_m = OpenCL::Int1.new(m || 1)
110
- cl_n = OpenCL::Int1.new(n || 1)
101
+ work_group = [output_buffer.total_elements]
111
102
 
112
103
  event_wait_list = build_event_wait_list(inputs)
113
- method_call = :"apply_adam_#{output_buffer.data_type}"
114
- event = _cl_program("apply_adam", dtype: output_buffer.data_type)
115
- .send(method_call, _opencl_queue, work_group, cl_m, cl_n,
104
+ event = call_program("apply_adam", output_buffer.data_type,
105
+ work_group,
116
106
  grad.cl_buffer,
117
107
  lr_t.cl_buffer,
118
108
  beta1_power.cl_buffer,
@@ -130,6 +120,99 @@ module TensorStream
130
120
  output_buffer
131
121
  end
132
122
 
123
+ register_op :apply_adagrad do |context, tensor, inputs|
124
+ target_var, accum, lr, grad = inputs
125
+
126
+ assign = tensor.inputs[0] || tensor
127
+ assign_acc = tensor.inputs[1]
128
+
129
+ assign.buffer.dirty = true
130
+ assign_acc.buffer.dirty = true
131
+ output_buffer = assign.buffer
132
+
133
+ work_group = [output_buffer.total_elements]
134
+
135
+ event_wait_list = build_event_wait_list(inputs)
136
+ event = call_program('apply_adagrad',
137
+ output_buffer.data_type,
138
+ work_group,
139
+ lr.cl_buffer,
140
+ grad.cl_buffer,
141
+ assign.buffer.cl_buffer,
142
+ assign_acc.buffer.cl_buffer,
143
+ event_wait_list: event_wait_list)
144
+ output_buffer.op = event
145
+ assign_acc.buffer.op = event
146
+ output_buffer
147
+ end
148
+
149
+ register_op :apply_centered_rms_prop do |context, tensor, inputs|
150
+ var, mg, ms, mom, lr, rho, momentum, epsilon, grad = inputs
151
+
152
+ assign = tensor.inputs[0]
153
+ assign_mg = tensor.inputs[1]
154
+ assign_ms = tensor.inputs[2]
155
+ assign_mom = tensor.inputs[3]
156
+
157
+ assign.buffer.dirty = true
158
+ assign_mg.buffer.dirty = true
159
+ assign_ms.buffer.dirty = true
160
+ assign_mom.buffer.dirty = true
161
+ output_buffer = assign.buffer
162
+ event_wait_list = build_event_wait_list(inputs)
163
+ work_group = [output_buffer.total_elements]
164
+
165
+ event = call_program('apply_centered_rms_prop', output_buffer.data_type, work_group,
166
+ lr.cl_buffer,
167
+ rho.cl_buffer,
168
+ momentum.cl_buffer,
169
+ epsilon.cl_buffer,
170
+ grad.cl_buffer,
171
+ assign.buffer.cl_buffer,
172
+ assign_ms.buffer.cl_buffer,
173
+ assign_mg.buffer.cl_buffer,
174
+ assign_mom.buffer.cl_buffer,
175
+ event_wait_list: event_wait_list)
176
+
177
+ output_buffer.op = event
178
+ assign_mg.buffer.op = event
179
+ assign_ms.buffer.op = event
180
+ assign_mom.buffer.op = event
181
+ output_buffer
182
+ end
183
+
184
+ register_op :apply_rms_prop do |context, tensor, inputs|
185
+ var, ms, mom, lr, rho, momentum, epsilon, grad = inputs
186
+
187
+ assign = tensor.inputs[0]
188
+ assign_ms = tensor.inputs[1]
189
+ assign_mom = tensor.inputs[2]
190
+
191
+ assign.buffer.dirty = true
192
+ assign_ms.buffer.dirty = true
193
+ assign_mom.buffer.dirty = true
194
+ output_buffer = assign.buffer
195
+ event_wait_list = build_event_wait_list(inputs)
196
+ work_group = [output_buffer.total_elements]
197
+
198
+ event = call_program('apply_rms_prop', output_buffer.data_type,
199
+ work_group,
200
+ lr.cl_buffer,
201
+ rho.cl_buffer,
202
+ momentum.cl_buffer,
203
+ epsilon.cl_buffer,
204
+ grad.cl_buffer,
205
+ assign.buffer.cl_buffer,
206
+ assign_ms.buffer.cl_buffer,
207
+ assign_mom.buffer.cl_buffer,
208
+ event_wait_list: event_wait_list)
209
+
210
+ output_buffer.op = event
211
+ assign_ms.buffer.op = event
212
+ assign_mom.buffer.op = event
213
+ output_buffer
214
+ end
215
+
133
216
  register_op :softmax do |_context, tensor, inputs|
134
217
  a = inputs[0]
135
218
  event_wait_list = build_event_wait_list(inputs)
@@ -213,7 +296,9 @@ module TensorStream
213
296
  work_group = [m]
214
297
  n = m if n.nil?
215
298
  cl_n = OpenCL::Int1.new(n || 1)
216
- event = _cl_program('softmax_grad', dtype: dtype, size: n).send(:"softmax_grad_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer, grad.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
299
+ event = _cl_program('softmax_grad', dtype: dtype, size: n).
300
+ send(:"softmax_grad_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer,
301
+ grad.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
217
302
  output_buffer.op = event
218
303
  output_buffer
219
304
  end
@@ -14,6 +14,14 @@ module TensorStream
14
14
  @op = op
15
15
  end
16
16
 
17
+ def total_elements
18
+ shape.reduce(:*) || 1
19
+ end
20
+
21
+ def empty_value?
22
+ @shape == [0]
23
+ end
24
+
17
25
  def to_ruby
18
26
  return [] if buffer.empty?
19
27
 
@@ -24,6 +32,7 @@ module TensorStream
24
32
  end
25
33
 
26
34
  if shape.empty?
35
+ return buffer.to_s if data_type == :string
27
36
  return buffer[0] != 0 if data_type == :boolean
28
37
  return buffer[0]
29
38
  end
@@ -11,6 +11,8 @@ require 'narray_ffi'
11
11
  require 'tensor_stream/evaluator/base_evaluator'
12
12
  require 'tensor_stream/opencl/math_ops'
13
13
  require 'tensor_stream/opencl/nn_ops'
14
+ require 'tensor_stream/opencl/images_ops'
15
+ require 'tensor_stream/opencl/array_ops'
14
16
  require 'tensor_stream/helpers/op_helper'
15
17
 
16
18
  module TensorStream
@@ -32,7 +34,8 @@ module TensorStream
32
34
  end
33
35
  end
34
36
 
35
- ## PURE ruby evaluator used for testing and development
37
+ ##
38
+ # PURE ruby evaluator used for testing and development
36
39
  class OpenclEvaluator < BaseEvaluator
37
40
  attr_accessor :retain
38
41
  attr_reader :opencl_device
@@ -42,6 +45,8 @@ module TensorStream
42
45
  include TensorStream::MathHelper
43
46
  include TensorStream::OpenCLHelpers::MathOps
44
47
  include TensorStream::OpenCLHelpers::NNOps
48
+ include TensorStream::OpenCLHelpers::ImagesOps
49
+ include TensorStream::OpenCLHelpers::ArrayOps
45
50
 
46
51
  def initialize(session, device, thread_pool: nil, log_intermediates: false)
47
52
  super
@@ -86,7 +91,10 @@ module TensorStream
86
91
 
87
92
  # opencl evaluator main entrypoint
88
93
  def run(tensor, execution_context)
89
- read_final_result(complete_eval(tensor, execution_context))
94
+ result = complete_eval(tensor, execution_context)
95
+ # puts "wait finish"
96
+ _opencl_queue.finish
97
+ read_final_result(result)
90
98
  end
91
99
 
92
100
  def run_with_buffer(tensor, context, execution_context)
@@ -117,9 +125,9 @@ module TensorStream
117
125
  def enqueue_buffer_read(tensor, context)
118
126
  buffer = _run(tensor, context)
119
127
  if buffer.is_a?(Array)
120
- buffer = buffer.collect do |b|
128
+ buffer.collect do |b|
121
129
  next b if b.buffer.size.zero?
122
- _opencl_queue.enqueue_read_buffer(b.cl_buffer, b.buffer, event_wait_list: build_event_wait_list([b]))
130
+ b.op = _opencl_queue.enqueue_read_buffer(b.cl_buffer, b.buffer, event_wait_list: build_event_wait_list([b]))
123
131
  b
124
132
  end
125
133
  else
@@ -127,14 +135,18 @@ module TensorStream
127
135
  return buffer if buffer.nil?
128
136
  return [] if buffer.buffer.nil?
129
137
  return buffer if buffer.buffer.size.zero?
130
- _opencl_queue.enqueue_read_buffer(buffer.cl_buffer, buffer.buffer, event_wait_list: build_event_wait_list([buffer]))
138
+ buffer.op = _opencl_queue.enqueue_read_buffer(buffer.cl_buffer, buffer.buffer, event_wait_list: build_event_wait_list([buffer]))
131
139
  buffer
132
140
  end
133
141
  end
134
142
 
135
143
  def complete_eval(tensor, context)
144
+ return nil if tensor.nil?
145
+
136
146
  buffer = enqueue_buffer_read(tensor, context)
137
- _opencl_queue.finish
147
+ events = build_event_wait_list([buffer])
148
+ # puts "wait #{tensor.name}"
149
+ OpenCL.wait_for_events(events) unless events.empty?
138
150
  buffer
139
151
  end
140
152
 
@@ -162,6 +174,7 @@ module TensorStream
162
174
 
163
175
  def prepare_input(tensor, context, options = {})
164
176
  return nil unless tensor
177
+
165
178
  tensor = resolve_placeholder(tensor)
166
179
  if options[:noop]
167
180
  tensor
@@ -210,11 +223,17 @@ module TensorStream
210
223
  def _cl_program(kernel, args = {})
211
224
  suffix = args.collect { |k, v| "#{k}.#{escape_arg_content(v)}" }.join('.')
212
225
  @context[:_cache]["_opencl_kernel_#{kernel}.#{suffix}:#{object_id}"] ||= begin
213
- filename = %w[cl.erb cl].map { |ext| cl_template_path(kernel, ext) }.find { |n| File.exist?(n) }
214
- raise "opencl kernel template for #{kernel} has not yet been defined" if filename.nil?
215
- source = File.read(filename)
216
- source = OpenclTemplateHelper.new(source).generate(args)
217
- # File.write("/tmp/#{kernel}.#{suffix}.cl", source)
226
+ file_path = File.join('/tmp', "#{kernel}.#{suffix}.cl")
227
+ source = if File.exist?(file_path) && ENV['TS_OPENCL_FILE_CACHE']
228
+ File.read(file_path)
229
+ else
230
+ filename = %w[cl.erb cl].map { |ext| cl_template_path(kernel, ext) }.find { |n| File.exist?(n) }
231
+ raise "opencl kernel template for #{kernel} has not yet been defined" if filename.nil?
232
+ source = File.read(filename)
233
+ source = OpenclTemplateHelper.new(source).generate(args)
234
+ File.write(file_path, source) if ENV['TS_OPENCL_FILE_CACHE']
235
+ source
236
+ end
218
237
  program = _opencl_context.create_program_with_source(source)
219
238
  program.build
220
239
  rescue OpenCL::Error::BUILD_PROGRAM_FAILURE => e
@@ -275,10 +294,10 @@ module TensorStream
275
294
  end
276
295
 
277
296
  register_op :identity do |context, tensor, inputs|
278
- if tensor.inputs.size > 1
279
- tensor.inputs[1..inputs.size].each { |input| complete_eval(input, context) }
280
- end
281
- inputs[0]
297
+ value = inputs[0]
298
+ buffer = OpenCLBuffer.new(name: tensor.name, data_type: tensor.data_type, shape: value.shape, buffer: value.buffer, cl_buffer: value.cl_buffer)
299
+ buffer.op = build_event_wait_list(inputs)
300
+ buffer
282
301
  end
283
302
 
284
303
  register_op :assign, noop: true do |context, tensor, inputs|
@@ -308,86 +327,11 @@ module TensorStream
308
327
  end
309
328
  end
310
329
 
311
- register_op :expand_dims, buffer: true do |_context, tensor, inputs|
312
- axis = inputs[1].buffer[0]
313
- shape = inputs[0].shape.dup
314
- axis = -axis if axis == shape.size
315
- new_shape = shape.insert(axis, 1).compact
316
- new_buf = inputs[0].buffer.reshape(*new_shape.reverse)
317
- convert_to_opencl(new_buf, new_shape, data_type: inputs[0].data_type, name: tensor.name)
318
- end
319
-
320
- register_op :fill, buffer: true do |_context, tensor, inputs|
321
- shape = inputs[0]
322
- value = inputs[1]
323
-
324
- narray_size = shape.buffer.to_a.reduce(:*) || 1
325
- cl_buffer = get_cached_buffer(tensor.name, shape.buffer.to_a)
326
-
327
- buffer = if cl_buffer
328
- cl_buffer.buffer
329
- else
330
- allocate_narray_for_type(tensor.data_type, narray_size)
331
- end
332
-
333
- buffer.fill!(value.buffer[0])
334
- convert_to_opencl(buffer, shape.buffer.to_a, data_type: tensor.data_type, name: tensor.name)
335
- end
336
-
337
330
  register_op :where, noop: true do |context, tensor, inputs|
338
331
  pred = tensor.options[:pred]
339
332
  execute_cond_func('where', tensor, pred, inputs[0], inputs[1], context)
340
333
  end
341
334
 
342
- register_op :cast do |_context, tensor, inputs|
343
- a = inputs[0]
344
- if a.data_type != tensor.data_type
345
- buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
346
- m, n = a.shape
347
- cl_m = OpenCL::Int1.new(m || 1)
348
- cl_n = OpenCL::Int1.new(n || 1)
349
- work_group = [m || 1, n || 1]
350
- event_wait_list = build_event_wait_list(inputs)
351
- buffer.op = _cl_program("cast", source_dt: a.data_type, target_dt: tensor.data_type).cast(_opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, buffer.cl_buffer, event_wait_list: event_wait_list)
352
- buffer
353
- else
354
- a
355
- end
356
- end
357
-
358
- register_op :stack do |_context, tensor, inputs|
359
- axis = tensor.options[:axis] || 0
360
- shape = inputs[0].shape
361
- rank = shape.size + 1
362
- elem_size = shape.empty? ? 1 : shape.reduce(:*)
363
-
364
- new_shape = [inputs.size]
365
- shape.inject(new_shape) { |ns, s| ns << s }
366
-
367
- divisors = new_shape.dup.drop(1).reverse.inject([1]) do |a, s|
368
- a << s * a.last
369
- end.reverse
370
-
371
- axis = rank + axis if axis < 0
372
- rotated_shape = Array.new(axis + 1) { new_shape.shift }
373
- new_shape = rotated_shape.rotate! + new_shape
374
-
375
- output_buffer = _create_result_buffer(tensor.data_type, new_shape, tensor.name)
376
- multipliers = new_shape.dup.drop(1).reverse.inject([1]) do |a, s|
377
- a << s * a.last
378
- end.reverse
379
-
380
- cl_n = OpenCL::Int1.new(elem_size)
381
- work_group = [elem_size]
382
- event_wait_list = build_event_wait_list(inputs)
383
- ops = inputs.each_with_index.map do |input, index|
384
- cl_index = OpenCL::Int1.new(index)
385
- _cl_program("pack", data_type: tensor.data_type, divisors: divisors, multipliers: multipliers, axis: axis).pack(_opencl_queue, work_group, cl_n, cl_index, input.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
386
- end
387
- output_buffer.op = ops
388
- output_buffer
389
- end
390
-
391
335
  register_op :check_numerics, noop: true do |context, tensor, inputs|
392
336
  a = complete_eval(inputs[0], context)
393
337
  name = tensor.options[:name]
@@ -420,86 +364,18 @@ module TensorStream
420
364
  a
421
365
  end
422
366
 
423
- register_op :rank do |_context, tensor, inputs|
424
- wrap_opencl(inputs[0].shape.size, data_type: tensor.data_type, name: tensor.name)
425
- end
426
-
427
367
  register_op :stop_gradient do |_context, _tensor, inputs|
428
368
  inputs[0]
429
369
  end
430
370
 
431
- register_op :slice, noop: true do |context, tensor, inputs|
432
- input_a = complete_eval(inputs[0], context)
433
- input_b = read_final_result(complete_eval(inputs[1], context))
434
- size = tensor.options[:size]
435
-
436
- slice_param = input_b.zip(size).collect { |x, y| x..x + y - 1 }.reverse
437
-
438
- new_buf = input_a.buffer.reshape(*input_a.shape.reverse)
439
- sliced = new_buf.slice[*slice_param]
440
- convert_to_opencl(sliced.flatten, sliced.shape.reverse, data_type: inputs[0].data_type, name: tensor.name)
441
- end
442
-
443
- register_op :transpose, buffer: true do |_context, tensor, inputs|
444
- t_param = Array.new(inputs[0].shape.size) { |index| index }.reverse
445
-
446
- if inputs[0].shape.size == 2 && inputs[1].nil?
447
- transposed = inputs[0].buffer.reshape(*inputs[0].shape.reverse).transpose(*t_param)
448
- res = convert_to_opencl(transposed.flatten, transposed.shape.reverse, data_type: inputs[0].data_type, name: tensor.name)
449
- res
450
- else
451
- rank = inputs[0].shape.size
452
- perm = inputs[1].nil? ? (0...rank).to_a.reverse : inputs[1].buffer
453
- new_shape = perm.map { |p| inputs[0].shape[p] }.to_a
454
- output_buffer = _create_result_buffer(tensor.data_type, new_shape, tensor.name)
455
- transpose_with_perm(inputs[0].buffer, output_buffer.buffer, inputs[0].shape, new_shape, perm)
456
-
457
- write_op = _opencl_queue.enqueue_write_buffer(output_buffer.cl_buffer, output_buffer.buffer)
458
- output_buffer.op = write_op
459
- output_buffer
460
- end
461
- end
462
-
463
- register_op :index, noop: true do |context, tensor, inputs|
464
- a = _run(inputs[0], context)
465
- index = read_final_result(_run(inputs[1], context))
466
-
467
- if a.is_a?(OutputGroup)
468
- a.outputs[index]
469
- elsif a.is_a?(Array)
470
- a[index]
471
- else
472
- new_shape = a.shape.dup
473
- new_shape.shift
474
- input_a = read_final_result(a)
475
- convert_to_opencl(input_a[index], new_shape, data_type: a.data_type, name: tensor.name)
476
- end
477
- end
478
-
479
371
  register_op :broadcast_gradient_args, buffer: true do |_context, tensor, inputs|
480
372
  rx, ry = get_broadcast_gradient_args(inputs[0].buffer.to_a, inputs[1].buffer.to_a)
481
373
  OutputGroup.new([wrap_opencl(rx, data_type: :int32, name: tensor.name), wrap_opencl(ry, data_type: :int32, name: "#{tensor.name}:1")], tensor.inputs.map(&:data_type))
482
374
  end
483
375
 
484
- register_op :shape do |_context, tensor, inputs|
485
- wrap_opencl(inputs[0].shape, name: tensor.name, data_type: tensor.data_type)
486
- end
487
-
488
- register_op :reshape, buffer: true do |_context, tensor, inputs|
489
- arr = inputs[0]
490
- new_shape = read_final_result(inputs[1])
491
-
492
- shape = if new_shape.size.zero? && arr.buffer.size == 1
493
- new_shape
494
- else
495
- TensorShape.fix_inferred_elements(new_shape, arr.buffer.size)
496
- end
497
-
498
- convert_to_opencl(arr.buffer, shape, data_type: arr.data_type, name: tensor.name)
499
- end
500
-
501
- register_op :flow_group do |context, _tensor, inputs|
502
- _opencl_queue.finish
376
+ register_op :flow_group do |_context, _tensor, inputs|
377
+ events = build_event_wait_list(inputs)
378
+ OpenCL.wait_for_events(events) unless events.empty?
503
379
  nil
504
380
  end
505
381
 
@@ -657,7 +533,10 @@ module TensorStream
657
533
  cl_n = OpenCL::Int1.new(n || 1)
658
534
 
659
535
  event_wait_list = build_event_wait_list([a, b, p]) # add dependency wait list
660
- output_buffer.op = _cl_program(op_name.to_s, dtype: dtype).send(:"#{op_name}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, p.cl_buffer, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
536
+ output_buffer.op = _cl_program(op_name.to_s, dtype: dtype).
537
+ send(:"#{op_name}_#{dtype}", _opencl_queue, work_group,
538
+ cl_m, cl_n, p.cl_buffer, a.cl_buffer, b.cl_buffer,
539
+ output_buffer.cl_buffer, event_wait_list: event_wait_list)
661
540
  output_buffer
662
541
  end
663
542
 
@@ -667,16 +546,17 @@ module TensorStream
667
546
  dtype = tensor.data_type
668
547
  output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
669
548
 
670
- m, n = a.shape
671
- work_group = [m || 1, n || 1]
672
- cl_m = OpenCL::Int1.new(m || 1)
673
- cl_n = OpenCL::Int1.new(n || 1)
549
+ work_group = [a.total_elements]
674
550
 
675
- event = _cl_program(op_name.to_s, dtype: dtype).send(:"#{op_name}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
551
+ event = call_program(op_name, dtype, work_group, a.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
676
552
  output_buffer.op = event
677
553
  output_buffer
678
554
  end
679
555
 
556
+ def call_program(name, dtype, work_group, *args)
557
+ _cl_program(name.to_s, dtype: dtype).send(:"#{name}_#{dtype}", _opencl_queue, work_group, *args)
558
+ end
559
+
680
560
  def auto_type_cast(a, b, name: nil)
681
561
  return [a, b] if a.data_type == b.data_type
682
562
  m, n = b.shape
@@ -728,16 +608,20 @@ module TensorStream
728
608
  @context[:_cache][cache_key]
729
609
  else
730
610
  narray_size = shape.reduce(:*) || 1
611
+ cl_buffer_size = shape.empty? ? 1 : shape.reduce(:*)
731
612
 
732
613
  buffer = if value.is_a?(NArray)
733
614
  value
615
+ elsif data_type == :string && shape.empty?
616
+ cl_buffer_size = value[0].bytesize
617
+ allocate_narray_for_type(data_type, value[0].bytesize)
734
618
  else
735
619
  allocate_narray_for_type(data_type, narray_size)
736
620
  end
737
621
 
738
622
  return nil if buffer.nil?
739
623
 
740
- cl_buffer_size = shape.empty? ? 1 : shape.reduce(:*)
624
+
741
625
 
742
626
  cl_buffer = unless value.flatten.empty?
743
627
  cl_buffer_size = 1 if cl_buffer_size.zero?
@@ -746,8 +630,11 @@ module TensorStream
746
630
 
747
631
  @context[:_cache][cache_key] = OpenCLBuffer.new(name: name, data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer)
748
632
  end
749
-
750
- if value.is_a?(Array)
633
+ if data_type == :string
634
+ value[0].each_byte.with_index do |c, index|
635
+ cl_object.buffer[index] = c
636
+ end
637
+ elsif value.is_a?(Array)
751
638
  value.flatten.each_with_index do |element, index|
752
639
  cl_object.buffer[index] = if element.is_a?(Tensor)
753
640
  read_final_result(complete_eval(element, {}))
@@ -765,7 +652,10 @@ module TensorStream
765
652
  cl_object.buffer[0] = Tensor.cast_dtype(value, data_type)
766
653
  end
767
654
 
768
- write_op = _opencl_queue.enqueue_write_buffer(cl_object.cl_buffer, cl_object.buffer) if cl_object.cl_buffer && !value.nil? && (!value.is_a?(Array) || !value.empty?)
655
+ if cl_object.cl_buffer && !value.nil? && (!value.is_a?(Array) || !value.empty?)
656
+ write_op = _opencl_queue.enqueue_write_buffer(cl_object.cl_buffer, cl_object.buffer)
657
+ end
658
+
769
659
  cl_object.op = write_op
770
660
  cl_object
771
661
  end
@@ -780,8 +670,12 @@ module TensorStream
780
670
  NArray.int(narray_size)
781
671
  when :int16
782
672
  NArray.sint(narray_size)
673
+ when :uint8
674
+ NArray.byte(narray_size)
783
675
  when :boolean
784
676
  NArray.byte(narray_size)
677
+ when :string
678
+ NArray.byte(narray_size)
785
679
  when :unknown
786
680
  nil
787
681
  else
@@ -799,6 +693,65 @@ module TensorStream
799
693
  end
800
694
  end
801
695
 
696
+ # automatically use sub buffers
697
+ def _create_result_sub_buffer(parent_buffer, index, data_type, shape, name)
698
+ cache_key ="_sub_result_#{parent_buffer.object_id}_#{name}_#{index}:#{object_id}"
699
+ @context[:_cache][:_cl_buffers][cache_key] ||= begin
700
+ size = shape.empty? || shape == [0] ? 1 : shape.reduce(:*)
701
+ buffer = allocate_narray_for_type(data_type, size)
702
+
703
+ if parent_buffer.cl_buffer.associated_memobject.nil?
704
+ start = index * buffer.size * buffer.element_size
705
+ region = OpenCL::BufferRegion::new(start, buffer.size * buffer.element_size)
706
+ cl_buffer = parent_buffer.cl_buffer.create_sub_buffer(OpenCL::BUFFER_CREATE_TYPE_REGION, region)
707
+ OpenCLBuffer.new(data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer, name: name)
708
+ else
709
+ _create_result_buffer(tensor.data_type, shape, name)
710
+ end
711
+ end
712
+
713
+ buffer = @context[:_cache][:_cl_buffers][cache_key]
714
+
715
+ if buffer.cl_buffer.associated_memobject
716
+ buffer.op = parent_buffer.op
717
+ else
718
+ region_size_in_bytes = buffer.buffer.size * buffer.buffer.element_size
719
+ start = index * region_size_in_bytes
720
+ region = [region_size_in_bytes, 1, 1]
721
+ buffer.op = _opencl_queue.enqueue_copy_buffer_rect(parent_buffer.cl_buffer, buffer.cl_buffer, region, src_origin: [start, 0, 0], event_wait_list: parent_buffer.op)
722
+ end
723
+
724
+ buffer
725
+ end
726
+
727
+ # create sub buffers of different sizes
728
+ def _create_variable_result_sub_buffer(parent_buffer, index, start, region_size_in_bytes, data_type, shape, name)
729
+ cache_key ="_sub_result_#{parent_buffer.object_id}_#{name}_#{index}:#{object_id}"
730
+ @context[:_cache][:_cl_buffers][cache_key] ||= begin
731
+ size = shape.empty? || shape == [0] ? 1 : shape.reduce(:*)
732
+ buffer = allocate_narray_for_type(data_type, size)
733
+
734
+ if parent_buffer.cl_buffer.associated_memobject.nil?
735
+ region = OpenCL::BufferRegion::new(start, region_size_in_bytes)
736
+ cl_buffer = parent_buffer.cl_buffer.create_sub_buffer(OpenCL::BUFFER_CREATE_TYPE_REGION, region)
737
+ OpenCLBuffer.new(data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer, name: "#{name}/sub")
738
+ else
739
+ _create_result_buffer(tensor.data_type, shape, name)
740
+ end
741
+ end
742
+
743
+ buffer = @context[:_cache][:_cl_buffers][cache_key]
744
+
745
+ if buffer.cl_buffer.associated_memobject
746
+ buffer.op = parent_buffer.op
747
+ else
748
+ region = [region_size_in_bytes, 1, 1]
749
+ buffer.op = _opencl_queue.enqueue_copy_buffer_rect(parent_buffer.cl_buffer, buffer.cl_buffer, region, src_origin: [start, 0, 0], event_wait_list: parent_buffer.op)
750
+ end
751
+
752
+ buffer
753
+ end
754
+
802
755
  def get_op_with_axis(a, target_axis, current_axis, output_type, op = ->(t, u) { t > u })
803
756
  if target_axis == current_axis
804
757
  if a[0].is_a?(Array)
@@ -898,7 +851,11 @@ module TensorStream
898
851
  end
899
852
 
900
853
  def build_event_wait_list(inputs)
901
- inputs.compact.map(&:op).flatten
854
+ if inputs.is_a?(Array)
855
+ inputs.flatten.compact.map(&:op).compact.uniq
856
+ else
857
+ inputs.op ? [inputs.op] : []
858
+ end
902
859
  end
903
860
 
904
861
  def resolve_placeholder(placeholder, _execution_context = {})