tensor_stream-opencl 0.1.3 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +11 -4
  3. data/benchmark/benchmark.rb +91 -0
  4. data/benchmark_intel.txt +36 -0
  5. data/lib/tensor_stream/opencl/array_ops.rb +395 -0
  6. data/lib/tensor_stream/opencl/images_ops.rb +62 -0
  7. data/lib/tensor_stream/opencl/kernels/abs.cl +6 -8
  8. data/lib/tensor_stream/opencl/kernels/acos.cl +3 -4
  9. data/lib/tensor_stream/opencl/kernels/apply_adadelta.cl +2 -4
  10. data/lib/tensor_stream/opencl/kernels/apply_adagrad.cl +12 -0
  11. data/lib/tensor_stream/opencl/kernels/apply_adam.cl +2 -5
  12. data/lib/tensor_stream/opencl/kernels/apply_centered_rms_prop.cl +19 -0
  13. data/lib/tensor_stream/opencl/kernels/apply_gradient.cl +3 -4
  14. data/lib/tensor_stream/opencl/kernels/apply_momentum.cl +2 -4
  15. data/lib/tensor_stream/opencl/kernels/apply_rms_prop.cl +16 -0
  16. data/lib/tensor_stream/opencl/kernels/asin.cl +3 -4
  17. data/lib/tensor_stream/opencl/kernels/ceil.cl +3 -4
  18. data/lib/tensor_stream/opencl/kernels/concat.cl +21 -0
  19. data/lib/tensor_stream/opencl/kernels/cos.cl +3 -5
  20. data/lib/tensor_stream/opencl/kernels/exp.cl +3 -5
  21. data/lib/tensor_stream/opencl/kernels/floor.cl +3 -4
  22. data/lib/tensor_stream/opencl/kernels/log.cl +3 -4
  23. data/lib/tensor_stream/opencl/kernels/log1p.cl +3 -4
  24. data/lib/tensor_stream/opencl/kernels/negate.cl +3 -4
  25. data/lib/tensor_stream/opencl/kernels/reciprocal.cl +3 -4
  26. data/lib/tensor_stream/opencl/kernels/sigmoid.cl +3 -4
  27. data/lib/tensor_stream/opencl/kernels/sign.cl +7 -8
  28. data/lib/tensor_stream/opencl/kernels/sin.cl +3 -4
  29. data/lib/tensor_stream/opencl/kernels/split.cl +17 -0
  30. data/lib/tensor_stream/opencl/kernels/split_n.cl +18 -0
  31. data/lib/tensor_stream/opencl/kernels/sqrt.cl +3 -4
  32. data/lib/tensor_stream/opencl/kernels/square.cl +3 -4
  33. data/lib/tensor_stream/opencl/kernels/tan.cl +3 -4
  34. data/lib/tensor_stream/opencl/kernels/tanh.cl +3 -4
  35. data/lib/tensor_stream/opencl/kernels/tanh_grad.cl +3 -4
  36. data/lib/tensor_stream/opencl/kernels/unpack.cl +23 -0
  37. data/lib/tensor_stream/opencl/nn_ops.rb +111 -26
  38. data/lib/tensor_stream/opencl/opencl_buffer.rb +9 -0
  39. data/lib/tensor_stream/opencl/opencl_evaluator.rb +129 -172
  40. data/lib/tensor_stream/opencl/version.rb +1 -1
  41. data/samples/iris.data +150 -0
  42. data/samples/iris.rb +110 -0
  43. data/samples/mnist_data.rb +65 -0
  44. data/samples/multigpu.rb +73 -0
  45. data/samples/nearest_neighbor.rb +56 -0
  46. data/samples/rnn.rb +108 -0
  47. data/tensor_stream-opencl.gemspec +4 -1
  48. metadata +62 -3
@@ -0,0 +1,18 @@
1
+ % ctype = dtype_to_c_type(data_type)
2
+ % mul_str = mul.each_with_index.collect { |mul, index| "#{mul} * index_map_#{index}" }
3
+ __kernel void split(const int offset, __global const <%= ctype %> *A, __global <%= ctype %> *C) {
4
+ // Get the index of the current element to be processed
5
+ const int globalCol = get_global_id(0); // Col ID of C (0..N)
6
+
7
+ // compute effective coordinates
8
+ int ptr = globalCol;
9
+ <% div.each_with_index do |div, index| %>
10
+ <% if index == axis %>
11
+ int index_map_<%= index %> = (int)floor(ptr / (float)<%= div %>) + <%= step %>;
12
+ <% else %>
13
+ int index_map_<%= index %> = (int)floor(ptr / (float)<%= div %>);
14
+ <% end %>
15
+ <% if index < div.size - 1%>ptr = ptr % <%= div %>;<% end %><% end %>
16
+ C[offset + globalCol] = A[<%= mul_str.join(" + ") %>];
17
+
18
+ }
@@ -1,9 +1,8 @@
1
1
  % c_dtype = dtype_to_c_type(dtype)
2
2
 
3
- __kernel void sqrt_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
+ __kernel void sqrt_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
4
4
  // Get the index of the current element to be processed
5
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
5
+ const int id = get_global_id(0);
7
6
 
8
- C[globalRow * N + globalCol] = sqrt(A[globalRow * N + globalCol]);
7
+ C[id] = sqrt(A[id]);
9
8
  }
@@ -1,9 +1,8 @@
1
1
  % c_dtype = dtype_to_c_type(dtype)
2
2
 
3
- __kernel void square_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
+ __kernel void square_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
4
4
  // Get the index of the current element to be processed
5
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
5
+ const int id = get_global_id(0);
7
6
 
8
- C[globalRow * N + globalCol] = A[globalRow * N + globalCol] * A[globalRow * N + globalCol];
7
+ C[id] = A[id] * A[id];
9
8
  }
@@ -1,8 +1,7 @@
1
1
  % c_dtype = dtype_to_c_type(dtype)
2
- __kernel void tan_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
2
+ __kernel void tan_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
3
  // Get the index of the current element to be processed
4
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
4
+ const int id = get_global_id(0); // Row ID of C (0..M)
6
5
 
7
- C[globalRow * N + globalCol] = tan(A[globalRow * N + globalCol]);
6
+ C[id] = tan(A[id]);
8
7
  }
@@ -1,8 +1,7 @@
1
1
  % c_dtype = dtype_to_c_type(dtype)
2
- __kernel void tanh_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
2
+ __kernel void tanh_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
3
  // Get the index of the current element to be processed
4
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
4
+ const int id = get_global_id(0); // Row ID of C (0..M)
6
5
 
7
- C[globalRow * N + globalCol] = tanh(A[globalRow * N + globalCol]);
6
+ C[id] = tanh(A[id]);
8
7
  }
@@ -1,7 +1,6 @@
1
1
  % c_dtype = dtype_to_c_type(dtype)
2
- __kernel void tanh_grad_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
2
+ __kernel void tanh_grad_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
3
  // Get the index of the current element to be processed
4
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
- C[globalRow * N + globalCol] = 1 - tanh(A[globalRow * N + globalCol]) * tanh(A[globalRow * N + globalCol]);
4
+ const int id = get_global_id(0);
5
+ C[id] = 1 - tanh(A[id]) * tanh(A[id]);
7
6
  }
@@ -0,0 +1,23 @@
1
+ % ctype = dtype_to_c_type(data_type)
2
+
3
+ __kernel void unpack(const int N, const int index, __global const <%= ctype %> *A, __global <%= ctype %> *C) {
4
+ // Get the index of the current element to be processed
5
+ const int globalCol = get_global_id(0); // Col ID of C (0..N)
6
+
7
+ int start = index * <%= divisors[0] %>;
8
+ int ptr = start + globalCol;
9
+ int index_map[<%= divisors.size %>] = { <%= Array.new(divisors.size) { 0 }.join(', ') %> };
10
+
11
+ // compute effective coordinates
12
+ <% divisors.each_with_index do |div, index| %>
13
+ index_map[<%= index %>] = (int)floor(ptr / (float)<%= div %>);<% if index < divisors.size - 1%>ptr = ptr % <%= div %>;<% end %><% end %>
14
+
15
+ // Apply axis translation if needed
16
+ <% if axis > 0 %>
17
+ int last = index_map[<%= axis %>];
18
+ <% axis.downto(1) do |i| %> index_map[<%= i %>] = index_map[<%= (i - 1) %>];<% end %>
19
+ index_map[0] = last;
20
+ <% end%>
21
+
22
+ C[<%= multipliers.each_with_index.map { |m, idx| "#{m}*index_map[#{idx}]" }.join(' + ') %>] = A[globalCol];
23
+ }
@@ -14,14 +14,15 @@ module TensorStream
14
14
  assign.buffer.dirty = true # force buffer copy when variable is read externally
15
15
  output_buffer = assign.buffer
16
16
 
17
- m, n = output_buffer.shape
18
- work_group = [m || 1, n || 1]
19
- cl_m = OpenCL::Int1.new(m || 1)
20
- cl_n = OpenCL::Int1.new(n || 1)
17
+ work_group = [output_buffer.total_elements]
21
18
 
22
19
  event_wait_list = build_event_wait_list([assign.buffer, learning_rate, delta])
23
- method_call = :"apply_gradient_#{output_buffer.data_type}"
24
- event = _cl_program("apply_gradient", dtype: output_buffer.data_type).send(method_call, _opencl_queue, work_group, cl_m, cl_n, delta.cl_buffer, learning_rate.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
20
+
21
+ event = call_program("apply_gradient", output_buffer.data_type,
22
+ work_group,
23
+ delta.cl_buffer,
24
+ learning_rate.cl_buffer,
25
+ output_buffer.cl_buffer, event_wait_list: event_wait_list)
25
26
  output_buffer.op = event
26
27
  output_buffer
27
28
  end
@@ -37,15 +38,12 @@ module TensorStream
37
38
 
38
39
  output_buffer = assign.buffer
39
40
 
40
- m, n = output_buffer.shape
41
- work_group = [m || 1, n || 1]
42
- cl_m = OpenCL::Int1.new(m || 1)
43
- cl_n = OpenCL::Int1.new(n || 1)
41
+ work_group = [output_buffer.total_elements]
44
42
 
45
43
  event_wait_list = build_event_wait_list([assign.buffer, assign_acc.buffer, learning_rate, grad, momentum])
46
44
  method_call = :"apply_momentum_#{output_buffer.data_type}"
47
45
  event = _cl_program("apply_momentum", nesterov: tensor.options[:use_nesterov], dtype: output_buffer.data_type).
48
- send(method_call, _opencl_queue, work_group, cl_m, cl_n, grad.cl_buffer,
46
+ send(method_call, _opencl_queue, work_group, grad.cl_buffer,
49
47
  learning_rate.cl_buffer, momentum.cl_buffer, output_buffer.cl_buffer,
50
48
  assign_acc.buffer.cl_buffer, event_wait_list: event_wait_list)
51
49
  output_buffer.op = event
@@ -66,15 +64,11 @@ module TensorStream
66
64
 
67
65
  output_buffer = assign.buffer
68
66
 
69
- m, n = output_buffer.shape
70
- work_group = [m || 1, n || 1]
71
- cl_m = OpenCL::Int1.new(m || 1)
72
- cl_n = OpenCL::Int1.new(n || 1)
67
+ work_group = [output_buffer.total_elements]
73
68
 
74
69
  event_wait_list = build_event_wait_list(inputs)
75
- method_call = :"apply_adadelta_#{output_buffer.data_type}"
76
- event = _cl_program('apply_adadelta', dtype: output_buffer.data_type)
77
- .send(method_call, _opencl_queue, work_group, cl_m, cl_n,
70
+ event = call_program('apply_adadelta', output_buffer.data_type,
71
+ work_group,
78
72
  lr.cl_buffer,
79
73
  rho.cl_buffer,
80
74
  epsilon.cl_buffer,
@@ -104,15 +98,11 @@ module TensorStream
104
98
 
105
99
  output_buffer = assign.buffer
106
100
 
107
- m, n = output_buffer.shape
108
- work_group = [m || 1, n || 1]
109
- cl_m = OpenCL::Int1.new(m || 1)
110
- cl_n = OpenCL::Int1.new(n || 1)
101
+ work_group = [output_buffer.total_elements]
111
102
 
112
103
  event_wait_list = build_event_wait_list(inputs)
113
- method_call = :"apply_adam_#{output_buffer.data_type}"
114
- event = _cl_program("apply_adam", dtype: output_buffer.data_type)
115
- .send(method_call, _opencl_queue, work_group, cl_m, cl_n,
104
+ event = call_program("apply_adam", output_buffer.data_type,
105
+ work_group,
116
106
  grad.cl_buffer,
117
107
  lr_t.cl_buffer,
118
108
  beta1_power.cl_buffer,
@@ -130,6 +120,99 @@ module TensorStream
130
120
  output_buffer
131
121
  end
132
122
 
123
+ register_op :apply_adagrad do |context, tensor, inputs|
124
+ target_var, accum, lr, grad = inputs
125
+
126
+ assign = tensor.inputs[0] || tensor
127
+ assign_acc = tensor.inputs[1]
128
+
129
+ assign.buffer.dirty = true
130
+ assign_acc.buffer.dirty = true
131
+ output_buffer = assign.buffer
132
+
133
+ work_group = [output_buffer.total_elements]
134
+
135
+ event_wait_list = build_event_wait_list(inputs)
136
+ event = call_program('apply_adagrad',
137
+ output_buffer.data_type,
138
+ work_group,
139
+ lr.cl_buffer,
140
+ grad.cl_buffer,
141
+ assign.buffer.cl_buffer,
142
+ assign_acc.buffer.cl_buffer,
143
+ event_wait_list: event_wait_list)
144
+ output_buffer.op = event
145
+ assign_acc.buffer.op = event
146
+ output_buffer
147
+ end
148
+
149
+ register_op :apply_centered_rms_prop do |context, tensor, inputs|
150
+ var, mg, ms, mom, lr, rho, momentum, epsilon, grad = inputs
151
+
152
+ assign = tensor.inputs[0]
153
+ assign_mg = tensor.inputs[1]
154
+ assign_ms = tensor.inputs[2]
155
+ assign_mom = tensor.inputs[3]
156
+
157
+ assign.buffer.dirty = true
158
+ assign_mg.buffer.dirty = true
159
+ assign_ms.buffer.dirty = true
160
+ assign_mom.buffer.dirty = true
161
+ output_buffer = assign.buffer
162
+ event_wait_list = build_event_wait_list(inputs)
163
+ work_group = [output_buffer.total_elements]
164
+
165
+ event = call_program('apply_centered_rms_prop', output_buffer.data_type, work_group,
166
+ lr.cl_buffer,
167
+ rho.cl_buffer,
168
+ momentum.cl_buffer,
169
+ epsilon.cl_buffer,
170
+ grad.cl_buffer,
171
+ assign.buffer.cl_buffer,
172
+ assign_ms.buffer.cl_buffer,
173
+ assign_mg.buffer.cl_buffer,
174
+ assign_mom.buffer.cl_buffer,
175
+ event_wait_list: event_wait_list)
176
+
177
+ output_buffer.op = event
178
+ assign_mg.buffer.op = event
179
+ assign_ms.buffer.op = event
180
+ assign_mom.buffer.op = event
181
+ output_buffer
182
+ end
183
+
184
+ register_op :apply_rms_prop do |context, tensor, inputs|
185
+ var, ms, mom, lr, rho, momentum, epsilon, grad = inputs
186
+
187
+ assign = tensor.inputs[0]
188
+ assign_ms = tensor.inputs[1]
189
+ assign_mom = tensor.inputs[2]
190
+
191
+ assign.buffer.dirty = true
192
+ assign_ms.buffer.dirty = true
193
+ assign_mom.buffer.dirty = true
194
+ output_buffer = assign.buffer
195
+ event_wait_list = build_event_wait_list(inputs)
196
+ work_group = [output_buffer.total_elements]
197
+
198
+ event = call_program('apply_rms_prop', output_buffer.data_type,
199
+ work_group,
200
+ lr.cl_buffer,
201
+ rho.cl_buffer,
202
+ momentum.cl_buffer,
203
+ epsilon.cl_buffer,
204
+ grad.cl_buffer,
205
+ assign.buffer.cl_buffer,
206
+ assign_ms.buffer.cl_buffer,
207
+ assign_mom.buffer.cl_buffer,
208
+ event_wait_list: event_wait_list)
209
+
210
+ output_buffer.op = event
211
+ assign_ms.buffer.op = event
212
+ assign_mom.buffer.op = event
213
+ output_buffer
214
+ end
215
+
133
216
  register_op :softmax do |_context, tensor, inputs|
134
217
  a = inputs[0]
135
218
  event_wait_list = build_event_wait_list(inputs)
@@ -213,7 +296,9 @@ module TensorStream
213
296
  work_group = [m]
214
297
  n = m if n.nil?
215
298
  cl_n = OpenCL::Int1.new(n || 1)
216
- event = _cl_program('softmax_grad', dtype: dtype, size: n).send(:"softmax_grad_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer, grad.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
299
+ event = _cl_program('softmax_grad', dtype: dtype, size: n).
300
+ send(:"softmax_grad_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer,
301
+ grad.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
217
302
  output_buffer.op = event
218
303
  output_buffer
219
304
  end
@@ -14,6 +14,14 @@ module TensorStream
14
14
  @op = op
15
15
  end
16
16
 
17
+ def total_elements
18
+ shape.reduce(:*) || 1
19
+ end
20
+
21
+ def empty_value?
22
+ @shape == [0]
23
+ end
24
+
17
25
  def to_ruby
18
26
  return [] if buffer.empty?
19
27
 
@@ -24,6 +32,7 @@ module TensorStream
24
32
  end
25
33
 
26
34
  if shape.empty?
35
+ return buffer.to_s if data_type == :string
27
36
  return buffer[0] != 0 if data_type == :boolean
28
37
  return buffer[0]
29
38
  end
@@ -11,6 +11,8 @@ require 'narray_ffi'
11
11
  require 'tensor_stream/evaluator/base_evaluator'
12
12
  require 'tensor_stream/opencl/math_ops'
13
13
  require 'tensor_stream/opencl/nn_ops'
14
+ require 'tensor_stream/opencl/images_ops'
15
+ require 'tensor_stream/opencl/array_ops'
14
16
  require 'tensor_stream/helpers/op_helper'
15
17
 
16
18
  module TensorStream
@@ -32,7 +34,8 @@ module TensorStream
32
34
  end
33
35
  end
34
36
 
35
- ## PURE ruby evaluator used for testing and development
37
+ ##
38
+ # PURE ruby evaluator used for testing and development
36
39
  class OpenclEvaluator < BaseEvaluator
37
40
  attr_accessor :retain
38
41
  attr_reader :opencl_device
@@ -42,6 +45,8 @@ module TensorStream
42
45
  include TensorStream::MathHelper
43
46
  include TensorStream::OpenCLHelpers::MathOps
44
47
  include TensorStream::OpenCLHelpers::NNOps
48
+ include TensorStream::OpenCLHelpers::ImagesOps
49
+ include TensorStream::OpenCLHelpers::ArrayOps
45
50
 
46
51
  def initialize(session, device, thread_pool: nil, log_intermediates: false)
47
52
  super
@@ -86,7 +91,10 @@ module TensorStream
86
91
 
87
92
  # opencl evaluator main entrypoint
88
93
  def run(tensor, execution_context)
89
- read_final_result(complete_eval(tensor, execution_context))
94
+ result = complete_eval(tensor, execution_context)
95
+ # puts "wait finish"
96
+ _opencl_queue.finish
97
+ read_final_result(result)
90
98
  end
91
99
 
92
100
  def run_with_buffer(tensor, context, execution_context)
@@ -117,9 +125,9 @@ module TensorStream
117
125
  def enqueue_buffer_read(tensor, context)
118
126
  buffer = _run(tensor, context)
119
127
  if buffer.is_a?(Array)
120
- buffer = buffer.collect do |b|
128
+ buffer.collect do |b|
121
129
  next b if b.buffer.size.zero?
122
- _opencl_queue.enqueue_read_buffer(b.cl_buffer, b.buffer, event_wait_list: build_event_wait_list([b]))
130
+ b.op = _opencl_queue.enqueue_read_buffer(b.cl_buffer, b.buffer, event_wait_list: build_event_wait_list([b]))
123
131
  b
124
132
  end
125
133
  else
@@ -127,14 +135,18 @@ module TensorStream
127
135
  return buffer if buffer.nil?
128
136
  return [] if buffer.buffer.nil?
129
137
  return buffer if buffer.buffer.size.zero?
130
- _opencl_queue.enqueue_read_buffer(buffer.cl_buffer, buffer.buffer, event_wait_list: build_event_wait_list([buffer]))
138
+ buffer.op = _opencl_queue.enqueue_read_buffer(buffer.cl_buffer, buffer.buffer, event_wait_list: build_event_wait_list([buffer]))
131
139
  buffer
132
140
  end
133
141
  end
134
142
 
135
143
  def complete_eval(tensor, context)
144
+ return nil if tensor.nil?
145
+
136
146
  buffer = enqueue_buffer_read(tensor, context)
137
- _opencl_queue.finish
147
+ events = build_event_wait_list([buffer])
148
+ # puts "wait #{tensor.name}"
149
+ OpenCL.wait_for_events(events) unless events.empty?
138
150
  buffer
139
151
  end
140
152
 
@@ -162,6 +174,7 @@ module TensorStream
162
174
 
163
175
  def prepare_input(tensor, context, options = {})
164
176
  return nil unless tensor
177
+
165
178
  tensor = resolve_placeholder(tensor)
166
179
  if options[:noop]
167
180
  tensor
@@ -210,11 +223,17 @@ module TensorStream
210
223
  def _cl_program(kernel, args = {})
211
224
  suffix = args.collect { |k, v| "#{k}.#{escape_arg_content(v)}" }.join('.')
212
225
  @context[:_cache]["_opencl_kernel_#{kernel}.#{suffix}:#{object_id}"] ||= begin
213
- filename = %w[cl.erb cl].map { |ext| cl_template_path(kernel, ext) }.find { |n| File.exist?(n) }
214
- raise "opencl kernel template for #{kernel} has not yet been defined" if filename.nil?
215
- source = File.read(filename)
216
- source = OpenclTemplateHelper.new(source).generate(args)
217
- # File.write("/tmp/#{kernel}.#{suffix}.cl", source)
226
+ file_path = File.join('/tmp', "#{kernel}.#{suffix}.cl")
227
+ source = if File.exist?(file_path) && ENV['TS_OPENCL_FILE_CACHE']
228
+ File.read(file_path)
229
+ else
230
+ filename = %w[cl.erb cl].map { |ext| cl_template_path(kernel, ext) }.find { |n| File.exist?(n) }
231
+ raise "opencl kernel template for #{kernel} has not yet been defined" if filename.nil?
232
+ source = File.read(filename)
233
+ source = OpenclTemplateHelper.new(source).generate(args)
234
+ File.write(file_path, source) if ENV['TS_OPENCL_FILE_CACHE']
235
+ source
236
+ end
218
237
  program = _opencl_context.create_program_with_source(source)
219
238
  program.build
220
239
  rescue OpenCL::Error::BUILD_PROGRAM_FAILURE => e
@@ -275,10 +294,10 @@ module TensorStream
275
294
  end
276
295
 
277
296
  register_op :identity do |context, tensor, inputs|
278
- if tensor.inputs.size > 1
279
- tensor.inputs[1..inputs.size].each { |input| complete_eval(input, context) }
280
- end
281
- inputs[0]
297
+ value = inputs[0]
298
+ buffer = OpenCLBuffer.new(name: tensor.name, data_type: tensor.data_type, shape: value.shape, buffer: value.buffer, cl_buffer: value.cl_buffer)
299
+ buffer.op = build_event_wait_list(inputs)
300
+ buffer
282
301
  end
283
302
 
284
303
  register_op :assign, noop: true do |context, tensor, inputs|
@@ -308,86 +327,11 @@ module TensorStream
308
327
  end
309
328
  end
310
329
 
311
- register_op :expand_dims, buffer: true do |_context, tensor, inputs|
312
- axis = inputs[1].buffer[0]
313
- shape = inputs[0].shape.dup
314
- axis = -axis if axis == shape.size
315
- new_shape = shape.insert(axis, 1).compact
316
- new_buf = inputs[0].buffer.reshape(*new_shape.reverse)
317
- convert_to_opencl(new_buf, new_shape, data_type: inputs[0].data_type, name: tensor.name)
318
- end
319
-
320
- register_op :fill, buffer: true do |_context, tensor, inputs|
321
- shape = inputs[0]
322
- value = inputs[1]
323
-
324
- narray_size = shape.buffer.to_a.reduce(:*) || 1
325
- cl_buffer = get_cached_buffer(tensor.name, shape.buffer.to_a)
326
-
327
- buffer = if cl_buffer
328
- cl_buffer.buffer
329
- else
330
- allocate_narray_for_type(tensor.data_type, narray_size)
331
- end
332
-
333
- buffer.fill!(value.buffer[0])
334
- convert_to_opencl(buffer, shape.buffer.to_a, data_type: tensor.data_type, name: tensor.name)
335
- end
336
-
337
330
  register_op :where, noop: true do |context, tensor, inputs|
338
331
  pred = tensor.options[:pred]
339
332
  execute_cond_func('where', tensor, pred, inputs[0], inputs[1], context)
340
333
  end
341
334
 
342
- register_op :cast do |_context, tensor, inputs|
343
- a = inputs[0]
344
- if a.data_type != tensor.data_type
345
- buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
346
- m, n = a.shape
347
- cl_m = OpenCL::Int1.new(m || 1)
348
- cl_n = OpenCL::Int1.new(n || 1)
349
- work_group = [m || 1, n || 1]
350
- event_wait_list = build_event_wait_list(inputs)
351
- buffer.op = _cl_program("cast", source_dt: a.data_type, target_dt: tensor.data_type).cast(_opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, buffer.cl_buffer, event_wait_list: event_wait_list)
352
- buffer
353
- else
354
- a
355
- end
356
- end
357
-
358
- register_op :stack do |_context, tensor, inputs|
359
- axis = tensor.options[:axis] || 0
360
- shape = inputs[0].shape
361
- rank = shape.size + 1
362
- elem_size = shape.empty? ? 1 : shape.reduce(:*)
363
-
364
- new_shape = [inputs.size]
365
- shape.inject(new_shape) { |ns, s| ns << s }
366
-
367
- divisors = new_shape.dup.drop(1).reverse.inject([1]) do |a, s|
368
- a << s * a.last
369
- end.reverse
370
-
371
- axis = rank + axis if axis < 0
372
- rotated_shape = Array.new(axis + 1) { new_shape.shift }
373
- new_shape = rotated_shape.rotate! + new_shape
374
-
375
- output_buffer = _create_result_buffer(tensor.data_type, new_shape, tensor.name)
376
- multipliers = new_shape.dup.drop(1).reverse.inject([1]) do |a, s|
377
- a << s * a.last
378
- end.reverse
379
-
380
- cl_n = OpenCL::Int1.new(elem_size)
381
- work_group = [elem_size]
382
- event_wait_list = build_event_wait_list(inputs)
383
- ops = inputs.each_with_index.map do |input, index|
384
- cl_index = OpenCL::Int1.new(index)
385
- _cl_program("pack", data_type: tensor.data_type, divisors: divisors, multipliers: multipliers, axis: axis).pack(_opencl_queue, work_group, cl_n, cl_index, input.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
386
- end
387
- output_buffer.op = ops
388
- output_buffer
389
- end
390
-
391
335
  register_op :check_numerics, noop: true do |context, tensor, inputs|
392
336
  a = complete_eval(inputs[0], context)
393
337
  name = tensor.options[:name]
@@ -420,86 +364,18 @@ module TensorStream
420
364
  a
421
365
  end
422
366
 
423
- register_op :rank do |_context, tensor, inputs|
424
- wrap_opencl(inputs[0].shape.size, data_type: tensor.data_type, name: tensor.name)
425
- end
426
-
427
367
  register_op :stop_gradient do |_context, _tensor, inputs|
428
368
  inputs[0]
429
369
  end
430
370
 
431
- register_op :slice, noop: true do |context, tensor, inputs|
432
- input_a = complete_eval(inputs[0], context)
433
- input_b = read_final_result(complete_eval(inputs[1], context))
434
- size = tensor.options[:size]
435
-
436
- slice_param = input_b.zip(size).collect { |x, y| x..x + y - 1 }.reverse
437
-
438
- new_buf = input_a.buffer.reshape(*input_a.shape.reverse)
439
- sliced = new_buf.slice[*slice_param]
440
- convert_to_opencl(sliced.flatten, sliced.shape.reverse, data_type: inputs[0].data_type, name: tensor.name)
441
- end
442
-
443
- register_op :transpose, buffer: true do |_context, tensor, inputs|
444
- t_param = Array.new(inputs[0].shape.size) { |index| index }.reverse
445
-
446
- if inputs[0].shape.size == 2 && inputs[1].nil?
447
- transposed = inputs[0].buffer.reshape(*inputs[0].shape.reverse).transpose(*t_param)
448
- res = convert_to_opencl(transposed.flatten, transposed.shape.reverse, data_type: inputs[0].data_type, name: tensor.name)
449
- res
450
- else
451
- rank = inputs[0].shape.size
452
- perm = inputs[1].nil? ? (0...rank).to_a.reverse : inputs[1].buffer
453
- new_shape = perm.map { |p| inputs[0].shape[p] }.to_a
454
- output_buffer = _create_result_buffer(tensor.data_type, new_shape, tensor.name)
455
- transpose_with_perm(inputs[0].buffer, output_buffer.buffer, inputs[0].shape, new_shape, perm)
456
-
457
- write_op = _opencl_queue.enqueue_write_buffer(output_buffer.cl_buffer, output_buffer.buffer)
458
- output_buffer.op = write_op
459
- output_buffer
460
- end
461
- end
462
-
463
- register_op :index, noop: true do |context, tensor, inputs|
464
- a = _run(inputs[0], context)
465
- index = read_final_result(_run(inputs[1], context))
466
-
467
- if a.is_a?(OutputGroup)
468
- a.outputs[index]
469
- elsif a.is_a?(Array)
470
- a[index]
471
- else
472
- new_shape = a.shape.dup
473
- new_shape.shift
474
- input_a = read_final_result(a)
475
- convert_to_opencl(input_a[index], new_shape, data_type: a.data_type, name: tensor.name)
476
- end
477
- end
478
-
479
371
  register_op :broadcast_gradient_args, buffer: true do |_context, tensor, inputs|
480
372
  rx, ry = get_broadcast_gradient_args(inputs[0].buffer.to_a, inputs[1].buffer.to_a)
481
373
  OutputGroup.new([wrap_opencl(rx, data_type: :int32, name: tensor.name), wrap_opencl(ry, data_type: :int32, name: "#{tensor.name}:1")], tensor.inputs.map(&:data_type))
482
374
  end
483
375
 
484
- register_op :shape do |_context, tensor, inputs|
485
- wrap_opencl(inputs[0].shape, name: tensor.name, data_type: tensor.data_type)
486
- end
487
-
488
- register_op :reshape, buffer: true do |_context, tensor, inputs|
489
- arr = inputs[0]
490
- new_shape = read_final_result(inputs[1])
491
-
492
- shape = if new_shape.size.zero? && arr.buffer.size == 1
493
- new_shape
494
- else
495
- TensorShape.fix_inferred_elements(new_shape, arr.buffer.size)
496
- end
497
-
498
- convert_to_opencl(arr.buffer, shape, data_type: arr.data_type, name: tensor.name)
499
- end
500
-
501
- register_op :flow_group do |context, _tensor, inputs|
502
- _opencl_queue.finish
376
+ register_op :flow_group do |_context, _tensor, inputs|
377
+ events = build_event_wait_list(inputs)
378
+ OpenCL.wait_for_events(events) unless events.empty?
503
379
  nil
504
380
  end
505
381
 
@@ -657,7 +533,10 @@ module TensorStream
657
533
  cl_n = OpenCL::Int1.new(n || 1)
658
534
 
659
535
  event_wait_list = build_event_wait_list([a, b, p]) # add dependency wait list
660
- output_buffer.op = _cl_program(op_name.to_s, dtype: dtype).send(:"#{op_name}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, p.cl_buffer, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
536
+ output_buffer.op = _cl_program(op_name.to_s, dtype: dtype).
537
+ send(:"#{op_name}_#{dtype}", _opencl_queue, work_group,
538
+ cl_m, cl_n, p.cl_buffer, a.cl_buffer, b.cl_buffer,
539
+ output_buffer.cl_buffer, event_wait_list: event_wait_list)
661
540
  output_buffer
662
541
  end
663
542
 
@@ -667,16 +546,17 @@ module TensorStream
667
546
  dtype = tensor.data_type
668
547
  output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
669
548
 
670
- m, n = a.shape
671
- work_group = [m || 1, n || 1]
672
- cl_m = OpenCL::Int1.new(m || 1)
673
- cl_n = OpenCL::Int1.new(n || 1)
549
+ work_group = [a.total_elements]
674
550
 
675
- event = _cl_program(op_name.to_s, dtype: dtype).send(:"#{op_name}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
551
+ event = call_program(op_name, dtype, work_group, a.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
676
552
  output_buffer.op = event
677
553
  output_buffer
678
554
  end
679
555
 
556
+ def call_program(name, dtype, work_group, *args)
557
+ _cl_program(name.to_s, dtype: dtype).send(:"#{name}_#{dtype}", _opencl_queue, work_group, *args)
558
+ end
559
+
680
560
  def auto_type_cast(a, b, name: nil)
681
561
  return [a, b] if a.data_type == b.data_type
682
562
  m, n = b.shape
@@ -728,16 +608,20 @@ module TensorStream
728
608
  @context[:_cache][cache_key]
729
609
  else
730
610
  narray_size = shape.reduce(:*) || 1
611
+ cl_buffer_size = shape.empty? ? 1 : shape.reduce(:*)
731
612
 
732
613
  buffer = if value.is_a?(NArray)
733
614
  value
615
+ elsif data_type == :string && shape.empty?
616
+ cl_buffer_size = value[0].bytesize
617
+ allocate_narray_for_type(data_type, value[0].bytesize)
734
618
  else
735
619
  allocate_narray_for_type(data_type, narray_size)
736
620
  end
737
621
 
738
622
  return nil if buffer.nil?
739
623
 
740
- cl_buffer_size = shape.empty? ? 1 : shape.reduce(:*)
624
+
741
625
 
742
626
  cl_buffer = unless value.flatten.empty?
743
627
  cl_buffer_size = 1 if cl_buffer_size.zero?
@@ -746,8 +630,11 @@ module TensorStream
746
630
 
747
631
  @context[:_cache][cache_key] = OpenCLBuffer.new(name: name, data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer)
748
632
  end
749
-
750
- if value.is_a?(Array)
633
+ if data_type == :string
634
+ value[0].each_byte.with_index do |c, index|
635
+ cl_object.buffer[index] = c
636
+ end
637
+ elsif value.is_a?(Array)
751
638
  value.flatten.each_with_index do |element, index|
752
639
  cl_object.buffer[index] = if element.is_a?(Tensor)
753
640
  read_final_result(complete_eval(element, {}))
@@ -765,7 +652,10 @@ module TensorStream
765
652
  cl_object.buffer[0] = Tensor.cast_dtype(value, data_type)
766
653
  end
767
654
 
768
- write_op = _opencl_queue.enqueue_write_buffer(cl_object.cl_buffer, cl_object.buffer) if cl_object.cl_buffer && !value.nil? && (!value.is_a?(Array) || !value.empty?)
655
+ if cl_object.cl_buffer && !value.nil? && (!value.is_a?(Array) || !value.empty?)
656
+ write_op = _opencl_queue.enqueue_write_buffer(cl_object.cl_buffer, cl_object.buffer)
657
+ end
658
+
769
659
  cl_object.op = write_op
770
660
  cl_object
771
661
  end
@@ -780,8 +670,12 @@ module TensorStream
780
670
  NArray.int(narray_size)
781
671
  when :int16
782
672
  NArray.sint(narray_size)
673
+ when :uint8
674
+ NArray.byte(narray_size)
783
675
  when :boolean
784
676
  NArray.byte(narray_size)
677
+ when :string
678
+ NArray.byte(narray_size)
785
679
  when :unknown
786
680
  nil
787
681
  else
@@ -799,6 +693,65 @@ module TensorStream
799
693
  end
800
694
  end
801
695
 
696
+ # automatically use sub buffers
697
+ def _create_result_sub_buffer(parent_buffer, index, data_type, shape, name)
698
+ cache_key ="_sub_result_#{parent_buffer.object_id}_#{name}_#{index}:#{object_id}"
699
+ @context[:_cache][:_cl_buffers][cache_key] ||= begin
700
+ size = shape.empty? || shape == [0] ? 1 : shape.reduce(:*)
701
+ buffer = allocate_narray_for_type(data_type, size)
702
+
703
+ if parent_buffer.cl_buffer.associated_memobject.nil?
704
+ start = index * buffer.size * buffer.element_size
705
+ region = OpenCL::BufferRegion::new(start, buffer.size * buffer.element_size)
706
+ cl_buffer = parent_buffer.cl_buffer.create_sub_buffer(OpenCL::BUFFER_CREATE_TYPE_REGION, region)
707
+ OpenCLBuffer.new(data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer, name: name)
708
+ else
709
+ _create_result_buffer(tensor.data_type, shape, name)
710
+ end
711
+ end
712
+
713
+ buffer = @context[:_cache][:_cl_buffers][cache_key]
714
+
715
+ if buffer.cl_buffer.associated_memobject
716
+ buffer.op = parent_buffer.op
717
+ else
718
+ region_size_in_bytes = buffer.buffer.size * buffer.buffer.element_size
719
+ start = index * region_size_in_bytes
720
+ region = [region_size_in_bytes, 1, 1]
721
+ buffer.op = _opencl_queue.enqueue_copy_buffer_rect(parent_buffer.cl_buffer, buffer.cl_buffer, region, src_origin: [start, 0, 0], event_wait_list: parent_buffer.op)
722
+ end
723
+
724
+ buffer
725
+ end
726
+
727
+ # create sub buffers of different sizes
728
+ def _create_variable_result_sub_buffer(parent_buffer, index, start, region_size_in_bytes, data_type, shape, name)
729
+ cache_key ="_sub_result_#{parent_buffer.object_id}_#{name}_#{index}:#{object_id}"
730
+ @context[:_cache][:_cl_buffers][cache_key] ||= begin
731
+ size = shape.empty? || shape == [0] ? 1 : shape.reduce(:*)
732
+ buffer = allocate_narray_for_type(data_type, size)
733
+
734
+ if parent_buffer.cl_buffer.associated_memobject.nil?
735
+ region = OpenCL::BufferRegion::new(start, region_size_in_bytes)
736
+ cl_buffer = parent_buffer.cl_buffer.create_sub_buffer(OpenCL::BUFFER_CREATE_TYPE_REGION, region)
737
+ OpenCLBuffer.new(data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer, name: "#{name}/sub")
738
+ else
739
+ _create_result_buffer(tensor.data_type, shape, name)
740
+ end
741
+ end
742
+
743
+ buffer = @context[:_cache][:_cl_buffers][cache_key]
744
+
745
+ if buffer.cl_buffer.associated_memobject
746
+ buffer.op = parent_buffer.op
747
+ else
748
+ region = [region_size_in_bytes, 1, 1]
749
+ buffer.op = _opencl_queue.enqueue_copy_buffer_rect(parent_buffer.cl_buffer, buffer.cl_buffer, region, src_origin: [start, 0, 0], event_wait_list: parent_buffer.op)
750
+ end
751
+
752
+ buffer
753
+ end
754
+
802
755
  def get_op_with_axis(a, target_axis, current_axis, output_type, op = ->(t, u) { t > u })
803
756
  if target_axis == current_axis
804
757
  if a[0].is_a?(Array)
@@ -898,7 +851,11 @@ module TensorStream
898
851
  end
899
852
 
900
853
  def build_event_wait_list(inputs)
901
- inputs.compact.map(&:op).flatten
854
+ if inputs.is_a?(Array)
855
+ inputs.flatten.compact.map(&:op).compact.uniq
856
+ else
857
+ inputs.op ? [inputs.op] : []
858
+ end
902
859
  end
903
860
 
904
861
  def resolve_placeholder(placeholder, _execution_context = {})