tensor_stream-opencl 0.2.2 → 0.2.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ __kernel void relu6_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
+ // Get the index of the current element to be processed
4
+ const int id = get_global_id(0);
5
+
6
+ C[id] = min((<%= c_dtype %>)max((<%= c_dtype %>) A[id], (<%= c_dtype %>)0), (<%= c_dtype %>)6);
7
+ }
@@ -1,8 +1,7 @@
1
1
  % c_dtype = dtype_to_c_type(dtype)
2
- __kernel void round_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
2
+ __kernel void round_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
3
  // Get the index of the current element to be processed
4
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
4
+ const int id = get_global_id(0); // Row ID of C (0..M)
6
5
 
7
- C[globalRow * N + globalCol] = round(A[globalRow * N + globalCol]);
6
+ C[id] = round(A[id]);
8
7
  }
@@ -0,0 +1,26 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ __kernel void sum_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
+ // Get the index of the current element to be processed
4
+ const int id = get_global_id(0);
5
+ int offset = (id + <%= index %>) * <%= w %>;
6
+ <%= c_dtype %> sum = 0;
7
+ <% if n > 4 %>
8
+ for(int i = 0; i < <%= n/4 %> ; i++) {
9
+ <% sums = 4.times.map do |i|
10
+ "A[offset + #{i}]"
11
+ end %>
12
+ sum += <%= sums.join(' + ') %>;
13
+ offset += 4;
14
+ }
15
+ <% if n%4!=0 %>
16
+ <% (n % 4).times do |i| %>
17
+ sum += A[offset + <%= i %>];
18
+ <% end %>
19
+ <% end %>
20
+ <% else %>
21
+ <% n.times do |i| %>
22
+ sum += A[offset + <%= i %>];
23
+ <% end %>
24
+ <% end %>
25
+ C[id] = sum;
26
+ }
@@ -5,8 +5,8 @@ module TensorStream
5
5
  def MathOps.included(klass)
6
6
  klass.class_eval do
7
7
  %i[max min add real_div div sub floor_mod mod mul pow sigmoid_grad squared_difference].each do |op|
8
- register_op op, noop: true do |context, tensor, inputs|
9
- execute_2_operand_func(op.to_s, tensor, inputs[0], inputs[1], context)
8
+ register_op op do |context, tensor, inputs|
9
+ execute_2_operand_func(op.to_s, tensor, inputs[0], inputs[1])
10
10
  end
11
11
  end
12
12
 
@@ -38,11 +38,11 @@ module TensorStream
38
38
  end
39
39
  end
40
40
 
41
- register_op :floor_div, noop: true do |context, tensor, inputs|
41
+ register_op :floor_div do |context, tensor, inputs|
42
42
  if fp_type?(tensor.data_type)
43
- execute_2_operand_func('floor_div', tensor, inputs[0], inputs[1], context)
43
+ execute_2_operand_func('floor_div', tensor, inputs[0], inputs[1])
44
44
  else
45
- execute_2_operand_func('div', tensor, inputs[0], inputs[1], context)
45
+ execute_2_operand_func('div', tensor, inputs[0], inputs[1])
46
46
  end
47
47
  end
48
48
 
@@ -78,11 +78,8 @@ module TensorStream
78
78
  cl_n = OpenCL::Int1.new(n)
79
79
  cl_k = OpenCL::Int1.new(k)
80
80
 
81
- transpose_a = OpenCL::Int1.new(tensor.options[:transpose_a] ? 1 : 0)
82
- transpose_b = OpenCL::Int1.new(tensor.options[:transpose_b] ? 1 : 0)
83
81
  event_wait_list = build_event_wait_list([a, b])
84
-
85
- output_buffer.op = _cl_program('gemm', dtype: dtype).send(:"gemm_#{dtype}", _opencl_queue, result_shape, cl_m, cl_n, cl_k, transpose_a, transpose_b, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
82
+ output_buffer.op = _cl_program('gemm', ta: !!tensor.options[:transpose_a], tb: !!tensor.options[:transpose_b], dtype: dtype).send(:"gemm_#{dtype}", _opencl_queue, result_shape, cl_m, cl_n, cl_k, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
86
83
 
87
84
  output_buffer
88
85
  end
@@ -94,39 +91,99 @@ module TensorStream
94
91
  end
95
92
 
96
93
  %i[sum mean].each do |op|
97
- register_op op, noop: true do |context, tensor, inputs|
94
+ register_op op do |context, tensor, inputs|
98
95
  reduction(context, tensor, inputs[0], inputs[1], op.to_sym)
99
96
  end
100
97
  end
101
98
 
102
- register_op :prod, noop: true do |context, tensor, inputs|
103
- input_a = complete_eval(inputs[0], context)
104
-
105
- if input_a.buffer.empty?
99
+ register_op :prod do |context, tensor, inputs|
100
+ if inputs[0].shape == [0]
106
101
  convert_to_opencl([1.0], [], data_type: inputs[0].data_type, name: tensor.name)
107
102
  else
108
103
  reduction(context, tensor, inputs[0], inputs[1], :prod)
109
104
  end
110
105
  end
111
106
 
112
- register_op :argmin, buffer: true do |_context, tensor, inputs|
113
- axis = tensor.options[:axis] || 0
114
- rank = inputs[0].shape.size
115
- raise TensorStream::InvalidArgumentError, "Expected dimension in the range [#{-rank},#{rank}) but got #{axis}" if axis < -rank || axis >= rank
107
+ # register_op :argmin, buffer: true do |_context, tensor, inputs|
108
+ # axis = inputs[1].nil? || inputs[1].buffer.nil? || inputs[1].buffer.empty? ? 0 : inputs[1].buffer
109
+ # rank = inputs[0].shape.size
110
+ # raise TensorStream::InvalidArgumentError, "Expected dimension in the range [#{-rank},#{rank}) but got #{axis}" if axis < -rank || axis >= rank
111
+
112
+ # arr = inputs[0].buffer.reshape(*inputs[0].shape.reverse).to_a
113
+ # op = get_op_with_axis(arr, axis, 0, inputs[0].data_type, ->(a, b) { a < b })
114
+ # convert_to_opencl(op, shape_eval(op), data_type: tensor.data_type, name: tensor.name)
115
+ # end
116
+
117
+ # register_op :argmax, buffer: true do |_context, tensor, inputs|
118
+ # axis = inputs[1].nil? || inputs[1].buffer.nil? || inputs[1].buffer.empty? ? 0 : inputs[1].buffer
119
+ # rank = inputs[0].shape.size
120
+ # raise TensorStream::InvalidArgumentError, "Expected dimension in the range [#{-rank},#{rank}) but got #{axis}" if axis < -rank || axis >= rank
121
+
122
+ # arr = inputs[0].buffer.reshape(*inputs[0].shape.reverse).to_a
123
+ # op = get_op_with_axis(arr, axis, 0, inputs[0].data_type, ->(a, b) { a > b })
124
+ # convert_to_opencl(op, shape_eval(op), data_type: tensor.data_type, name: tensor.name)
125
+ # end
126
+
127
+ def reduction(child_context, tensor, value, axis, func)
128
+ if axis.nil?
129
+ value = _run(value, child_context)
130
+ size = value.shape.reduce(:*) || 1
131
+ if value.shape.empty? # for scalars, just return as is
132
+ value
133
+ else
134
+ reduction_threads = 32
135
+ items_per_thread_threshold = 4
136
+
137
+ output_buffer = _create_result_buffer(value.data_type, [], tensor.name)
138
+ event_wait_list = build_event_wait_list([value])
139
+
140
+ if (size > reduction_threads) && ((size / reduction_threads) > items_per_thread_threshold)
141
+ items_per_thread = size / reduction_threads
142
+ extra_items = size % reduction_threads
143
+ intermediate_output_buffer = _create_result_buffer(value.data_type, [reduction_threads], tensor.name)
144
+
145
+ temp_values = if extra_items.zero?
146
+ _cl_program(func, dtype: value.data_type, index: 0, n: items_per_thread, w: items_per_thread).
147
+ send(:"#{func}_#{value.data_type}", _opencl_queue, [reduction_threads], value.cl_buffer, intermediate_output_buffer.cl_buffer, event_wait_list: event_wait_list)
148
+ else
149
+ [_cl_program(func, dtype: value.data_type, index: 0, n: items_per_thread, w: items_per_thread).
150
+ send(:"#{func}_#{value.data_type}", _opencl_queue, [reduction_threads - 1], value.cl_buffer, intermediate_output_buffer.cl_buffer, event_wait_list: event_wait_list),
151
+ _cl_program(func, dtype: value.data_type, index: reduction_threads - 1, n: items_per_thread + extra_items, w: items_per_thread).send(:"#{func}_#{value.data_type}", _opencl_queue, [1], value.cl_buffer, intermediate_output_buffer.cl_buffer, event_wait_list: event_wait_list)]
152
+ end
153
+ output_buffer.op = _cl_program(func, dtype: value.data_type, n: reduction_threads, index: 0, w: 0).send(:"#{func}_#{value.data_type}", _opencl_queue, [1], value.cl_buffer, output_buffer.cl_buffer, event_wait_list: temp_values)
154
+ output_buffer
155
+ else
156
+ output_buffer.op = _cl_program(func, dtype: value.data_type, n: size, index: 0, w: 0).send(:"#{func}_#{value.data_type}", _opencl_queue, [1], value.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
157
+ output_buffer
158
+ end
159
+ end
160
+ else
161
+ return value if value.shape.empty?
162
+
163
+ axis = axis.is_a?(OpenCLBuffer) ? read_final_result(axis) : axis
164
+ input = complete_eval(value, child_context)
165
+ value = value.buffer.reshape(*value.shape.reverse)
166
+ rank = input.shape.size - 1
167
+
168
+ if axis.is_a?(Array)
169
+ axis.map { |x| rank - x.abs }.sort.reverse_each do |x|
170
+ value = value.send(func, x.to_i)
171
+ end
172
+ else
173
+ value = value.send(func, rank - axis.abs)
174
+ end
116
175
 
117
- arr = inputs[0].buffer.reshape(*inputs[0].shape.reverse).to_a
118
- op = get_op_with_axis(arr, axis, 0, inputs[0].data_type, ->(a, b) { a < b })
119
- convert_to_opencl(op, shape_eval(op), data_type: tensor.data_type, name: tensor.name)
120
- end
176
+ new_shape = if value.is_a?(NArray)
177
+ value.shape.reverse
178
+ else
179
+ value = [value]
180
+ []
181
+ end
121
182
 
122
- register_op :argmax, buffer: true do |_context, tensor, inputs|
123
- axis = tensor.options[:axis] || 0
124
- rank = inputs[0].shape.size
125
- raise TensorStream::InvalidArgumentError, "Expected dimension in the range [#{-rank},#{rank}) but got #{axis}" if axis < -rank || axis >= rank
183
+ new_shape = _reduced_shape(input.shape.dup, axis) if tensor.options[:keepdims]
126
184
 
127
- arr = inputs[0].buffer.reshape(*inputs[0].shape.reverse).to_a
128
- op = get_op_with_axis(arr, axis, 0, inputs[0].data_type, ->(a, b) { a > b })
129
- convert_to_opencl(op, shape_eval(op), data_type: tensor.data_type, name: tensor.name)
185
+ convert_to_opencl(value.flatten, new_shape, data_type: tensor.data_type, name: tensor.name)
186
+ end
130
187
  end
131
188
  end
132
189
  end
@@ -121,11 +121,11 @@ module TensorStream
121
121
  end
122
122
 
123
123
  register_op :apply_adagrad do |context, tensor, inputs|
124
- target_var, accum, lr, grad = inputs
124
+ _target_var, _accum, lr, grad = inputs
125
125
 
126
126
  assign = tensor.inputs[0] || tensor
127
127
  assign_acc = tensor.inputs[1]
128
-
128
+
129
129
  assign.buffer.dirty = true
130
130
  assign_acc.buffer.dirty = true
131
131
  output_buffer = assign.buffer
@@ -133,7 +133,7 @@ module TensorStream
133
133
  work_group = [output_buffer.total_elements]
134
134
 
135
135
  event_wait_list = build_event_wait_list(inputs)
136
- event = call_program('apply_adagrad',
136
+ event = call_program('apply_adagrad',
137
137
  output_buffer.data_type,
138
138
  work_group,
139
139
  lr.cl_buffer,
@@ -195,7 +195,7 @@ module TensorStream
195
195
  event_wait_list = build_event_wait_list(inputs)
196
196
  work_group = [output_buffer.total_elements]
197
197
 
198
- event = call_program('apply_rms_prop', output_buffer.data_type,
198
+ event = call_program('apply_rms_prop', output_buffer.data_type,
199
199
  work_group,
200
200
  lr.cl_buffer,
201
201
  rho.cl_buffer,
@@ -298,7 +298,7 @@ module TensorStream
298
298
  end
299
299
 
300
300
  b = wrap_opencl(labels, data_type: inputs[0].data_type, name: "#{tensor.name}_label")
301
-
301
+
302
302
  event_wait_list = build_event_wait_list(inputs)
303
303
  dtype = tensor.data_type
304
304
  output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
@@ -335,6 +335,90 @@ module TensorStream
335
335
  output_buffer.op = event
336
336
  output_buffer
337
337
  end
338
+
339
+ %i[relu6].each do |op|
340
+ register_op op, noop: true do |context, tensor, inputs|
341
+ execute_func(op.to_s, tensor, inputs[0], context)
342
+ end
343
+ end
344
+
345
+ # Fast per pixel parallel convolution operation
346
+ register_op :conv2d do |_context, tensor, inputs|
347
+ filter = inputs[1]
348
+ batch, height, width, channel = inputs[0].shape
349
+ filter_shape = filter.shape
350
+ strides = tensor.options[:strides]
351
+ height_stride = strides[1]
352
+ width_stride = strides[2]
353
+
354
+ raise TensorStream::ValueError, " Current implementation does not yet support strides in the batch and depth dimensions." if strides[0] != 1 || strides[3] != 1
355
+
356
+ event_wait_list = build_event_wait_list(inputs)
357
+
358
+ f_height, f_width, in_channels, out_channels = filter_shape
359
+ out_shape = [batch, height / height_stride, width / width_stride, out_channels]
360
+ output_buffer = _create_result_buffer(tensor.data_type, out_shape, tensor.name)
361
+
362
+ cl_image_height = OpenCL::Int1.new(height)
363
+ cl_image_width = OpenCL::Int1.new(width)
364
+
365
+ work_dimen = [batch, height / height_stride, width / width_stride]
366
+
367
+ output_buffer.op = _cl_program("conv2d", dtype: tensor.data_type, fh: f_height, fw: f_width, ch: channel, out_ch: out_channels, stride: [height_stride, width_stride] ).send(:conv2d, _opencl_queue, work_dimen, cl_image_height, cl_image_width, inputs[0].cl_buffer,
368
+ inputs[1].cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
369
+ output_buffer
370
+ end
371
+
372
+ register_op :conv2d_backprop_input do |context, tensor, inputs|
373
+ image_shape, filter, grad = inputs
374
+ filter_shape = filter.shape
375
+
376
+ strides = tensor.options[:strides]
377
+ height_stride = strides[1]
378
+ width_stride = strides[2]
379
+
380
+ image_shape = read_final_result(complete_eval(image_shape, context))
381
+
382
+ event_wait_list = build_event_wait_list(inputs)
383
+ output_buffer = _create_result_buffer(tensor.data_type, image_shape, tensor.name)
384
+
385
+ batch, height, width, channels = image_shape
386
+ f_height, f_width, in_channels, out_channels = filter_shape
387
+
388
+ work_dimen = [batch, height, width]
389
+
390
+ cl_image_height = OpenCL::Int1.new(height)
391
+ cl_image_width = OpenCL::Int1.new(width)
392
+
393
+ output_buffer.op = _cl_program("conv2d_backprop_input", dtype: tensor.data_type, fh: f_height, fw: f_width, ch: channels, out_ch: out_channels, stride: [height_stride, width_stride] ).send(:conv2d_backprop_input, _opencl_queue, work_dimen, cl_image_height, cl_image_width,
394
+ filter.cl_buffer, grad.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
395
+ output_buffer
396
+ end
397
+
398
+ register_op :conv2d_backprop_filter do |context, tensor, inputs|
399
+ images, filter_shape, grad = inputs
400
+
401
+ event_wait_list = build_event_wait_list(inputs)
402
+
403
+ strides = tensor.options[:strides]
404
+ height_stride = strides[1]
405
+ width_stride = strides[2]
406
+
407
+ filter_shape = read_final_result(complete_eval(filter_shape, context))
408
+ output_buffer = _create_result_buffer(tensor.data_type, filter_shape, tensor.name)
409
+
410
+ batch_size, height, width, channels = images.shape
411
+ f_height, f_width, input_channels, output_channels = filter_shape
412
+ work_dimen = [f_height, f_width, output_channels]
413
+
414
+ cl_batch_size = OpenCL::Int1.new(batch_size)
415
+ cl_image_height = OpenCL::Int1.new(height)
416
+ cl_image_width = OpenCL::Int1.new(width)
417
+
418
+ output_buffer.op = _cl_program("conv2d_backprop_filter", dtype: tensor.data_type, fh: f_height, fw: f_width, ch: channels, out_ch: output_channels, stride: [height_stride, width_stride] ).send(:conv2d_backprop_filter, _opencl_queue, work_dimen, cl_batch_size, cl_image_height, cl_image_width,
419
+ images.cl_buffer, grad.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
420
+ output_buffer
421
+ end
338
422
  end
339
423
  end
340
424
  end
@@ -37,9 +37,13 @@ module TensorStream
37
37
  return buffer[0] != 0 if data_type == :boolean
38
38
  return buffer[0]
39
39
  end
40
-
41
- result = buffer.reshape(*shape.map(&:to_i).reverse).to_a
40
+
41
+ result = buffer.reshape(*shape.map(&:to_i).reverse).to_a
42
42
  data_type == :boolean ? process_function_op(result, ->(a, _b) { a != 0 }) : result
43
43
  end
44
+
45
+ def self.nil_buffer(owner, name, data_type)
46
+ OpenCLBuffer.new(owner, name: name, data_type: data_type, shape: [0], buffer: nil, cl_buffer: nil)
47
+ end
44
48
  end
45
49
  end
@@ -35,7 +35,8 @@ module TensorStream
35
35
  end
36
36
 
37
37
  ##
38
- # PURE ruby evaluator used for testing and development
38
+ # OpenCL hardware accelerated evaluator
39
+ #
39
40
  class OpenclEvaluator < BaseEvaluator
40
41
  attr_accessor :retain
41
42
  attr_reader :opencl_device, :opencl_context
@@ -53,41 +54,57 @@ module TensorStream
53
54
  super
54
55
  _create_opencl_context
55
56
  @opencl_device = device.native_device
57
+
58
+ @max_work_item_dimensions = @opencl_device.max_work_item_dimensions
59
+ @max_work_item_sizes = @opencl_device.max_work_item_sizes
60
+ @max_work_group_size = @opencl_device.max_work_group_size
61
+
62
+ @local_mem_size = @opencl_device.local_mem_size
63
+ @device_type = @opencl_device.type.to_s.downcase
64
+
56
65
  create_command_queue
57
66
  end
58
67
 
59
- def self.query_supported_devices
60
- devices = query_devices_with_score
61
- devices.sort { |a, b| a[1] <=> b[1] }.map do |d|
62
- opencl_to_device(d)
68
+ class << self
69
+ def query_supported_devices
70
+ devices = query_devices_with_score
71
+ devices.sort_by { |a| a[1] }.map do |d|
72
+ opencl_to_device(d)
73
+ end
63
74
  end
64
- end
65
75
 
66
- def self.fetch_device(query = [])
67
- devices = query_devices_with_score
68
- platform_devices = devices.select { |d| d[0].platform.to_s.tr(' ', '_').downcase =~ /#{query[0].downcase}/ }
69
- opencl_to_device(platform_devices[[query[1].to_i, platform_devices.size - 1].min])
70
- end
76
+ def fetch_device(query = [])
77
+ devices = query_devices_with_score
78
+ platform_devices = devices.select { |d| d[0].platform.to_s.tr(' ', '_').downcase =~ /#{query[0].downcase}/ }
79
+ opencl_to_device(platform_devices[[query[1].to_i, platform_devices.size - 1].min])
80
+ end
71
81
 
72
- def self.opencl_to_device(dev)
73
- device = dev[0]
74
- index = dev[3]
75
- platform_name = device.platform.name.tr(' ', '_').downcase
76
- uri = [platform_name, index].join(':')
82
+ def opencl_to_device(dev)
83
+ device = dev[0]
84
+ index = dev[3]
85
+ platform_name = device.platform.name.tr(' ', '_').downcase
86
+ uri = [platform_name, index].join(':')
77
87
 
78
- device_type = device.type.to_s == 'GPU' ? :gpu : :cpu
88
+ device_type = device.type.to_s == 'GPU' ? :gpu : :cpu
79
89
 
80
- OpenclDevice.new(uri, device_type, self).tap do |d|
81
- d.native_device = device
90
+ OpenclDevice.new(uri, device_type, self).tap do |d|
91
+ d.native_device = device
92
+ end
82
93
  end
83
- end
84
94
 
85
- ##
86
- # Select the best device available in the system for this evaluator
87
- def self.default_device
88
- devices = OpenclEvaluator.query_devices_with_score
89
- device = devices.max { |a, b| a[1] <=> b[1] }
90
- opencl_to_device(device)
95
+ ##
96
+ # Select the best device available in the system for this evaluator
97
+ def default_device
98
+ devices = OpenclEvaluator.query_devices_with_score
99
+ device = devices.max { |a, b| a[1] <=> b[1] }
100
+ opencl_to_device(device)
101
+ end
102
+
103
+ def getset_global_opencl_context(platform)
104
+ @global_opencl_context ||= {}
105
+ @global_opencl_context[platform] ||= yield
106
+ @global_opencl_context[platform]
107
+ end
91
108
  end
92
109
 
93
110
  # opencl evaluator main entrypoint
@@ -228,16 +245,22 @@ module TensorStream
228
245
 
229
246
  def _create_opencl_context(device = nil)
230
247
  if device.nil?
231
- @@global_opencl_context ||= begin
232
- all_devices = OpenclEvaluator.query_supported_devices.map(&:native_device)
233
- puts "global context created for #{all_devices}"
234
- OpenCL.create_context(all_devices)
248
+ all_devices_by_platform = {}
249
+ TensorStream::Evaluator::OpenclEvaluator.query_supported_devices.map(&:native_device).each do |d|
250
+ all_devices_by_platform[d.platform.name] ||= []
251
+ all_devices_by_platform[d.platform.name] << d
235
252
  end
236
253
 
237
- @opencl_context = @@global_opencl_context
254
+ all_devices_by_platform.each do |platform, devices|
255
+ @opencl_context = TensorStream::Evaluator::OpenclEvaluator.getset_global_opencl_context(platform) do
256
+ OpenCL.create_context(devices)
257
+ end
258
+ end
238
259
  else
239
260
  puts "context created for #{device.native_device}"
240
- @opencl_context = OpenCL.create_context(device.native_device)
261
+ @opencl_context = TensorStream::Evaluator::OpenclEvaluator.getset_global_opencl_context(device.native_device.platform) do
262
+ OpenCL.create_context(device.native_device)
263
+ end
241
264
  end
242
265
  end
243
266
 
@@ -269,11 +292,12 @@ module TensorStream
269
292
  @context[:_cache][kernel_cache_key] ||=
270
293
  begin
271
294
  # puts "building #{kernel_cache_key}"
272
- file_path = File.join('/tmp', "#{kernel}.#{suffix}.cl")
295
+ file_path = File.join(ENV['TS_OPENCL_FILE_CACHE_PATH'] || '/tmp', "#{kernel}.#{suffix}.cl")
273
296
  source = if File.exist?(file_path) && ENV['TS_OPENCL_FILE_CACHE']
274
297
  File.read(file_path)
275
298
  else
276
- filename = %w[cl.erb cl].map { |ext| cl_template_path(kernel, ext) }.find { |n| File.exist?(n) }
299
+ filenames = ['', ".#{@device_type}"].map { |type| %w[cl.erb cl].map { |ext| cl_template_path("#{kernel}#{type}", ext) } }.flatten
300
+ filename = filenames.find { |n| File.exist?(n) }
277
301
  raise "opencl kernel template for #{kernel} has not yet been defined" if filename.nil?
278
302
 
279
303
  source = File.read(filename)
@@ -322,6 +346,7 @@ module TensorStream
322
346
 
323
347
  def eval_variable(tensor, _child_context)
324
348
  raise "variable #{tensor.name} not initalized" if tensor.value.nil? && (tensor.buffer.nil? || !tensor.buffer.dirty)
349
+
325
350
  tensor.buffer = wrap_opencl(tensor, name: tensor.name) if tensor.buffer.nil?
326
351
  tensor.buffer
327
352
  end
@@ -339,7 +364,7 @@ module TensorStream
339
364
  end
340
365
  end
341
366
 
342
- register_op :identity do |context, tensor, inputs|
367
+ register_op :identity do |_context, tensor, inputs|
343
368
  value = inputs[0]
344
369
  buffer = OpenCLBuffer.new(self, name: tensor.name, data_type: tensor.data_type, shape: value.shape, buffer: value.buffer, cl_buffer: value.cl_buffer)
345
370
  buffer.op = build_event_wait_list(inputs)
@@ -351,25 +376,26 @@ module TensorStream
351
376
  end
352
377
 
353
378
  register_op :assign_add do |context, tensor, inputs|
354
- value = execute_2_operand_func('add', tensor, inputs[0], inputs[1], context)
379
+ value = execute_2_operand_func('add', tensor, inputs[0], inputs[1])
355
380
  assign_var(tensor, value, context)
356
381
  end
357
382
 
358
383
  register_op :assign_sub do |context, tensor, inputs|
359
- value = execute_2_operand_func('sub', tensor, inputs[0], inputs[1], context)
384
+ value = execute_2_operand_func('sub', tensor, inputs[0], inputs[1])
360
385
  assign_var(tensor, value, context)
361
386
  end
362
387
 
363
- register_op :variable, noop: true do |context, tensor, inputs|
388
+ register_op :variable, noop: true do |_context, tensor, _inputs|
364
389
  variable = tensor.inputs[0]
365
390
  raise "variable #{tensor.name} not initalized" if variable.value.nil? && (variable.buffer.nil? || !variable.buffer.dirty)
391
+
366
392
  variable.buffer = wrap_opencl(variable, name: variable.name) if variable.buffer.nil?
367
393
  variable.buffer
368
394
  end
369
395
 
370
396
  %i[less less_equal greater greater_equal equal not_equal logical_and].each do |op|
371
- register_op op, noop: true do |context, tensor, inputs|
372
- execute_2_operand_func(op.to_s, tensor, inputs[0], inputs[1], context, 'cond')
397
+ register_op op do |context, tensor, inputs|
398
+ execute_2_operand_func(op.to_s, tensor, inputs[0], inputs[1], 'cond')
373
399
  end
374
400
  end
375
401
 
@@ -463,11 +489,11 @@ module TensorStream
463
489
  rescue EvaluatorExcecutionException => e
464
490
  _opencl_queue.finish # dump queue
465
491
  puts e.message
466
- raise e, "error #{e.message} while evaluating #{tensor.name} : #{tensor.to_math(true, 1)} defined at #{tensor.source}"
492
+ raise e, "error #{e.message} while evaluating #{tensor.name} : defined at #{tensor.source}"
467
493
  rescue TensorStreamError => e
468
494
  _opencl_queue.finish # dump queue
469
495
  puts e.message
470
- raise e, "error #{e.message} while evaluating #{tensor.name} : #{tensor.to_math(true, 1)} defined at #{tensor.source}"
496
+ raise e, "error #{e.message} while evaluating #{tensor.name} : defined at #{tensor.source}"
471
497
  rescue StandardError => e
472
498
  _opencl_queue.finish # dump queue
473
499
  puts e.message
@@ -496,6 +522,7 @@ module TensorStream
496
522
  cache_key = "#{tensor.graph.object_id}_opencl_#{tensor.name}:#{object_id}"
497
523
  return @context[cache_key] if @context.key?(cache_key)
498
524
  return @context[:_cache][cache_key] if tensor.is_const && @context[:_cache][cache_key]
525
+
499
526
  @context[cache_key] = if tensor.value.is_a?(Tensor)
500
527
  _run(tensor.value, child_context)
501
528
  else
@@ -512,7 +539,6 @@ module TensorStream
512
539
  buffer = complete_eval(b, child_context)
513
540
 
514
541
  if assign.buffer
515
- # buffer = type_cast(buffer, assign.data_type, name: "#{tensor.name}/cast_#{tensor.name}_#{tensor.data_type}")
516
542
  event_wait_list = build_event_wait_list([buffer, assign.buffer])
517
543
  assign.buffer.op = if assign.buffer.cl_buffer != buffer.cl_buffer
518
544
  _opencl_queue.enqueue_copy_buffer(buffer.cl_buffer, assign.buffer.cl_buffer, event_wait_list: event_wait_list)
@@ -524,23 +550,32 @@ module TensorStream
524
550
  assign.buffer = convert_to_opencl(value, buffer.shape, data_type: tensor.data_type, name: assign.name)
525
551
  assign.value = value
526
552
  end
553
+
527
554
  assign.buffer.dirty = true
528
555
  assign.buffer
529
556
  end
530
557
 
531
- def execute_2_operand_func(op_name, tensor, input_a, input_b, child_context, prog_name = nil)
532
- a = _run(input_a, child_context)
533
- b = _run(input_b, child_context)
558
+ def execute_2_operand_func(op_name, tensor, a, b, prog_name = nil)
534
559
  a, b = auto_type_cast(a, b, name: "#{tensor.name}/cast_#{a.name}_#{b.data_type}")
535
560
  dtype = tensor.data_type
536
561
  result_shape = TensorShape.infer_shape(a.shape, b.shape)
537
- return _create_result_buffer(dtype, [0], "out_#{tensor.name}") if result_shape == [0]
562
+ return OpenCLBuffer.nil_buffer(self, "out_#{tensor.name}", dtype) if result_shape == [0]
563
+
538
564
  output_buffer = _create_result_buffer(tensor.data_type, result_shape, "out_#{tensor.name}")
539
565
  a, b, prog, switch_operands = select_program(a, b, op_name)
540
566
  m, n = result_shape
541
- work_group = [m || 1, n || 1]
542
- cl_m = OpenCL::Int1.new(m || 1)
543
- cl_n = OpenCL::Int1.new(n || 1)
567
+
568
+ work_group = if result_shape.size > 2 && (b.shape.size.zero? || (a.shape == b.shape))
569
+ [m, result_shape.reduce(:*) / m]
570
+ elsif result_shape.size <= 2
571
+ [m || 1, n || 1]
572
+ else
573
+ raise "rank > 2 not supported for now"
574
+ end
575
+
576
+ cl_m = OpenCL::Int1.new(work_group[0])
577
+ cl_n = OpenCL::Int1.new(work_group[1])
578
+
544
579
  cl_switch = OpenCL::Int1.new(switch_operands) # no need to switch for addition
545
580
 
546
581
  event_wait_list = build_event_wait_list([a, b]) # add dependency wait list
@@ -610,6 +645,7 @@ module TensorStream
610
645
 
611
646
  def auto_type_cast(a, b, name: nil)
612
647
  return [a, b] if a.data_type == b.data_type
648
+
613
649
  m, n = b.shape
614
650
  work_group = [m || 1, n || 1]
615
651
  event_wait_list = build_event_wait_list([b])
@@ -624,6 +660,7 @@ module TensorStream
624
660
 
625
661
  def type_cast(source, data_type, name: nil)
626
662
  return source if source.data_type == data_type
663
+
627
664
  m, n = source.shape
628
665
  work_group = [m || 1, n || 1]
629
666
  event_wait_list = [source.op].compact
@@ -673,8 +710,6 @@ module TensorStream
673
710
 
674
711
  return nil if buffer.nil?
675
712
 
676
-
677
-
678
713
  cl_buffer = unless value.flatten.empty?
679
714
  cl_buffer_size = 1 if cl_buffer_size.zero?
680
715
  _opencl_context.create_buffer(cl_buffer_size * buffer.element_size)
@@ -682,6 +717,7 @@ module TensorStream
682
717
 
683
718
  @context[:_cache][cache_key] = OpenCLBuffer.new(self, name: name, data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer)
684
719
  end
720
+
685
721
  if data_type == :string
686
722
  value[0].each_byte.with_index do |c, index|
687
723
  cl_object.buffer[index] = c
@@ -704,11 +740,11 @@ module TensorStream
704
740
  cl_object.buffer[0] = Tensor.cast_dtype(value, data_type)
705
741
  end
706
742
 
707
- if cl_object.cl_buffer && !value.nil? && (!value.is_a?(Array) || !value.empty?)
708
- write_op = _opencl_queue.enqueue_write_buffer(cl_object.cl_buffer, cl_object.buffer)
743
+ # if OpenCL buffer is valid enqueue a write
744
+ if cl_object.cl_buffer && value && (!value.is_a?(Array) || !value.empty?)
745
+ cl_object.op = _opencl_queue.enqueue_write_buffer(cl_object.cl_buffer, cl_object.buffer)
709
746
  end
710
747
 
711
- cl_object.op = write_op
712
748
  cl_object
713
749
  end
714
750
 
@@ -718,7 +754,7 @@ module TensorStream
718
754
  NArray.sfloat(narray_size)
719
755
  when :float64
720
756
  NArray.float(narray_size)
721
- when :int, :int32, :int64, :uint64, :uint32 #NArray does not have 64 bit int types
757
+ when :int, :int32, :int64, :uint64, :uint32 # NArray does not have 64 bit int types
722
758
  NArray.int(narray_size)
723
759
  when :int16, :uint16
724
760
  NArray.sint(narray_size)
@@ -736,7 +772,8 @@ module TensorStream
736
772
  end
737
773
 
738
774
  def _create_result_buffer(data_type, shape, name)
739
- return OpenCLBuffer.new(self, name: name, data_type: data_type, shape: [0], buffer: nil, cl_buffer: nil) if shape == [0]
775
+ return OpenCLBuffer.nil_buffer(self, name, data_type) if shape == [0]
776
+
740
777
  cache_key = "_result_#{name}_#{shape.join('_')}:#{object_id}"
741
778
  @context[:_cache][:_cl_buffers][cache_key] ||= begin
742
779
  # puts "create result buffer #{cache_key}"
@@ -759,7 +796,7 @@ module TensorStream
759
796
  region = OpenCL::BufferRegion::new(start, buffer.size * buffer.element_size)
760
797
  cl_buffer = parent_buffer.cl_buffer.create_sub_buffer(OpenCL::BUFFER_CREATE_TYPE_REGION, region)
761
798
  OpenCLBuffer.new(self, data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer, name: name)
762
- else
799
+ else # source buffer already a sub buffer, OpenCL does not allow sub buffers from sub buffers
763
800
  _create_result_buffer(tensor.data_type, shape, name)
764
801
  end
765
802
  end
@@ -768,7 +805,7 @@ module TensorStream
768
805
 
769
806
  if buffer.cl_buffer.associated_memobject
770
807
  buffer.op = parent_buffer.op
771
- else
808
+ else # source buffer alreay a sub buffer, so we need to do a copy instead
772
809
  region_size_in_bytes = buffer.buffer.size * buffer.buffer.element_size
773
810
  start = index * region_size_in_bytes
774
811
  region = [region_size_in_bytes, 1, 1]
@@ -841,6 +878,7 @@ module TensorStream
841
878
 
842
879
  def _reduced_shape(input_shape, axes)
843
880
  return [] if axes.nil? # reduce to scalar
881
+
844
882
  axes = [axes] unless axes.is_a?(Array)
845
883
  return input_shape if axes.empty?
846
884
 
@@ -850,39 +888,6 @@ module TensorStream
850
888
  input_shape
851
889
  end
852
890
 
853
- def reduction(child_context, tensor, a, b, func)
854
- input = complete_eval(a, child_context)
855
- axis = b.is_a?(Tensor) ? read_final_result(complete_eval(b, child_context)) : b
856
- if axis.nil?
857
- red = input.buffer.send(func)
858
- convert_to_opencl(red, [], data_type: tensor.data_type, name: tensor.name)
859
- else
860
- return input if input.shape.empty?
861
-
862
- value = input.buffer.reshape(*input.shape.reverse)
863
- rank = input.shape.size - 1
864
-
865
- if axis.is_a?(Array)
866
- axis.map { |x| rank - x.abs }.sort.reverse_each do |x|
867
- value = value.send(func, x.to_i)
868
- end
869
- else
870
- value = value.send(func, rank - axis.abs)
871
- end
872
-
873
- new_shape = if value.is_a?(NArray)
874
- value.shape.reverse
875
- else
876
- value = [value]
877
- []
878
- end
879
-
880
- new_shape = _reduced_shape(input.shape.dup, axis) if tensor.options[:keepdims]
881
-
882
- convert_to_opencl(value.flatten, new_shape, data_type: tensor.data_type, name: tensor.name)
883
- end
884
- end
885
-
886
891
  # selects variants of cl programs depending on input
887
892
  def select_program(input_a, input_b, op)
888
893
  return [input_a, input_b, op.to_s, 0] if input_a.shape == input_b.shape