tensor_stream-opencl 0.2.2 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ __kernel void relu6_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
+ // Get the index of the current element to be processed
4
+ const int id = get_global_id(0);
5
+
6
+ C[id] = min((<%= c_dtype %>)max((<%= c_dtype %>) A[id], (<%= c_dtype %>)0), (<%= c_dtype %>)6);
7
+ }
@@ -1,8 +1,7 @@
1
1
  % c_dtype = dtype_to_c_type(dtype)
2
- __kernel void round_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
2
+ __kernel void round_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
3
  // Get the index of the current element to be processed
4
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
4
+ const int id = get_global_id(0); // Row ID of C (0..M)
6
5
 
7
- C[globalRow * N + globalCol] = round(A[globalRow * N + globalCol]);
6
+ C[id] = round(A[id]);
8
7
  }
@@ -0,0 +1,26 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ __kernel void sum_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
+ // Get the index of the current element to be processed
4
+ const int id = get_global_id(0);
5
+ int offset = (id + <%= index %>) * <%= w %>;
6
+ <%= c_dtype %> sum = 0;
7
+ <% if n > 4 %>
8
+ for(int i = 0; i < <%= n/4 %> ; i++) {
9
+ <% sums = 4.times.map do |i|
10
+ "A[offset + #{i}]"
11
+ end %>
12
+ sum += <%= sums.join(' + ') %>;
13
+ offset += 4;
14
+ }
15
+ <% if n%4!=0 %>
16
+ <% (n % 4).times do |i| %>
17
+ sum += A[offset + <%= i %>];
18
+ <% end %>
19
+ <% end %>
20
+ <% else %>
21
+ <% n.times do |i| %>
22
+ sum += A[offset + <%= i %>];
23
+ <% end %>
24
+ <% end %>
25
+ C[id] = sum;
26
+ }
@@ -5,8 +5,8 @@ module TensorStream
5
5
  def MathOps.included(klass)
6
6
  klass.class_eval do
7
7
  %i[max min add real_div div sub floor_mod mod mul pow sigmoid_grad squared_difference].each do |op|
8
- register_op op, noop: true do |context, tensor, inputs|
9
- execute_2_operand_func(op.to_s, tensor, inputs[0], inputs[1], context)
8
+ register_op op do |context, tensor, inputs|
9
+ execute_2_operand_func(op.to_s, tensor, inputs[0], inputs[1])
10
10
  end
11
11
  end
12
12
 
@@ -38,11 +38,11 @@ module TensorStream
38
38
  end
39
39
  end
40
40
 
41
- register_op :floor_div, noop: true do |context, tensor, inputs|
41
+ register_op :floor_div do |context, tensor, inputs|
42
42
  if fp_type?(tensor.data_type)
43
- execute_2_operand_func('floor_div', tensor, inputs[0], inputs[1], context)
43
+ execute_2_operand_func('floor_div', tensor, inputs[0], inputs[1])
44
44
  else
45
- execute_2_operand_func('div', tensor, inputs[0], inputs[1], context)
45
+ execute_2_operand_func('div', tensor, inputs[0], inputs[1])
46
46
  end
47
47
  end
48
48
 
@@ -78,11 +78,8 @@ module TensorStream
78
78
  cl_n = OpenCL::Int1.new(n)
79
79
  cl_k = OpenCL::Int1.new(k)
80
80
 
81
- transpose_a = OpenCL::Int1.new(tensor.options[:transpose_a] ? 1 : 0)
82
- transpose_b = OpenCL::Int1.new(tensor.options[:transpose_b] ? 1 : 0)
83
81
  event_wait_list = build_event_wait_list([a, b])
84
-
85
- output_buffer.op = _cl_program('gemm', dtype: dtype).send(:"gemm_#{dtype}", _opencl_queue, result_shape, cl_m, cl_n, cl_k, transpose_a, transpose_b, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
82
+ output_buffer.op = _cl_program('gemm', ta: !!tensor.options[:transpose_a], tb: !!tensor.options[:transpose_b], dtype: dtype).send(:"gemm_#{dtype}", _opencl_queue, result_shape, cl_m, cl_n, cl_k, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
86
83
 
87
84
  output_buffer
88
85
  end
@@ -94,39 +91,99 @@ module TensorStream
94
91
  end
95
92
 
96
93
  %i[sum mean].each do |op|
97
- register_op op, noop: true do |context, tensor, inputs|
94
+ register_op op do |context, tensor, inputs|
98
95
  reduction(context, tensor, inputs[0], inputs[1], op.to_sym)
99
96
  end
100
97
  end
101
98
 
102
- register_op :prod, noop: true do |context, tensor, inputs|
103
- input_a = complete_eval(inputs[0], context)
104
-
105
- if input_a.buffer.empty?
99
+ register_op :prod do |context, tensor, inputs|
100
+ if inputs[0].shape == [0]
106
101
  convert_to_opencl([1.0], [], data_type: inputs[0].data_type, name: tensor.name)
107
102
  else
108
103
  reduction(context, tensor, inputs[0], inputs[1], :prod)
109
104
  end
110
105
  end
111
106
 
112
- register_op :argmin, buffer: true do |_context, tensor, inputs|
113
- axis = tensor.options[:axis] || 0
114
- rank = inputs[0].shape.size
115
- raise TensorStream::InvalidArgumentError, "Expected dimension in the range [#{-rank},#{rank}) but got #{axis}" if axis < -rank || axis >= rank
107
+ # register_op :argmin, buffer: true do |_context, tensor, inputs|
108
+ # axis = inputs[1].nil? || inputs[1].buffer.nil? || inputs[1].buffer.empty? ? 0 : inputs[1].buffer
109
+ # rank = inputs[0].shape.size
110
+ # raise TensorStream::InvalidArgumentError, "Expected dimension in the range [#{-rank},#{rank}) but got #{axis}" if axis < -rank || axis >= rank
111
+
112
+ # arr = inputs[0].buffer.reshape(*inputs[0].shape.reverse).to_a
113
+ # op = get_op_with_axis(arr, axis, 0, inputs[0].data_type, ->(a, b) { a < b })
114
+ # convert_to_opencl(op, shape_eval(op), data_type: tensor.data_type, name: tensor.name)
115
+ # end
116
+
117
+ # register_op :argmax, buffer: true do |_context, tensor, inputs|
118
+ # axis = inputs[1].nil? || inputs[1].buffer.nil? || inputs[1].buffer.empty? ? 0 : inputs[1].buffer
119
+ # rank = inputs[0].shape.size
120
+ # raise TensorStream::InvalidArgumentError, "Expected dimension in the range [#{-rank},#{rank}) but got #{axis}" if axis < -rank || axis >= rank
121
+
122
+ # arr = inputs[0].buffer.reshape(*inputs[0].shape.reverse).to_a
123
+ # op = get_op_with_axis(arr, axis, 0, inputs[0].data_type, ->(a, b) { a > b })
124
+ # convert_to_opencl(op, shape_eval(op), data_type: tensor.data_type, name: tensor.name)
125
+ # end
126
+
127
+ def reduction(child_context, tensor, value, axis, func)
128
+ if axis.nil?
129
+ value = _run(value, child_context)
130
+ size = value.shape.reduce(:*) || 1
131
+ if value.shape.empty? # for scalars, just return as is
132
+ value
133
+ else
134
+ reduction_threads = 32
135
+ items_per_thread_threshold = 4
136
+
137
+ output_buffer = _create_result_buffer(value.data_type, [], tensor.name)
138
+ event_wait_list = build_event_wait_list([value])
139
+
140
+ if (size > reduction_threads) && ((size / reduction_threads) > items_per_thread_threshold)
141
+ items_per_thread = size / reduction_threads
142
+ extra_items = size % reduction_threads
143
+ intermediate_output_buffer = _create_result_buffer(value.data_type, [reduction_threads], tensor.name)
144
+
145
+ temp_values = if extra_items.zero?
146
+ _cl_program(func, dtype: value.data_type, index: 0, n: items_per_thread, w: items_per_thread).
147
+ send(:"#{func}_#{value.data_type}", _opencl_queue, [reduction_threads], value.cl_buffer, intermediate_output_buffer.cl_buffer, event_wait_list: event_wait_list)
148
+ else
149
+ [_cl_program(func, dtype: value.data_type, index: 0, n: items_per_thread, w: items_per_thread).
150
+ send(:"#{func}_#{value.data_type}", _opencl_queue, [reduction_threads - 1], value.cl_buffer, intermediate_output_buffer.cl_buffer, event_wait_list: event_wait_list),
151
+ _cl_program(func, dtype: value.data_type, index: reduction_threads - 1, n: items_per_thread + extra_items, w: items_per_thread).send(:"#{func}_#{value.data_type}", _opencl_queue, [1], value.cl_buffer, intermediate_output_buffer.cl_buffer, event_wait_list: event_wait_list)]
152
+ end
153
+ output_buffer.op = _cl_program(func, dtype: value.data_type, n: reduction_threads, index: 0, w: 0).send(:"#{func}_#{value.data_type}", _opencl_queue, [1], value.cl_buffer, output_buffer.cl_buffer, event_wait_list: temp_values)
154
+ output_buffer
155
+ else
156
+ output_buffer.op = _cl_program(func, dtype: value.data_type, n: size, index: 0, w: 0).send(:"#{func}_#{value.data_type}", _opencl_queue, [1], value.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
157
+ output_buffer
158
+ end
159
+ end
160
+ else
161
+ return value if value.shape.empty?
162
+
163
+ axis = axis.is_a?(OpenCLBuffer) ? read_final_result(axis) : axis
164
+ input = complete_eval(value, child_context)
165
+ value = value.buffer.reshape(*value.shape.reverse)
166
+ rank = input.shape.size - 1
167
+
168
+ if axis.is_a?(Array)
169
+ axis.map { |x| rank - x.abs }.sort.reverse_each do |x|
170
+ value = value.send(func, x.to_i)
171
+ end
172
+ else
173
+ value = value.send(func, rank - axis.abs)
174
+ end
116
175
 
117
- arr = inputs[0].buffer.reshape(*inputs[0].shape.reverse).to_a
118
- op = get_op_with_axis(arr, axis, 0, inputs[0].data_type, ->(a, b) { a < b })
119
- convert_to_opencl(op, shape_eval(op), data_type: tensor.data_type, name: tensor.name)
120
- end
176
+ new_shape = if value.is_a?(NArray)
177
+ value.shape.reverse
178
+ else
179
+ value = [value]
180
+ []
181
+ end
121
182
 
122
- register_op :argmax, buffer: true do |_context, tensor, inputs|
123
- axis = tensor.options[:axis] || 0
124
- rank = inputs[0].shape.size
125
- raise TensorStream::InvalidArgumentError, "Expected dimension in the range [#{-rank},#{rank}) but got #{axis}" if axis < -rank || axis >= rank
183
+ new_shape = _reduced_shape(input.shape.dup, axis) if tensor.options[:keepdims]
126
184
 
127
- arr = inputs[0].buffer.reshape(*inputs[0].shape.reverse).to_a
128
- op = get_op_with_axis(arr, axis, 0, inputs[0].data_type, ->(a, b) { a > b })
129
- convert_to_opencl(op, shape_eval(op), data_type: tensor.data_type, name: tensor.name)
185
+ convert_to_opencl(value.flatten, new_shape, data_type: tensor.data_type, name: tensor.name)
186
+ end
130
187
  end
131
188
  end
132
189
  end
@@ -121,11 +121,11 @@ module TensorStream
121
121
  end
122
122
 
123
123
  register_op :apply_adagrad do |context, tensor, inputs|
124
- target_var, accum, lr, grad = inputs
124
+ _target_var, _accum, lr, grad = inputs
125
125
 
126
126
  assign = tensor.inputs[0] || tensor
127
127
  assign_acc = tensor.inputs[1]
128
-
128
+
129
129
  assign.buffer.dirty = true
130
130
  assign_acc.buffer.dirty = true
131
131
  output_buffer = assign.buffer
@@ -133,7 +133,7 @@ module TensorStream
133
133
  work_group = [output_buffer.total_elements]
134
134
 
135
135
  event_wait_list = build_event_wait_list(inputs)
136
- event = call_program('apply_adagrad',
136
+ event = call_program('apply_adagrad',
137
137
  output_buffer.data_type,
138
138
  work_group,
139
139
  lr.cl_buffer,
@@ -195,7 +195,7 @@ module TensorStream
195
195
  event_wait_list = build_event_wait_list(inputs)
196
196
  work_group = [output_buffer.total_elements]
197
197
 
198
- event = call_program('apply_rms_prop', output_buffer.data_type,
198
+ event = call_program('apply_rms_prop', output_buffer.data_type,
199
199
  work_group,
200
200
  lr.cl_buffer,
201
201
  rho.cl_buffer,
@@ -298,7 +298,7 @@ module TensorStream
298
298
  end
299
299
 
300
300
  b = wrap_opencl(labels, data_type: inputs[0].data_type, name: "#{tensor.name}_label")
301
-
301
+
302
302
  event_wait_list = build_event_wait_list(inputs)
303
303
  dtype = tensor.data_type
304
304
  output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
@@ -335,6 +335,90 @@ module TensorStream
335
335
  output_buffer.op = event
336
336
  output_buffer
337
337
  end
338
+
339
+ %i[relu6].each do |op|
340
+ register_op op, noop: true do |context, tensor, inputs|
341
+ execute_func(op.to_s, tensor, inputs[0], context)
342
+ end
343
+ end
344
+
345
+ # Fast per pixel parallel convolution operation
346
+ register_op :conv2d do |_context, tensor, inputs|
347
+ filter = inputs[1]
348
+ batch, height, width, channel = inputs[0].shape
349
+ filter_shape = filter.shape
350
+ strides = tensor.options[:strides]
351
+ height_stride = strides[1]
352
+ width_stride = strides[2]
353
+
354
+ raise TensorStream::ValueError, " Current implementation does not yet support strides in the batch and depth dimensions." if strides[0] != 1 || strides[3] != 1
355
+
356
+ event_wait_list = build_event_wait_list(inputs)
357
+
358
+ f_height, f_width, in_channels, out_channels = filter_shape
359
+ out_shape = [batch, height / height_stride, width / width_stride, out_channels]
360
+ output_buffer = _create_result_buffer(tensor.data_type, out_shape, tensor.name)
361
+
362
+ cl_image_height = OpenCL::Int1.new(height)
363
+ cl_image_width = OpenCL::Int1.new(width)
364
+
365
+ work_dimen = [batch, height / height_stride, width / width_stride]
366
+
367
+ output_buffer.op = _cl_program("conv2d", dtype: tensor.data_type, fh: f_height, fw: f_width, ch: channel, out_ch: out_channels, stride: [height_stride, width_stride] ).send(:conv2d, _opencl_queue, work_dimen, cl_image_height, cl_image_width, inputs[0].cl_buffer,
368
+ inputs[1].cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
369
+ output_buffer
370
+ end
371
+
372
+ register_op :conv2d_backprop_input do |context, tensor, inputs|
373
+ image_shape, filter, grad = inputs
374
+ filter_shape = filter.shape
375
+
376
+ strides = tensor.options[:strides]
377
+ height_stride = strides[1]
378
+ width_stride = strides[2]
379
+
380
+ image_shape = read_final_result(complete_eval(image_shape, context))
381
+
382
+ event_wait_list = build_event_wait_list(inputs)
383
+ output_buffer = _create_result_buffer(tensor.data_type, image_shape, tensor.name)
384
+
385
+ batch, height, width, channels = image_shape
386
+ f_height, f_width, in_channels, out_channels = filter_shape
387
+
388
+ work_dimen = [batch, height, width]
389
+
390
+ cl_image_height = OpenCL::Int1.new(height)
391
+ cl_image_width = OpenCL::Int1.new(width)
392
+
393
+ output_buffer.op = _cl_program("conv2d_backprop_input", dtype: tensor.data_type, fh: f_height, fw: f_width, ch: channels, out_ch: out_channels, stride: [height_stride, width_stride] ).send(:conv2d_backprop_input, _opencl_queue, work_dimen, cl_image_height, cl_image_width,
394
+ filter.cl_buffer, grad.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
395
+ output_buffer
396
+ end
397
+
398
+ register_op :conv2d_backprop_filter do |context, tensor, inputs|
399
+ images, filter_shape, grad = inputs
400
+
401
+ event_wait_list = build_event_wait_list(inputs)
402
+
403
+ strides = tensor.options[:strides]
404
+ height_stride = strides[1]
405
+ width_stride = strides[2]
406
+
407
+ filter_shape = read_final_result(complete_eval(filter_shape, context))
408
+ output_buffer = _create_result_buffer(tensor.data_type, filter_shape, tensor.name)
409
+
410
+ batch_size, height, width, channels = images.shape
411
+ f_height, f_width, input_channels, output_channels = filter_shape
412
+ work_dimen = [f_height, f_width, output_channels]
413
+
414
+ cl_batch_size = OpenCL::Int1.new(batch_size)
415
+ cl_image_height = OpenCL::Int1.new(height)
416
+ cl_image_width = OpenCL::Int1.new(width)
417
+
418
+ output_buffer.op = _cl_program("conv2d_backprop_filter", dtype: tensor.data_type, fh: f_height, fw: f_width, ch: channels, out_ch: output_channels, stride: [height_stride, width_stride] ).send(:conv2d_backprop_filter, _opencl_queue, work_dimen, cl_batch_size, cl_image_height, cl_image_width,
419
+ images.cl_buffer, grad.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
420
+ output_buffer
421
+ end
338
422
  end
339
423
  end
340
424
  end
@@ -37,9 +37,13 @@ module TensorStream
37
37
  return buffer[0] != 0 if data_type == :boolean
38
38
  return buffer[0]
39
39
  end
40
-
41
- result = buffer.reshape(*shape.map(&:to_i).reverse).to_a
40
+
41
+ result = buffer.reshape(*shape.map(&:to_i).reverse).to_a
42
42
  data_type == :boolean ? process_function_op(result, ->(a, _b) { a != 0 }) : result
43
43
  end
44
+
45
+ def self.nil_buffer(owner, name, data_type)
46
+ OpenCLBuffer.new(owner, name: name, data_type: data_type, shape: [0], buffer: nil, cl_buffer: nil)
47
+ end
44
48
  end
45
49
  end
@@ -35,7 +35,8 @@ module TensorStream
35
35
  end
36
36
 
37
37
  ##
38
- # PURE ruby evaluator used for testing and development
38
+ # OpenCL hardware accelerated evaluator
39
+ #
39
40
  class OpenclEvaluator < BaseEvaluator
40
41
  attr_accessor :retain
41
42
  attr_reader :opencl_device, :opencl_context
@@ -53,41 +54,57 @@ module TensorStream
53
54
  super
54
55
  _create_opencl_context
55
56
  @opencl_device = device.native_device
57
+
58
+ @max_work_item_dimensions = @opencl_device.max_work_item_dimensions
59
+ @max_work_item_sizes = @opencl_device.max_work_item_sizes
60
+ @max_work_group_size = @opencl_device.max_work_group_size
61
+
62
+ @local_mem_size = @opencl_device.local_mem_size
63
+ @device_type = @opencl_device.type.to_s.downcase
64
+
56
65
  create_command_queue
57
66
  end
58
67
 
59
- def self.query_supported_devices
60
- devices = query_devices_with_score
61
- devices.sort { |a, b| a[1] <=> b[1] }.map do |d|
62
- opencl_to_device(d)
68
+ class << self
69
+ def query_supported_devices
70
+ devices = query_devices_with_score
71
+ devices.sort_by { |a| a[1] }.map do |d|
72
+ opencl_to_device(d)
73
+ end
63
74
  end
64
- end
65
75
 
66
- def self.fetch_device(query = [])
67
- devices = query_devices_with_score
68
- platform_devices = devices.select { |d| d[0].platform.to_s.tr(' ', '_').downcase =~ /#{query[0].downcase}/ }
69
- opencl_to_device(platform_devices[[query[1].to_i, platform_devices.size - 1].min])
70
- end
76
+ def fetch_device(query = [])
77
+ devices = query_devices_with_score
78
+ platform_devices = devices.select { |d| d[0].platform.to_s.tr(' ', '_').downcase =~ /#{query[0].downcase}/ }
79
+ opencl_to_device(platform_devices[[query[1].to_i, platform_devices.size - 1].min])
80
+ end
71
81
 
72
- def self.opencl_to_device(dev)
73
- device = dev[0]
74
- index = dev[3]
75
- platform_name = device.platform.name.tr(' ', '_').downcase
76
- uri = [platform_name, index].join(':')
82
+ def opencl_to_device(dev)
83
+ device = dev[0]
84
+ index = dev[3]
85
+ platform_name = device.platform.name.tr(' ', '_').downcase
86
+ uri = [platform_name, index].join(':')
77
87
 
78
- device_type = device.type.to_s == 'GPU' ? :gpu : :cpu
88
+ device_type = device.type.to_s == 'GPU' ? :gpu : :cpu
79
89
 
80
- OpenclDevice.new(uri, device_type, self).tap do |d|
81
- d.native_device = device
90
+ OpenclDevice.new(uri, device_type, self).tap do |d|
91
+ d.native_device = device
92
+ end
82
93
  end
83
- end
84
94
 
85
- ##
86
- # Select the best device available in the system for this evaluator
87
- def self.default_device
88
- devices = OpenclEvaluator.query_devices_with_score
89
- device = devices.max { |a, b| a[1] <=> b[1] }
90
- opencl_to_device(device)
95
+ ##
96
+ # Select the best device available in the system for this evaluator
97
+ def default_device
98
+ devices = OpenclEvaluator.query_devices_with_score
99
+ device = devices.max { |a, b| a[1] <=> b[1] }
100
+ opencl_to_device(device)
101
+ end
102
+
103
+ def getset_global_opencl_context(platform)
104
+ @global_opencl_context ||= {}
105
+ @global_opencl_context[platform] ||= yield
106
+ @global_opencl_context[platform]
107
+ end
91
108
  end
92
109
 
93
110
  # opencl evaluator main entrypoint
@@ -228,16 +245,22 @@ module TensorStream
228
245
 
229
246
  def _create_opencl_context(device = nil)
230
247
  if device.nil?
231
- @@global_opencl_context ||= begin
232
- all_devices = OpenclEvaluator.query_supported_devices.map(&:native_device)
233
- puts "global context created for #{all_devices}"
234
- OpenCL.create_context(all_devices)
248
+ all_devices_by_platform = {}
249
+ TensorStream::Evaluator::OpenclEvaluator.query_supported_devices.map(&:native_device).each do |d|
250
+ all_devices_by_platform[d.platform.name] ||= []
251
+ all_devices_by_platform[d.platform.name] << d
235
252
  end
236
253
 
237
- @opencl_context = @@global_opencl_context
254
+ all_devices_by_platform.each do |platform, devices|
255
+ @opencl_context = TensorStream::Evaluator::OpenclEvaluator.getset_global_opencl_context(platform) do
256
+ OpenCL.create_context(devices)
257
+ end
258
+ end
238
259
  else
239
260
  puts "context created for #{device.native_device}"
240
- @opencl_context = OpenCL.create_context(device.native_device)
261
+ @opencl_context = TensorStream::Evaluator::OpenclEvaluator.getset_global_opencl_context(device.native_device.platform) do
262
+ OpenCL.create_context(device.native_device)
263
+ end
241
264
  end
242
265
  end
243
266
 
@@ -269,11 +292,12 @@ module TensorStream
269
292
  @context[:_cache][kernel_cache_key] ||=
270
293
  begin
271
294
  # puts "building #{kernel_cache_key}"
272
- file_path = File.join('/tmp', "#{kernel}.#{suffix}.cl")
295
+ file_path = File.join(ENV['TS_OPENCL_FILE_CACHE_PATH'] || '/tmp', "#{kernel}.#{suffix}.cl")
273
296
  source = if File.exist?(file_path) && ENV['TS_OPENCL_FILE_CACHE']
274
297
  File.read(file_path)
275
298
  else
276
- filename = %w[cl.erb cl].map { |ext| cl_template_path(kernel, ext) }.find { |n| File.exist?(n) }
299
+ filenames = ['', ".#{@device_type}"].map { |type| %w[cl.erb cl].map { |ext| cl_template_path("#{kernel}#{type}", ext) } }.flatten
300
+ filename = filenames.find { |n| File.exist?(n) }
277
301
  raise "opencl kernel template for #{kernel} has not yet been defined" if filename.nil?
278
302
 
279
303
  source = File.read(filename)
@@ -322,6 +346,7 @@ module TensorStream
322
346
 
323
347
  def eval_variable(tensor, _child_context)
324
348
  raise "variable #{tensor.name} not initalized" if tensor.value.nil? && (tensor.buffer.nil? || !tensor.buffer.dirty)
349
+
325
350
  tensor.buffer = wrap_opencl(tensor, name: tensor.name) if tensor.buffer.nil?
326
351
  tensor.buffer
327
352
  end
@@ -339,7 +364,7 @@ module TensorStream
339
364
  end
340
365
  end
341
366
 
342
- register_op :identity do |context, tensor, inputs|
367
+ register_op :identity do |_context, tensor, inputs|
343
368
  value = inputs[0]
344
369
  buffer = OpenCLBuffer.new(self, name: tensor.name, data_type: tensor.data_type, shape: value.shape, buffer: value.buffer, cl_buffer: value.cl_buffer)
345
370
  buffer.op = build_event_wait_list(inputs)
@@ -351,25 +376,26 @@ module TensorStream
351
376
  end
352
377
 
353
378
  register_op :assign_add do |context, tensor, inputs|
354
- value = execute_2_operand_func('add', tensor, inputs[0], inputs[1], context)
379
+ value = execute_2_operand_func('add', tensor, inputs[0], inputs[1])
355
380
  assign_var(tensor, value, context)
356
381
  end
357
382
 
358
383
  register_op :assign_sub do |context, tensor, inputs|
359
- value = execute_2_operand_func('sub', tensor, inputs[0], inputs[1], context)
384
+ value = execute_2_operand_func('sub', tensor, inputs[0], inputs[1])
360
385
  assign_var(tensor, value, context)
361
386
  end
362
387
 
363
- register_op :variable, noop: true do |context, tensor, inputs|
388
+ register_op :variable, noop: true do |_context, tensor, _inputs|
364
389
  variable = tensor.inputs[0]
365
390
  raise "variable #{tensor.name} not initalized" if variable.value.nil? && (variable.buffer.nil? || !variable.buffer.dirty)
391
+
366
392
  variable.buffer = wrap_opencl(variable, name: variable.name) if variable.buffer.nil?
367
393
  variable.buffer
368
394
  end
369
395
 
370
396
  %i[less less_equal greater greater_equal equal not_equal logical_and].each do |op|
371
- register_op op, noop: true do |context, tensor, inputs|
372
- execute_2_operand_func(op.to_s, tensor, inputs[0], inputs[1], context, 'cond')
397
+ register_op op do |context, tensor, inputs|
398
+ execute_2_operand_func(op.to_s, tensor, inputs[0], inputs[1], 'cond')
373
399
  end
374
400
  end
375
401
 
@@ -463,11 +489,11 @@ module TensorStream
463
489
  rescue EvaluatorExcecutionException => e
464
490
  _opencl_queue.finish # dump queue
465
491
  puts e.message
466
- raise e, "error #{e.message} while evaluating #{tensor.name} : #{tensor.to_math(true, 1)} defined at #{tensor.source}"
492
+ raise e, "error #{e.message} while evaluating #{tensor.name} : defined at #{tensor.source}"
467
493
  rescue TensorStreamError => e
468
494
  _opencl_queue.finish # dump queue
469
495
  puts e.message
470
- raise e, "error #{e.message} while evaluating #{tensor.name} : #{tensor.to_math(true, 1)} defined at #{tensor.source}"
496
+ raise e, "error #{e.message} while evaluating #{tensor.name} : defined at #{tensor.source}"
471
497
  rescue StandardError => e
472
498
  _opencl_queue.finish # dump queue
473
499
  puts e.message
@@ -496,6 +522,7 @@ module TensorStream
496
522
  cache_key = "#{tensor.graph.object_id}_opencl_#{tensor.name}:#{object_id}"
497
523
  return @context[cache_key] if @context.key?(cache_key)
498
524
  return @context[:_cache][cache_key] if tensor.is_const && @context[:_cache][cache_key]
525
+
499
526
  @context[cache_key] = if tensor.value.is_a?(Tensor)
500
527
  _run(tensor.value, child_context)
501
528
  else
@@ -512,7 +539,6 @@ module TensorStream
512
539
  buffer = complete_eval(b, child_context)
513
540
 
514
541
  if assign.buffer
515
- # buffer = type_cast(buffer, assign.data_type, name: "#{tensor.name}/cast_#{tensor.name}_#{tensor.data_type}")
516
542
  event_wait_list = build_event_wait_list([buffer, assign.buffer])
517
543
  assign.buffer.op = if assign.buffer.cl_buffer != buffer.cl_buffer
518
544
  _opencl_queue.enqueue_copy_buffer(buffer.cl_buffer, assign.buffer.cl_buffer, event_wait_list: event_wait_list)
@@ -524,23 +550,32 @@ module TensorStream
524
550
  assign.buffer = convert_to_opencl(value, buffer.shape, data_type: tensor.data_type, name: assign.name)
525
551
  assign.value = value
526
552
  end
553
+
527
554
  assign.buffer.dirty = true
528
555
  assign.buffer
529
556
  end
530
557
 
531
- def execute_2_operand_func(op_name, tensor, input_a, input_b, child_context, prog_name = nil)
532
- a = _run(input_a, child_context)
533
- b = _run(input_b, child_context)
558
+ def execute_2_operand_func(op_name, tensor, a, b, prog_name = nil)
534
559
  a, b = auto_type_cast(a, b, name: "#{tensor.name}/cast_#{a.name}_#{b.data_type}")
535
560
  dtype = tensor.data_type
536
561
  result_shape = TensorShape.infer_shape(a.shape, b.shape)
537
- return _create_result_buffer(dtype, [0], "out_#{tensor.name}") if result_shape == [0]
562
+ return OpenCLBuffer.nil_buffer(self, "out_#{tensor.name}", dtype) if result_shape == [0]
563
+
538
564
  output_buffer = _create_result_buffer(tensor.data_type, result_shape, "out_#{tensor.name}")
539
565
  a, b, prog, switch_operands = select_program(a, b, op_name)
540
566
  m, n = result_shape
541
- work_group = [m || 1, n || 1]
542
- cl_m = OpenCL::Int1.new(m || 1)
543
- cl_n = OpenCL::Int1.new(n || 1)
567
+
568
+ work_group = if result_shape.size > 2 && (b.shape.size.zero? || (a.shape == b.shape))
569
+ [m, result_shape.reduce(:*) / m]
570
+ elsif result_shape.size <= 2
571
+ [m || 1, n || 1]
572
+ else
573
+ raise "rank > 2 not supported for now"
574
+ end
575
+
576
+ cl_m = OpenCL::Int1.new(work_group[0])
577
+ cl_n = OpenCL::Int1.new(work_group[1])
578
+
544
579
  cl_switch = OpenCL::Int1.new(switch_operands) # no need to switch for addition
545
580
 
546
581
  event_wait_list = build_event_wait_list([a, b]) # add dependency wait list
@@ -610,6 +645,7 @@ module TensorStream
610
645
 
611
646
  def auto_type_cast(a, b, name: nil)
612
647
  return [a, b] if a.data_type == b.data_type
648
+
613
649
  m, n = b.shape
614
650
  work_group = [m || 1, n || 1]
615
651
  event_wait_list = build_event_wait_list([b])
@@ -624,6 +660,7 @@ module TensorStream
624
660
 
625
661
  def type_cast(source, data_type, name: nil)
626
662
  return source if source.data_type == data_type
663
+
627
664
  m, n = source.shape
628
665
  work_group = [m || 1, n || 1]
629
666
  event_wait_list = [source.op].compact
@@ -673,8 +710,6 @@ module TensorStream
673
710
 
674
711
  return nil if buffer.nil?
675
712
 
676
-
677
-
678
713
  cl_buffer = unless value.flatten.empty?
679
714
  cl_buffer_size = 1 if cl_buffer_size.zero?
680
715
  _opencl_context.create_buffer(cl_buffer_size * buffer.element_size)
@@ -682,6 +717,7 @@ module TensorStream
682
717
 
683
718
  @context[:_cache][cache_key] = OpenCLBuffer.new(self, name: name, data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer)
684
719
  end
720
+
685
721
  if data_type == :string
686
722
  value[0].each_byte.with_index do |c, index|
687
723
  cl_object.buffer[index] = c
@@ -704,11 +740,11 @@ module TensorStream
704
740
  cl_object.buffer[0] = Tensor.cast_dtype(value, data_type)
705
741
  end
706
742
 
707
- if cl_object.cl_buffer && !value.nil? && (!value.is_a?(Array) || !value.empty?)
708
- write_op = _opencl_queue.enqueue_write_buffer(cl_object.cl_buffer, cl_object.buffer)
743
+ # if OpenCL buffer is valid enqueue a write
744
+ if cl_object.cl_buffer && value && (!value.is_a?(Array) || !value.empty?)
745
+ cl_object.op = _opencl_queue.enqueue_write_buffer(cl_object.cl_buffer, cl_object.buffer)
709
746
  end
710
747
 
711
- cl_object.op = write_op
712
748
  cl_object
713
749
  end
714
750
 
@@ -718,7 +754,7 @@ module TensorStream
718
754
  NArray.sfloat(narray_size)
719
755
  when :float64
720
756
  NArray.float(narray_size)
721
- when :int, :int32, :int64, :uint64, :uint32 #NArray does not have 64 bit int types
757
+ when :int, :int32, :int64, :uint64, :uint32 # NArray does not have 64 bit int types
722
758
  NArray.int(narray_size)
723
759
  when :int16, :uint16
724
760
  NArray.sint(narray_size)
@@ -736,7 +772,8 @@ module TensorStream
736
772
  end
737
773
 
738
774
  def _create_result_buffer(data_type, shape, name)
739
- return OpenCLBuffer.new(self, name: name, data_type: data_type, shape: [0], buffer: nil, cl_buffer: nil) if shape == [0]
775
+ return OpenCLBuffer.nil_buffer(self, name, data_type) if shape == [0]
776
+
740
777
  cache_key = "_result_#{name}_#{shape.join('_')}:#{object_id}"
741
778
  @context[:_cache][:_cl_buffers][cache_key] ||= begin
742
779
  # puts "create result buffer #{cache_key}"
@@ -759,7 +796,7 @@ module TensorStream
759
796
  region = OpenCL::BufferRegion::new(start, buffer.size * buffer.element_size)
760
797
  cl_buffer = parent_buffer.cl_buffer.create_sub_buffer(OpenCL::BUFFER_CREATE_TYPE_REGION, region)
761
798
  OpenCLBuffer.new(self, data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer, name: name)
762
- else
799
+ else # source buffer already a sub buffer, OpenCL does not allow sub buffers from sub buffers
763
800
  _create_result_buffer(tensor.data_type, shape, name)
764
801
  end
765
802
  end
@@ -768,7 +805,7 @@ module TensorStream
768
805
 
769
806
  if buffer.cl_buffer.associated_memobject
770
807
  buffer.op = parent_buffer.op
771
- else
808
+ else # source buffer alreay a sub buffer, so we need to do a copy instead
772
809
  region_size_in_bytes = buffer.buffer.size * buffer.buffer.element_size
773
810
  start = index * region_size_in_bytes
774
811
  region = [region_size_in_bytes, 1, 1]
@@ -841,6 +878,7 @@ module TensorStream
841
878
 
842
879
  def _reduced_shape(input_shape, axes)
843
880
  return [] if axes.nil? # reduce to scalar
881
+
844
882
  axes = [axes] unless axes.is_a?(Array)
845
883
  return input_shape if axes.empty?
846
884
 
@@ -850,39 +888,6 @@ module TensorStream
850
888
  input_shape
851
889
  end
852
890
 
853
- def reduction(child_context, tensor, a, b, func)
854
- input = complete_eval(a, child_context)
855
- axis = b.is_a?(Tensor) ? read_final_result(complete_eval(b, child_context)) : b
856
- if axis.nil?
857
- red = input.buffer.send(func)
858
- convert_to_opencl(red, [], data_type: tensor.data_type, name: tensor.name)
859
- else
860
- return input if input.shape.empty?
861
-
862
- value = input.buffer.reshape(*input.shape.reverse)
863
- rank = input.shape.size - 1
864
-
865
- if axis.is_a?(Array)
866
- axis.map { |x| rank - x.abs }.sort.reverse_each do |x|
867
- value = value.send(func, x.to_i)
868
- end
869
- else
870
- value = value.send(func, rank - axis.abs)
871
- end
872
-
873
- new_shape = if value.is_a?(NArray)
874
- value.shape.reverse
875
- else
876
- value = [value]
877
- []
878
- end
879
-
880
- new_shape = _reduced_shape(input.shape.dup, axis) if tensor.options[:keepdims]
881
-
882
- convert_to_opencl(value.flatten, new_shape, data_type: tensor.data_type, name: tensor.name)
883
- end
884
- end
885
-
886
891
  # selects variants of cl programs depending on input
887
892
  def select_program(input_a, input_b, op)
888
893
  return [input_a, input_b, op.to_s, 0] if input_a.shape == input_b.shape