tensor_stream-opencl 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (72) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +11 -0
  3. data/.rspec +3 -0
  4. data/.travis.yml +5 -0
  5. data/CODE_OF_CONDUCT.md +74 -0
  6. data/Gemfile +6 -0
  7. data/Gemfile.lock +51 -0
  8. data/LICENSE.txt +21 -0
  9. data/README.md +58 -0
  10. data/Rakefile +6 -0
  11. data/bin/console +14 -0
  12. data/bin/setup +8 -0
  13. data/lib/tensor_stream/opencl.rb +7 -0
  14. data/lib/tensor_stream/opencl/kernels/_bool_operand.cl +45 -0
  15. data/lib/tensor_stream/opencl/kernels/_operand.cl +45 -0
  16. data/lib/tensor_stream/opencl/kernels/abs.cl +20 -0
  17. data/lib/tensor_stream/opencl/kernels/acos.cl +8 -0
  18. data/lib/tensor_stream/opencl/kernels/add.cl +3 -0
  19. data/lib/tensor_stream/opencl/kernels/apply_adam.cl +23 -0
  20. data/lib/tensor_stream/opencl/kernels/apply_gradient.cl +9 -0
  21. data/lib/tensor_stream/opencl/kernels/apply_momentum.cl +16 -0
  22. data/lib/tensor_stream/opencl/kernels/argmax.cl +8 -0
  23. data/lib/tensor_stream/opencl/kernels/argmin.cl +8 -0
  24. data/lib/tensor_stream/opencl/kernels/asin.cl +9 -0
  25. data/lib/tensor_stream/opencl/kernels/cast.cl +10 -0
  26. data/lib/tensor_stream/opencl/kernels/ceil.cl +8 -0
  27. data/lib/tensor_stream/opencl/kernels/cond.cl.erb +6 -0
  28. data/lib/tensor_stream/opencl/kernels/cos.cl +8 -0
  29. data/lib/tensor_stream/opencl/kernels/div.cl.erb +3 -0
  30. data/lib/tensor_stream/opencl/kernels/exp.cl +8 -0
  31. data/lib/tensor_stream/opencl/kernels/floor.cl +8 -0
  32. data/lib/tensor_stream/opencl/kernels/floor_div.cl +48 -0
  33. data/lib/tensor_stream/opencl/kernels/floor_mod.cl +3 -0
  34. data/lib/tensor_stream/opencl/kernels/gemm.cl +32 -0
  35. data/lib/tensor_stream/opencl/kernels/log.cl +8 -0
  36. data/lib/tensor_stream/opencl/kernels/log1p.cl +8 -0
  37. data/lib/tensor_stream/opencl/kernels/log_softmax.cl +26 -0
  38. data/lib/tensor_stream/opencl/kernels/max.cl +46 -0
  39. data/lib/tensor_stream/opencl/kernels/min.cl +46 -0
  40. data/lib/tensor_stream/opencl/kernels/mod.cl +3 -0
  41. data/lib/tensor_stream/opencl/kernels/mul.cl +3 -0
  42. data/lib/tensor_stream/opencl/kernels/negate.cl +8 -0
  43. data/lib/tensor_stream/opencl/kernels/pack.cl +24 -0
  44. data/lib/tensor_stream/opencl/kernels/pow.cl +46 -0
  45. data/lib/tensor_stream/opencl/kernels/real_div.cl +3 -0
  46. data/lib/tensor_stream/opencl/kernels/reciprocal.cl +8 -0
  47. data/lib/tensor_stream/opencl/kernels/round.cl +8 -0
  48. data/lib/tensor_stream/opencl/kernels/sigmoid.cl +9 -0
  49. data/lib/tensor_stream/opencl/kernels/sigmoid_grad.cl +55 -0
  50. data/lib/tensor_stream/opencl/kernels/sign.cl +21 -0
  51. data/lib/tensor_stream/opencl/kernels/sin.cl +9 -0
  52. data/lib/tensor_stream/opencl/kernels/softmax.cl +26 -0
  53. data/lib/tensor_stream/opencl/kernels/softmax_cross.cl +32 -0
  54. data/lib/tensor_stream/opencl/kernels/softmax_cross_grad.cl +28 -0
  55. data/lib/tensor_stream/opencl/kernels/softmax_grad.cl +46 -0
  56. data/lib/tensor_stream/opencl/kernels/sqrt.cl +9 -0
  57. data/lib/tensor_stream/opencl/kernels/square.cl +9 -0
  58. data/lib/tensor_stream/opencl/kernels/squared_difference.cl +53 -0
  59. data/lib/tensor_stream/opencl/kernels/sub.cl +3 -0
  60. data/lib/tensor_stream/opencl/kernels/tan.cl +8 -0
  61. data/lib/tensor_stream/opencl/kernels/tanh.cl +8 -0
  62. data/lib/tensor_stream/opencl/kernels/tanh_grad.cl +7 -0
  63. data/lib/tensor_stream/opencl/kernels/where.cl +8 -0
  64. data/lib/tensor_stream/opencl/math_ops.rb +133 -0
  65. data/lib/tensor_stream/opencl/nn_ops.rb +191 -0
  66. data/lib/tensor_stream/opencl/opencl_buffer.rb +35 -0
  67. data/lib/tensor_stream/opencl/opencl_device.rb +5 -0
  68. data/lib/tensor_stream/opencl/opencl_evaluator.rb +933 -0
  69. data/lib/tensor_stream/opencl/opencl_template_helper.rb +99 -0
  70. data/lib/tensor_stream/opencl/version.rb +5 -0
  71. data/tensor_stream-opencl.gemspec +40 -0
  72. metadata +185 -0
@@ -0,0 +1,191 @@
1
+ module TensorStream
2
+ module OpenCLHelpers
3
+ # Collection of math functions for interfacing with OpenCL kernels
4
+ module NNOps
5
+ def NNOps.included(klass)
6
+ klass.class_eval do
7
+
8
+ # Fast in place multiply subtract assign
9
+ register_op :apply_gradient_descent do |_context, tensor, inputs|
10
+ _target_var, learning_rate, delta = inputs
11
+
12
+ assign = tensor.inputs[0] || tensor
13
+
14
+ assign.buffer.dirty = true # force buffer copy when variable is read externally
15
+ output_buffer = assign.buffer
16
+
17
+ m, n = output_buffer.shape
18
+ work_group = [m || 1, n || 1]
19
+ cl_m = OpenCL::Int1.new(m || 1)
20
+ cl_n = OpenCL::Int1.new(n || 1)
21
+
22
+ event_wait_list = build_event_wait_list([assign.buffer, learning_rate, delta])
23
+ method_call = :"apply_gradient_#{output_buffer.data_type}"
24
+ event = _cl_program("apply_gradient", dtype: output_buffer.data_type).send(method_call, _opencl_queue, work_group, cl_m, cl_n, delta.cl_buffer, learning_rate.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
25
+ output_buffer.op = event
26
+ output_buffer
27
+ end
28
+
29
+ # updates for gradient descent with momentum
30
+ register_op :apply_momentum do |_context, tensor, inputs|
31
+ target_var, momentum_var, learning_rate, grad, momentum = inputs
32
+
33
+ assign = tensor.inputs[0] || tensor
34
+ assign_acc = tensor.inputs[1]
35
+ assign.buffer.dirty = true # force buffer copy when variable is read externally
36
+ assign_acc.buffer.dirty = true # force buffer copy when variable is read externally
37
+
38
+ output_buffer = assign.buffer
39
+
40
+ m, n = output_buffer.shape
41
+ work_group = [m || 1, n || 1]
42
+ cl_m = OpenCL::Int1.new(m || 1)
43
+ cl_n = OpenCL::Int1.new(n || 1)
44
+
45
+ event_wait_list = build_event_wait_list([assign.buffer, assign_acc.buffer, learning_rate, grad, momentum])
46
+ method_call = :"apply_momentum_#{output_buffer.data_type}"
47
+ event = _cl_program("apply_momentum", nesterov: tensor.options[:use_nesterov], dtype: output_buffer.data_type).
48
+ send(method_call, _opencl_queue, work_group, cl_m, cl_n, grad.cl_buffer,
49
+ learning_rate.cl_buffer, momentum.cl_buffer, output_buffer.cl_buffer,
50
+ assign_acc.buffer.cl_buffer, event_wait_list: event_wait_list)
51
+ output_buffer.op = event
52
+ assign_acc.buffer.op = event
53
+ output_buffer
54
+ end
55
+
56
+ register_op :apply_adadelta do |context, tensor, inputs|
57
+ end
58
+
59
+ # Adam optimization algorithm
60
+ register_op :apply_adam do |_context, tensor, inputs|
61
+ _target_var, _m, _v, beta1_power, beta2_power, lr_t, beta1_t, beta2_t, epsilon_t, grad = inputs
62
+
63
+ assign = tensor.inputs[0] || tensor
64
+ assign_m = tensor.inputs[1]
65
+ assign_v = tensor.inputs[2]
66
+
67
+ # mark variable buffers as dirty
68
+ assign.buffer.dirty = true # force buffer copy when variable is read externally
69
+ assign_m.buffer.dirty = true # force buffer copy when variable is read externally
70
+ assign_v.buffer.dirty = true # force buffer copy when variable is read externally
71
+
72
+ output_buffer = assign.buffer
73
+
74
+ m, n = output_buffer.shape
75
+ work_group = [m || 1, n || 1]
76
+ cl_m = OpenCL::Int1.new(m || 1)
77
+ cl_n = OpenCL::Int1.new(n || 1)
78
+
79
+ event_wait_list = build_event_wait_list(inputs)
80
+ method_call = :"apply_adam_#{output_buffer.data_type}"
81
+ event = _cl_program("apply_adam", dtype: output_buffer.data_type)
82
+ .send(method_call, _opencl_queue, work_group, cl_m, cl_n,
83
+ grad.cl_buffer,
84
+ lr_t.cl_buffer,
85
+ beta1_power.cl_buffer,
86
+ beta2_power.cl_buffer,
87
+ beta1_t.cl_buffer,
88
+ beta2_t.cl_buffer,
89
+ epsilon_t.cl_buffer,
90
+ assign_m.buffer.cl_buffer,
91
+ assign.buffer.cl_buffer,
92
+ assign_v.buffer.cl_buffer,
93
+ event_wait_list: event_wait_list)
94
+ output_buffer.op = event
95
+ assign_m.buffer.op = event
96
+ assign_v.buffer.op = event
97
+ output_buffer
98
+ end
99
+
100
+ register_op :softmax do |_context, tensor, inputs|
101
+ a = inputs[0]
102
+ event_wait_list = build_event_wait_list(inputs)
103
+ dtype = tensor.data_type
104
+ output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
105
+
106
+ m, n = a.shape
107
+ work_group = [m]
108
+ n = m if n.nil?
109
+ cl_n = OpenCL::Int1.new(n || 1)
110
+
111
+ event = _cl_program("softmax", dtype: dtype).send(:"softmax_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
112
+ output_buffer.op = event
113
+ output_buffer
114
+ end
115
+
116
+ register_op :log_softmax do |_context, tensor, inputs|
117
+ a = inputs[0] # logits
118
+ event_wait_list = build_event_wait_list(inputs)
119
+ dtype = tensor.data_type
120
+ output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
121
+
122
+ m, n = a.shape
123
+ work_group = [m]
124
+ n = m if n.nil?
125
+ cl_n = OpenCL::Int1.new(n || 1)
126
+
127
+ event = _cl_program("log_softmax", dtype: dtype).send(:"log_softmax_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
128
+ output_buffer.op = event
129
+ output_buffer
130
+ end
131
+
132
+ register_op :softmax_cross_entropy_with_logits_v2 do |context, tensor, inputs|
133
+ a = inputs[0] # logits
134
+ b = inputs[1] # labels
135
+ event_wait_list = build_event_wait_list(inputs)
136
+ dtype = tensor.data_type
137
+ output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
138
+ output_buffer_backprop = _create_result_buffer(tensor.data_type, a.shape, "#{tensor.name}_2")
139
+ rank = a.shape.size - 1
140
+ m, n = a.shape
141
+ work_group = [m]
142
+ n = m if n.nil?
143
+ cl_n = OpenCL::Int1.new(n || 1)
144
+
145
+ event = _cl_program("softmax_cross", dtype: dtype).send(:"softmax_cross_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer, b.cl_buffer,
146
+ output_buffer.cl_buffer, output_buffer_backprop.cl_buffer, event_wait_list: event_wait_list)
147
+ output_buffer.op = event
148
+ output_buffer_backprop.op = event
149
+
150
+ loss = reduction(context, tensor, output_buffer, rank, :sum)
151
+ TensorStream::Evaluator::OutputGroup.new([loss, output_buffer_backprop], [tensor.inputs[0].data_type, tensor.inputs[0].data_type])
152
+ end
153
+
154
+ register_op :softmax_cross_entropy_with_logits_v2_grad do |_context, tensor, inputs|
155
+ a = inputs[0] # logits
156
+ b = inputs[1] # labels
157
+ c = inputs[2] # grads
158
+ event_wait_list = build_event_wait_list(inputs)
159
+ dtype = tensor.data_type
160
+ output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
161
+
162
+ m, n = a.shape
163
+ work_group = [m]
164
+ n = m if n.nil?
165
+ cl_n = OpenCL::Int1.new(n || 1)
166
+
167
+ event = _cl_program("softmax_cross_grad", dtype: dtype).send(:"softmax_cross_grad_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer, b.cl_buffer, c.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
168
+ output_buffer.op = event
169
+ output_buffer
170
+ end
171
+
172
+ register_op :softmax_grad do |_context, tensor, inputs|
173
+ a, grad = inputs
174
+
175
+ event_wait_list = build_event_wait_list(inputs)
176
+ dtype = tensor.data_type
177
+ output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
178
+
179
+ m, n = a.shape
180
+ work_group = [m]
181
+ n = m if n.nil?
182
+ cl_n = OpenCL::Int1.new(n || 1)
183
+ event = _cl_program('softmax_grad', dtype: dtype, size: n).send(:"softmax_grad_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer, grad.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
184
+ output_buffer.op = event
185
+ output_buffer
186
+ end
187
+ end
188
+ end
189
+ end
190
+ end
191
+ end
@@ -0,0 +1,35 @@
1
+ module TensorStream
2
+ # Buffer used by the OpenCL evaluator
3
+ class OpenCLBuffer < Buffer
4
+ include ArrayOpsHelper
5
+
6
+ attr_accessor :shape, :buffer, :cl_buffer, :op
7
+
8
+ def initialize(data_type:, shape:, buffer:, cl_buffer:, op: nil, name: nil)
9
+ @data_type = data_type
10
+ @shape = shape
11
+ @buffer = buffer
12
+ @cl_buffer = cl_buffer
13
+ @name = name
14
+ @op = op
15
+ end
16
+
17
+ def to_ruby
18
+ return [] if buffer.empty?
19
+
20
+ if dirty
21
+ op.command_queue.enqueue_read_buffer(cl_buffer, buffer, event_wait_list: [op].compact)
22
+ op.command_queue.finish
23
+ self.dirty = false
24
+ end
25
+
26
+ if shape.empty?
27
+ return buffer[0] != 0 if data_type == :boolean
28
+ return buffer[0]
29
+ end
30
+
31
+ result = buffer.reshape(*shape.map(&:to_i).reverse).to_a
32
+ data_type == :boolean ? process_function_op(result, ->(a, _b) { a != 0 }) : result
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,5 @@
1
+ module TensorStream
2
+ class OpenclDevice < TensorStream::Device
3
+ attr_accessor :native_device
4
+ end
5
+ end
@@ -0,0 +1,933 @@
1
+ require 'tensor_stream/evaluator/operation_helpers/random_gaussian'
2
+ require 'tensor_stream/evaluator/operation_helpers/array_ops_helper'
3
+ require 'tensor_stream/evaluator/operation_helpers/math_helper'
4
+ require 'tensor_stream/evaluator/buffer'
5
+ require 'tensor_stream/opencl/opencl_buffer'
6
+ require 'tensor_stream/opencl/opencl_template_helper'
7
+ require 'tensor_stream/device'
8
+ require 'tensor_stream/evaluator/opencl/opencl_device'
9
+ require 'opencl_ruby_ffi'
10
+ require 'narray_ffi'
11
+ require 'tensor_stream/evaluator/base_evaluator'
12
+ require 'tensor_stream/opencl/math_ops'
13
+ require 'tensor_stream/opencl/nn_ops'
14
+ require 'tensor_stream/helpers/op_helper'
15
+
16
+ module TensorStream
17
+ module Evaluator
18
+ class FullEvalNotPossible < RuntimeError
19
+ end
20
+
21
+ # Errors during graph evaluation
22
+ class EvaluatorExcecutionException < RuntimeError
23
+ attr_reader :tensor
24
+
25
+ def initialize(exception, tensor)
26
+ @exception = exception
27
+ @tensor = tensor
28
+ end
29
+
30
+ def wrapped_exception
31
+ @exception
32
+ end
33
+ end
34
+
35
+ ## PURE ruby evaluator used for testing and development
36
+ class OpenclEvaluator < BaseEvaluator
37
+ attr_accessor :retain
38
+ attr_reader :opencl_device
39
+
40
+ include TensorStream::OpHelper
41
+ include TensorStream::ArrayOpsHelper
42
+ include TensorStream::MathHelper
43
+ include TensorStream::OpenCLHelpers::MathOps
44
+ include TensorStream::OpenCLHelpers::NNOps
45
+
46
+ def initialize(session, device, thread_pool: nil, log_intermediates: false)
47
+ super
48
+ _create_opencl_context(device.native_device)
49
+ @opencl_device = device.native_device
50
+ create_command_queue
51
+ end
52
+
53
+ def self.query_supported_devices
54
+ devices = query_devices_with_score
55
+ devices.sort { |a| a[1] }.reverse.map do |d|
56
+ opencl_to_device(d)
57
+ end
58
+ end
59
+
60
+ def self.fetch_device(query = [])
61
+ devices = query_devices_with_score
62
+ platform_devices = devices.select { |d| d[0].platform.to_s.tr(' ', '_').downcase =~ /#{query[0].downcase}/ }
63
+ opencl_to_device(platform_devices[[query[1].to_i, platform_devices.size - 1].min])
64
+ end
65
+
66
+ def self.opencl_to_device(d)
67
+ device = d[0]
68
+ index = d[3]
69
+ platform_name = device.platform.name.tr(' ', '_').downcase
70
+ uri = [platform_name, index].join(':')
71
+
72
+ device_type = device.type.to_s == 'GPU' ? :gpu : :cpu
73
+
74
+ OpenclDevice.new(uri, device_type, self).tap do |devide|
75
+ devide.native_device = device
76
+ end
77
+ end
78
+
79
+ ##
80
+ # Select the best device available in the system for this evaluator
81
+ def self.default_device
82
+ devices = OpenclEvaluator.query_devices_with_score
83
+ device = devices.sort { |a| a[1] }.reverse.first
84
+ opencl_to_device(device)
85
+ end
86
+
87
+ # opencl evaluator main entrypoint
88
+ def run(tensor, execution_context)
89
+ read_final_result(complete_eval(tensor, execution_context))
90
+ end
91
+
92
+ def run_with_buffer(tensor, context, execution_context)
93
+ @context = context
94
+ @context[:_cache][:_cl_buffers] ||= {} if context[:_cache]
95
+
96
+ if tensor.is_a?(Array)
97
+ tensor.collect do |t|
98
+ value = run(t, execution_context)
99
+ Buffer.new(data_type: t.data_type, buffer: value)
100
+ end
101
+ else
102
+ value = run(tensor, execution_context)
103
+ Buffer.new(data_type: tensor.data_type, buffer: value)
104
+ end
105
+ end
106
+
107
+ # buffer comes from non-opencl evaluator
108
+ def convert_from_buffer(tensor, result)
109
+ if result.buffer.is_a?(TensorStream::Evaluator::OutputGroup)
110
+ converted_outputs = result.buffer.outputs.zip(result.buffer.data_types).map { |output, data_type| convert_to_opencl([output].flatten, shape_eval(output), data_type: data_type, name: tensor.name) }
111
+ TensorStream::Evaluator::OutputGroup.new(converted_outputs, result.buffer.data_types)
112
+ else
113
+ convert_to_opencl([result.buffer].flatten, shape_eval(result.buffer), data_type: result.data_type, name: tensor.name)
114
+ end
115
+ end
116
+
117
+ def enqueue_buffer_read(tensor, context)
118
+ buffer = _run(tensor, context)
119
+ if buffer.is_a?(Array)
120
+ buffer = buffer.collect do |b|
121
+ next b if b.buffer.size.zero?
122
+ _opencl_queue.enqueue_read_buffer(b.cl_buffer, b.buffer, event_wait_list: build_event_wait_list([b]))
123
+ b
124
+ end
125
+ else
126
+ return buffer.outputs[0] if buffer.is_a?(OutputGroup)
127
+ return buffer if buffer.nil?
128
+ return [] if buffer.buffer.nil?
129
+ return buffer if buffer.buffer.size.zero?
130
+ _opencl_queue.enqueue_read_buffer(buffer.cl_buffer, buffer.buffer, event_wait_list: build_event_wait_list([buffer]))
131
+ buffer
132
+ end
133
+ end
134
+
135
+ def complete_eval(tensor, context)
136
+ buffer = enqueue_buffer_read(tensor, context)
137
+ _opencl_queue.finish
138
+ buffer
139
+ end
140
+
141
+ def self.query_devices_with_score
142
+ OpenCL.platforms.flat_map do |p|
143
+ p.devices.select { |d| d.available > 0 }.each_with_index.collect do |d, index|
144
+ score = 0
145
+ if d.type.to_s == 'CPU'
146
+ score += 1
147
+ elsif d.type.to_s == 'GPU'
148
+ score += 4
149
+ end
150
+
151
+ score += 1000 if d.platform.name == 'NVIDIA CUDA'
152
+
153
+ score += d.max_compute_units
154
+ score += d.max_clock_frequency
155
+
156
+ [d, score, p.name, index]
157
+ end
158
+ end
159
+ end
160
+
161
+ protected
162
+
163
+ def prepare_input(tensor, context, options = {})
164
+ return nil unless tensor
165
+ tensor = resolve_placeholder(tensor)
166
+ if options[:noop]
167
+ tensor
168
+ elsif options[:buffer]
169
+ complete_eval(tensor, context)
170
+ elsif options[:complete]
171
+ read_final_result(complete_eval(tensor, context))
172
+ else
173
+ _run(tensor, context)
174
+ end
175
+ end
176
+
177
+ # read result from opencl and convert to ruby
178
+ def read_final_result(buffer)
179
+ return buffer.map { |b| read_final_result(b) } if buffer.is_a?(Array)
180
+ return nil if buffer.nil?
181
+
182
+ buffer.to_ruby
183
+ end
184
+
185
+ def _create_opencl_context(opencl_device)
186
+ @opencl_context = OpenCL.create_context(opencl_device)
187
+ end
188
+
189
+ def create_command_queue
190
+ supported_proprties = opencl_device.queue_properties.names
191
+
192
+ properties = []
193
+ properties << OpenCL::CommandQueue::PROFILING_ENABLE if supported_proprties.include?('PROFILING_ENABLE')
194
+ properties << OpenCL::CommandQueue::OUT_OF_ORDER_EXEC_MODE_ENABLE if supported_proprties.include?('OUT_OF_ORDER_EXEC_MODE_ENABLE')
195
+ @command_queue = _opencl_context.create_command_queue(opencl_device, properties: properties)
196
+ end
197
+
198
+ def _opencl_context
199
+ @opencl_context
200
+ end
201
+
202
+ def _opencl_queue
203
+ @command_queue
204
+ end
205
+
206
+ def cl_template_path(kernel, extension)
207
+ File.join(File.dirname(__FILE__), 'kernels', "#{kernel}.#{extension}")
208
+ end
209
+
210
+ def _cl_program(kernel, args = {})
211
+ suffix = args.collect { |k, v| "#{k}.#{escape_arg_content(v)}" }.join('.')
212
+ @context[:_cache]["_opencl_kernel_#{kernel}.#{suffix}:#{object_id}"] ||= begin
213
+ filename = %w[cl.erb cl].map { |ext| cl_template_path(kernel, ext) }.find { |n| File.exist?(n) }
214
+ raise "opencl kernel template for #{kernel} has not yet been defined" if filename.nil?
215
+ source = File.read(filename)
216
+ source = OpenclTemplateHelper.new(source).generate(args)
217
+ # File.write("/tmp/#{kernel}.#{suffix}.cl", source)
218
+ program = _opencl_context.create_program_with_source(source)
219
+ program.build
220
+ rescue OpenCL::Error::BUILD_PROGRAM_FAILURE => e
221
+ puts "OpenCL Compile error: #{program.build_log}"
222
+ raise e
223
+ end
224
+ end
225
+
226
+ def escape_arg_content(value)
227
+ return value.tr(' ','_') if value.is_a?(String)
228
+ return value.join('-') if value.is_a?(Array)
229
+
230
+ value
231
+ end
232
+
233
+ def _run(tensor, execution_context)
234
+ return tensor if tensor.is_a?(OpenCLBuffer)
235
+ return tensor.map { |t| _run(t, execution_context) } if tensor.is_a?(Array) && !tensor.size.empty? && tensor[0].is_a?(Tensor)
236
+
237
+ tensor = tensor.call if tensor.is_a?(Proc)
238
+
239
+ child_context = execution_context.dup
240
+ res = if tensor.is_a?(Operation)
241
+ if !self.class.ops.include?(tensor.operation.to_sym)
242
+ result = @session.delegate_to_evaluator(tensor, @context, execution_context)
243
+ convert_from_buffer(tensor, result)
244
+ else
245
+ eval_operation(tensor, child_context)
246
+ end
247
+ elsif tensor.is_a?(Variable)
248
+ eval_variable(tensor, child_context)
249
+ elsif tensor.is_a?(Placeholder)
250
+ resolve_placeholder(tensor, child_context)
251
+ else
252
+ eval_tensor(tensor, child_context)
253
+ end
254
+ execution_context.deep_merge!(returns: child_context[:returns])
255
+ res
256
+ end
257
+
258
+ def eval_variable(tensor, _child_context)
259
+ raise "variable #{tensor.name} not initalized" if tensor.value.nil? && (tensor.buffer.nil? || !tensor.buffer.dirty)
260
+ tensor.buffer = wrap_opencl(tensor, name: tensor.name) if tensor.buffer.nil?
261
+ tensor.buffer
262
+ end
263
+
264
+ register_op :no_op do |_context, _tensor, _inputs|
265
+ end
266
+
267
+ register_op :cond, noop: true do |context, tensor, inputs|
268
+ pred = complete_eval(tensor.options[:pred], context)
269
+
270
+ if all_true?(pred.buffer)
271
+ complete_eval(inputs[0], context)
272
+ else
273
+ complete_eval(inputs[1], context)
274
+ end
275
+ end
276
+
277
+ register_op :identity do |context, tensor, inputs|
278
+ if tensor.inputs.size > 1
279
+ tensor.inputs[1..inputs.size].each { |input| complete_eval(input, context) }
280
+ end
281
+ inputs[0]
282
+ end
283
+
284
+ register_op :assign, noop: true do |context, tensor, inputs|
285
+ assign_var(tensor, inputs[1], context)
286
+ end
287
+
288
+ register_op :assign_add do |context, tensor, inputs|
289
+ value = execute_2_operand_func('add', tensor, inputs[0], inputs[1], context)
290
+ assign_var(tensor, value, context)
291
+ end
292
+
293
+ register_op :assign_sub do |context, tensor, inputs|
294
+ value = execute_2_operand_func('sub', tensor, inputs[0], inputs[1], context)
295
+ assign_var(tensor, value, context)
296
+ end
297
+
298
+ register_op :variable, noop: true do |context, tensor, inputs|
299
+ variable = tensor.inputs[0]
300
+ raise "variable #{tensor.name} not initalized" if variable.value.nil? && (variable.buffer.nil? || !variable.buffer.dirty)
301
+ variable.buffer = wrap_opencl(variable, name: variable.name) if variable.buffer.nil?
302
+ variable.buffer
303
+ end
304
+
305
+ %i[less less_equal greater greater_equal equal not_equal logical_and].each do |op|
306
+ register_op op, noop: true do |context, tensor, inputs|
307
+ execute_2_operand_func(op.to_s, tensor, inputs[0], inputs[1], context, 'cond')
308
+ end
309
+ end
310
+
311
+ register_op :expand_dims, buffer: true do |_context, tensor, inputs|
312
+ axis = inputs[1].buffer[0]
313
+ shape = inputs[0].shape.dup
314
+ axis = -axis if axis == shape.size
315
+ new_shape = shape.insert(axis, 1).compact
316
+ new_buf = inputs[0].buffer.reshape(*new_shape.reverse)
317
+ convert_to_opencl(new_buf, new_shape, data_type: inputs[0].data_type, name: tensor.name)
318
+ end
319
+
320
+ register_op :fill, buffer: true do |_context, tensor, inputs|
321
+ shape = inputs[0]
322
+ value = inputs[1]
323
+
324
+ narray_size = shape.buffer.to_a.reduce(:*) || 1
325
+ cl_buffer = get_cached_buffer(tensor.name, shape.buffer.to_a)
326
+
327
+ buffer = if cl_buffer
328
+ cl_buffer.buffer
329
+ else
330
+ allocate_narray_for_type(tensor.data_type, narray_size)
331
+ end
332
+
333
+ buffer.fill!(value.buffer[0])
334
+ convert_to_opencl(buffer, shape.buffer.to_a, data_type: tensor.data_type, name: tensor.name)
335
+ end
336
+
337
+ register_op :where, noop: true do |context, tensor, inputs|
338
+ pred = tensor.options[:pred]
339
+ execute_cond_func('where', tensor, pred, inputs[0], inputs[1], context)
340
+ end
341
+
342
+ register_op :cast do |_context, tensor, inputs|
343
+ a = inputs[0]
344
+ if a.data_type != tensor.data_type
345
+ buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
346
+ m, n = a.shape
347
+ cl_m = OpenCL::Int1.new(m || 1)
348
+ cl_n = OpenCL::Int1.new(n || 1)
349
+ work_group = [m || 1, n || 1]
350
+ event_wait_list = build_event_wait_list(inputs)
351
+ buffer.op = _cl_program("cast", source_dt: a.data_type, target_dt: tensor.data_type).cast(_opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, buffer.cl_buffer, event_wait_list: event_wait_list)
352
+ buffer
353
+ else
354
+ a
355
+ end
356
+ end
357
+
358
+ register_op :stack do |_context, tensor, inputs|
359
+ axis = tensor.options[:axis] || 0
360
+ shape = inputs[0].shape
361
+ rank = shape.size + 1
362
+ elem_size = shape.empty? ? 1 : shape.reduce(:*)
363
+
364
+ new_shape = [inputs.size]
365
+ shape.inject(new_shape) { |ns, s| ns << s }
366
+
367
+ divisors = new_shape.dup.drop(1).reverse.inject([1]) do |a, s|
368
+ a << s * a.last
369
+ end.reverse
370
+
371
+ axis = rank + axis if axis < 0
372
+ rotated_shape = Array.new(axis + 1) { new_shape.shift }
373
+ new_shape = rotated_shape.rotate! + new_shape
374
+
375
+ output_buffer = _create_result_buffer(tensor.data_type, new_shape, tensor.name)
376
+ multipliers = new_shape.dup.drop(1).reverse.inject([1]) do |a, s|
377
+ a << s * a.last
378
+ end.reverse
379
+
380
+ cl_n = OpenCL::Int1.new(elem_size)
381
+ work_group = [elem_size]
382
+ event_wait_list = build_event_wait_list(inputs)
383
+ ops = inputs.each_with_index.map do |input, index|
384
+ cl_index = OpenCL::Int1.new(index)
385
+ _cl_program("pack", data_type: tensor.data_type, divisors: divisors, multipliers: multipliers, axis: axis).pack(_opencl_queue, work_group, cl_n, cl_index, input.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
386
+ end
387
+ output_buffer.op = ops
388
+ output_buffer
389
+ end
390
+
391
+ register_op :check_numerics, noop: true do |context, tensor, inputs|
392
+ a = complete_eval(inputs[0], context)
393
+ name = tensor.options[:name]
394
+
395
+ a.buffer.each do |input|
396
+ raise TensorStream::InvalidArgumentError, "#{name} Invalid Argument" if input.nan? || input.infinite?
397
+ end
398
+ a
399
+ end
400
+
401
+ register_op :broadcast_transform do |context, tensor, inputs|
402
+ a, b = inputs
403
+
404
+ if a.shape == b.shape
405
+ [a, b]
406
+ else
407
+ input_a = read_final_result(complete_eval(a, context))
408
+ input_b = read_final_result(complete_eval(b, context))
409
+ b_a, b_b = broadcast(input_a, input_b)
410
+ [wrap_opencl(b_a, data_type: a.data_type, name: "#{tensor.name}_a"),
411
+ wrap_opencl(b_b, data_type: a.data_type, name: "#{tensor.name}_b")]
412
+ end
413
+ end
414
+
415
+ register_op :print do |context, tensor, inputs|
416
+ a, b = inputs
417
+ input_b = complete_eval(b, context)
418
+ input_b = read_final_result(input_b)
419
+ puts "#{tensor.options.fetch(:message, '')} #{input_b}"
420
+ a
421
+ end
422
+
423
+ register_op :rank do |_context, tensor, inputs|
424
+ wrap_opencl(inputs[0].shape.size, data_type: tensor.data_type, name: tensor.name)
425
+ end
426
+
427
+ register_op :stop_gradient do |_context, _tensor, inputs|
428
+ inputs[0]
429
+ end
430
+
431
+ register_op :slice, noop: true do |context, tensor, inputs|
432
+ input_a = complete_eval(inputs[0], context)
433
+ input_b = read_final_result(complete_eval(inputs[1], context))
434
+ size = tensor.options[:size]
435
+
436
+ slice_param = input_b.zip(size).collect { |x, y| x..x + y - 1 }.reverse
437
+
438
+ new_buf = input_a.buffer.reshape(*input_a.shape.reverse)
439
+ sliced = new_buf.slice[*slice_param]
440
+ convert_to_opencl(sliced.flatten, sliced.shape.reverse, data_type: inputs[0].data_type, name: tensor.name)
441
+ end
442
+
443
+ register_op :transpose, buffer: true do |_context, tensor, inputs|
444
+ t_param = Array.new(inputs[0].shape.size) { |index| index }.reverse
445
+
446
+ if inputs[0].shape.size == 2 && inputs[1].nil?
447
+ transposed = inputs[0].buffer.reshape(*inputs[0].shape.reverse).transpose(*t_param)
448
+ res = convert_to_opencl(transposed.flatten, transposed.shape.reverse, data_type: inputs[0].data_type, name: tensor.name)
449
+ res
450
+ else
451
+ rank = inputs[0].shape.size
452
+ perm = inputs[1].nil? ? (0...rank).to_a.reverse : inputs[1].buffer
453
+ new_shape = perm.map { |p| inputs[0].shape[p] }.to_a
454
+ output_buffer = _create_result_buffer(tensor.data_type, new_shape, tensor.name)
455
+ transpose_with_perm(inputs[0].buffer, output_buffer.buffer, inputs[0].shape, new_shape, perm)
456
+
457
+ write_op = _opencl_queue.enqueue_write_buffer(output_buffer.cl_buffer, output_buffer.buffer)
458
+ output_buffer.op = write_op
459
+ output_buffer
460
+ end
461
+ end
462
+
463
+ register_op :index, noop: true do |context, tensor, inputs|
464
+ a = _run(inputs[0], context)
465
+ index = read_final_result(_run(inputs[1], context))
466
+
467
+ if a.is_a?(OutputGroup)
468
+ a.outputs[index]
469
+ elsif a.is_a?(Array)
470
+ a[index]
471
+ else
472
+ new_shape = a.shape.dup
473
+ new_shape.shift
474
+ input_a = read_final_result(a)
475
+ convert_to_opencl(input_a[index], new_shape, data_type: a.data_type, name: tensor.name)
476
+ end
477
+ end
478
+
479
+ register_op :broadcast_gradient_args, buffer: true do |_context, tensor, inputs|
480
+ rx, ry = get_broadcast_gradient_args(inputs[0].buffer.to_a, inputs[1].buffer.to_a)
481
+ OutputGroup.new([wrap_opencl(rx, data_type: :int32, name: tensor.name), wrap_opencl(ry, data_type: :int32, name: "#{tensor.name}:1")], tensor.inputs.map(&:data_type))
482
+ end
483
+
484
+ register_op :shape do |_context, tensor, inputs|
485
+ wrap_opencl(inputs[0].shape, name: tensor.name, data_type: tensor.data_type)
486
+ end
487
+
488
+ register_op :reshape, buffer: true do |_context, tensor, inputs|
489
+ arr = inputs[0]
490
+ new_shape = read_final_result(inputs[1])
491
+
492
+ shape = if new_shape.size.zero? && arr.buffer.size == 1
493
+ new_shape
494
+ else
495
+ TensorShape.fix_inferred_elements(new_shape, arr.buffer.size)
496
+ end
497
+
498
+ convert_to_opencl(arr.buffer, shape, data_type: arr.data_type, name: tensor.name)
499
+ end
500
+
501
+ register_op :flow_group do |context, _tensor, inputs|
502
+ _opencl_queue.finish
503
+ nil
504
+ end
505
+
506
+ register_op :size do |_context, tensor, inputs|
507
+ wrap_opencl(inputs[0].buffer.size, name: tensor.name, data_type: tensor.options[:out_type] || :int32)
508
+ end
509
+
510
+ def eval_operation(tensor, child_context)
511
+ cache_key = "#{tensor.graph.object_id}_opencl_#{tensor.name}:#{object_id}"
512
+ return @context[:_cache][cache_key] if @context[:_cache].key?(cache_key)
513
+ return @context[cache_key] if @context.key?(cache_key)
514
+ # puts "opencl: #{tensor.name}"
515
+ invoke(tensor, child_context).tap do |result|
516
+ if tensor.breakpoint
517
+ a = resolve_placeholder(tensor.inputs[0], child_context) if tensor.inputs && tensor.inputs[0]
518
+ b = resolve_placeholder(tensor.inputs[1], child_context) if tensor.inputs && tensor.inputs[1]
519
+ a = read_final_result(complete_eval(a, child_context))
520
+ b = read_final_result(complete_eval(b, child_context))
521
+ result = read_final_result(complete_eval(result, child_context))
522
+
523
+ tensor.breakpoint.call(tensor, a, b, result)
524
+ end
525
+ if @log_intermediates
526
+ @context[:compute_history] << {
527
+ name: tensor.name,
528
+ type: tensor.data_type,
529
+ shape: shape_eval(result),
530
+ source: tensor.source,
531
+ description: tensor.to_math(true, 1),
532
+ value: result
533
+ }
534
+ end
535
+ @context[cache_key] = result
536
+ @context[:_cache][cache_key] = result if tensor.is_const
537
+ end
538
+ rescue EvaluatorExcecutionException => e
539
+ _opencl_queue.finish # dump queue
540
+ raise e, "error #{e.message} while evaluating #{tensor.name} : #{tensor.to_math(true, 1)} defined at #{tensor.source}"
541
+ rescue TensorStreamError => e
542
+ _opencl_queue.finish # dump queue
543
+ raise e, "error #{e.message} while evaluating #{tensor.name} : #{tensor.to_math(true, 1)} defined at #{tensor.source}"
544
+ rescue StandardError => e
545
+ _opencl_queue.finish # dump queue
546
+ puts e.message
547
+ puts e.backtrace.join("\n")
548
+
549
+ # shape_a = a.shape.shape if a
550
+ # shape_b = b.shape.shape if b
551
+ # dtype_a = a.data_type if a
552
+ # dtype_b = b.data_type if b
553
+ # a = complete_eval(a, child_context)
554
+ # b = complete_eval(b, child_context)
555
+ # puts "name: #{tensor.given_name}"
556
+ # # puts "op: #{tensor.to_math(true, 1)}"
557
+ # puts "A #{shape_a} #{dtype_a}: #{a}" if a
558
+ # puts "B #{shape_b} #{dtype_b}: #{b}" if b
559
+ # dump_intermediates if @log_intermediates
560
+ # File.write('/home/jedld/workspace/tensor_stream/samples/error.graphml', TensorStream::Graphml.new.get_string(tensor, @session))
561
+
562
+ # File.write('/Users/josephemmanueldayo/workspace/gradients.graphml', TensorStream::Graphml.new.get_string(tensor, @session))
563
+ raise EvaluatorExcecutionException.new(e, tensor), "error #{e.message} while evaluating #{tensor.name} : #{tensor.to_math(true, 1)} defined at #{tensor.source}"
564
+ end
565
+
566
+ def eval_tensor(tensor, child_context)
567
+ return tensor unless tensor.is_a?(Tensor)
568
+
569
+ cache_key = "#{tensor.graph.object_id}_opencl_#{tensor.name}:#{object_id}"
570
+ return @context[cache_key] if @context.key?(cache_key)
571
+ return @context[:_cache][cache_key] if tensor.is_const && @context[:_cache][cache_key]
572
+ @context[cache_key] = if tensor.value.is_a?(Tensor)
573
+ _run(tensor.value, child_context)
574
+ else
575
+ wrap_opencl(tensor, name: tensor.name)
576
+ end
577
+ @context[:_cache][cache_key] = @context[cache_key] if tensor.is_const
578
+ @context[cache_key]
579
+ end
580
+
581
+ private
582
+
583
+ def assign_var(tensor, b, child_context)
584
+ assign = tensor.inputs[0] || tensor
585
+ buffer = complete_eval(b, child_context)
586
+
587
+ if assign.buffer
588
+ # buffer = type_cast(buffer, assign.data_type, name: "#{tensor.name}/cast_#{tensor.name}_#{tensor.data_type}")
589
+ event_wait_list = build_event_wait_list([buffer, assign.buffer])
590
+ assign.buffer.op = if assign.buffer.cl_buffer != buffer.cl_buffer
591
+ _opencl_queue.enqueue_copy_buffer(buffer.cl_buffer, assign.buffer.cl_buffer, event_wait_list: event_wait_list)
592
+ else
593
+ buffer.op
594
+ end
595
+ else
596
+ value = read_final_result(buffer)
597
+ assign.buffer = convert_to_opencl(value, buffer.shape, data_type: tensor.data_type, name: assign.name)
598
+ assign.value = value
599
+ end
600
+ assign.buffer.dirty = true
601
+ assign.buffer
602
+ end
603
+
604
+ def execute_2_operand_func(op_name, tensor, input_a, input_b, child_context, prog_name = nil)
605
+ a = _run(input_a, child_context)
606
+ b = _run(input_b, child_context)
607
+ a, b = auto_type_cast(a, b, name: "#{tensor.name}/cast_#{a.name}_#{b.data_type}")
608
+ dtype = tensor.data_type
609
+ result_shape = TensorShape.infer_shape(a.shape, b.shape)
610
+ return _create_result_buffer(dtype, [0], "out_#{tensor.name}") if result_shape == [0]
611
+ output_buffer = _create_result_buffer(tensor.data_type, result_shape, "out_#{tensor.name}")
612
+ a, b, prog, switch_operands = select_program(a, b, op_name)
613
+ m, n = result_shape
614
+ work_group = [m || 1, n || 1]
615
+ cl_m = OpenCL::Int1.new(m || 1)
616
+ cl_n = OpenCL::Int1.new(n || 1)
617
+ cl_switch = OpenCL::Int1.new(switch_operands) # no need to switch for addition
618
+
619
+ event_wait_list = build_event_wait_list([a, b]) # add dependency wait list
620
+
621
+ method_call = :"#{prog}_#{a.data_type}_#{b.data_type}"
622
+ prog_name ||= op_name
623
+ event = if prog == "#{op_name}_b"
624
+ cl_m_b, cl_n_b = if b.shape.size == 2
625
+ [OpenCL::Int1.new(b.shape[0]), OpenCL::Int1.new(b.shape[1])]
626
+ elsif b.shape.size == 1
627
+ [OpenCL::Int1.new(1), OpenCL::Int1.new(b.shape[0])]
628
+ else
629
+ raise "rank > 2 not supported!"
630
+ end
631
+ _cl_program(prog_name, a: a.data_type, b: b.data_type, dtype: dtype).
632
+ send(method_call, _opencl_queue, work_group, cl_m, cl_n, cl_m_b, cl_n_b,
633
+ cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
634
+ else
635
+ _cl_program(prog_name, a: a.data_type, b: b.data_type, dtype: dtype).
636
+ send(method_call, _opencl_queue, work_group, cl_m, cl_n, cl_switch,
637
+ a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
638
+ end
639
+
640
+ output_buffer.op = event
641
+ output_buffer
642
+ end
643
+
644
+ def execute_cond_func(op_name, tensor, pred, input_a, input_b, child_context)
645
+ p = _run(pred, child_context)
646
+ a = _run(input_a, child_context)
647
+ b = _run(input_b, child_context)
648
+
649
+ a, b = auto_type_cast(a, b, name: "#{tensor.name}/cast_#{a.name}_#{b.data_type}")
650
+ dtype = tensor.data_type
651
+
652
+ output_buffer = _create_result_buffer(tensor.data_type, p.shape, tensor.name)
653
+
654
+ m, n = p.shape
655
+ work_group = [m || 1, n || 1]
656
+ cl_m = OpenCL::Int1.new(m || 1)
657
+ cl_n = OpenCL::Int1.new(n || 1)
658
+
659
+ event_wait_list = build_event_wait_list([a, b, p]) # add dependency wait list
660
+ output_buffer.op = _cl_program(op_name.to_s, dtype: dtype).send(:"#{op_name}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, p.cl_buffer, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
661
+ output_buffer
662
+ end
663
+
664
+ def execute_func(op_name, tensor, a, child_context)
665
+ a = _run(a, child_context)
666
+ event_wait_list = build_event_wait_list([a])
667
+ dtype = tensor.data_type
668
+ output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
669
+
670
+ m, n = a.shape
671
+ work_group = [m || 1, n || 1]
672
+ cl_m = OpenCL::Int1.new(m || 1)
673
+ cl_n = OpenCL::Int1.new(n || 1)
674
+
675
+ event = _cl_program(op_name.to_s, dtype: dtype).send(:"#{op_name}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
676
+ output_buffer.op = event
677
+ output_buffer
678
+ end
679
+
680
+ def auto_type_cast(a, b, name: nil)
681
+ return [a, b] if a.data_type == b.data_type
682
+ m, n = b.shape
683
+ work_group = [m || 1, n || 1]
684
+ event_wait_list = build_event_wait_list([b])
685
+ buffer = _create_result_buffer(b.data_type, b.shape, name)
686
+
687
+ cl_m = OpenCL::Int1.new(m || 1)
688
+ cl_n = OpenCL::Int1.new(n || 1)
689
+
690
+ buffer.op = _cl_program("cast", source_dt: a.data_type, target_dt: b.data_type).cast(_opencl_queue, work_group, cl_m, cl_n, b.cl_buffer, buffer.cl_buffer, event_wait_list: event_wait_list)
691
+ [a, buffer]
692
+ end
693
+
694
+ def type_cast(source, data_type, name: nil)
695
+ return source if source.data_type == data_type
696
+ m, n = source.shape
697
+ work_group = [m || 1, n || 1]
698
+ event_wait_list = [source.op].compact
699
+ buffer = _create_result_buffer(data_type, source.shape, name)
700
+
701
+ cl_m = OpenCL::Int1.new(m || 1)
702
+ cl_n = OpenCL::Int1.new(n || 1)
703
+
704
+ buffer.op = _cl_program("cast", source_dt: source.data_type, target_dt: data_type).cast(_opencl_queue, work_group, cl_m, cl_n, source.cl_buffer, buffer.cl_buffer, event_wait_list: event_wait_list)
705
+ buffer
706
+ end
707
+
708
+ def wrap_opencl(tensor, data_type: nil, name: nil)
709
+ value, shape = if tensor.is_a?(Tensor)
710
+ [tensor.value, tensor.shape.shape]
711
+ else
712
+ [tensor, shape_eval(tensor)]
713
+ end
714
+
715
+ convert_to_opencl(value, shape, data_type: data_type || tensor.data_type, name: name)
716
+ end
717
+
718
+ def get_cached_buffer(name, shape)
719
+ cache_key = "_cl_object_#{name}:#{shape.join('_')}:#{object_id}"
720
+ @context[:_cache][cache_key]
721
+ end
722
+
723
+ def convert_to_opencl(value, shape, data_type: nil, name: nil)
724
+ value = [value] if !value.is_a?(Array) && !value.is_a?(NArray)
725
+
726
+ cache_key = "_cl_object_#{name}:#{shape.join('_')}:#{object_id}"
727
+ cl_object = if name && @context[:_cache][cache_key]
728
+ @context[:_cache][cache_key]
729
+ else
730
+ narray_size = shape.reduce(:*) || 1
731
+
732
+ buffer = if value.is_a?(NArray)
733
+ value
734
+ else
735
+ allocate_narray_for_type(data_type, narray_size)
736
+ end
737
+
738
+ return nil if buffer.nil?
739
+
740
+ cl_buffer_size = shape.empty? ? 1 : shape.reduce(:*)
741
+
742
+ cl_buffer = unless value.flatten.empty?
743
+ cl_buffer_size = 1 if cl_buffer_size.zero?
744
+ _opencl_context.create_buffer(cl_buffer_size * buffer.element_size)
745
+ end
746
+
747
+ @context[:_cache][cache_key] = OpenCLBuffer.new(name: name, data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer)
748
+ end
749
+
750
+ if value.is_a?(Array)
751
+ value.flatten.each_with_index do |element, index|
752
+ cl_object.buffer[index] = if element.is_a?(Tensor)
753
+ read_final_result(complete_eval(element, {}))
754
+ elsif data_type == :boolean
755
+ element ? 1 : 0
756
+ else
757
+ Tensor.cast_dtype(element, data_type)
758
+ end
759
+ end
760
+ elsif value.is_a?(NArray)
761
+ cl_object.buffer = value
762
+ elsif data_type == :boolean
763
+ cl_object.buffer[0] = element ? 1 : 0
764
+ else
765
+ cl_object.buffer[0] = Tensor.cast_dtype(value, data_type)
766
+ end
767
+
768
+ write_op = _opencl_queue.enqueue_write_buffer(cl_object.cl_buffer, cl_object.buffer) if cl_object.cl_buffer && !value.nil? && (!value.is_a?(Array) || !value.empty?)
769
+ cl_object.op = write_op
770
+ cl_object
771
+ end
772
+
773
+ def allocate_narray_for_type(data_type, narray_size)
774
+ case data_type
775
+ when :float, :float32
776
+ NArray.sfloat(narray_size)
777
+ when :float64
778
+ NArray.float(narray_size)
779
+ when :int, :int32, :int64
780
+ NArray.int(narray_size)
781
+ when :int16
782
+ NArray.sint(narray_size)
783
+ when :boolean
784
+ NArray.sint(narray_size)
785
+ when :unknown
786
+ nil
787
+ else
788
+ raise "unsupported type #{data_type}"
789
+ end
790
+ end
791
+
792
+ def _create_result_buffer(data_type, shape, name)
793
+ return OpenCLBuffer.new(name: name, data_type: data_type, shape: [0], buffer: nil, cl_buffer: nil) if shape == [0]
794
+ @context[:_cache][:_cl_buffers]["_result_#{name}_#{shape.join('_')}:#{object_id}"] ||= begin
795
+ size = shape.empty? || shape == [0] ? 1 : shape.reduce(:*)
796
+ buffer = allocate_narray_for_type(data_type, size)
797
+ cl_buffer = _opencl_context.create_buffer(buffer.size * buffer.element_size)
798
+ OpenCLBuffer.new(data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer, name: name)
799
+ end
800
+ end
801
+
802
+ def get_op_with_axis(a, target_axis, current_axis, output_type, op = ->(t, u) { t > u })
803
+ if target_axis == current_axis
804
+ if a[0].is_a?(Array)
805
+ (0...a[0].size).each.collect do |column_index|
806
+ max = nil
807
+ max_index = 0
808
+ a.each_with_index do |row, row_index|
809
+ if max.nil? || op.call(row[column_index], max)
810
+ max = row[column_index]
811
+ max_index = row_index
812
+ end
813
+ end
814
+
815
+ Tensor.cast_dtype(max_index, output_type)
816
+ end
817
+ else
818
+ max = nil
819
+ max_index = 0
820
+ a.each_with_index do |x, index|
821
+ if max.nil? || op.call(x, max)
822
+ max = x
823
+ max_index = index
824
+ end
825
+ end
826
+ Tensor.cast_dtype(max_index, output_type)
827
+ end
828
+ else
829
+ a.collect do |row|
830
+ get_op_with_axis(row, target_axis, current_axis + 1, output_type, op)
831
+ end
832
+ end
833
+ end
834
+
835
+ def _reduced_shape(input_shape, axes)
836
+ return [] if axes.nil? # reduce to scalar
837
+ axes = [axes] unless axes.is_a?(Array)
838
+ return input_shape if axes.empty?
839
+
840
+ axes.each do |dimen|
841
+ input_shape[dimen] = 1
842
+ end
843
+ input_shape
844
+ end
845
+
846
+ def reduction(child_context, tensor, a, b, func)
847
+ input = complete_eval(a, child_context)
848
+ axis = b.is_a?(Tensor) ? read_final_result(complete_eval(b, child_context)) : b
849
+ if axis.nil?
850
+ red = input.buffer.send(func)
851
+ convert_to_opencl(red, [], data_type: tensor.data_type, name: tensor.name)
852
+ else
853
+ return input if input.shape.empty?
854
+ value = input.buffer.reshape(*input.shape.reverse)
855
+ rank = input.shape.size - 1
856
+
857
+ if axis.is_a?(Array)
858
+ axis.map { |x| rank - x.abs }.sort.reverse_each do |x|
859
+ value = value.send(func, x.to_i)
860
+ end
861
+ else
862
+ value = value.send(func, rank - axis.abs)
863
+ end
864
+
865
+ new_shape = if value.is_a?(NArray)
866
+ value.shape.reverse
867
+ else
868
+ value = [value]
869
+ []
870
+ end
871
+
872
+ new_shape = _reduced_shape(input.shape.dup, axis) if tensor.options[:keepdims]
873
+
874
+ convert_to_opencl(value.flatten, new_shape, data_type: tensor.data_type, name: tensor.name)
875
+ end
876
+ end
877
+
878
+ # selects variants of cl programs depending on input
879
+ def select_program(input_a, input_b, op)
880
+ return [input_a, input_b, op.to_s, 0] if input_a.shape == input_b.shape
881
+
882
+ return [input_b, input_a, "#{op}_c", 1] if input_a.shape.empty? || input_a.shape.reduce(:*) == 1 # A is scalar?
883
+ return [input_a, input_b, "#{op}_c", 0] if input_b.shape.empty? || input_a.shape.reduce(:*) == 1 # B is scalar?
884
+
885
+ return [input_b, input_a, "#{op}_b", 1] if input_a.shape.size < input_b.shape.size
886
+
887
+ if input_a.shape.size == input_b.shape.size
888
+ input_a.shape.zip(input_b.shape).each do |s1, s2|
889
+ return [input_b, input_a, "#{op}_b", 1] if s1 < s2
890
+ end
891
+ end
892
+
893
+ [input_a, input_b, "#{op}_b", 0]
894
+ end
895
+
896
+ def _rank_from_shape(shape)
897
+ shape.is_a?(Array) ? shape.size : 0
898
+ end
899
+
900
+ def build_event_wait_list(inputs)
901
+ inputs.compact.map(&:op).flatten
902
+ end
903
+
904
+ def resolve_placeholder(placeholder, _execution_context = {})
905
+ return nil if placeholder.nil?
906
+
907
+ var = if placeholder.is_a?(Placeholder)
908
+ @context[placeholder.name.to_sym].tap do |c|
909
+ raise "missing placeholder #{placeholder.name}" if c.nil?
910
+ end
911
+ else
912
+ placeholder
913
+ end
914
+
915
+ return convert_to_opencl(var, shape_eval(var), data_type: placeholder.data_type, name: placeholder.name) unless var.is_a?(Tensor)
916
+ Tensor.cast_dtype(var, placeholder.data_type)
917
+ end
918
+
919
+ def all_true?(arr)
920
+ if arr.is_a?(Array) || arr.is_a?(NArray)
921
+ arr.each do |a|
922
+ return false unless all_true?(a)
923
+ end
924
+ return true
925
+ end
926
+
927
+ arr != 0
928
+ end
929
+ end
930
+ end
931
+ end
932
+
933
+ TensorStream::Evaluator.register_evaluator(TensorStream::Evaluator::OpenclEvaluator, 'opencl', 1)