tensor_stream-opencl 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +11 -0
  3. data/.rspec +3 -0
  4. data/.travis.yml +5 -0
  5. data/CODE_OF_CONDUCT.md +74 -0
  6. data/Gemfile +6 -0
  7. data/Gemfile.lock +51 -0
  8. data/LICENSE.txt +21 -0
  9. data/README.md +58 -0
  10. data/Rakefile +6 -0
  11. data/bin/console +14 -0
  12. data/bin/setup +8 -0
  13. data/lib/tensor_stream/opencl.rb +7 -0
  14. data/lib/tensor_stream/opencl/kernels/_bool_operand.cl +45 -0
  15. data/lib/tensor_stream/opencl/kernels/_operand.cl +45 -0
  16. data/lib/tensor_stream/opencl/kernels/abs.cl +20 -0
  17. data/lib/tensor_stream/opencl/kernels/acos.cl +8 -0
  18. data/lib/tensor_stream/opencl/kernels/add.cl +3 -0
  19. data/lib/tensor_stream/opencl/kernels/apply_adam.cl +23 -0
  20. data/lib/tensor_stream/opencl/kernels/apply_gradient.cl +9 -0
  21. data/lib/tensor_stream/opencl/kernels/apply_momentum.cl +16 -0
  22. data/lib/tensor_stream/opencl/kernels/argmax.cl +8 -0
  23. data/lib/tensor_stream/opencl/kernels/argmin.cl +8 -0
  24. data/lib/tensor_stream/opencl/kernels/asin.cl +9 -0
  25. data/lib/tensor_stream/opencl/kernels/cast.cl +10 -0
  26. data/lib/tensor_stream/opencl/kernels/ceil.cl +8 -0
  27. data/lib/tensor_stream/opencl/kernels/cond.cl.erb +6 -0
  28. data/lib/tensor_stream/opencl/kernels/cos.cl +8 -0
  29. data/lib/tensor_stream/opencl/kernels/div.cl.erb +3 -0
  30. data/lib/tensor_stream/opencl/kernels/exp.cl +8 -0
  31. data/lib/tensor_stream/opencl/kernels/floor.cl +8 -0
  32. data/lib/tensor_stream/opencl/kernels/floor_div.cl +48 -0
  33. data/lib/tensor_stream/opencl/kernels/floor_mod.cl +3 -0
  34. data/lib/tensor_stream/opencl/kernels/gemm.cl +32 -0
  35. data/lib/tensor_stream/opencl/kernels/log.cl +8 -0
  36. data/lib/tensor_stream/opencl/kernels/log1p.cl +8 -0
  37. data/lib/tensor_stream/opencl/kernels/log_softmax.cl +26 -0
  38. data/lib/tensor_stream/opencl/kernels/max.cl +46 -0
  39. data/lib/tensor_stream/opencl/kernels/min.cl +46 -0
  40. data/lib/tensor_stream/opencl/kernels/mod.cl +3 -0
  41. data/lib/tensor_stream/opencl/kernels/mul.cl +3 -0
  42. data/lib/tensor_stream/opencl/kernels/negate.cl +8 -0
  43. data/lib/tensor_stream/opencl/kernels/pack.cl +24 -0
  44. data/lib/tensor_stream/opencl/kernels/pow.cl +46 -0
  45. data/lib/tensor_stream/opencl/kernels/real_div.cl +3 -0
  46. data/lib/tensor_stream/opencl/kernels/reciprocal.cl +8 -0
  47. data/lib/tensor_stream/opencl/kernels/round.cl +8 -0
  48. data/lib/tensor_stream/opencl/kernels/sigmoid.cl +9 -0
  49. data/lib/tensor_stream/opencl/kernels/sigmoid_grad.cl +55 -0
  50. data/lib/tensor_stream/opencl/kernels/sign.cl +21 -0
  51. data/lib/tensor_stream/opencl/kernels/sin.cl +9 -0
  52. data/lib/tensor_stream/opencl/kernels/softmax.cl +26 -0
  53. data/lib/tensor_stream/opencl/kernels/softmax_cross.cl +32 -0
  54. data/lib/tensor_stream/opencl/kernels/softmax_cross_grad.cl +28 -0
  55. data/lib/tensor_stream/opencl/kernels/softmax_grad.cl +46 -0
  56. data/lib/tensor_stream/opencl/kernels/sqrt.cl +9 -0
  57. data/lib/tensor_stream/opencl/kernels/square.cl +9 -0
  58. data/lib/tensor_stream/opencl/kernels/squared_difference.cl +53 -0
  59. data/lib/tensor_stream/opencl/kernels/sub.cl +3 -0
  60. data/lib/tensor_stream/opencl/kernels/tan.cl +8 -0
  61. data/lib/tensor_stream/opencl/kernels/tanh.cl +8 -0
  62. data/lib/tensor_stream/opencl/kernels/tanh_grad.cl +7 -0
  63. data/lib/tensor_stream/opencl/kernels/where.cl +8 -0
  64. data/lib/tensor_stream/opencl/math_ops.rb +133 -0
  65. data/lib/tensor_stream/opencl/nn_ops.rb +191 -0
  66. data/lib/tensor_stream/opencl/opencl_buffer.rb +35 -0
  67. data/lib/tensor_stream/opencl/opencl_device.rb +5 -0
  68. data/lib/tensor_stream/opencl/opencl_evaluator.rb +933 -0
  69. data/lib/tensor_stream/opencl/opencl_template_helper.rb +99 -0
  70. data/lib/tensor_stream/opencl/version.rb +5 -0
  71. data/tensor_stream-opencl.gemspec +40 -0
  72. metadata +185 -0
@@ -0,0 +1,191 @@
1
+ module TensorStream
2
+ module OpenCLHelpers
3
+ # Collection of math functions for interfacing with OpenCL kernels
4
+ module NNOps
5
+ def NNOps.included(klass)
6
+ klass.class_eval do
7
+
8
+ # Fast in place multiply subtract assign
9
+ register_op :apply_gradient_descent do |_context, tensor, inputs|
10
+ _target_var, learning_rate, delta = inputs
11
+
12
+ assign = tensor.inputs[0] || tensor
13
+
14
+ assign.buffer.dirty = true # force buffer copy when variable is read externally
15
+ output_buffer = assign.buffer
16
+
17
+ m, n = output_buffer.shape
18
+ work_group = [m || 1, n || 1]
19
+ cl_m = OpenCL::Int1.new(m || 1)
20
+ cl_n = OpenCL::Int1.new(n || 1)
21
+
22
+ event_wait_list = build_event_wait_list([assign.buffer, learning_rate, delta])
23
+ method_call = :"apply_gradient_#{output_buffer.data_type}"
24
+ event = _cl_program("apply_gradient", dtype: output_buffer.data_type).send(method_call, _opencl_queue, work_group, cl_m, cl_n, delta.cl_buffer, learning_rate.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
25
+ output_buffer.op = event
26
+ output_buffer
27
+ end
28
+
29
+ # updates for gradient descent with momentum
30
+ register_op :apply_momentum do |_context, tensor, inputs|
31
+ target_var, momentum_var, learning_rate, grad, momentum = inputs
32
+
33
+ assign = tensor.inputs[0] || tensor
34
+ assign_acc = tensor.inputs[1]
35
+ assign.buffer.dirty = true # force buffer copy when variable is read externally
36
+ assign_acc.buffer.dirty = true # force buffer copy when variable is read externally
37
+
38
+ output_buffer = assign.buffer
39
+
40
+ m, n = output_buffer.shape
41
+ work_group = [m || 1, n || 1]
42
+ cl_m = OpenCL::Int1.new(m || 1)
43
+ cl_n = OpenCL::Int1.new(n || 1)
44
+
45
+ event_wait_list = build_event_wait_list([assign.buffer, assign_acc.buffer, learning_rate, grad, momentum])
46
+ method_call = :"apply_momentum_#{output_buffer.data_type}"
47
+ event = _cl_program("apply_momentum", nesterov: tensor.options[:use_nesterov], dtype: output_buffer.data_type).
48
+ send(method_call, _opencl_queue, work_group, cl_m, cl_n, grad.cl_buffer,
49
+ learning_rate.cl_buffer, momentum.cl_buffer, output_buffer.cl_buffer,
50
+ assign_acc.buffer.cl_buffer, event_wait_list: event_wait_list)
51
+ output_buffer.op = event
52
+ assign_acc.buffer.op = event
53
+ output_buffer
54
+ end
55
+
56
+ register_op :apply_adadelta do |context, tensor, inputs|
57
+ end
58
+
59
+ # Adam optimization algorithm
60
+ register_op :apply_adam do |_context, tensor, inputs|
61
+ _target_var, _m, _v, beta1_power, beta2_power, lr_t, beta1_t, beta2_t, epsilon_t, grad = inputs
62
+
63
+ assign = tensor.inputs[0] || tensor
64
+ assign_m = tensor.inputs[1]
65
+ assign_v = tensor.inputs[2]
66
+
67
+ # mark variable buffers as dirty
68
+ assign.buffer.dirty = true # force buffer copy when variable is read externally
69
+ assign_m.buffer.dirty = true # force buffer copy when variable is read externally
70
+ assign_v.buffer.dirty = true # force buffer copy when variable is read externally
71
+
72
+ output_buffer = assign.buffer
73
+
74
+ m, n = output_buffer.shape
75
+ work_group = [m || 1, n || 1]
76
+ cl_m = OpenCL::Int1.new(m || 1)
77
+ cl_n = OpenCL::Int1.new(n || 1)
78
+
79
+ event_wait_list = build_event_wait_list(inputs)
80
+ method_call = :"apply_adam_#{output_buffer.data_type}"
81
+ event = _cl_program("apply_adam", dtype: output_buffer.data_type)
82
+ .send(method_call, _opencl_queue, work_group, cl_m, cl_n,
83
+ grad.cl_buffer,
84
+ lr_t.cl_buffer,
85
+ beta1_power.cl_buffer,
86
+ beta2_power.cl_buffer,
87
+ beta1_t.cl_buffer,
88
+ beta2_t.cl_buffer,
89
+ epsilon_t.cl_buffer,
90
+ assign_m.buffer.cl_buffer,
91
+ assign.buffer.cl_buffer,
92
+ assign_v.buffer.cl_buffer,
93
+ event_wait_list: event_wait_list)
94
+ output_buffer.op = event
95
+ assign_m.buffer.op = event
96
+ assign_v.buffer.op = event
97
+ output_buffer
98
+ end
99
+
100
+ register_op :softmax do |_context, tensor, inputs|
101
+ a = inputs[0]
102
+ event_wait_list = build_event_wait_list(inputs)
103
+ dtype = tensor.data_type
104
+ output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
105
+
106
+ m, n = a.shape
107
+ work_group = [m]
108
+ n = m if n.nil?
109
+ cl_n = OpenCL::Int1.new(n || 1)
110
+
111
+ event = _cl_program("softmax", dtype: dtype).send(:"softmax_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
112
+ output_buffer.op = event
113
+ output_buffer
114
+ end
115
+
116
+ register_op :log_softmax do |_context, tensor, inputs|
117
+ a = inputs[0] # logits
118
+ event_wait_list = build_event_wait_list(inputs)
119
+ dtype = tensor.data_type
120
+ output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
121
+
122
+ m, n = a.shape
123
+ work_group = [m]
124
+ n = m if n.nil?
125
+ cl_n = OpenCL::Int1.new(n || 1)
126
+
127
+ event = _cl_program("log_softmax", dtype: dtype).send(:"log_softmax_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
128
+ output_buffer.op = event
129
+ output_buffer
130
+ end
131
+
132
+ register_op :softmax_cross_entropy_with_logits_v2 do |context, tensor, inputs|
133
+ a = inputs[0] # logits
134
+ b = inputs[1] # labels
135
+ event_wait_list = build_event_wait_list(inputs)
136
+ dtype = tensor.data_type
137
+ output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
138
+ output_buffer_backprop = _create_result_buffer(tensor.data_type, a.shape, "#{tensor.name}_2")
139
+ rank = a.shape.size - 1
140
+ m, n = a.shape
141
+ work_group = [m]
142
+ n = m if n.nil?
143
+ cl_n = OpenCL::Int1.new(n || 1)
144
+
145
+ event = _cl_program("softmax_cross", dtype: dtype).send(:"softmax_cross_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer, b.cl_buffer,
146
+ output_buffer.cl_buffer, output_buffer_backprop.cl_buffer, event_wait_list: event_wait_list)
147
+ output_buffer.op = event
148
+ output_buffer_backprop.op = event
149
+
150
+ loss = reduction(context, tensor, output_buffer, rank, :sum)
151
+ TensorStream::Evaluator::OutputGroup.new([loss, output_buffer_backprop], [tensor.inputs[0].data_type, tensor.inputs[0].data_type])
152
+ end
153
+
154
+ register_op :softmax_cross_entropy_with_logits_v2_grad do |_context, tensor, inputs|
155
+ a = inputs[0] # logits
156
+ b = inputs[1] # labels
157
+ c = inputs[2] # grads
158
+ event_wait_list = build_event_wait_list(inputs)
159
+ dtype = tensor.data_type
160
+ output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
161
+
162
+ m, n = a.shape
163
+ work_group = [m]
164
+ n = m if n.nil?
165
+ cl_n = OpenCL::Int1.new(n || 1)
166
+
167
+ event = _cl_program("softmax_cross_grad", dtype: dtype).send(:"softmax_cross_grad_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer, b.cl_buffer, c.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
168
+ output_buffer.op = event
169
+ output_buffer
170
+ end
171
+
172
+ register_op :softmax_grad do |_context, tensor, inputs|
173
+ a, grad = inputs
174
+
175
+ event_wait_list = build_event_wait_list(inputs)
176
+ dtype = tensor.data_type
177
+ output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
178
+
179
+ m, n = a.shape
180
+ work_group = [m]
181
+ n = m if n.nil?
182
+ cl_n = OpenCL::Int1.new(n || 1)
183
+ event = _cl_program('softmax_grad', dtype: dtype, size: n).send(:"softmax_grad_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer, grad.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
184
+ output_buffer.op = event
185
+ output_buffer
186
+ end
187
+ end
188
+ end
189
+ end
190
+ end
191
+ end
@@ -0,0 +1,35 @@
1
+ module TensorStream
2
+ # Buffer used by the OpenCL evaluator
3
+ class OpenCLBuffer < Buffer
4
+ include ArrayOpsHelper
5
+
6
+ attr_accessor :shape, :buffer, :cl_buffer, :op
7
+
8
+ def initialize(data_type:, shape:, buffer:, cl_buffer:, op: nil, name: nil)
9
+ @data_type = data_type
10
+ @shape = shape
11
+ @buffer = buffer
12
+ @cl_buffer = cl_buffer
13
+ @name = name
14
+ @op = op
15
+ end
16
+
17
+ def to_ruby
18
+ return [] if buffer.empty?
19
+
20
+ if dirty
21
+ op.command_queue.enqueue_read_buffer(cl_buffer, buffer, event_wait_list: [op].compact)
22
+ op.command_queue.finish
23
+ self.dirty = false
24
+ end
25
+
26
+ if shape.empty?
27
+ return buffer[0] != 0 if data_type == :boolean
28
+ return buffer[0]
29
+ end
30
+
31
+ result = buffer.reshape(*shape.map(&:to_i).reverse).to_a
32
+ data_type == :boolean ? process_function_op(result, ->(a, _b) { a != 0 }) : result
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,5 @@
1
+ module TensorStream
2
+ class OpenclDevice < TensorStream::Device
3
+ attr_accessor :native_device
4
+ end
5
+ end
@@ -0,0 +1,933 @@
1
+ require 'tensor_stream/evaluator/operation_helpers/random_gaussian'
2
+ require 'tensor_stream/evaluator/operation_helpers/array_ops_helper'
3
+ require 'tensor_stream/evaluator/operation_helpers/math_helper'
4
+ require 'tensor_stream/evaluator/buffer'
5
+ require 'tensor_stream/opencl/opencl_buffer'
6
+ require 'tensor_stream/opencl/opencl_template_helper'
7
+ require 'tensor_stream/device'
8
+ require 'tensor_stream/evaluator/opencl/opencl_device'
9
+ require 'opencl_ruby_ffi'
10
+ require 'narray_ffi'
11
+ require 'tensor_stream/evaluator/base_evaluator'
12
+ require 'tensor_stream/opencl/math_ops'
13
+ require 'tensor_stream/opencl/nn_ops'
14
+ require 'tensor_stream/helpers/op_helper'
15
+
16
+ module TensorStream
17
+ module Evaluator
18
+ class FullEvalNotPossible < RuntimeError
19
+ end
20
+
21
+ # Errors during graph evaluation
22
+ class EvaluatorExcecutionException < RuntimeError
23
+ attr_reader :tensor
24
+
25
+ def initialize(exception, tensor)
26
+ @exception = exception
27
+ @tensor = tensor
28
+ end
29
+
30
+ def wrapped_exception
31
+ @exception
32
+ end
33
+ end
34
+
35
+ ## PURE ruby evaluator used for testing and development
36
+ class OpenclEvaluator < BaseEvaluator
37
+ attr_accessor :retain
38
+ attr_reader :opencl_device
39
+
40
+ include TensorStream::OpHelper
41
+ include TensorStream::ArrayOpsHelper
42
+ include TensorStream::MathHelper
43
+ include TensorStream::OpenCLHelpers::MathOps
44
+ include TensorStream::OpenCLHelpers::NNOps
45
+
46
+ def initialize(session, device, thread_pool: nil, log_intermediates: false)
47
+ super
48
+ _create_opencl_context(device.native_device)
49
+ @opencl_device = device.native_device
50
+ create_command_queue
51
+ end
52
+
53
+ def self.query_supported_devices
54
+ devices = query_devices_with_score
55
+ devices.sort { |a| a[1] }.reverse.map do |d|
56
+ opencl_to_device(d)
57
+ end
58
+ end
59
+
60
+ def self.fetch_device(query = [])
61
+ devices = query_devices_with_score
62
+ platform_devices = devices.select { |d| d[0].platform.to_s.tr(' ', '_').downcase =~ /#{query[0].downcase}/ }
63
+ opencl_to_device(platform_devices[[query[1].to_i, platform_devices.size - 1].min])
64
+ end
65
+
66
+ def self.opencl_to_device(d)
67
+ device = d[0]
68
+ index = d[3]
69
+ platform_name = device.platform.name.tr(' ', '_').downcase
70
+ uri = [platform_name, index].join(':')
71
+
72
+ device_type = device.type.to_s == 'GPU' ? :gpu : :cpu
73
+
74
+ OpenclDevice.new(uri, device_type, self).tap do |devide|
75
+ devide.native_device = device
76
+ end
77
+ end
78
+
79
+ ##
80
+ # Select the best device available in the system for this evaluator
81
+ def self.default_device
82
+ devices = OpenclEvaluator.query_devices_with_score
83
+ device = devices.sort { |a| a[1] }.reverse.first
84
+ opencl_to_device(device)
85
+ end
86
+
87
+ # opencl evaluator main entrypoint
88
+ def run(tensor, execution_context)
89
+ read_final_result(complete_eval(tensor, execution_context))
90
+ end
91
+
92
+ def run_with_buffer(tensor, context, execution_context)
93
+ @context = context
94
+ @context[:_cache][:_cl_buffers] ||= {} if context[:_cache]
95
+
96
+ if tensor.is_a?(Array)
97
+ tensor.collect do |t|
98
+ value = run(t, execution_context)
99
+ Buffer.new(data_type: t.data_type, buffer: value)
100
+ end
101
+ else
102
+ value = run(tensor, execution_context)
103
+ Buffer.new(data_type: tensor.data_type, buffer: value)
104
+ end
105
+ end
106
+
107
+ # buffer comes from non-opencl evaluator
108
+ def convert_from_buffer(tensor, result)
109
+ if result.buffer.is_a?(TensorStream::Evaluator::OutputGroup)
110
+ converted_outputs = result.buffer.outputs.zip(result.buffer.data_types).map { |output, data_type| convert_to_opencl([output].flatten, shape_eval(output), data_type: data_type, name: tensor.name) }
111
+ TensorStream::Evaluator::OutputGroup.new(converted_outputs, result.buffer.data_types)
112
+ else
113
+ convert_to_opencl([result.buffer].flatten, shape_eval(result.buffer), data_type: result.data_type, name: tensor.name)
114
+ end
115
+ end
116
+
117
+ def enqueue_buffer_read(tensor, context)
118
+ buffer = _run(tensor, context)
119
+ if buffer.is_a?(Array)
120
+ buffer = buffer.collect do |b|
121
+ next b if b.buffer.size.zero?
122
+ _opencl_queue.enqueue_read_buffer(b.cl_buffer, b.buffer, event_wait_list: build_event_wait_list([b]))
123
+ b
124
+ end
125
+ else
126
+ return buffer.outputs[0] if buffer.is_a?(OutputGroup)
127
+ return buffer if buffer.nil?
128
+ return [] if buffer.buffer.nil?
129
+ return buffer if buffer.buffer.size.zero?
130
+ _opencl_queue.enqueue_read_buffer(buffer.cl_buffer, buffer.buffer, event_wait_list: build_event_wait_list([buffer]))
131
+ buffer
132
+ end
133
+ end
134
+
135
+ def complete_eval(tensor, context)
136
+ buffer = enqueue_buffer_read(tensor, context)
137
+ _opencl_queue.finish
138
+ buffer
139
+ end
140
+
141
+ def self.query_devices_with_score
142
+ OpenCL.platforms.flat_map do |p|
143
+ p.devices.select { |d| d.available > 0 }.each_with_index.collect do |d, index|
144
+ score = 0
145
+ if d.type.to_s == 'CPU'
146
+ score += 1
147
+ elsif d.type.to_s == 'GPU'
148
+ score += 4
149
+ end
150
+
151
+ score += 1000 if d.platform.name == 'NVIDIA CUDA'
152
+
153
+ score += d.max_compute_units
154
+ score += d.max_clock_frequency
155
+
156
+ [d, score, p.name, index]
157
+ end
158
+ end
159
+ end
160
+
161
+ protected
162
+
163
+ def prepare_input(tensor, context, options = {})
164
+ return nil unless tensor
165
+ tensor = resolve_placeholder(tensor)
166
+ if options[:noop]
167
+ tensor
168
+ elsif options[:buffer]
169
+ complete_eval(tensor, context)
170
+ elsif options[:complete]
171
+ read_final_result(complete_eval(tensor, context))
172
+ else
173
+ _run(tensor, context)
174
+ end
175
+ end
176
+
177
+ # read result from opencl and convert to ruby
178
+ def read_final_result(buffer)
179
+ return buffer.map { |b| read_final_result(b) } if buffer.is_a?(Array)
180
+ return nil if buffer.nil?
181
+
182
+ buffer.to_ruby
183
+ end
184
+
185
+ def _create_opencl_context(opencl_device)
186
+ @opencl_context = OpenCL.create_context(opencl_device)
187
+ end
188
+
189
+ def create_command_queue
190
+ supported_proprties = opencl_device.queue_properties.names
191
+
192
+ properties = []
193
+ properties << OpenCL::CommandQueue::PROFILING_ENABLE if supported_proprties.include?('PROFILING_ENABLE')
194
+ properties << OpenCL::CommandQueue::OUT_OF_ORDER_EXEC_MODE_ENABLE if supported_proprties.include?('OUT_OF_ORDER_EXEC_MODE_ENABLE')
195
+ @command_queue = _opencl_context.create_command_queue(opencl_device, properties: properties)
196
+ end
197
+
198
+ def _opencl_context
199
+ @opencl_context
200
+ end
201
+
202
+ def _opencl_queue
203
+ @command_queue
204
+ end
205
+
206
+ def cl_template_path(kernel, extension)
207
+ File.join(File.dirname(__FILE__), 'kernels', "#{kernel}.#{extension}")
208
+ end
209
+
210
+ def _cl_program(kernel, args = {})
211
+ suffix = args.collect { |k, v| "#{k}.#{escape_arg_content(v)}" }.join('.')
212
+ @context[:_cache]["_opencl_kernel_#{kernel}.#{suffix}:#{object_id}"] ||= begin
213
+ filename = %w[cl.erb cl].map { |ext| cl_template_path(kernel, ext) }.find { |n| File.exist?(n) }
214
+ raise "opencl kernel template for #{kernel} has not yet been defined" if filename.nil?
215
+ source = File.read(filename)
216
+ source = OpenclTemplateHelper.new(source).generate(args)
217
+ # File.write("/tmp/#{kernel}.#{suffix}.cl", source)
218
+ program = _opencl_context.create_program_with_source(source)
219
+ program.build
220
+ rescue OpenCL::Error::BUILD_PROGRAM_FAILURE => e
221
+ puts "OpenCL Compile error: #{program.build_log}"
222
+ raise e
223
+ end
224
+ end
225
+
226
+ def escape_arg_content(value)
227
+ return value.tr(' ','_') if value.is_a?(String)
228
+ return value.join('-') if value.is_a?(Array)
229
+
230
+ value
231
+ end
232
+
233
+ def _run(tensor, execution_context)
234
+ return tensor if tensor.is_a?(OpenCLBuffer)
235
+ return tensor.map { |t| _run(t, execution_context) } if tensor.is_a?(Array) && !tensor.size.empty? && tensor[0].is_a?(Tensor)
236
+
237
+ tensor = tensor.call if tensor.is_a?(Proc)
238
+
239
+ child_context = execution_context.dup
240
+ res = if tensor.is_a?(Operation)
241
+ if !self.class.ops.include?(tensor.operation.to_sym)
242
+ result = @session.delegate_to_evaluator(tensor, @context, execution_context)
243
+ convert_from_buffer(tensor, result)
244
+ else
245
+ eval_operation(tensor, child_context)
246
+ end
247
+ elsif tensor.is_a?(Variable)
248
+ eval_variable(tensor, child_context)
249
+ elsif tensor.is_a?(Placeholder)
250
+ resolve_placeholder(tensor, child_context)
251
+ else
252
+ eval_tensor(tensor, child_context)
253
+ end
254
+ execution_context.deep_merge!(returns: child_context[:returns])
255
+ res
256
+ end
257
+
258
+ def eval_variable(tensor, _child_context)
259
+ raise "variable #{tensor.name} not initalized" if tensor.value.nil? && (tensor.buffer.nil? || !tensor.buffer.dirty)
260
+ tensor.buffer = wrap_opencl(tensor, name: tensor.name) if tensor.buffer.nil?
261
+ tensor.buffer
262
+ end
263
+
264
+ register_op :no_op do |_context, _tensor, _inputs|
265
+ end
266
+
267
+ register_op :cond, noop: true do |context, tensor, inputs|
268
+ pred = complete_eval(tensor.options[:pred], context)
269
+
270
+ if all_true?(pred.buffer)
271
+ complete_eval(inputs[0], context)
272
+ else
273
+ complete_eval(inputs[1], context)
274
+ end
275
+ end
276
+
277
+ register_op :identity do |context, tensor, inputs|
278
+ if tensor.inputs.size > 1
279
+ tensor.inputs[1..inputs.size].each { |input| complete_eval(input, context) }
280
+ end
281
+ inputs[0]
282
+ end
283
+
284
+ register_op :assign, noop: true do |context, tensor, inputs|
285
+ assign_var(tensor, inputs[1], context)
286
+ end
287
+
288
+ register_op :assign_add do |context, tensor, inputs|
289
+ value = execute_2_operand_func('add', tensor, inputs[0], inputs[1], context)
290
+ assign_var(tensor, value, context)
291
+ end
292
+
293
+ register_op :assign_sub do |context, tensor, inputs|
294
+ value = execute_2_operand_func('sub', tensor, inputs[0], inputs[1], context)
295
+ assign_var(tensor, value, context)
296
+ end
297
+
298
+ register_op :variable, noop: true do |context, tensor, inputs|
299
+ variable = tensor.inputs[0]
300
+ raise "variable #{tensor.name} not initalized" if variable.value.nil? && (variable.buffer.nil? || !variable.buffer.dirty)
301
+ variable.buffer = wrap_opencl(variable, name: variable.name) if variable.buffer.nil?
302
+ variable.buffer
303
+ end
304
+
305
+ %i[less less_equal greater greater_equal equal not_equal logical_and].each do |op|
306
+ register_op op, noop: true do |context, tensor, inputs|
307
+ execute_2_operand_func(op.to_s, tensor, inputs[0], inputs[1], context, 'cond')
308
+ end
309
+ end
310
+
311
+ register_op :expand_dims, buffer: true do |_context, tensor, inputs|
312
+ axis = inputs[1].buffer[0]
313
+ shape = inputs[0].shape.dup
314
+ axis = -axis if axis == shape.size
315
+ new_shape = shape.insert(axis, 1).compact
316
+ new_buf = inputs[0].buffer.reshape(*new_shape.reverse)
317
+ convert_to_opencl(new_buf, new_shape, data_type: inputs[0].data_type, name: tensor.name)
318
+ end
319
+
320
+ register_op :fill, buffer: true do |_context, tensor, inputs|
321
+ shape = inputs[0]
322
+ value = inputs[1]
323
+
324
+ narray_size = shape.buffer.to_a.reduce(:*) || 1
325
+ cl_buffer = get_cached_buffer(tensor.name, shape.buffer.to_a)
326
+
327
+ buffer = if cl_buffer
328
+ cl_buffer.buffer
329
+ else
330
+ allocate_narray_for_type(tensor.data_type, narray_size)
331
+ end
332
+
333
+ buffer.fill!(value.buffer[0])
334
+ convert_to_opencl(buffer, shape.buffer.to_a, data_type: tensor.data_type, name: tensor.name)
335
+ end
336
+
337
+ register_op :where, noop: true do |context, tensor, inputs|
338
+ pred = tensor.options[:pred]
339
+ execute_cond_func('where', tensor, pred, inputs[0], inputs[1], context)
340
+ end
341
+
342
+ register_op :cast do |_context, tensor, inputs|
343
+ a = inputs[0]
344
+ if a.data_type != tensor.data_type
345
+ buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
346
+ m, n = a.shape
347
+ cl_m = OpenCL::Int1.new(m || 1)
348
+ cl_n = OpenCL::Int1.new(n || 1)
349
+ work_group = [m || 1, n || 1]
350
+ event_wait_list = build_event_wait_list(inputs)
351
+ buffer.op = _cl_program("cast", source_dt: a.data_type, target_dt: tensor.data_type).cast(_opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, buffer.cl_buffer, event_wait_list: event_wait_list)
352
+ buffer
353
+ else
354
+ a
355
+ end
356
+ end
357
+
358
+ register_op :stack do |_context, tensor, inputs|
359
+ axis = tensor.options[:axis] || 0
360
+ shape = inputs[0].shape
361
+ rank = shape.size + 1
362
+ elem_size = shape.empty? ? 1 : shape.reduce(:*)
363
+
364
+ new_shape = [inputs.size]
365
+ shape.inject(new_shape) { |ns, s| ns << s }
366
+
367
+ divisors = new_shape.dup.drop(1).reverse.inject([1]) do |a, s|
368
+ a << s * a.last
369
+ end.reverse
370
+
371
+ axis = rank + axis if axis < 0
372
+ rotated_shape = Array.new(axis + 1) { new_shape.shift }
373
+ new_shape = rotated_shape.rotate! + new_shape
374
+
375
+ output_buffer = _create_result_buffer(tensor.data_type, new_shape, tensor.name)
376
+ multipliers = new_shape.dup.drop(1).reverse.inject([1]) do |a, s|
377
+ a << s * a.last
378
+ end.reverse
379
+
380
+ cl_n = OpenCL::Int1.new(elem_size)
381
+ work_group = [elem_size]
382
+ event_wait_list = build_event_wait_list(inputs)
383
+ ops = inputs.each_with_index.map do |input, index|
384
+ cl_index = OpenCL::Int1.new(index)
385
+ _cl_program("pack", data_type: tensor.data_type, divisors: divisors, multipliers: multipliers, axis: axis).pack(_opencl_queue, work_group, cl_n, cl_index, input.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
386
+ end
387
+ output_buffer.op = ops
388
+ output_buffer
389
+ end
390
+
391
+ register_op :check_numerics, noop: true do |context, tensor, inputs|
392
+ a = complete_eval(inputs[0], context)
393
+ name = tensor.options[:name]
394
+
395
+ a.buffer.each do |input|
396
+ raise TensorStream::InvalidArgumentError, "#{name} Invalid Argument" if input.nan? || input.infinite?
397
+ end
398
+ a
399
+ end
400
+
401
+ register_op :broadcast_transform do |context, tensor, inputs|
402
+ a, b = inputs
403
+
404
+ if a.shape == b.shape
405
+ [a, b]
406
+ else
407
+ input_a = read_final_result(complete_eval(a, context))
408
+ input_b = read_final_result(complete_eval(b, context))
409
+ b_a, b_b = broadcast(input_a, input_b)
410
+ [wrap_opencl(b_a, data_type: a.data_type, name: "#{tensor.name}_a"),
411
+ wrap_opencl(b_b, data_type: a.data_type, name: "#{tensor.name}_b")]
412
+ end
413
+ end
414
+
415
+ register_op :print do |context, tensor, inputs|
416
+ a, b = inputs
417
+ input_b = complete_eval(b, context)
418
+ input_b = read_final_result(input_b)
419
+ puts "#{tensor.options.fetch(:message, '')} #{input_b}"
420
+ a
421
+ end
422
+
423
+ register_op :rank do |_context, tensor, inputs|
424
+ wrap_opencl(inputs[0].shape.size, data_type: tensor.data_type, name: tensor.name)
425
+ end
426
+
427
+ register_op :stop_gradient do |_context, _tensor, inputs|
428
+ inputs[0]
429
+ end
430
+
431
+ register_op :slice, noop: true do |context, tensor, inputs|
432
+ input_a = complete_eval(inputs[0], context)
433
+ input_b = read_final_result(complete_eval(inputs[1], context))
434
+ size = tensor.options[:size]
435
+
436
+ slice_param = input_b.zip(size).collect { |x, y| x..x + y - 1 }.reverse
437
+
438
+ new_buf = input_a.buffer.reshape(*input_a.shape.reverse)
439
+ sliced = new_buf.slice[*slice_param]
440
+ convert_to_opencl(sliced.flatten, sliced.shape.reverse, data_type: inputs[0].data_type, name: tensor.name)
441
+ end
442
+
443
+ register_op :transpose, buffer: true do |_context, tensor, inputs|
444
+ t_param = Array.new(inputs[0].shape.size) { |index| index }.reverse
445
+
446
+ if inputs[0].shape.size == 2 && inputs[1].nil?
447
+ transposed = inputs[0].buffer.reshape(*inputs[0].shape.reverse).transpose(*t_param)
448
+ res = convert_to_opencl(transposed.flatten, transposed.shape.reverse, data_type: inputs[0].data_type, name: tensor.name)
449
+ res
450
+ else
451
+ rank = inputs[0].shape.size
452
+ perm = inputs[1].nil? ? (0...rank).to_a.reverse : inputs[1].buffer
453
+ new_shape = perm.map { |p| inputs[0].shape[p] }.to_a
454
+ output_buffer = _create_result_buffer(tensor.data_type, new_shape, tensor.name)
455
+ transpose_with_perm(inputs[0].buffer, output_buffer.buffer, inputs[0].shape, new_shape, perm)
456
+
457
+ write_op = _opencl_queue.enqueue_write_buffer(output_buffer.cl_buffer, output_buffer.buffer)
458
+ output_buffer.op = write_op
459
+ output_buffer
460
+ end
461
+ end
462
+
463
+ register_op :index, noop: true do |context, tensor, inputs|
464
+ a = _run(inputs[0], context)
465
+ index = read_final_result(_run(inputs[1], context))
466
+
467
+ if a.is_a?(OutputGroup)
468
+ a.outputs[index]
469
+ elsif a.is_a?(Array)
470
+ a[index]
471
+ else
472
+ new_shape = a.shape.dup
473
+ new_shape.shift
474
+ input_a = read_final_result(a)
475
+ convert_to_opencl(input_a[index], new_shape, data_type: a.data_type, name: tensor.name)
476
+ end
477
+ end
478
+
479
+ register_op :broadcast_gradient_args, buffer: true do |_context, tensor, inputs|
480
+ rx, ry = get_broadcast_gradient_args(inputs[0].buffer.to_a, inputs[1].buffer.to_a)
481
+ OutputGroup.new([wrap_opencl(rx, data_type: :int32, name: tensor.name), wrap_opencl(ry, data_type: :int32, name: "#{tensor.name}:1")], tensor.inputs.map(&:data_type))
482
+ end
483
+
484
+ register_op :shape do |_context, tensor, inputs|
485
+ wrap_opencl(inputs[0].shape, name: tensor.name, data_type: tensor.data_type)
486
+ end
487
+
488
+ register_op :reshape, buffer: true do |_context, tensor, inputs|
489
+ arr = inputs[0]
490
+ new_shape = read_final_result(inputs[1])
491
+
492
+ shape = if new_shape.size.zero? && arr.buffer.size == 1
493
+ new_shape
494
+ else
495
+ TensorShape.fix_inferred_elements(new_shape, arr.buffer.size)
496
+ end
497
+
498
+ convert_to_opencl(arr.buffer, shape, data_type: arr.data_type, name: tensor.name)
499
+ end
500
+
501
+ register_op :flow_group do |context, _tensor, inputs|
502
+ _opencl_queue.finish
503
+ nil
504
+ end
505
+
506
+ register_op :size do |_context, tensor, inputs|
507
+ wrap_opencl(inputs[0].buffer.size, name: tensor.name, data_type: tensor.options[:out_type] || :int32)
508
+ end
509
+
510
+ def eval_operation(tensor, child_context)
511
+ cache_key = "#{tensor.graph.object_id}_opencl_#{tensor.name}:#{object_id}"
512
+ return @context[:_cache][cache_key] if @context[:_cache].key?(cache_key)
513
+ return @context[cache_key] if @context.key?(cache_key)
514
+ # puts "opencl: #{tensor.name}"
515
+ invoke(tensor, child_context).tap do |result|
516
+ if tensor.breakpoint
517
+ a = resolve_placeholder(tensor.inputs[0], child_context) if tensor.inputs && tensor.inputs[0]
518
+ b = resolve_placeholder(tensor.inputs[1], child_context) if tensor.inputs && tensor.inputs[1]
519
+ a = read_final_result(complete_eval(a, child_context))
520
+ b = read_final_result(complete_eval(b, child_context))
521
+ result = read_final_result(complete_eval(result, child_context))
522
+
523
+ tensor.breakpoint.call(tensor, a, b, result)
524
+ end
525
+ if @log_intermediates
526
+ @context[:compute_history] << {
527
+ name: tensor.name,
528
+ type: tensor.data_type,
529
+ shape: shape_eval(result),
530
+ source: tensor.source,
531
+ description: tensor.to_math(true, 1),
532
+ value: result
533
+ }
534
+ end
535
+ @context[cache_key] = result
536
+ @context[:_cache][cache_key] = result if tensor.is_const
537
+ end
538
+ rescue EvaluatorExcecutionException => e
539
+ _opencl_queue.finish # dump queue
540
+ raise e, "error #{e.message} while evaluating #{tensor.name} : #{tensor.to_math(true, 1)} defined at #{tensor.source}"
541
+ rescue TensorStreamError => e
542
+ _opencl_queue.finish # dump queue
543
+ raise e, "error #{e.message} while evaluating #{tensor.name} : #{tensor.to_math(true, 1)} defined at #{tensor.source}"
544
+ rescue StandardError => e
545
+ _opencl_queue.finish # dump queue
546
+ puts e.message
547
+ puts e.backtrace.join("\n")
548
+
549
+ # shape_a = a.shape.shape if a
550
+ # shape_b = b.shape.shape if b
551
+ # dtype_a = a.data_type if a
552
+ # dtype_b = b.data_type if b
553
+ # a = complete_eval(a, child_context)
554
+ # b = complete_eval(b, child_context)
555
+ # puts "name: #{tensor.given_name}"
556
+ # # puts "op: #{tensor.to_math(true, 1)}"
557
+ # puts "A #{shape_a} #{dtype_a}: #{a}" if a
558
+ # puts "B #{shape_b} #{dtype_b}: #{b}" if b
559
+ # dump_intermediates if @log_intermediates
560
+ # File.write('/home/jedld/workspace/tensor_stream/samples/error.graphml', TensorStream::Graphml.new.get_string(tensor, @session))
561
+
562
+ # File.write('/Users/josephemmanueldayo/workspace/gradients.graphml', TensorStream::Graphml.new.get_string(tensor, @session))
563
+ raise EvaluatorExcecutionException.new(e, tensor), "error #{e.message} while evaluating #{tensor.name} : #{tensor.to_math(true, 1)} defined at #{tensor.source}"
564
+ end
565
+
566
+ def eval_tensor(tensor, child_context)
567
+ return tensor unless tensor.is_a?(Tensor)
568
+
569
+ cache_key = "#{tensor.graph.object_id}_opencl_#{tensor.name}:#{object_id}"
570
+ return @context[cache_key] if @context.key?(cache_key)
571
+ return @context[:_cache][cache_key] if tensor.is_const && @context[:_cache][cache_key]
572
+ @context[cache_key] = if tensor.value.is_a?(Tensor)
573
+ _run(tensor.value, child_context)
574
+ else
575
+ wrap_opencl(tensor, name: tensor.name)
576
+ end
577
+ @context[:_cache][cache_key] = @context[cache_key] if tensor.is_const
578
+ @context[cache_key]
579
+ end
580
+
581
+ private
582
+
583
+ def assign_var(tensor, b, child_context)
584
+ assign = tensor.inputs[0] || tensor
585
+ buffer = complete_eval(b, child_context)
586
+
587
+ if assign.buffer
588
+ # buffer = type_cast(buffer, assign.data_type, name: "#{tensor.name}/cast_#{tensor.name}_#{tensor.data_type}")
589
+ event_wait_list = build_event_wait_list([buffer, assign.buffer])
590
+ assign.buffer.op = if assign.buffer.cl_buffer != buffer.cl_buffer
591
+ _opencl_queue.enqueue_copy_buffer(buffer.cl_buffer, assign.buffer.cl_buffer, event_wait_list: event_wait_list)
592
+ else
593
+ buffer.op
594
+ end
595
+ else
596
+ value = read_final_result(buffer)
597
+ assign.buffer = convert_to_opencl(value, buffer.shape, data_type: tensor.data_type, name: assign.name)
598
+ assign.value = value
599
+ end
600
+ assign.buffer.dirty = true
601
+ assign.buffer
602
+ end
603
+
604
+ def execute_2_operand_func(op_name, tensor, input_a, input_b, child_context, prog_name = nil)
605
+ a = _run(input_a, child_context)
606
+ b = _run(input_b, child_context)
607
+ a, b = auto_type_cast(a, b, name: "#{tensor.name}/cast_#{a.name}_#{b.data_type}")
608
+ dtype = tensor.data_type
609
+ result_shape = TensorShape.infer_shape(a.shape, b.shape)
610
+ return _create_result_buffer(dtype, [0], "out_#{tensor.name}") if result_shape == [0]
611
+ output_buffer = _create_result_buffer(tensor.data_type, result_shape, "out_#{tensor.name}")
612
+ a, b, prog, switch_operands = select_program(a, b, op_name)
613
+ m, n = result_shape
614
+ work_group = [m || 1, n || 1]
615
+ cl_m = OpenCL::Int1.new(m || 1)
616
+ cl_n = OpenCL::Int1.new(n || 1)
617
+ cl_switch = OpenCL::Int1.new(switch_operands) # no need to switch for addition
618
+
619
+ event_wait_list = build_event_wait_list([a, b]) # add dependency wait list
620
+
621
+ method_call = :"#{prog}_#{a.data_type}_#{b.data_type}"
622
+ prog_name ||= op_name
623
+ event = if prog == "#{op_name}_b"
624
+ cl_m_b, cl_n_b = if b.shape.size == 2
625
+ [OpenCL::Int1.new(b.shape[0]), OpenCL::Int1.new(b.shape[1])]
626
+ elsif b.shape.size == 1
627
+ [OpenCL::Int1.new(1), OpenCL::Int1.new(b.shape[0])]
628
+ else
629
+ raise "rank > 2 not supported!"
630
+ end
631
+ _cl_program(prog_name, a: a.data_type, b: b.data_type, dtype: dtype).
632
+ send(method_call, _opencl_queue, work_group, cl_m, cl_n, cl_m_b, cl_n_b,
633
+ cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
634
+ else
635
+ _cl_program(prog_name, a: a.data_type, b: b.data_type, dtype: dtype).
636
+ send(method_call, _opencl_queue, work_group, cl_m, cl_n, cl_switch,
637
+ a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
638
+ end
639
+
640
+ output_buffer.op = event
641
+ output_buffer
642
+ end
643
+
644
+ def execute_cond_func(op_name, tensor, pred, input_a, input_b, child_context)
645
+ p = _run(pred, child_context)
646
+ a = _run(input_a, child_context)
647
+ b = _run(input_b, child_context)
648
+
649
+ a, b = auto_type_cast(a, b, name: "#{tensor.name}/cast_#{a.name}_#{b.data_type}")
650
+ dtype = tensor.data_type
651
+
652
+ output_buffer = _create_result_buffer(tensor.data_type, p.shape, tensor.name)
653
+
654
+ m, n = p.shape
655
+ work_group = [m || 1, n || 1]
656
+ cl_m = OpenCL::Int1.new(m || 1)
657
+ cl_n = OpenCL::Int1.new(n || 1)
658
+
659
+ event_wait_list = build_event_wait_list([a, b, p]) # add dependency wait list
660
+ output_buffer.op = _cl_program(op_name.to_s, dtype: dtype).send(:"#{op_name}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, p.cl_buffer, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
661
+ output_buffer
662
+ end
663
+
664
+ def execute_func(op_name, tensor, a, child_context)
665
+ a = _run(a, child_context)
666
+ event_wait_list = build_event_wait_list([a])
667
+ dtype = tensor.data_type
668
+ output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
669
+
670
+ m, n = a.shape
671
+ work_group = [m || 1, n || 1]
672
+ cl_m = OpenCL::Int1.new(m || 1)
673
+ cl_n = OpenCL::Int1.new(n || 1)
674
+
675
+ event = _cl_program(op_name.to_s, dtype: dtype).send(:"#{op_name}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
676
+ output_buffer.op = event
677
+ output_buffer
678
+ end
679
+
680
+ def auto_type_cast(a, b, name: nil)
681
+ return [a, b] if a.data_type == b.data_type
682
+ m, n = b.shape
683
+ work_group = [m || 1, n || 1]
684
+ event_wait_list = build_event_wait_list([b])
685
+ buffer = _create_result_buffer(b.data_type, b.shape, name)
686
+
687
+ cl_m = OpenCL::Int1.new(m || 1)
688
+ cl_n = OpenCL::Int1.new(n || 1)
689
+
690
+ buffer.op = _cl_program("cast", source_dt: a.data_type, target_dt: b.data_type).cast(_opencl_queue, work_group, cl_m, cl_n, b.cl_buffer, buffer.cl_buffer, event_wait_list: event_wait_list)
691
+ [a, buffer]
692
+ end
693
+
694
+ def type_cast(source, data_type, name: nil)
695
+ return source if source.data_type == data_type
696
+ m, n = source.shape
697
+ work_group = [m || 1, n || 1]
698
+ event_wait_list = [source.op].compact
699
+ buffer = _create_result_buffer(data_type, source.shape, name)
700
+
701
+ cl_m = OpenCL::Int1.new(m || 1)
702
+ cl_n = OpenCL::Int1.new(n || 1)
703
+
704
+ buffer.op = _cl_program("cast", source_dt: source.data_type, target_dt: data_type).cast(_opencl_queue, work_group, cl_m, cl_n, source.cl_buffer, buffer.cl_buffer, event_wait_list: event_wait_list)
705
+ buffer
706
+ end
707
+
708
+ def wrap_opencl(tensor, data_type: nil, name: nil)
709
+ value, shape = if tensor.is_a?(Tensor)
710
+ [tensor.value, tensor.shape.shape]
711
+ else
712
+ [tensor, shape_eval(tensor)]
713
+ end
714
+
715
+ convert_to_opencl(value, shape, data_type: data_type || tensor.data_type, name: name)
716
+ end
717
+
718
+ def get_cached_buffer(name, shape)
719
+ cache_key = "_cl_object_#{name}:#{shape.join('_')}:#{object_id}"
720
+ @context[:_cache][cache_key]
721
+ end
722
+
723
+ def convert_to_opencl(value, shape, data_type: nil, name: nil)
724
+ value = [value] if !value.is_a?(Array) && !value.is_a?(NArray)
725
+
726
+ cache_key = "_cl_object_#{name}:#{shape.join('_')}:#{object_id}"
727
+ cl_object = if name && @context[:_cache][cache_key]
728
+ @context[:_cache][cache_key]
729
+ else
730
+ narray_size = shape.reduce(:*) || 1
731
+
732
+ buffer = if value.is_a?(NArray)
733
+ value
734
+ else
735
+ allocate_narray_for_type(data_type, narray_size)
736
+ end
737
+
738
+ return nil if buffer.nil?
739
+
740
+ cl_buffer_size = shape.empty? ? 1 : shape.reduce(:*)
741
+
742
+ cl_buffer = unless value.flatten.empty?
743
+ cl_buffer_size = 1 if cl_buffer_size.zero?
744
+ _opencl_context.create_buffer(cl_buffer_size * buffer.element_size)
745
+ end
746
+
747
+ @context[:_cache][cache_key] = OpenCLBuffer.new(name: name, data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer)
748
+ end
749
+
750
+ if value.is_a?(Array)
751
+ value.flatten.each_with_index do |element, index|
752
+ cl_object.buffer[index] = if element.is_a?(Tensor)
753
+ read_final_result(complete_eval(element, {}))
754
+ elsif data_type == :boolean
755
+ element ? 1 : 0
756
+ else
757
+ Tensor.cast_dtype(element, data_type)
758
+ end
759
+ end
760
+ elsif value.is_a?(NArray)
761
+ cl_object.buffer = value
762
+ elsif data_type == :boolean
763
+ cl_object.buffer[0] = element ? 1 : 0
764
+ else
765
+ cl_object.buffer[0] = Tensor.cast_dtype(value, data_type)
766
+ end
767
+
768
+ write_op = _opencl_queue.enqueue_write_buffer(cl_object.cl_buffer, cl_object.buffer) if cl_object.cl_buffer && !value.nil? && (!value.is_a?(Array) || !value.empty?)
769
+ cl_object.op = write_op
770
+ cl_object
771
+ end
772
+
773
+ def allocate_narray_for_type(data_type, narray_size)
774
+ case data_type
775
+ when :float, :float32
776
+ NArray.sfloat(narray_size)
777
+ when :float64
778
+ NArray.float(narray_size)
779
+ when :int, :int32, :int64
780
+ NArray.int(narray_size)
781
+ when :int16
782
+ NArray.sint(narray_size)
783
+ when :boolean
784
+ NArray.sint(narray_size)
785
+ when :unknown
786
+ nil
787
+ else
788
+ raise "unsupported type #{data_type}"
789
+ end
790
+ end
791
+
792
+ def _create_result_buffer(data_type, shape, name)
793
+ return OpenCLBuffer.new(name: name, data_type: data_type, shape: [0], buffer: nil, cl_buffer: nil) if shape == [0]
794
+ @context[:_cache][:_cl_buffers]["_result_#{name}_#{shape.join('_')}:#{object_id}"] ||= begin
795
+ size = shape.empty? || shape == [0] ? 1 : shape.reduce(:*)
796
+ buffer = allocate_narray_for_type(data_type, size)
797
+ cl_buffer = _opencl_context.create_buffer(buffer.size * buffer.element_size)
798
+ OpenCLBuffer.new(data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer, name: name)
799
+ end
800
+ end
801
+
802
+ def get_op_with_axis(a, target_axis, current_axis, output_type, op = ->(t, u) { t > u })
803
+ if target_axis == current_axis
804
+ if a[0].is_a?(Array)
805
+ (0...a[0].size).each.collect do |column_index|
806
+ max = nil
807
+ max_index = 0
808
+ a.each_with_index do |row, row_index|
809
+ if max.nil? || op.call(row[column_index], max)
810
+ max = row[column_index]
811
+ max_index = row_index
812
+ end
813
+ end
814
+
815
+ Tensor.cast_dtype(max_index, output_type)
816
+ end
817
+ else
818
+ max = nil
819
+ max_index = 0
820
+ a.each_with_index do |x, index|
821
+ if max.nil? || op.call(x, max)
822
+ max = x
823
+ max_index = index
824
+ end
825
+ end
826
+ Tensor.cast_dtype(max_index, output_type)
827
+ end
828
+ else
829
+ a.collect do |row|
830
+ get_op_with_axis(row, target_axis, current_axis + 1, output_type, op)
831
+ end
832
+ end
833
+ end
834
+
835
+ def _reduced_shape(input_shape, axes)
836
+ return [] if axes.nil? # reduce to scalar
837
+ axes = [axes] unless axes.is_a?(Array)
838
+ return input_shape if axes.empty?
839
+
840
+ axes.each do |dimen|
841
+ input_shape[dimen] = 1
842
+ end
843
+ input_shape
844
+ end
845
+
846
+ def reduction(child_context, tensor, a, b, func)
847
+ input = complete_eval(a, child_context)
848
+ axis = b.is_a?(Tensor) ? read_final_result(complete_eval(b, child_context)) : b
849
+ if axis.nil?
850
+ red = input.buffer.send(func)
851
+ convert_to_opencl(red, [], data_type: tensor.data_type, name: tensor.name)
852
+ else
853
+ return input if input.shape.empty?
854
+ value = input.buffer.reshape(*input.shape.reverse)
855
+ rank = input.shape.size - 1
856
+
857
+ if axis.is_a?(Array)
858
+ axis.map { |x| rank - x.abs }.sort.reverse_each do |x|
859
+ value = value.send(func, x.to_i)
860
+ end
861
+ else
862
+ value = value.send(func, rank - axis.abs)
863
+ end
864
+
865
+ new_shape = if value.is_a?(NArray)
866
+ value.shape.reverse
867
+ else
868
+ value = [value]
869
+ []
870
+ end
871
+
872
+ new_shape = _reduced_shape(input.shape.dup, axis) if tensor.options[:keepdims]
873
+
874
+ convert_to_opencl(value.flatten, new_shape, data_type: tensor.data_type, name: tensor.name)
875
+ end
876
+ end
877
+
878
+ # selects variants of cl programs depending on input
879
+ def select_program(input_a, input_b, op)
880
+ return [input_a, input_b, op.to_s, 0] if input_a.shape == input_b.shape
881
+
882
+ return [input_b, input_a, "#{op}_c", 1] if input_a.shape.empty? || input_a.shape.reduce(:*) == 1 # A is scalar?
883
+ return [input_a, input_b, "#{op}_c", 0] if input_b.shape.empty? || input_a.shape.reduce(:*) == 1 # B is scalar?
884
+
885
+ return [input_b, input_a, "#{op}_b", 1] if input_a.shape.size < input_b.shape.size
886
+
887
+ if input_a.shape.size == input_b.shape.size
888
+ input_a.shape.zip(input_b.shape).each do |s1, s2|
889
+ return [input_b, input_a, "#{op}_b", 1] if s1 < s2
890
+ end
891
+ end
892
+
893
+ [input_a, input_b, "#{op}_b", 0]
894
+ end
895
+
896
+ def _rank_from_shape(shape)
897
+ shape.is_a?(Array) ? shape.size : 0
898
+ end
899
+
900
+ def build_event_wait_list(inputs)
901
+ inputs.compact.map(&:op).flatten
902
+ end
903
+
904
+ def resolve_placeholder(placeholder, _execution_context = {})
905
+ return nil if placeholder.nil?
906
+
907
+ var = if placeholder.is_a?(Placeholder)
908
+ @context[placeholder.name.to_sym].tap do |c|
909
+ raise "missing placeholder #{placeholder.name}" if c.nil?
910
+ end
911
+ else
912
+ placeholder
913
+ end
914
+
915
+ return convert_to_opencl(var, shape_eval(var), data_type: placeholder.data_type, name: placeholder.name) unless var.is_a?(Tensor)
916
+ Tensor.cast_dtype(var, placeholder.data_type)
917
+ end
918
+
919
+ def all_true?(arr)
920
+ if arr.is_a?(Array) || arr.is_a?(NArray)
921
+ arr.each do |a|
922
+ return false unless all_true?(a)
923
+ end
924
+ return true
925
+ end
926
+
927
+ arr != 0
928
+ end
929
+ end
930
+ end
931
+ end
932
+
933
+ TensorStream::Evaluator.register_evaluator(TensorStream::Evaluator::OpenclEvaluator, 'opencl', 1)