tensor_stream 0.8.1 → 0.8.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/CHANGELOG.md +8 -0
  4. data/README.md +12 -6
  5. data/lib/tensor_stream.rb +1 -0
  6. data/lib/tensor_stream/evaluator/base_evaluator.rb +1 -1
  7. data/lib/tensor_stream/evaluator/ruby/array_ops.rb +282 -0
  8. data/lib/tensor_stream/evaluator/ruby/images_ops.rb +61 -0
  9. data/lib/tensor_stream/evaluator/ruby/math_ops.rb +111 -0
  10. data/lib/tensor_stream/evaluator/ruby/nn_ops.rb +48 -9
  11. data/lib/tensor_stream/evaluator/ruby/random_ops.rb +51 -0
  12. data/lib/tensor_stream/evaluator/ruby_evaluator.rb +20 -433
  13. data/lib/tensor_stream/images.rb +16 -0
  14. data/lib/tensor_stream/ops.rb +5 -1
  15. data/lib/tensor_stream/session.rb +15 -15
  16. data/lib/tensor_stream/tensor.rb +1 -1
  17. data/lib/tensor_stream/train/adadelta_optimizer.rb +52 -0
  18. data/lib/tensor_stream/train/adam_optimizer.rb +17 -2
  19. data/lib/tensor_stream/train/gradient_descent_optimizer.rb +7 -1
  20. data/lib/tensor_stream/trainer.rb +1 -0
  21. data/lib/tensor_stream/types.rb +4 -0
  22. data/lib/tensor_stream/utils.rb +4 -0
  23. data/lib/tensor_stream/variable_scope.rb +1 -0
  24. data/lib/tensor_stream/version.rb +1 -1
  25. data/samples/linear_regression.rb +4 -1
  26. data/samples/mnist_data.rb +64 -0
  27. data/samples/nearest_neighbor.rb +1 -2
  28. data/samples/raw_neural_net_sample.rb +1 -1
  29. data/tensor_stream.gemspec +1 -0
  30. metadata +23 -57
  31. data/lib/tensor_stream/evaluator/opencl/kernels/_bool_operand.cl +0 -45
  32. data/lib/tensor_stream/evaluator/opencl/kernels/_operand.cl +0 -45
  33. data/lib/tensor_stream/evaluator/opencl/kernels/abs.cl +0 -20
  34. data/lib/tensor_stream/evaluator/opencl/kernels/acos.cl +0 -8
  35. data/lib/tensor_stream/evaluator/opencl/kernels/add.cl +0 -3
  36. data/lib/tensor_stream/evaluator/opencl/kernels/apply_adam.cl +0 -23
  37. data/lib/tensor_stream/evaluator/opencl/kernels/apply_gradient.cl +0 -9
  38. data/lib/tensor_stream/evaluator/opencl/kernels/apply_momentum.cl +0 -16
  39. data/lib/tensor_stream/evaluator/opencl/kernels/argmax.cl +0 -8
  40. data/lib/tensor_stream/evaluator/opencl/kernels/argmin.cl +0 -8
  41. data/lib/tensor_stream/evaluator/opencl/kernels/asin.cl +0 -9
  42. data/lib/tensor_stream/evaluator/opencl/kernels/cast.cl +0 -10
  43. data/lib/tensor_stream/evaluator/opencl/kernels/ceil.cl +0 -8
  44. data/lib/tensor_stream/evaluator/opencl/kernels/cond.cl.erb +0 -6
  45. data/lib/tensor_stream/evaluator/opencl/kernels/cos.cl +0 -8
  46. data/lib/tensor_stream/evaluator/opencl/kernels/div.cl.erb +0 -3
  47. data/lib/tensor_stream/evaluator/opencl/kernels/exp.cl +0 -8
  48. data/lib/tensor_stream/evaluator/opencl/kernels/floor.cl +0 -8
  49. data/lib/tensor_stream/evaluator/opencl/kernels/floor_div.cl +0 -48
  50. data/lib/tensor_stream/evaluator/opencl/kernels/floor_mod.cl +0 -3
  51. data/lib/tensor_stream/evaluator/opencl/kernels/gemm.cl +0 -32
  52. data/lib/tensor_stream/evaluator/opencl/kernels/log.cl +0 -8
  53. data/lib/tensor_stream/evaluator/opencl/kernels/log1p.cl +0 -8
  54. data/lib/tensor_stream/evaluator/opencl/kernels/log_softmax.cl +0 -26
  55. data/lib/tensor_stream/evaluator/opencl/kernels/max.cl +0 -46
  56. data/lib/tensor_stream/evaluator/opencl/kernels/min.cl +0 -46
  57. data/lib/tensor_stream/evaluator/opencl/kernels/mod.cl +0 -3
  58. data/lib/tensor_stream/evaluator/opencl/kernels/mul.cl +0 -3
  59. data/lib/tensor_stream/evaluator/opencl/kernels/negate.cl +0 -8
  60. data/lib/tensor_stream/evaluator/opencl/kernels/pack.cl +0 -24
  61. data/lib/tensor_stream/evaluator/opencl/kernels/pow.cl +0 -46
  62. data/lib/tensor_stream/evaluator/opencl/kernels/real_div.cl +0 -3
  63. data/lib/tensor_stream/evaluator/opencl/kernels/reciprocal.cl +0 -8
  64. data/lib/tensor_stream/evaluator/opencl/kernels/round.cl +0 -8
  65. data/lib/tensor_stream/evaluator/opencl/kernels/sigmoid.cl +0 -9
  66. data/lib/tensor_stream/evaluator/opencl/kernels/sigmoid_grad.cl +0 -55
  67. data/lib/tensor_stream/evaluator/opencl/kernels/sign.cl +0 -21
  68. data/lib/tensor_stream/evaluator/opencl/kernels/sin.cl +0 -9
  69. data/lib/tensor_stream/evaluator/opencl/kernels/softmax.cl +0 -26
  70. data/lib/tensor_stream/evaluator/opencl/kernels/softmax_cross.cl +0 -32
  71. data/lib/tensor_stream/evaluator/opencl/kernels/softmax_cross_grad.cl +0 -28
  72. data/lib/tensor_stream/evaluator/opencl/kernels/softmax_grad.cl +0 -46
  73. data/lib/tensor_stream/evaluator/opencl/kernels/sqrt.cl +0 -9
  74. data/lib/tensor_stream/evaluator/opencl/kernels/square.cl +0 -9
  75. data/lib/tensor_stream/evaluator/opencl/kernels/squared_difference.cl +0 -53
  76. data/lib/tensor_stream/evaluator/opencl/kernels/sub.cl +0 -3
  77. data/lib/tensor_stream/evaluator/opencl/kernels/tan.cl +0 -8
  78. data/lib/tensor_stream/evaluator/opencl/kernels/tanh.cl +0 -8
  79. data/lib/tensor_stream/evaluator/opencl/kernels/tanh_grad.cl +0 -7
  80. data/lib/tensor_stream/evaluator/opencl/kernels/where.cl +0 -8
  81. data/lib/tensor_stream/evaluator/opencl/opencl_buffer.rb +0 -35
  82. data/lib/tensor_stream/evaluator/opencl/opencl_device.rb +0 -5
  83. data/lib/tensor_stream/evaluator/opencl/opencl_evaluator.rb +0 -1230
  84. data/lib/tensor_stream/evaluator/opencl/opencl_template_helper.rb +0 -95
@@ -1,3 +0,0 @@
1
- % c_dtype = dtype_to_c_type(dtype)
2
- % op = operator_to_c('sub')
3
- <%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'sub', dtype: "#{a}_#{b}", result_t: c_dtype %>
@@ -1,8 +0,0 @@
1
- % c_dtype = dtype_to_c_type(dtype)
2
- __kernel void tan_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
- // Get the index of the current element to be processed
4
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
-
7
- C[globalRow * N + globalCol] = tan(A[globalRow * N + globalCol]);
8
- }
@@ -1,8 +0,0 @@
1
- % c_dtype = dtype_to_c_type(dtype)
2
- __kernel void tanh_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
- // Get the index of the current element to be processed
4
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
-
7
- C[globalRow * N + globalCol] = tanh(A[globalRow * N + globalCol]);
8
- }
@@ -1,7 +0,0 @@
1
- % c_dtype = dtype_to_c_type(dtype)
2
- __kernel void tanh_grad_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
- // Get the index of the current element to be processed
4
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
- C[globalRow * N + globalCol] = 1 - tanh(A[globalRow * N + globalCol]) * tanh(A[globalRow * N + globalCol]);
7
- }
@@ -1,8 +0,0 @@
1
- % c_dtype = dtype_to_c_type(dtype)
2
- __kernel void where_<%= dtype %>(const int M, const int N, __global const short *PRED, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
3
- // Get the index of the current element to be processed
4
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
-
7
- C[globalRow * N + globalCol] = PRED[globalRow * N + globalCol] ? A[globalRow * N + globalCol] : B[globalRow * N + globalCol];
8
- }
@@ -1,35 +0,0 @@
1
- module TensorStream
2
- # Buffer used by the OpenCL evaluator
3
- class OpenCLBuffer < Buffer
4
- include ArrayOpsHelper
5
-
6
- attr_accessor :shape, :buffer, :cl_buffer, :op
7
-
8
- def initialize(data_type:, shape:, buffer:, cl_buffer:, op: nil, name: nil)
9
- @data_type = data_type
10
- @shape = shape
11
- @buffer = buffer
12
- @cl_buffer = cl_buffer
13
- @name = name
14
- @op = op
15
- end
16
-
17
- def to_ruby
18
- return [] if buffer.empty?
19
-
20
- if dirty
21
- op.command_queue.enqueue_read_buffer(cl_buffer, buffer, event_wait_list: [op].compact)
22
- op.command_queue.finish
23
- self.dirty = false
24
- end
25
-
26
- if shape.empty?
27
- return buffer[0] != 0 if data_type == :boolean
28
- return buffer[0]
29
- end
30
-
31
- result = buffer.reshape(*shape.map(&:to_i).reverse).to_a
32
- data_type == :boolean ? process_function_op(result, ->(a, _b) { a != 0 }) : result
33
- end
34
- end
35
- end
@@ -1,5 +0,0 @@
1
- module TensorStream
2
- class OpenclDevice < TensorStream::Device
3
- attr_accessor :native_device
4
- end
5
- end
@@ -1,1230 +0,0 @@
1
- require 'tensor_stream/evaluator/operation_helpers/random_gaussian'
2
- require 'tensor_stream/evaluator/operation_helpers/array_ops_helper'
3
- require 'tensor_stream/evaluator/operation_helpers/math_helper'
4
- require 'tensor_stream/evaluator/opencl/opencl_buffer'
5
- require 'tensor_stream/evaluator/opencl/opencl_template_helper'
6
- require 'tensor_stream/evaluator/opencl/opencl_device'
7
- require 'opencl_ruby_ffi'
8
- require 'narray_ffi'
9
- require 'tensor_stream/evaluator/base_evaluator'
10
-
11
- module TensorStream
12
- module Evaluator
13
- class FullEvalNotPossible < RuntimeError
14
- end
15
-
16
- # Errors during graph evaluation
17
- class EvaluatorExcecutionException < RuntimeError
18
- attr_reader :tensor
19
-
20
- def initialize(exception, tensor)
21
- @exception = exception
22
- @tensor = tensor
23
- end
24
-
25
- def wrapped_exception
26
- @exception
27
- end
28
- end
29
-
30
- ## PURE ruby evaluator used for testing and development
31
- class OpenclEvaluator < BaseEvaluator
32
- attr_accessor :retain
33
- attr_reader :opencl_device
34
-
35
- include TensorStream::OpHelper
36
- include TensorStream::ArrayOpsHelper
37
- include TensorStream::MathHelper
38
-
39
- def initialize(session, device, thread_pool: nil, log_intermediates: false)
40
- super
41
- _create_opencl_context(device.native_device)
42
- @opencl_device = device.native_device
43
- create_command_queue
44
- end
45
-
46
- def self.query_supported_devices
47
- devices = query_devices_with_score
48
- devices.sort { |a| a[1] }.reverse.map do |d|
49
- opencl_to_device(d)
50
- end
51
- end
52
-
53
- def self.fetch_device(query = [])
54
- devices = query_devices_with_score
55
- platform_devices = devices.select { |d| d[0].platform.to_s.tr(' ', '_').downcase =~ /#{query[0].downcase}/ }
56
- opencl_to_device(platform_devices[[query[1].to_i, platform_devices.size - 1].min])
57
- end
58
-
59
- def self.opencl_to_device(d)
60
- device = d[0]
61
- index = d[3]
62
- platform_name = device.platform.name.tr(' ', '_').downcase
63
- uri = [platform_name, index].join(':')
64
-
65
- device_type = device.type.to_s == 'GPU' ? :gpu : :cpu
66
-
67
- OpenclDevice.new(uri, device_type, self).tap do |devide|
68
- devide.native_device = device
69
- end
70
- end
71
-
72
- ##
73
- # Select the best device available in the system for this evaluator
74
- def self.default_device
75
- devices = OpenclEvaluator.query_devices_with_score
76
- device = devices.sort { |a| a[1] }.reverse.first
77
- opencl_to_device(device)
78
- end
79
-
80
- # opencl evaluator main entrypoint
81
- def run(tensor, execution_context)
82
- read_final_result(complete_eval(tensor, execution_context))
83
- end
84
-
85
- def run_with_buffer(tensor, context, execution_context)
86
- @context = context
87
- @context[:_cache][:_cl_buffers] ||= {} if context[:_cache]
88
-
89
- if tensor.is_a?(Array)
90
- tensor.collect do |t|
91
- value = run(t, execution_context)
92
- Buffer.new(data_type: t.data_type, buffer: value)
93
- end
94
- else
95
- value = run(tensor, execution_context)
96
- Buffer.new(data_type: tensor.data_type, buffer: value)
97
- end
98
- end
99
-
100
- # buffer comes from non-opencl evaluator
101
- def convert_from_buffer(tensor, result)
102
- if result.buffer.is_a?(TensorStream::Evaluator::OutputGroup)
103
- converted_outputs = result.buffer.outputs.zip(result.buffer.data_types).map { |output, data_type| convert_to_opencl([output].flatten, shape_eval(output), data_type: data_type, name: tensor.name) }
104
- TensorStream::Evaluator::OutputGroup.new(converted_outputs, result.buffer.data_types)
105
- else
106
- convert_to_opencl([result.buffer].flatten, shape_eval(result.buffer), data_type: result.data_type, name: tensor.name)
107
- end
108
- end
109
-
110
- def enqueue_buffer_read(tensor, context)
111
- buffer = _run(tensor, context)
112
- if buffer.is_a?(Array)
113
- buffer = buffer.collect do |b|
114
- next b if b.buffer.size.zero?
115
- _opencl_queue.enqueue_read_buffer(b.cl_buffer, b.buffer, event_wait_list: build_event_wait_list([b]))
116
- b
117
- end
118
- else
119
- return buffer.outputs[0] if buffer.is_a?(OutputGroup)
120
- return buffer if buffer.nil?
121
- return [] if buffer.buffer.nil?
122
- return buffer if buffer.buffer.size.zero?
123
- _opencl_queue.enqueue_read_buffer(buffer.cl_buffer, buffer.buffer, event_wait_list: build_event_wait_list([buffer]))
124
- buffer
125
- end
126
- end
127
-
128
- def complete_eval(tensor, context)
129
- buffer = enqueue_buffer_read(tensor, context)
130
- _opencl_queue.finish
131
- buffer
132
- end
133
-
134
- def self.query_devices_with_score
135
- OpenCL.platforms.flat_map do |p|
136
- p.devices.select { |d| d.available > 0 }.each_with_index.collect do |d, index|
137
- score = 0
138
- if d.type.to_s == 'CPU'
139
- score += 1
140
- elsif d.type.to_s == 'GPU'
141
- score += 4
142
- end
143
-
144
- score += 1000 if d.platform.name == 'NVIDIA CUDA'
145
-
146
- score += d.max_compute_units
147
- score += d.max_clock_frequency
148
-
149
- [d, score, p.name, index]
150
- end
151
- end
152
- end
153
-
154
- protected
155
-
156
- def prepare_input(tensor, context, options = {})
157
- return nil unless tensor
158
- tensor = resolve_placeholder(tensor)
159
- if options[:noop]
160
- tensor
161
- elsif options[:buffer]
162
- complete_eval(tensor, context)
163
- elsif options[:complete]
164
- read_final_result(complete_eval(tensor, context))
165
- else
166
- _run(tensor, context)
167
- end
168
- end
169
-
170
- # read result from opencl and convert to ruby
171
- def read_final_result(buffer)
172
- return buffer.map { |b| read_final_result(b) } if buffer.is_a?(Array)
173
- return nil if buffer.nil?
174
-
175
- buffer.to_ruby
176
- end
177
-
178
- def _create_opencl_context(opencl_device)
179
- @opencl_context = OpenCL.create_context(opencl_device)
180
- end
181
-
182
- def create_command_queue
183
- supported_proprties = opencl_device.queue_properties.names
184
-
185
- properties = []
186
- properties << OpenCL::CommandQueue::PROFILING_ENABLE if supported_proprties.include?('PROFILING_ENABLE')
187
- properties << OpenCL::CommandQueue::OUT_OF_ORDER_EXEC_MODE_ENABLE if supported_proprties.include?('OUT_OF_ORDER_EXEC_MODE_ENABLE')
188
- @command_queue = _opencl_context.create_command_queue(opencl_device, properties: properties)
189
- end
190
-
191
- def _opencl_context
192
- @opencl_context
193
- end
194
-
195
- def _opencl_queue
196
- @command_queue
197
- end
198
-
199
- def cl_template_path(kernel, extension)
200
- File.join(File.dirname(__FILE__), 'kernels', "#{kernel}.#{extension}")
201
- end
202
-
203
- def _cl_program(kernel, args = {})
204
- suffix = args.collect { |k, v| "#{k}.#{escape_arg_content(v)}" }.join('.')
205
- @context[:_cache]["_opencl_kernel_#{kernel}.#{suffix}:#{object_id}"] ||= begin
206
- filename = %w[cl.erb cl].map { |ext| cl_template_path(kernel, ext) }.find { |n| File.exist?(n) }
207
- raise "opencl kernel template for #{kernel} has not yet been defined" if filename.nil?
208
- source = File.read(filename)
209
- source = OpenclTemplateHelper.new(source).generate(args)
210
- # File.write("/tmp/#{kernel}.#{suffix}.cl", source)
211
- program = _opencl_context.create_program_with_source(source)
212
- program.build
213
- rescue OpenCL::Error::BUILD_PROGRAM_FAILURE => e
214
- puts "OpenCL Compile error: #{program.build_log}"
215
- raise e
216
- end
217
- end
218
-
219
- def escape_arg_content(value)
220
- return value.tr(' ','_') if value.is_a?(String)
221
- return value.join('-') if value.is_a?(Array)
222
-
223
- value
224
- end
225
-
226
- def _run(tensor, execution_context)
227
- return tensor if tensor.is_a?(OpenCLBuffer)
228
- return tensor.map { |t| _run(t, execution_context) } if tensor.is_a?(Array) && !tensor.size.empty? && tensor[0].is_a?(Tensor)
229
-
230
- tensor = tensor.call if tensor.is_a?(Proc)
231
-
232
- child_context = execution_context.dup
233
- res = if tensor.is_a?(Operation)
234
- if !self.class.ops.include?(tensor.operation.to_sym)
235
- result = @session.delegate_to_evaluator(tensor, @context, execution_context)
236
- convert_from_buffer(tensor, result)
237
- else
238
- eval_operation(tensor, child_context)
239
- end
240
- elsif tensor.is_a?(Variable)
241
- eval_variable(tensor, child_context)
242
- elsif tensor.is_a?(Placeholder)
243
- resolve_placeholder(tensor, child_context)
244
- else
245
- eval_tensor(tensor, child_context)
246
- end
247
- execution_context.deep_merge!(returns: child_context[:returns])
248
- res
249
- end
250
-
251
- def eval_variable(tensor, _child_context)
252
- raise "variable #{tensor.name} not initalized" if tensor.value.nil? && (tensor.buffer.nil? || !tensor.buffer.dirty)
253
- tensor.buffer = wrap_opencl(tensor, name: tensor.name) if tensor.buffer.nil?
254
- tensor.buffer
255
- end
256
-
257
- register_op :no_op do |_context, _tensor, _inputs|
258
- end
259
-
260
- register_op :log do |context, tensor, inputs|
261
- execute_func('log', tensor, inputs[0], context)
262
- end
263
-
264
- register_op :cond, noop: true do |context, tensor, inputs|
265
- pred = complete_eval(tensor.options[:pred], context)
266
-
267
- if all_true?(pred.buffer)
268
- complete_eval(inputs[0], context)
269
- else
270
- complete_eval(inputs[1], context)
271
- end
272
- end
273
-
274
- register_op :identity do |context, tensor, inputs|
275
- if tensor.inputs.size > 1
276
- tensor.inputs[1..inputs.size].each { |input| complete_eval(input, context) }
277
- end
278
- inputs[0]
279
- end
280
-
281
- register_op :assign, noop: true do |context, tensor, inputs|
282
- assign_var(tensor, inputs[1], context)
283
- end
284
-
285
- register_op :assign_add do |context, tensor, inputs|
286
- value = execute_2_operand_func('add', tensor, inputs[0], inputs[1], context)
287
- assign_var(tensor, value, context)
288
- end
289
-
290
- register_op :assign_sub do |context, tensor, inputs|
291
- value = execute_2_operand_func('sub', tensor, inputs[0], inputs[1], context)
292
- assign_var(tensor, value, context)
293
- end
294
-
295
- register_op :variable, noop: true do |context, tensor, inputs|
296
- variable = tensor.inputs[0]
297
- raise "variable #{tensor.name} not initalized" if variable.value.nil? && (variable.buffer.nil? || !variable.buffer.dirty)
298
- variable.buffer = wrap_opencl(variable, name: variable.name) if variable.buffer.nil?
299
- variable.buffer
300
- end
301
-
302
- # Fast in place multiply subtract assign
303
- register_op :apply_gradient_descent do |_context, tensor, inputs|
304
- _target_var, learning_rate, delta = inputs
305
-
306
- assign = tensor.inputs[0] || tensor
307
-
308
- assign.buffer.dirty = true # force buffer copy when variable is read externally
309
- output_buffer = assign.buffer
310
-
311
- m, n = output_buffer.shape
312
- work_group = [m || 1, n || 1]
313
- cl_m = OpenCL::Int1.new(m || 1)
314
- cl_n = OpenCL::Int1.new(n || 1)
315
-
316
- event_wait_list = build_event_wait_list([assign.buffer, learning_rate, delta])
317
- method_call = :"apply_gradient_#{output_buffer.data_type}"
318
- event = _cl_program("apply_gradient", dtype: output_buffer.data_type).send(method_call, _opencl_queue, work_group, cl_m, cl_n, delta.cl_buffer, learning_rate.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
319
- output_buffer.op = event
320
- output_buffer
321
- end
322
-
323
- # Fast in place multiply subtract assign
324
- register_op :apply_momentum do |_context, tensor, inputs|
325
- target_var, momentum_var, learning_rate, grad, momentum = inputs
326
-
327
- assign = tensor.inputs[0] || tensor
328
- assign_acc = tensor.inputs[1]
329
- assign.buffer.dirty = true # force buffer copy when variable is read externally
330
- assign_acc.buffer.dirty = true # force buffer copy when variable is read externally
331
-
332
- output_buffer = assign.buffer
333
-
334
- m, n = output_buffer.shape
335
- work_group = [m || 1, n || 1]
336
- cl_m = OpenCL::Int1.new(m || 1)
337
- cl_n = OpenCL::Int1.new(n || 1)
338
-
339
- event_wait_list = build_event_wait_list([assign.buffer, assign_acc.buffer, learning_rate, grad, momentum])
340
- method_call = :"apply_momentum_#{output_buffer.data_type}"
341
- event = _cl_program("apply_momentum", nesterov: tensor.options[:use_nesterov], dtype: output_buffer.data_type).
342
- send(method_call, _opencl_queue, work_group, cl_m, cl_n, grad.cl_buffer,
343
- learning_rate.cl_buffer, momentum.cl_buffer, output_buffer.cl_buffer,
344
- assign_acc.buffer.cl_buffer, event_wait_list: event_wait_list)
345
- output_buffer.op = event
346
- assign_acc.buffer.op = event
347
- output_buffer
348
- end
349
-
350
- # Adam optimization algorithm
351
- register_op :apply_adam do |_context, tensor, inputs|
352
- _target_var, _m, _v, beta1_power, beta2_power, lr_t, beta1_t, beta2_t, epsilon_t, grad = inputs
353
-
354
- assign = tensor.inputs[0] || tensor
355
- assign_m = tensor.inputs[1]
356
- assign_v = tensor.inputs[2]
357
-
358
- # mark variable buffers as dirty
359
- assign.buffer.dirty = true # force buffer copy when variable is read externally
360
- assign_m.buffer.dirty = true # force buffer copy when variable is read externally
361
- assign_v.buffer.dirty = true # force buffer copy when variable is read externally
362
-
363
- output_buffer = assign.buffer
364
-
365
- m, n = output_buffer.shape
366
- work_group = [m || 1, n || 1]
367
- cl_m = OpenCL::Int1.new(m || 1)
368
- cl_n = OpenCL::Int1.new(n || 1)
369
-
370
- event_wait_list = build_event_wait_list(inputs)
371
- method_call = :"apply_adam_#{output_buffer.data_type}"
372
- event = _cl_program("apply_adam", dtype: output_buffer.data_type)
373
- .send(method_call, _opencl_queue, work_group, cl_m, cl_n,
374
- grad.cl_buffer,
375
- lr_t.cl_buffer,
376
- beta1_power.cl_buffer,
377
- beta2_power.cl_buffer,
378
- beta1_t.cl_buffer,
379
- beta2_t.cl_buffer,
380
- epsilon_t.cl_buffer,
381
- assign_m.buffer.cl_buffer,
382
- assign.buffer.cl_buffer,
383
- assign_v.buffer.cl_buffer,
384
- event_wait_list: event_wait_list)
385
- output_buffer.op = event
386
- assign_m.buffer.op = event
387
- assign_v.buffer.op = event
388
- output_buffer
389
- end
390
-
391
- %i[less less_equal greater greater_equal equal not_equal logical_and].each do |op|
392
- register_op op, noop: true do |context, tensor, inputs|
393
- execute_2_operand_func(op.to_s, tensor, inputs[0], inputs[1], context, 'cond')
394
- end
395
- end
396
-
397
- %i[max min add real_div div sub floor_mod mod mul pow sigmoid_grad squared_difference].each do |op|
398
- register_op op, noop: true do |context, tensor, inputs|
399
- execute_2_operand_func(op.to_s, tensor, inputs[0], inputs[1], context)
400
- end
401
- end
402
-
403
- register_op :add_n do |_context, tensor, inputs|
404
- if inputs.size == 1
405
- inputs[0]
406
- else
407
- m, n = inputs[0].shape
408
- work_group = [m || 1, n || 1]
409
- cl_m = OpenCL::Int1.new(m || 1)
410
- cl_n = OpenCL::Int1.new(n || 1)
411
- cl_switch = OpenCL::Int1.new(0)
412
- dtype = tensor.data_type
413
-
414
- output_buffer = _create_result_buffer(tensor.data_type, inputs[0].shape, "out_#{tensor.name}")
415
- inputs_queue = inputs.dup
416
- a = inputs_queue.pop
417
- until inputs_queue.empty?
418
- b = inputs_queue.pop
419
- event_wait_list = build_event_wait_list([a, b])
420
- method_call = :"add_#{a.data_type}_#{b.data_type}"
421
- event = _cl_program('add', a: a.data_type, b: b.data_type, dtype: dtype).send(method_call, _opencl_queue, work_group, cl_m, cl_n, cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
422
- a = output_buffer
423
- a.op = event
424
- end
425
-
426
- output_buffer.op = a.op
427
- output_buffer
428
- end
429
- end
430
-
431
- register_op :expand_dims, buffer: true do |_context, tensor, inputs|
432
- axis = inputs[1].buffer[0]
433
- shape = inputs[0].shape.dup
434
- axis = -axis if axis == shape.size
435
- new_shape = shape.insert(axis, 1).compact
436
- new_buf = inputs[0].buffer.reshape(*new_shape.reverse)
437
- convert_to_opencl(new_buf, new_shape, data_type: inputs[0].data_type, name: tensor.name)
438
- end
439
-
440
- register_op :fill, buffer: true do |_context, tensor, inputs|
441
- shape = inputs[0]
442
- value = inputs[1]
443
-
444
- narray_size = shape.buffer.to_a.reduce(:*) || 1
445
- cl_buffer = get_cached_buffer(tensor.name, shape.buffer.to_a)
446
-
447
- buffer = if cl_buffer
448
- cl_buffer.buffer
449
- else
450
- allocate_narray_for_type(tensor.data_type, narray_size)
451
- end
452
-
453
- buffer.fill!(value.buffer[0])
454
- convert_to_opencl(buffer, shape.buffer.to_a, data_type: tensor.data_type, name: tensor.name)
455
- end
456
-
457
- register_op :floor_div, noop: true do |context, tensor, inputs|
458
- if fp_type?(tensor.data_type)
459
- execute_2_operand_func('floor_div', tensor, inputs[0], inputs[1], context)
460
- else
461
- execute_2_operand_func('div', tensor, inputs[0], inputs[1], context)
462
- end
463
- end
464
-
465
- register_op :where, noop: true do |context, tensor, inputs|
466
- pred = tensor.options[:pred]
467
- execute_cond_func('where', tensor, pred, inputs[0], inputs[1], context)
468
- end
469
-
470
- register_op :mat_mul do |_context, tensor, inputs|
471
- a, b = inputs
472
-
473
- m = a.shape[0]
474
- n = b.shape[1]
475
- v = b.shape[0]
476
- k = a.shape[1]
477
-
478
- if tensor.options[:transpose_a]
479
- m = a.shape[1]
480
- k = a.shape[0]
481
- end
482
-
483
- if tensor.options[:transpose_b]
484
- n = b.shape[0]
485
- v = b.shape[1]
486
- end
487
-
488
- result_shape = [m, n]
489
-
490
- raise "#{tensor.inputs[0].name} rank must be greater than 1" if a.shape.size < 2
491
- raise "#{tensor.inputs[1].name} rank must be greater than 1" if b.shape.size < 2
492
- raise "incompatible shape sizes for matrix multiplication (#{a.shape[1]} != #{b.shape[0]}) #{a.shape} vs #{b.shape}" if k != v
493
-
494
- dtype = tensor.data_type
495
- a, b = auto_type_cast(a, b, name: "#{tensor.name}/cast_#{a.name}_#{b.data_type}")
496
- output_buffer = _create_result_buffer(a.data_type, result_shape, tensor.name)
497
-
498
- cl_m = OpenCL::Int1.new(m)
499
- cl_n = OpenCL::Int1.new(n)
500
- cl_k = OpenCL::Int1.new(k)
501
-
502
- transpose_a = OpenCL::Int1.new(tensor.options[:transpose_a] ? 1 : 0)
503
- transpose_b = OpenCL::Int1.new(tensor.options[:transpose_b] ? 1 : 0)
504
- event_wait_list = build_event_wait_list(inputs)
505
- output_buffer.op = _cl_program('gemm', dtype: dtype).send(:"gemm_#{dtype}", _opencl_queue, result_shape, cl_m, cl_n, cl_k, transpose_a, transpose_b, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
506
- output_buffer
507
- end
508
-
509
- register_op :cast do |_context, tensor, inputs|
510
- a = inputs[0]
511
- if a.data_type != tensor.data_type
512
- buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
513
- m, n = a.shape
514
- cl_m = OpenCL::Int1.new(m || 1)
515
- cl_n = OpenCL::Int1.new(n || 1)
516
- work_group = [m || 1, n || 1]
517
- event_wait_list = build_event_wait_list(inputs)
518
- buffer.op = _cl_program("cast", source_dt: a.data_type, target_dt: tensor.data_type).cast(_opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, buffer.cl_buffer, event_wait_list: event_wait_list)
519
- buffer
520
- else
521
- a
522
- end
523
- end
524
-
525
- register_op :stack do |_context, tensor, inputs|
526
- axis = tensor.options[:axis] || 0
527
- shape = inputs[0].shape
528
- rank = shape.size + 1
529
- elem_size = shape.empty? ? 1 : shape.reduce(:*)
530
-
531
- new_shape = [inputs.size]
532
- shape.inject(new_shape) { |ns, s| ns << s }
533
-
534
- divisors = new_shape.dup.drop(1).reverse.inject([1]) do |a, s|
535
- a << s * a.last
536
- end.reverse
537
-
538
- axis = rank + axis if axis < 0
539
- rotated_shape = Array.new(axis + 1) { new_shape.shift }
540
- new_shape = rotated_shape.rotate! + new_shape
541
-
542
- output_buffer = _create_result_buffer(tensor.data_type, new_shape, tensor.name)
543
- multipliers = new_shape.dup.drop(1).reverse.inject([1]) do |a, s|
544
- a << s * a.last
545
- end.reverse
546
-
547
- cl_n = OpenCL::Int1.new(elem_size)
548
- work_group = [elem_size]
549
- event_wait_list = build_event_wait_list(inputs)
550
- ops = inputs.each_with_index.map do |input, index|
551
- cl_index = OpenCL::Int1.new(index)
552
- _cl_program("pack", data_type: tensor.data_type, divisors: divisors, multipliers: multipliers, axis: axis).pack(_opencl_queue, work_group, cl_n, cl_index, input.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
553
- end
554
- output_buffer.op = ops
555
- output_buffer
556
- end
557
-
558
- %i[sign exp tan acos asin sin cos abs sqrt negate square reciprocal tanh tanh_grad sigmoid log1p round floor ceil].each do |op|
559
- register_op op, noop: true do |context, tensor, inputs|
560
- execute_func(op.to_s, tensor, inputs[0], context)
561
- end
562
- end
563
-
564
- register_op :softmax do |_context, tensor, inputs|
565
- a = inputs[0]
566
- event_wait_list = build_event_wait_list(inputs)
567
- dtype = tensor.data_type
568
- output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
569
-
570
- m, n = a.shape
571
- work_group = [m]
572
- n = m if n.nil?
573
- cl_n = OpenCL::Int1.new(n || 1)
574
-
575
- event = _cl_program("softmax", dtype: dtype).send(:"softmax_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
576
- output_buffer.op = event
577
- output_buffer
578
- end
579
-
580
- register_op :log_softmax do |_context, tensor, inputs|
581
- a = inputs[0] # logits
582
- event_wait_list = build_event_wait_list(inputs)
583
- dtype = tensor.data_type
584
- output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
585
-
586
- m, n = a.shape
587
- work_group = [m]
588
- n = m if n.nil?
589
- cl_n = OpenCL::Int1.new(n || 1)
590
-
591
- event = _cl_program("log_softmax", dtype: dtype).send(:"log_softmax_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
592
- output_buffer.op = event
593
- output_buffer
594
- end
595
-
596
- register_op :softmax_cross_entropy_with_logits_v2 do |context, tensor, inputs|
597
- a = inputs[0] # logits
598
- b = inputs[1] # labels
599
- event_wait_list = build_event_wait_list(inputs)
600
- dtype = tensor.data_type
601
- output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
602
- output_buffer_backprop = _create_result_buffer(tensor.data_type, a.shape, "#{tensor.name}_2")
603
- rank = a.shape.size - 1
604
- m, n = a.shape
605
- work_group = [m]
606
- n = m if n.nil?
607
- cl_n = OpenCL::Int1.new(n || 1)
608
-
609
- event = _cl_program("softmax_cross", dtype: dtype).send(:"softmax_cross_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer, b.cl_buffer,
610
- output_buffer.cl_buffer, output_buffer_backprop.cl_buffer, event_wait_list: event_wait_list)
611
- output_buffer.op = event
612
- output_buffer_backprop.op = event
613
-
614
- loss = reduction(context, tensor, output_buffer, rank, :sum)
615
- OutputGroup.new([loss, output_buffer_backprop], [tensor.inputs[0].data_type, tensor.inputs[0].data_type])
616
- end
617
-
618
- register_op :softmax_cross_entropy_with_logits_v2_grad do |_context, tensor, inputs|
619
- a = inputs[0] # logits
620
- b = inputs[1] # labels
621
- c = inputs[2] # grads
622
- event_wait_list = build_event_wait_list(inputs)
623
- dtype = tensor.data_type
624
- output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
625
-
626
- m, n = a.shape
627
- work_group = [m]
628
- n = m if n.nil?
629
- cl_n = OpenCL::Int1.new(n || 1)
630
-
631
- event = _cl_program("softmax_cross_grad", dtype: dtype).send(:"softmax_cross_grad_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer, b.cl_buffer, c.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
632
- output_buffer.op = event
633
- output_buffer
634
- end
635
-
636
- register_op :softmax_grad do |_context, tensor, inputs|
637
- a, grad = inputs
638
-
639
- event_wait_list = build_event_wait_list(inputs)
640
- dtype = tensor.data_type
641
- output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
642
-
643
- m, n = a.shape
644
- work_group = [m]
645
- n = m if n.nil?
646
- cl_n = OpenCL::Int1.new(n || 1)
647
- event = _cl_program('softmax_grad', dtype: dtype, size: n).send(:"softmax_grad_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer, grad.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
648
- output_buffer.op = event
649
- output_buffer
650
- end
651
-
652
- register_op :check_numerics, noop: true do |context, tensor, inputs|
653
- a = complete_eval(inputs[0], context)
654
- name = tensor.options[:name]
655
-
656
- a.buffer.each do |input|
657
- raise TensorStream::InvalidArgumentError, "#{name} Invalid Argument" if input.nan? || input.infinite?
658
- end
659
- a
660
- end
661
-
662
- register_op :broadcast_transform do |context, tensor, inputs|
663
- a, b = inputs
664
-
665
- if a.shape == b.shape
666
- [a, b]
667
- else
668
- input_a = read_final_result(complete_eval(a, context))
669
- input_b = read_final_result(complete_eval(b, context))
670
- b_a, b_b = broadcast(input_a, input_b)
671
- [wrap_opencl(b_a, data_type: a.data_type, name: "#{tensor.name}_a"),
672
- wrap_opencl(b_b, data_type: a.data_type, name: "#{tensor.name}_b")]
673
- end
674
- end
675
-
676
- register_op :print do |context, tensor, inputs|
677
- a, b = inputs
678
- input_b = complete_eval(b, context)
679
- input_b = read_final_result(input_b)
680
- puts "#{tensor.options.fetch(:message, '')} #{input_b}"
681
- a
682
- end
683
-
684
- register_op :rank do |_context, tensor, inputs|
685
- wrap_opencl(inputs[0].shape.size, data_type: tensor.data_type, name: tensor.name)
686
- end
687
-
688
- register_op :stop_gradient do |_context, _tensor, inputs|
689
- inputs[0]
690
- end
691
-
692
- register_op :slice, noop: true do |context, tensor, inputs|
693
- input_a = complete_eval(inputs[0], context)
694
- input_b = read_final_result(complete_eval(inputs[1], context))
695
- size = tensor.options[:size]
696
-
697
- slice_param = input_b.zip(size).collect { |x, y| x..x + y - 1 }.reverse
698
-
699
- new_buf = input_a.buffer.reshape(*input_a.shape.reverse)
700
- sliced = new_buf.slice[*slice_param]
701
- convert_to_opencl(sliced.flatten, sliced.shape.reverse, data_type: inputs[0].data_type, name: tensor.name)
702
- end
703
-
704
- register_op :transpose, buffer: true do |_context, tensor, inputs|
705
- t_param = Array.new(inputs[0].shape.size) { |index| index }.reverse
706
-
707
- if inputs[0].shape.size == 2 && inputs[1].nil?
708
- transposed = inputs[0].buffer.reshape(*inputs[0].shape.reverse).transpose(*t_param)
709
- res = convert_to_opencl(transposed.flatten, transposed.shape.reverse, data_type: inputs[0].data_type, name: tensor.name)
710
- res
711
- else
712
- rank = inputs[0].shape.size
713
- perm = inputs[1].nil? ? (0...rank).to_a.reverse : inputs[1].buffer
714
- new_shape = perm.map { |p| inputs[0].shape[p] }.to_a
715
- output_buffer = _create_result_buffer(tensor.data_type, new_shape, tensor.name)
716
- transpose_with_perm(inputs[0].buffer, output_buffer.buffer, inputs[0].shape, new_shape, perm)
717
-
718
- write_op = _opencl_queue.enqueue_write_buffer(output_buffer.cl_buffer, output_buffer.buffer)
719
- output_buffer.op = write_op
720
- output_buffer
721
- end
722
- end
723
-
724
- register_op :index, noop: true do |context, tensor, inputs|
725
- a = _run(inputs[0], context)
726
- index = read_final_result(_run(inputs[1], context))
727
-
728
- if a.is_a?(OutputGroup)
729
- a.outputs[index]
730
- elsif a.is_a?(Array)
731
- a[index]
732
- else
733
- new_shape = a.shape.dup
734
- new_shape.shift
735
- input_a = read_final_result(a)
736
- convert_to_opencl(input_a[index], new_shape, data_type: a.data_type, name: tensor.name)
737
- end
738
- end
739
-
740
- register_op :broadcast_gradient_args, buffer: true do |_context, tensor, inputs|
741
- rx, ry = get_broadcast_gradient_args(inputs[0].buffer.to_a, inputs[1].buffer.to_a)
742
- OutputGroup.new([wrap_opencl(rx, data_type: :int32, name: tensor.name), wrap_opencl(ry, data_type: :int32, name: "#{tensor.name}:1")], tensor.inputs.map(&:data_type))
743
- end
744
-
745
- register_op :shape do |_context, tensor, inputs|
746
- wrap_opencl(inputs[0].shape, name: tensor.name, data_type: tensor.data_type)
747
- end
748
-
749
- register_op :reshape, buffer: true do |_context, tensor, inputs|
750
- arr = inputs[0]
751
- new_shape = read_final_result(inputs[1])
752
-
753
- shape = if new_shape.size.zero? && arr.buffer.size == 1
754
- new_shape
755
- else
756
- TensorShape.fix_inferred_elements(new_shape, arr.buffer.size)
757
- end
758
-
759
- convert_to_opencl(arr.buffer, shape, data_type: arr.data_type, name: tensor.name)
760
- end
761
-
762
- register_op :flow_group do |context, _tensor, inputs|
763
- _opencl_queue.finish
764
- nil
765
- end
766
-
767
- register_op :size do |_context, tensor, inputs|
768
- wrap_opencl(inputs[0].buffer.size, name: tensor.name, data_type: tensor.options[:out_type] || :int32)
769
- end
770
-
771
- %i[sum mean].each do |op|
772
- register_op op, noop: true do |context, tensor, inputs|
773
- reduction(context, tensor, inputs[0], inputs[1], op.to_sym)
774
- end
775
- end
776
-
777
- register_op :prod, noop: true do |context, tensor, inputs|
778
- input_a = complete_eval(inputs[0], context)
779
-
780
- if input_a.buffer.empty?
781
- convert_to_opencl([1.0], [], data_type: inputs[0].data_type, name: tensor.name)
782
- else
783
- reduction(context, tensor, inputs[0], inputs[1], :prod)
784
- end
785
- end
786
-
787
- register_op :argmin, buffer: true do |_context, tensor, inputs|
788
- axis = tensor.options[:axis] || 0
789
- rank = inputs[0].shape.size
790
- raise TensorStream::InvalidArgumentError, "Expected dimension in the range [#{-rank},#{rank}) but got #{axis}" if axis < -rank || axis >= rank
791
-
792
- arr = inputs[0].buffer.reshape(*inputs[0].shape.reverse).to_a
793
- op = get_op_with_axis(arr, axis, 0, inputs[0].data_type, ->(a, b) { a < b })
794
- convert_to_opencl(op, shape_eval(op), data_type: tensor.data_type, name: tensor.name)
795
- end
796
-
797
- register_op :argmax, buffer: true do |_context, tensor, inputs|
798
- axis = tensor.options[:axis] || 0
799
- rank = inputs[0].shape.size
800
- raise TensorStream::InvalidArgumentError, "Expected dimension in the range [#{-rank},#{rank}) but got #{axis}" if axis < -rank || axis >= rank
801
-
802
- arr = inputs[0].buffer.reshape(*inputs[0].shape.reverse).to_a
803
- op = get_op_with_axis(arr, axis, 0, inputs[0].data_type, ->(a, b) { a > b })
804
- convert_to_opencl(op, shape_eval(op), data_type: tensor.data_type, name: tensor.name)
805
- end
806
-
807
- def eval_operation(tensor, child_context)
808
- cache_key = "#{tensor.graph.object_id}_opencl_#{tensor.name}:#{object_id}"
809
- return @context[:_cache][cache_key] if @context[:_cache].key?(cache_key)
810
- return @context[cache_key] if @context.key?(cache_key)
811
- # puts "opencl: #{tensor.name}"
812
- invoke(tensor, child_context).tap do |result|
813
- if tensor.breakpoint
814
- a = resolve_placeholder(tensor.inputs[0], child_context) if tensor.inputs && tensor.inputs[0]
815
- b = resolve_placeholder(tensor.inputs[1], child_context) if tensor.inputs && tensor.inputs[1]
816
- a = read_final_result(complete_eval(a, child_context))
817
- b = read_final_result(complete_eval(b, child_context))
818
- result = read_final_result(complete_eval(result, child_context))
819
-
820
- tensor.breakpoint.call(tensor, a, b, result)
821
- end
822
- if @log_intermediates
823
- @context[:compute_history] << {
824
- name: tensor.name,
825
- type: tensor.data_type,
826
- shape: shape_eval(result),
827
- source: tensor.source,
828
- description: tensor.to_math(true, 1),
829
- value: result
830
- }
831
- end
832
- @context[cache_key] = result
833
- @context[:_cache][cache_key] = result if tensor.is_const
834
- end
835
- rescue EvaluatorExcecutionException => e
836
- _opencl_queue.finish # dump queue
837
- raise e, "error #{e.message} while evaluating #{tensor.name} : #{tensor.to_math(true, 1)} defined at #{tensor.source}"
838
- rescue TensorStreamError => e
839
- _opencl_queue.finish # dump queue
840
- raise e, "error #{e.message} while evaluating #{tensor.name} : #{tensor.to_math(true, 1)} defined at #{tensor.source}"
841
- rescue StandardError => e
842
- _opencl_queue.finish # dump queue
843
- puts e.message
844
- puts e.backtrace.join("\n")
845
-
846
- # shape_a = a.shape.shape if a
847
- # shape_b = b.shape.shape if b
848
- # dtype_a = a.data_type if a
849
- # dtype_b = b.data_type if b
850
- # a = complete_eval(a, child_context)
851
- # b = complete_eval(b, child_context)
852
- # puts "name: #{tensor.given_name}"
853
- # # puts "op: #{tensor.to_math(true, 1)}"
854
- # puts "A #{shape_a} #{dtype_a}: #{a}" if a
855
- # puts "B #{shape_b} #{dtype_b}: #{b}" if b
856
- # dump_intermediates if @log_intermediates
857
- # File.write('/home/jedld/workspace/tensor_stream/samples/error.graphml', TensorStream::Graphml.new.get_string(tensor, @session))
858
-
859
- # File.write('/Users/josephemmanueldayo/workspace/gradients.graphml', TensorStream::Graphml.new.get_string(tensor, @session))
860
- raise EvaluatorExcecutionException.new(e, tensor), "error #{e.message} while evaluating #{tensor.name} : #{tensor.to_math(true, 1)} defined at #{tensor.source}"
861
- end
862
-
863
- def eval_tensor(tensor, child_context)
864
- return tensor unless tensor.is_a?(Tensor)
865
-
866
- cache_key = "#{tensor.graph.object_id}_opencl_#{tensor.name}:#{object_id}"
867
- return @context[cache_key] if @context.key?(cache_key)
868
- return @context[:_cache][cache_key] if tensor.is_const && @context[:_cache][cache_key]
869
- @context[cache_key] = if tensor.value.is_a?(Tensor)
870
- _run(tensor.value, child_context)
871
- else
872
- wrap_opencl(tensor, name: tensor.name)
873
- end
874
- @context[:_cache][cache_key] = @context[cache_key] if tensor.is_const
875
- @context[cache_key]
876
- end
877
-
878
- private
879
-
880
- def assign_var(tensor, b, child_context)
881
- assign = tensor.inputs[0] || tensor
882
- buffer = complete_eval(b, child_context)
883
-
884
- if assign.buffer
885
- # buffer = type_cast(buffer, assign.data_type, name: "#{tensor.name}/cast_#{tensor.name}_#{tensor.data_type}")
886
- event_wait_list = build_event_wait_list([buffer, assign.buffer])
887
- assign.buffer.op = if assign.buffer.cl_buffer != buffer.cl_buffer
888
- _opencl_queue.enqueue_copy_buffer(buffer.cl_buffer, assign.buffer.cl_buffer, event_wait_list: event_wait_list)
889
- else
890
- buffer.op
891
- end
892
- else
893
- value = read_final_result(buffer)
894
- assign.buffer = convert_to_opencl(value, buffer.shape, data_type: tensor.data_type, name: assign.name)
895
- assign.value = value
896
- end
897
- assign.buffer.dirty = true
898
- assign.buffer
899
- end
900
-
901
- def execute_2_operand_func(op_name, tensor, input_a, input_b, child_context, prog_name = nil)
902
- a = _run(input_a, child_context)
903
- b = _run(input_b, child_context)
904
- a, b = auto_type_cast(a, b, name: "#{tensor.name}/cast_#{a.name}_#{b.data_type}")
905
- dtype = tensor.data_type
906
- result_shape = TensorShape.infer_shape(a.shape, b.shape)
907
- return _create_result_buffer(dtype, [0], "out_#{tensor.name}") if result_shape == [0]
908
- output_buffer = _create_result_buffer(tensor.data_type, result_shape, "out_#{tensor.name}")
909
- a, b, prog, switch_operands = select_program(a, b, op_name)
910
- m, n = result_shape
911
- work_group = [m || 1, n || 1]
912
- cl_m = OpenCL::Int1.new(m || 1)
913
- cl_n = OpenCL::Int1.new(n || 1)
914
- cl_switch = OpenCL::Int1.new(switch_operands) # no need to switch for addition
915
-
916
- event_wait_list = build_event_wait_list([a, b]) # add dependency wait list
917
-
918
- method_call = :"#{prog}_#{a.data_type}_#{b.data_type}"
919
- prog_name ||= op_name
920
- event = if prog == "#{op_name}_b"
921
- cl_m_b, cl_n_b = if b.shape.size == 2
922
- [OpenCL::Int1.new(b.shape[0]), OpenCL::Int1.new(b.shape[1])]
923
- elsif b.shape.size == 1
924
- [OpenCL::Int1.new(1), OpenCL::Int1.new(b.shape[0])]
925
- else
926
- raise "rank > 2 not supported!"
927
- end
928
- _cl_program(prog_name, a: a.data_type, b: b.data_type, dtype: dtype).
929
- send(method_call, _opencl_queue, work_group, cl_m, cl_n, cl_m_b, cl_n_b,
930
- cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
931
- else
932
- _cl_program(prog_name, a: a.data_type, b: b.data_type, dtype: dtype).
933
- send(method_call, _opencl_queue, work_group, cl_m, cl_n, cl_switch,
934
- a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
935
- end
936
-
937
- output_buffer.op = event
938
- output_buffer
939
- end
940
-
941
- def execute_cond_func(op_name, tensor, pred, input_a, input_b, child_context)
942
- p = _run(pred, child_context)
943
- a = _run(input_a, child_context)
944
- b = _run(input_b, child_context)
945
-
946
- a, b = auto_type_cast(a, b, name: "#{tensor.name}/cast_#{a.name}_#{b.data_type}")
947
- dtype = tensor.data_type
948
-
949
- output_buffer = _create_result_buffer(tensor.data_type, p.shape, tensor.name)
950
-
951
- m, n = p.shape
952
- work_group = [m || 1, n || 1]
953
- cl_m = OpenCL::Int1.new(m || 1)
954
- cl_n = OpenCL::Int1.new(n || 1)
955
-
956
- event_wait_list = build_event_wait_list([a, b, p]) # add dependency wait list
957
- output_buffer.op = _cl_program(op_name.to_s, dtype: dtype).send(:"#{op_name}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, p.cl_buffer, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
958
- output_buffer
959
- end
960
-
961
- def execute_func(op_name, tensor, a, child_context)
962
- a = _run(a, child_context)
963
- event_wait_list = build_event_wait_list([a])
964
- dtype = tensor.data_type
965
- output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
966
-
967
- m, n = a.shape
968
- work_group = [m || 1, n || 1]
969
- cl_m = OpenCL::Int1.new(m || 1)
970
- cl_n = OpenCL::Int1.new(n || 1)
971
-
972
- event = _cl_program(op_name.to_s, dtype: dtype).send(:"#{op_name}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
973
- output_buffer.op = event
974
- output_buffer
975
- end
976
-
977
- def auto_type_cast(a, b, name: nil)
978
- return [a, b] if a.data_type == b.data_type
979
- m, n = b.shape
980
- work_group = [m || 1, n || 1]
981
- event_wait_list = build_event_wait_list([b])
982
- buffer = _create_result_buffer(b.data_type, b.shape, name)
983
-
984
- cl_m = OpenCL::Int1.new(m || 1)
985
- cl_n = OpenCL::Int1.new(n || 1)
986
-
987
- buffer.op = _cl_program("cast", source_dt: a.data_type, target_dt: b.data_type).cast(_opencl_queue, work_group, cl_m, cl_n, b.cl_buffer, buffer.cl_buffer, event_wait_list: event_wait_list)
988
- [a, buffer]
989
- end
990
-
991
- def type_cast(source, data_type, name: nil)
992
- return source if source.data_type == data_type
993
- m, n = source.shape
994
- work_group = [m || 1, n || 1]
995
- event_wait_list = [source.op].compact
996
- buffer = _create_result_buffer(data_type, source.shape, name)
997
-
998
- cl_m = OpenCL::Int1.new(m || 1)
999
- cl_n = OpenCL::Int1.new(n || 1)
1000
-
1001
- buffer.op = _cl_program("cast", source_dt: source.data_type, target_dt: data_type).cast(_opencl_queue, work_group, cl_m, cl_n, source.cl_buffer, buffer.cl_buffer, event_wait_list: event_wait_list)
1002
- buffer
1003
- end
1004
-
1005
- def wrap_opencl(tensor, data_type: nil, name: nil)
1006
- value, shape = if tensor.is_a?(Tensor)
1007
- [tensor.value, tensor.shape.shape]
1008
- else
1009
- [tensor, shape_eval(tensor)]
1010
- end
1011
-
1012
- convert_to_opencl(value, shape, data_type: data_type || tensor.data_type, name: name)
1013
- end
1014
-
1015
- def get_cached_buffer(name, shape)
1016
- cache_key = "_cl_object_#{name}:#{shape.join('_')}:#{object_id}"
1017
- @context[:_cache][cache_key]
1018
- end
1019
-
1020
- def convert_to_opencl(value, shape, data_type: nil, name: nil)
1021
- value = [value] if !value.is_a?(Array) && !value.is_a?(NArray)
1022
-
1023
- cache_key = "_cl_object_#{name}:#{shape.join('_')}:#{object_id}"
1024
- cl_object = if name && @context[:_cache][cache_key]
1025
- @context[:_cache][cache_key]
1026
- else
1027
- narray_size = shape.reduce(:*) || 1
1028
-
1029
- buffer = if value.is_a?(NArray)
1030
- value
1031
- else
1032
- allocate_narray_for_type(data_type, narray_size)
1033
- end
1034
-
1035
- return nil if buffer.nil?
1036
-
1037
- cl_buffer_size = shape.empty? ? 1 : shape.reduce(:*)
1038
-
1039
- cl_buffer = unless value.flatten.empty?
1040
- cl_buffer_size = 1 if cl_buffer_size.zero?
1041
- _opencl_context.create_buffer(cl_buffer_size * buffer.element_size)
1042
- end
1043
-
1044
- @context[:_cache][cache_key] = OpenCLBuffer.new(name: name, data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer)
1045
- end
1046
-
1047
- if value.is_a?(Array)
1048
- value.flatten.each_with_index do |element, index|
1049
- cl_object.buffer[index] = if element.is_a?(Tensor)
1050
- read_final_result(complete_eval(element, {}))
1051
- elsif data_type == :boolean
1052
- element ? 1 : 0
1053
- else
1054
- Tensor.cast_dtype(element, data_type)
1055
- end
1056
- end
1057
- elsif value.is_a?(NArray)
1058
- cl_object.buffer = value
1059
- elsif data_type == :boolean
1060
- cl_object.buffer[0] = element ? 1 : 0
1061
- else
1062
- cl_object.buffer[0] = Tensor.cast_dtype(value, data_type)
1063
- end
1064
-
1065
- write_op = _opencl_queue.enqueue_write_buffer(cl_object.cl_buffer, cl_object.buffer) if cl_object.cl_buffer && !value.nil? && (!value.is_a?(Array) || !value.empty?)
1066
- cl_object.op = write_op
1067
- cl_object
1068
- end
1069
-
1070
- def allocate_narray_for_type(data_type, narray_size)
1071
- case data_type
1072
- when :float, :float32
1073
- NArray.sfloat(narray_size)
1074
- when :float64
1075
- NArray.float(narray_size)
1076
- when :int, :int32, :int64
1077
- NArray.int(narray_size)
1078
- when :int16
1079
- NArray.sint(narray_size)
1080
- when :boolean
1081
- NArray.sint(narray_size)
1082
- when :unknown
1083
- nil
1084
- else
1085
- raise "unsupported type #{data_type}"
1086
- end
1087
- end
1088
-
1089
- def _create_result_buffer(data_type, shape, name)
1090
- return OpenCLBuffer.new(name: name, data_type: data_type, shape: [0], buffer: nil, cl_buffer: nil) if shape == [0]
1091
- @context[:_cache][:_cl_buffers]["_result_#{name}_#{shape.join('_')}:#{object_id}"] ||= begin
1092
- size = shape.empty? || shape == [0] ? 1 : shape.reduce(:*)
1093
- buffer = allocate_narray_for_type(data_type, size)
1094
- cl_buffer = _opencl_context.create_buffer(buffer.size * buffer.element_size)
1095
- OpenCLBuffer.new(data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer, name: name)
1096
- end
1097
- end
1098
-
1099
- def get_op_with_axis(a, target_axis, current_axis, output_type, op = ->(t, u) { t > u })
1100
- if target_axis == current_axis
1101
- if a[0].is_a?(Array)
1102
- (0...a[0].size).each.collect do |column_index|
1103
- max = nil
1104
- max_index = 0
1105
- a.each_with_index do |row, row_index|
1106
- if max.nil? || op.call(row[column_index], max)
1107
- max = row[column_index]
1108
- max_index = row_index
1109
- end
1110
- end
1111
-
1112
- Tensor.cast_dtype(max_index, output_type)
1113
- end
1114
- else
1115
- max = nil
1116
- max_index = 0
1117
- a.each_with_index do |x, index|
1118
- if max.nil? || op.call(x, max)
1119
- max = x
1120
- max_index = index
1121
- end
1122
- end
1123
- Tensor.cast_dtype(max_index, output_type)
1124
- end
1125
- else
1126
- a.collect do |row|
1127
- get_op_with_axis(row, target_axis, current_axis + 1, output_type, op)
1128
- end
1129
- end
1130
- end
1131
-
1132
- def _reduced_shape(input_shape, axes)
1133
- return [] if axes.nil? # reduce to scalar
1134
- axes = [axes] unless axes.is_a?(Array)
1135
- return input_shape if axes.empty?
1136
-
1137
- axes.each do |dimen|
1138
- input_shape[dimen] = 1
1139
- end
1140
- input_shape
1141
- end
1142
-
1143
- def reduction(child_context, tensor, a, b, func)
1144
- input = complete_eval(a, child_context)
1145
- axis = b.is_a?(Tensor) ? read_final_result(complete_eval(b, child_context)) : b
1146
- if axis.nil?
1147
- red = input.buffer.send(func)
1148
- convert_to_opencl(red, [], data_type: tensor.data_type, name: tensor.name)
1149
- else
1150
- return input if input.shape.empty?
1151
- value = input.buffer.reshape(*input.shape.reverse)
1152
- rank = input.shape.size - 1
1153
-
1154
- if axis.is_a?(Array)
1155
- axis.map { |x| rank - x.abs }.sort.reverse_each do |x|
1156
- value = value.send(func, x.to_i)
1157
- end
1158
- else
1159
- value = value.send(func, rank - axis.abs)
1160
- end
1161
-
1162
- new_shape = if value.is_a?(NArray)
1163
- value.shape.reverse
1164
- else
1165
- value = [value]
1166
- []
1167
- end
1168
-
1169
- new_shape = _reduced_shape(input.shape.dup, axis) if tensor.options[:keepdims]
1170
-
1171
- convert_to_opencl(value.flatten, new_shape, data_type: tensor.data_type, name: tensor.name)
1172
- end
1173
- end
1174
-
1175
- # selects variants of cl programs depending on input
1176
- def select_program(input_a, input_b, op)
1177
- return [input_a, input_b, op.to_s, 0] if input_a.shape == input_b.shape
1178
-
1179
- return [input_b, input_a, "#{op}_c", 1] if input_a.shape.empty? || input_a.shape.reduce(:*) == 1 # A is scalar?
1180
- return [input_a, input_b, "#{op}_c", 0] if input_b.shape.empty? || input_a.shape.reduce(:*) == 1 # B is scalar?
1181
-
1182
- return [input_b, input_a, "#{op}_b", 1] if input_a.shape.size < input_b.shape.size
1183
-
1184
- if input_a.shape.size == input_b.shape.size
1185
- input_a.shape.zip(input_b.shape).each do |s1, s2|
1186
- return [input_b, input_a, "#{op}_b", 1] if s1 < s2
1187
- end
1188
- end
1189
-
1190
- [input_a, input_b, "#{op}_b", 0]
1191
- end
1192
-
1193
- def _rank_from_shape(shape)
1194
- shape.is_a?(Array) ? shape.size : 0
1195
- end
1196
-
1197
- def build_event_wait_list(inputs)
1198
- inputs.compact.map(&:op).flatten
1199
- end
1200
-
1201
- def resolve_placeholder(placeholder, _execution_context = {})
1202
- return nil if placeholder.nil?
1203
-
1204
- var = if placeholder.is_a?(Placeholder)
1205
- @context[placeholder.name.to_sym].tap do |c|
1206
- raise "missing placeholder #{placeholder.name}" if c.nil?
1207
- end
1208
- else
1209
- placeholder
1210
- end
1211
-
1212
- return convert_to_opencl(var, shape_eval(var), data_type: placeholder.data_type, name: placeholder.name) unless var.is_a?(Tensor)
1213
- Tensor.cast_dtype(var, placeholder.data_type)
1214
- end
1215
-
1216
- def all_true?(arr)
1217
- if arr.is_a?(Array) || arr.is_a?(NArray)
1218
- arr.each do |a|
1219
- return false unless all_true?(a)
1220
- end
1221
- return true
1222
- end
1223
-
1224
- arr != 0
1225
- end
1226
- end
1227
- end
1228
- end
1229
-
1230
- TensorStream::Evaluator.register_evaluator(TensorStream::Evaluator::OpenclEvaluator, 'opencl', 1)