tensor_stream 0.8.1 → 0.8.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (84) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/CHANGELOG.md +8 -0
  4. data/README.md +12 -6
  5. data/lib/tensor_stream.rb +1 -0
  6. data/lib/tensor_stream/evaluator/base_evaluator.rb +1 -1
  7. data/lib/tensor_stream/evaluator/ruby/array_ops.rb +282 -0
  8. data/lib/tensor_stream/evaluator/ruby/images_ops.rb +61 -0
  9. data/lib/tensor_stream/evaluator/ruby/math_ops.rb +111 -0
  10. data/lib/tensor_stream/evaluator/ruby/nn_ops.rb +48 -9
  11. data/lib/tensor_stream/evaluator/ruby/random_ops.rb +51 -0
  12. data/lib/tensor_stream/evaluator/ruby_evaluator.rb +20 -433
  13. data/lib/tensor_stream/images.rb +16 -0
  14. data/lib/tensor_stream/ops.rb +5 -1
  15. data/lib/tensor_stream/session.rb +15 -15
  16. data/lib/tensor_stream/tensor.rb +1 -1
  17. data/lib/tensor_stream/train/adadelta_optimizer.rb +52 -0
  18. data/lib/tensor_stream/train/adam_optimizer.rb +17 -2
  19. data/lib/tensor_stream/train/gradient_descent_optimizer.rb +7 -1
  20. data/lib/tensor_stream/trainer.rb +1 -0
  21. data/lib/tensor_stream/types.rb +4 -0
  22. data/lib/tensor_stream/utils.rb +4 -0
  23. data/lib/tensor_stream/variable_scope.rb +1 -0
  24. data/lib/tensor_stream/version.rb +1 -1
  25. data/samples/linear_regression.rb +4 -1
  26. data/samples/mnist_data.rb +64 -0
  27. data/samples/nearest_neighbor.rb +1 -2
  28. data/samples/raw_neural_net_sample.rb +1 -1
  29. data/tensor_stream.gemspec +1 -0
  30. metadata +23 -57
  31. data/lib/tensor_stream/evaluator/opencl/kernels/_bool_operand.cl +0 -45
  32. data/lib/tensor_stream/evaluator/opencl/kernels/_operand.cl +0 -45
  33. data/lib/tensor_stream/evaluator/opencl/kernels/abs.cl +0 -20
  34. data/lib/tensor_stream/evaluator/opencl/kernels/acos.cl +0 -8
  35. data/lib/tensor_stream/evaluator/opencl/kernels/add.cl +0 -3
  36. data/lib/tensor_stream/evaluator/opencl/kernels/apply_adam.cl +0 -23
  37. data/lib/tensor_stream/evaluator/opencl/kernels/apply_gradient.cl +0 -9
  38. data/lib/tensor_stream/evaluator/opencl/kernels/apply_momentum.cl +0 -16
  39. data/lib/tensor_stream/evaluator/opencl/kernels/argmax.cl +0 -8
  40. data/lib/tensor_stream/evaluator/opencl/kernels/argmin.cl +0 -8
  41. data/lib/tensor_stream/evaluator/opencl/kernels/asin.cl +0 -9
  42. data/lib/tensor_stream/evaluator/opencl/kernels/cast.cl +0 -10
  43. data/lib/tensor_stream/evaluator/opencl/kernels/ceil.cl +0 -8
  44. data/lib/tensor_stream/evaluator/opencl/kernels/cond.cl.erb +0 -6
  45. data/lib/tensor_stream/evaluator/opencl/kernels/cos.cl +0 -8
  46. data/lib/tensor_stream/evaluator/opencl/kernels/div.cl.erb +0 -3
  47. data/lib/tensor_stream/evaluator/opencl/kernels/exp.cl +0 -8
  48. data/lib/tensor_stream/evaluator/opencl/kernels/floor.cl +0 -8
  49. data/lib/tensor_stream/evaluator/opencl/kernels/floor_div.cl +0 -48
  50. data/lib/tensor_stream/evaluator/opencl/kernels/floor_mod.cl +0 -3
  51. data/lib/tensor_stream/evaluator/opencl/kernels/gemm.cl +0 -32
  52. data/lib/tensor_stream/evaluator/opencl/kernels/log.cl +0 -8
  53. data/lib/tensor_stream/evaluator/opencl/kernels/log1p.cl +0 -8
  54. data/lib/tensor_stream/evaluator/opencl/kernels/log_softmax.cl +0 -26
  55. data/lib/tensor_stream/evaluator/opencl/kernels/max.cl +0 -46
  56. data/lib/tensor_stream/evaluator/opencl/kernels/min.cl +0 -46
  57. data/lib/tensor_stream/evaluator/opencl/kernels/mod.cl +0 -3
  58. data/lib/tensor_stream/evaluator/opencl/kernels/mul.cl +0 -3
  59. data/lib/tensor_stream/evaluator/opencl/kernels/negate.cl +0 -8
  60. data/lib/tensor_stream/evaluator/opencl/kernels/pack.cl +0 -24
  61. data/lib/tensor_stream/evaluator/opencl/kernels/pow.cl +0 -46
  62. data/lib/tensor_stream/evaluator/opencl/kernels/real_div.cl +0 -3
  63. data/lib/tensor_stream/evaluator/opencl/kernels/reciprocal.cl +0 -8
  64. data/lib/tensor_stream/evaluator/opencl/kernels/round.cl +0 -8
  65. data/lib/tensor_stream/evaluator/opencl/kernels/sigmoid.cl +0 -9
  66. data/lib/tensor_stream/evaluator/opencl/kernels/sigmoid_grad.cl +0 -55
  67. data/lib/tensor_stream/evaluator/opencl/kernels/sign.cl +0 -21
  68. data/lib/tensor_stream/evaluator/opencl/kernels/sin.cl +0 -9
  69. data/lib/tensor_stream/evaluator/opencl/kernels/softmax.cl +0 -26
  70. data/lib/tensor_stream/evaluator/opencl/kernels/softmax_cross.cl +0 -32
  71. data/lib/tensor_stream/evaluator/opencl/kernels/softmax_cross_grad.cl +0 -28
  72. data/lib/tensor_stream/evaluator/opencl/kernels/softmax_grad.cl +0 -46
  73. data/lib/tensor_stream/evaluator/opencl/kernels/sqrt.cl +0 -9
  74. data/lib/tensor_stream/evaluator/opencl/kernels/square.cl +0 -9
  75. data/lib/tensor_stream/evaluator/opencl/kernels/squared_difference.cl +0 -53
  76. data/lib/tensor_stream/evaluator/opencl/kernels/sub.cl +0 -3
  77. data/lib/tensor_stream/evaluator/opencl/kernels/tan.cl +0 -8
  78. data/lib/tensor_stream/evaluator/opencl/kernels/tanh.cl +0 -8
  79. data/lib/tensor_stream/evaluator/opencl/kernels/tanh_grad.cl +0 -7
  80. data/lib/tensor_stream/evaluator/opencl/kernels/where.cl +0 -8
  81. data/lib/tensor_stream/evaluator/opencl/opencl_buffer.rb +0 -35
  82. data/lib/tensor_stream/evaluator/opencl/opencl_device.rb +0 -5
  83. data/lib/tensor_stream/evaluator/opencl/opencl_evaluator.rb +0 -1230
  84. data/lib/tensor_stream/evaluator/opencl/opencl_template_helper.rb +0 -95
@@ -1,3 +0,0 @@
1
- % c_dtype = dtype_to_c_type(dtype)
2
- % op = operator_to_c('sub')
3
- <%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'sub', dtype: "#{a}_#{b}", result_t: c_dtype %>
@@ -1,8 +0,0 @@
1
- % c_dtype = dtype_to_c_type(dtype)
2
- __kernel void tan_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
- // Get the index of the current element to be processed
4
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
-
7
- C[globalRow * N + globalCol] = tan(A[globalRow * N + globalCol]);
8
- }
@@ -1,8 +0,0 @@
1
- % c_dtype = dtype_to_c_type(dtype)
2
- __kernel void tanh_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
- // Get the index of the current element to be processed
4
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
-
7
- C[globalRow * N + globalCol] = tanh(A[globalRow * N + globalCol]);
8
- }
@@ -1,7 +0,0 @@
1
- % c_dtype = dtype_to_c_type(dtype)
2
- __kernel void tanh_grad_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
- // Get the index of the current element to be processed
4
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
- C[globalRow * N + globalCol] = 1 - tanh(A[globalRow * N + globalCol]) * tanh(A[globalRow * N + globalCol]);
7
- }
@@ -1,8 +0,0 @@
1
- % c_dtype = dtype_to_c_type(dtype)
2
- __kernel void where_<%= dtype %>(const int M, const int N, __global const short *PRED, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
3
- // Get the index of the current element to be processed
4
- const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
- const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
-
7
- C[globalRow * N + globalCol] = PRED[globalRow * N + globalCol] ? A[globalRow * N + globalCol] : B[globalRow * N + globalCol];
8
- }
@@ -1,35 +0,0 @@
1
- module TensorStream
2
- # Buffer used by the OpenCL evaluator
3
- class OpenCLBuffer < Buffer
4
- include ArrayOpsHelper
5
-
6
- attr_accessor :shape, :buffer, :cl_buffer, :op
7
-
8
- def initialize(data_type:, shape:, buffer:, cl_buffer:, op: nil, name: nil)
9
- @data_type = data_type
10
- @shape = shape
11
- @buffer = buffer
12
- @cl_buffer = cl_buffer
13
- @name = name
14
- @op = op
15
- end
16
-
17
- def to_ruby
18
- return [] if buffer.empty?
19
-
20
- if dirty
21
- op.command_queue.enqueue_read_buffer(cl_buffer, buffer, event_wait_list: [op].compact)
22
- op.command_queue.finish
23
- self.dirty = false
24
- end
25
-
26
- if shape.empty?
27
- return buffer[0] != 0 if data_type == :boolean
28
- return buffer[0]
29
- end
30
-
31
- result = buffer.reshape(*shape.map(&:to_i).reverse).to_a
32
- data_type == :boolean ? process_function_op(result, ->(a, _b) { a != 0 }) : result
33
- end
34
- end
35
- end
@@ -1,5 +0,0 @@
1
- module TensorStream
2
- class OpenclDevice < TensorStream::Device
3
- attr_accessor :native_device
4
- end
5
- end
@@ -1,1230 +0,0 @@
1
- require 'tensor_stream/evaluator/operation_helpers/random_gaussian'
2
- require 'tensor_stream/evaluator/operation_helpers/array_ops_helper'
3
- require 'tensor_stream/evaluator/operation_helpers/math_helper'
4
- require 'tensor_stream/evaluator/opencl/opencl_buffer'
5
- require 'tensor_stream/evaluator/opencl/opencl_template_helper'
6
- require 'tensor_stream/evaluator/opencl/opencl_device'
7
- require 'opencl_ruby_ffi'
8
- require 'narray_ffi'
9
- require 'tensor_stream/evaluator/base_evaluator'
10
-
11
- module TensorStream
12
- module Evaluator
13
- class FullEvalNotPossible < RuntimeError
14
- end
15
-
16
- # Errors during graph evaluation
17
- class EvaluatorExcecutionException < RuntimeError
18
- attr_reader :tensor
19
-
20
- def initialize(exception, tensor)
21
- @exception = exception
22
- @tensor = tensor
23
- end
24
-
25
- def wrapped_exception
26
- @exception
27
- end
28
- end
29
-
30
- ## PURE ruby evaluator used for testing and development
31
- class OpenclEvaluator < BaseEvaluator
32
- attr_accessor :retain
33
- attr_reader :opencl_device
34
-
35
- include TensorStream::OpHelper
36
- include TensorStream::ArrayOpsHelper
37
- include TensorStream::MathHelper
38
-
39
- def initialize(session, device, thread_pool: nil, log_intermediates: false)
40
- super
41
- _create_opencl_context(device.native_device)
42
- @opencl_device = device.native_device
43
- create_command_queue
44
- end
45
-
46
- def self.query_supported_devices
47
- devices = query_devices_with_score
48
- devices.sort { |a| a[1] }.reverse.map do |d|
49
- opencl_to_device(d)
50
- end
51
- end
52
-
53
- def self.fetch_device(query = [])
54
- devices = query_devices_with_score
55
- platform_devices = devices.select { |d| d[0].platform.to_s.tr(' ', '_').downcase =~ /#{query[0].downcase}/ }
56
- opencl_to_device(platform_devices[[query[1].to_i, platform_devices.size - 1].min])
57
- end
58
-
59
- def self.opencl_to_device(d)
60
- device = d[0]
61
- index = d[3]
62
- platform_name = device.platform.name.tr(' ', '_').downcase
63
- uri = [platform_name, index].join(':')
64
-
65
- device_type = device.type.to_s == 'GPU' ? :gpu : :cpu
66
-
67
- OpenclDevice.new(uri, device_type, self).tap do |devide|
68
- devide.native_device = device
69
- end
70
- end
71
-
72
- ##
73
- # Select the best device available in the system for this evaluator
74
- def self.default_device
75
- devices = OpenclEvaluator.query_devices_with_score
76
- device = devices.sort { |a| a[1] }.reverse.first
77
- opencl_to_device(device)
78
- end
79
-
80
- # opencl evaluator main entrypoint
81
- def run(tensor, execution_context)
82
- read_final_result(complete_eval(tensor, execution_context))
83
- end
84
-
85
- def run_with_buffer(tensor, context, execution_context)
86
- @context = context
87
- @context[:_cache][:_cl_buffers] ||= {} if context[:_cache]
88
-
89
- if tensor.is_a?(Array)
90
- tensor.collect do |t|
91
- value = run(t, execution_context)
92
- Buffer.new(data_type: t.data_type, buffer: value)
93
- end
94
- else
95
- value = run(tensor, execution_context)
96
- Buffer.new(data_type: tensor.data_type, buffer: value)
97
- end
98
- end
99
-
100
- # buffer comes from non-opencl evaluator
101
- def convert_from_buffer(tensor, result)
102
- if result.buffer.is_a?(TensorStream::Evaluator::OutputGroup)
103
- converted_outputs = result.buffer.outputs.zip(result.buffer.data_types).map { |output, data_type| convert_to_opencl([output].flatten, shape_eval(output), data_type: data_type, name: tensor.name) }
104
- TensorStream::Evaluator::OutputGroup.new(converted_outputs, result.buffer.data_types)
105
- else
106
- convert_to_opencl([result.buffer].flatten, shape_eval(result.buffer), data_type: result.data_type, name: tensor.name)
107
- end
108
- end
109
-
110
- def enqueue_buffer_read(tensor, context)
111
- buffer = _run(tensor, context)
112
- if buffer.is_a?(Array)
113
- buffer = buffer.collect do |b|
114
- next b if b.buffer.size.zero?
115
- _opencl_queue.enqueue_read_buffer(b.cl_buffer, b.buffer, event_wait_list: build_event_wait_list([b]))
116
- b
117
- end
118
- else
119
- return buffer.outputs[0] if buffer.is_a?(OutputGroup)
120
- return buffer if buffer.nil?
121
- return [] if buffer.buffer.nil?
122
- return buffer if buffer.buffer.size.zero?
123
- _opencl_queue.enqueue_read_buffer(buffer.cl_buffer, buffer.buffer, event_wait_list: build_event_wait_list([buffer]))
124
- buffer
125
- end
126
- end
127
-
128
- def complete_eval(tensor, context)
129
- buffer = enqueue_buffer_read(tensor, context)
130
- _opencl_queue.finish
131
- buffer
132
- end
133
-
134
- def self.query_devices_with_score
135
- OpenCL.platforms.flat_map do |p|
136
- p.devices.select { |d| d.available > 0 }.each_with_index.collect do |d, index|
137
- score = 0
138
- if d.type.to_s == 'CPU'
139
- score += 1
140
- elsif d.type.to_s == 'GPU'
141
- score += 4
142
- end
143
-
144
- score += 1000 if d.platform.name == 'NVIDIA CUDA'
145
-
146
- score += d.max_compute_units
147
- score += d.max_clock_frequency
148
-
149
- [d, score, p.name, index]
150
- end
151
- end
152
- end
153
-
154
- protected
155
-
156
- def prepare_input(tensor, context, options = {})
157
- return nil unless tensor
158
- tensor = resolve_placeholder(tensor)
159
- if options[:noop]
160
- tensor
161
- elsif options[:buffer]
162
- complete_eval(tensor, context)
163
- elsif options[:complete]
164
- read_final_result(complete_eval(tensor, context))
165
- else
166
- _run(tensor, context)
167
- end
168
- end
169
-
170
- # read result from opencl and convert to ruby
171
- def read_final_result(buffer)
172
- return buffer.map { |b| read_final_result(b) } if buffer.is_a?(Array)
173
- return nil if buffer.nil?
174
-
175
- buffer.to_ruby
176
- end
177
-
178
- def _create_opencl_context(opencl_device)
179
- @opencl_context = OpenCL.create_context(opencl_device)
180
- end
181
-
182
- def create_command_queue
183
- supported_proprties = opencl_device.queue_properties.names
184
-
185
- properties = []
186
- properties << OpenCL::CommandQueue::PROFILING_ENABLE if supported_proprties.include?('PROFILING_ENABLE')
187
- properties << OpenCL::CommandQueue::OUT_OF_ORDER_EXEC_MODE_ENABLE if supported_proprties.include?('OUT_OF_ORDER_EXEC_MODE_ENABLE')
188
- @command_queue = _opencl_context.create_command_queue(opencl_device, properties: properties)
189
- end
190
-
191
- def _opencl_context
192
- @opencl_context
193
- end
194
-
195
- def _opencl_queue
196
- @command_queue
197
- end
198
-
199
- def cl_template_path(kernel, extension)
200
- File.join(File.dirname(__FILE__), 'kernels', "#{kernel}.#{extension}")
201
- end
202
-
203
- def _cl_program(kernel, args = {})
204
- suffix = args.collect { |k, v| "#{k}.#{escape_arg_content(v)}" }.join('.')
205
- @context[:_cache]["_opencl_kernel_#{kernel}.#{suffix}:#{object_id}"] ||= begin
206
- filename = %w[cl.erb cl].map { |ext| cl_template_path(kernel, ext) }.find { |n| File.exist?(n) }
207
- raise "opencl kernel template for #{kernel} has not yet been defined" if filename.nil?
208
- source = File.read(filename)
209
- source = OpenclTemplateHelper.new(source).generate(args)
210
- # File.write("/tmp/#{kernel}.#{suffix}.cl", source)
211
- program = _opencl_context.create_program_with_source(source)
212
- program.build
213
- rescue OpenCL::Error::BUILD_PROGRAM_FAILURE => e
214
- puts "OpenCL Compile error: #{program.build_log}"
215
- raise e
216
- end
217
- end
218
-
219
- def escape_arg_content(value)
220
- return value.tr(' ','_') if value.is_a?(String)
221
- return value.join('-') if value.is_a?(Array)
222
-
223
- value
224
- end
225
-
226
- def _run(tensor, execution_context)
227
- return tensor if tensor.is_a?(OpenCLBuffer)
228
- return tensor.map { |t| _run(t, execution_context) } if tensor.is_a?(Array) && !tensor.size.empty? && tensor[0].is_a?(Tensor)
229
-
230
- tensor = tensor.call if tensor.is_a?(Proc)
231
-
232
- child_context = execution_context.dup
233
- res = if tensor.is_a?(Operation)
234
- if !self.class.ops.include?(tensor.operation.to_sym)
235
- result = @session.delegate_to_evaluator(tensor, @context, execution_context)
236
- convert_from_buffer(tensor, result)
237
- else
238
- eval_operation(tensor, child_context)
239
- end
240
- elsif tensor.is_a?(Variable)
241
- eval_variable(tensor, child_context)
242
- elsif tensor.is_a?(Placeholder)
243
- resolve_placeholder(tensor, child_context)
244
- else
245
- eval_tensor(tensor, child_context)
246
- end
247
- execution_context.deep_merge!(returns: child_context[:returns])
248
- res
249
- end
250
-
251
- def eval_variable(tensor, _child_context)
252
- raise "variable #{tensor.name} not initalized" if tensor.value.nil? && (tensor.buffer.nil? || !tensor.buffer.dirty)
253
- tensor.buffer = wrap_opencl(tensor, name: tensor.name) if tensor.buffer.nil?
254
- tensor.buffer
255
- end
256
-
257
- register_op :no_op do |_context, _tensor, _inputs|
258
- end
259
-
260
- register_op :log do |context, tensor, inputs|
261
- execute_func('log', tensor, inputs[0], context)
262
- end
263
-
264
- register_op :cond, noop: true do |context, tensor, inputs|
265
- pred = complete_eval(tensor.options[:pred], context)
266
-
267
- if all_true?(pred.buffer)
268
- complete_eval(inputs[0], context)
269
- else
270
- complete_eval(inputs[1], context)
271
- end
272
- end
273
-
274
- register_op :identity do |context, tensor, inputs|
275
- if tensor.inputs.size > 1
276
- tensor.inputs[1..inputs.size].each { |input| complete_eval(input, context) }
277
- end
278
- inputs[0]
279
- end
280
-
281
- register_op :assign, noop: true do |context, tensor, inputs|
282
- assign_var(tensor, inputs[1], context)
283
- end
284
-
285
- register_op :assign_add do |context, tensor, inputs|
286
- value = execute_2_operand_func('add', tensor, inputs[0], inputs[1], context)
287
- assign_var(tensor, value, context)
288
- end
289
-
290
- register_op :assign_sub do |context, tensor, inputs|
291
- value = execute_2_operand_func('sub', tensor, inputs[0], inputs[1], context)
292
- assign_var(tensor, value, context)
293
- end
294
-
295
- register_op :variable, noop: true do |context, tensor, inputs|
296
- variable = tensor.inputs[0]
297
- raise "variable #{tensor.name} not initalized" if variable.value.nil? && (variable.buffer.nil? || !variable.buffer.dirty)
298
- variable.buffer = wrap_opencl(variable, name: variable.name) if variable.buffer.nil?
299
- variable.buffer
300
- end
301
-
302
- # Fast in place multiply subtract assign
303
- register_op :apply_gradient_descent do |_context, tensor, inputs|
304
- _target_var, learning_rate, delta = inputs
305
-
306
- assign = tensor.inputs[0] || tensor
307
-
308
- assign.buffer.dirty = true # force buffer copy when variable is read externally
309
- output_buffer = assign.buffer
310
-
311
- m, n = output_buffer.shape
312
- work_group = [m || 1, n || 1]
313
- cl_m = OpenCL::Int1.new(m || 1)
314
- cl_n = OpenCL::Int1.new(n || 1)
315
-
316
- event_wait_list = build_event_wait_list([assign.buffer, learning_rate, delta])
317
- method_call = :"apply_gradient_#{output_buffer.data_type}"
318
- event = _cl_program("apply_gradient", dtype: output_buffer.data_type).send(method_call, _opencl_queue, work_group, cl_m, cl_n, delta.cl_buffer, learning_rate.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
319
- output_buffer.op = event
320
- output_buffer
321
- end
322
-
323
- # Fast in place multiply subtract assign
324
- register_op :apply_momentum do |_context, tensor, inputs|
325
- target_var, momentum_var, learning_rate, grad, momentum = inputs
326
-
327
- assign = tensor.inputs[0] || tensor
328
- assign_acc = tensor.inputs[1]
329
- assign.buffer.dirty = true # force buffer copy when variable is read externally
330
- assign_acc.buffer.dirty = true # force buffer copy when variable is read externally
331
-
332
- output_buffer = assign.buffer
333
-
334
- m, n = output_buffer.shape
335
- work_group = [m || 1, n || 1]
336
- cl_m = OpenCL::Int1.new(m || 1)
337
- cl_n = OpenCL::Int1.new(n || 1)
338
-
339
- event_wait_list = build_event_wait_list([assign.buffer, assign_acc.buffer, learning_rate, grad, momentum])
340
- method_call = :"apply_momentum_#{output_buffer.data_type}"
341
- event = _cl_program("apply_momentum", nesterov: tensor.options[:use_nesterov], dtype: output_buffer.data_type).
342
- send(method_call, _opencl_queue, work_group, cl_m, cl_n, grad.cl_buffer,
343
- learning_rate.cl_buffer, momentum.cl_buffer, output_buffer.cl_buffer,
344
- assign_acc.buffer.cl_buffer, event_wait_list: event_wait_list)
345
- output_buffer.op = event
346
- assign_acc.buffer.op = event
347
- output_buffer
348
- end
349
-
350
- # Adam optimization algorithm
351
- register_op :apply_adam do |_context, tensor, inputs|
352
- _target_var, _m, _v, beta1_power, beta2_power, lr_t, beta1_t, beta2_t, epsilon_t, grad = inputs
353
-
354
- assign = tensor.inputs[0] || tensor
355
- assign_m = tensor.inputs[1]
356
- assign_v = tensor.inputs[2]
357
-
358
- # mark variable buffers as dirty
359
- assign.buffer.dirty = true # force buffer copy when variable is read externally
360
- assign_m.buffer.dirty = true # force buffer copy when variable is read externally
361
- assign_v.buffer.dirty = true # force buffer copy when variable is read externally
362
-
363
- output_buffer = assign.buffer
364
-
365
- m, n = output_buffer.shape
366
- work_group = [m || 1, n || 1]
367
- cl_m = OpenCL::Int1.new(m || 1)
368
- cl_n = OpenCL::Int1.new(n || 1)
369
-
370
- event_wait_list = build_event_wait_list(inputs)
371
- method_call = :"apply_adam_#{output_buffer.data_type}"
372
- event = _cl_program("apply_adam", dtype: output_buffer.data_type)
373
- .send(method_call, _opencl_queue, work_group, cl_m, cl_n,
374
- grad.cl_buffer,
375
- lr_t.cl_buffer,
376
- beta1_power.cl_buffer,
377
- beta2_power.cl_buffer,
378
- beta1_t.cl_buffer,
379
- beta2_t.cl_buffer,
380
- epsilon_t.cl_buffer,
381
- assign_m.buffer.cl_buffer,
382
- assign.buffer.cl_buffer,
383
- assign_v.buffer.cl_buffer,
384
- event_wait_list: event_wait_list)
385
- output_buffer.op = event
386
- assign_m.buffer.op = event
387
- assign_v.buffer.op = event
388
- output_buffer
389
- end
390
-
391
- %i[less less_equal greater greater_equal equal not_equal logical_and].each do |op|
392
- register_op op, noop: true do |context, tensor, inputs|
393
- execute_2_operand_func(op.to_s, tensor, inputs[0], inputs[1], context, 'cond')
394
- end
395
- end
396
-
397
- %i[max min add real_div div sub floor_mod mod mul pow sigmoid_grad squared_difference].each do |op|
398
- register_op op, noop: true do |context, tensor, inputs|
399
- execute_2_operand_func(op.to_s, tensor, inputs[0], inputs[1], context)
400
- end
401
- end
402
-
403
- register_op :add_n do |_context, tensor, inputs|
404
- if inputs.size == 1
405
- inputs[0]
406
- else
407
- m, n = inputs[0].shape
408
- work_group = [m || 1, n || 1]
409
- cl_m = OpenCL::Int1.new(m || 1)
410
- cl_n = OpenCL::Int1.new(n || 1)
411
- cl_switch = OpenCL::Int1.new(0)
412
- dtype = tensor.data_type
413
-
414
- output_buffer = _create_result_buffer(tensor.data_type, inputs[0].shape, "out_#{tensor.name}")
415
- inputs_queue = inputs.dup
416
- a = inputs_queue.pop
417
- until inputs_queue.empty?
418
- b = inputs_queue.pop
419
- event_wait_list = build_event_wait_list([a, b])
420
- method_call = :"add_#{a.data_type}_#{b.data_type}"
421
- event = _cl_program('add', a: a.data_type, b: b.data_type, dtype: dtype).send(method_call, _opencl_queue, work_group, cl_m, cl_n, cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
422
- a = output_buffer
423
- a.op = event
424
- end
425
-
426
- output_buffer.op = a.op
427
- output_buffer
428
- end
429
- end
430
-
431
- register_op :expand_dims, buffer: true do |_context, tensor, inputs|
432
- axis = inputs[1].buffer[0]
433
- shape = inputs[0].shape.dup
434
- axis = -axis if axis == shape.size
435
- new_shape = shape.insert(axis, 1).compact
436
- new_buf = inputs[0].buffer.reshape(*new_shape.reverse)
437
- convert_to_opencl(new_buf, new_shape, data_type: inputs[0].data_type, name: tensor.name)
438
- end
439
-
440
- register_op :fill, buffer: true do |_context, tensor, inputs|
441
- shape = inputs[0]
442
- value = inputs[1]
443
-
444
- narray_size = shape.buffer.to_a.reduce(:*) || 1
445
- cl_buffer = get_cached_buffer(tensor.name, shape.buffer.to_a)
446
-
447
- buffer = if cl_buffer
448
- cl_buffer.buffer
449
- else
450
- allocate_narray_for_type(tensor.data_type, narray_size)
451
- end
452
-
453
- buffer.fill!(value.buffer[0])
454
- convert_to_opencl(buffer, shape.buffer.to_a, data_type: tensor.data_type, name: tensor.name)
455
- end
456
-
457
- register_op :floor_div, noop: true do |context, tensor, inputs|
458
- if fp_type?(tensor.data_type)
459
- execute_2_operand_func('floor_div', tensor, inputs[0], inputs[1], context)
460
- else
461
- execute_2_operand_func('div', tensor, inputs[0], inputs[1], context)
462
- end
463
- end
464
-
465
- register_op :where, noop: true do |context, tensor, inputs|
466
- pred = tensor.options[:pred]
467
- execute_cond_func('where', tensor, pred, inputs[0], inputs[1], context)
468
- end
469
-
470
- register_op :mat_mul do |_context, tensor, inputs|
471
- a, b = inputs
472
-
473
- m = a.shape[0]
474
- n = b.shape[1]
475
- v = b.shape[0]
476
- k = a.shape[1]
477
-
478
- if tensor.options[:transpose_a]
479
- m = a.shape[1]
480
- k = a.shape[0]
481
- end
482
-
483
- if tensor.options[:transpose_b]
484
- n = b.shape[0]
485
- v = b.shape[1]
486
- end
487
-
488
- result_shape = [m, n]
489
-
490
- raise "#{tensor.inputs[0].name} rank must be greater than 1" if a.shape.size < 2
491
- raise "#{tensor.inputs[1].name} rank must be greater than 1" if b.shape.size < 2
492
- raise "incompatible shape sizes for matrix multiplication (#{a.shape[1]} != #{b.shape[0]}) #{a.shape} vs #{b.shape}" if k != v
493
-
494
- dtype = tensor.data_type
495
- a, b = auto_type_cast(a, b, name: "#{tensor.name}/cast_#{a.name}_#{b.data_type}")
496
- output_buffer = _create_result_buffer(a.data_type, result_shape, tensor.name)
497
-
498
- cl_m = OpenCL::Int1.new(m)
499
- cl_n = OpenCL::Int1.new(n)
500
- cl_k = OpenCL::Int1.new(k)
501
-
502
- transpose_a = OpenCL::Int1.new(tensor.options[:transpose_a] ? 1 : 0)
503
- transpose_b = OpenCL::Int1.new(tensor.options[:transpose_b] ? 1 : 0)
504
- event_wait_list = build_event_wait_list(inputs)
505
- output_buffer.op = _cl_program('gemm', dtype: dtype).send(:"gemm_#{dtype}", _opencl_queue, result_shape, cl_m, cl_n, cl_k, transpose_a, transpose_b, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
506
- output_buffer
507
- end
508
-
509
- register_op :cast do |_context, tensor, inputs|
510
- a = inputs[0]
511
- if a.data_type != tensor.data_type
512
- buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
513
- m, n = a.shape
514
- cl_m = OpenCL::Int1.new(m || 1)
515
- cl_n = OpenCL::Int1.new(n || 1)
516
- work_group = [m || 1, n || 1]
517
- event_wait_list = build_event_wait_list(inputs)
518
- buffer.op = _cl_program("cast", source_dt: a.data_type, target_dt: tensor.data_type).cast(_opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, buffer.cl_buffer, event_wait_list: event_wait_list)
519
- buffer
520
- else
521
- a
522
- end
523
- end
524
-
525
- register_op :stack do |_context, tensor, inputs|
526
- axis = tensor.options[:axis] || 0
527
- shape = inputs[0].shape
528
- rank = shape.size + 1
529
- elem_size = shape.empty? ? 1 : shape.reduce(:*)
530
-
531
- new_shape = [inputs.size]
532
- shape.inject(new_shape) { |ns, s| ns << s }
533
-
534
- divisors = new_shape.dup.drop(1).reverse.inject([1]) do |a, s|
535
- a << s * a.last
536
- end.reverse
537
-
538
- axis = rank + axis if axis < 0
539
- rotated_shape = Array.new(axis + 1) { new_shape.shift }
540
- new_shape = rotated_shape.rotate! + new_shape
541
-
542
- output_buffer = _create_result_buffer(tensor.data_type, new_shape, tensor.name)
543
- multipliers = new_shape.dup.drop(1).reverse.inject([1]) do |a, s|
544
- a << s * a.last
545
- end.reverse
546
-
547
- cl_n = OpenCL::Int1.new(elem_size)
548
- work_group = [elem_size]
549
- event_wait_list = build_event_wait_list(inputs)
550
- ops = inputs.each_with_index.map do |input, index|
551
- cl_index = OpenCL::Int1.new(index)
552
- _cl_program("pack", data_type: tensor.data_type, divisors: divisors, multipliers: multipliers, axis: axis).pack(_opencl_queue, work_group, cl_n, cl_index, input.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
553
- end
554
- output_buffer.op = ops
555
- output_buffer
556
- end
557
-
558
- %i[sign exp tan acos asin sin cos abs sqrt negate square reciprocal tanh tanh_grad sigmoid log1p round floor ceil].each do |op|
559
- register_op op, noop: true do |context, tensor, inputs|
560
- execute_func(op.to_s, tensor, inputs[0], context)
561
- end
562
- end
563
-
564
- register_op :softmax do |_context, tensor, inputs|
565
- a = inputs[0]
566
- event_wait_list = build_event_wait_list(inputs)
567
- dtype = tensor.data_type
568
- output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
569
-
570
- m, n = a.shape
571
- work_group = [m]
572
- n = m if n.nil?
573
- cl_n = OpenCL::Int1.new(n || 1)
574
-
575
- event = _cl_program("softmax", dtype: dtype).send(:"softmax_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
576
- output_buffer.op = event
577
- output_buffer
578
- end
579
-
580
- register_op :log_softmax do |_context, tensor, inputs|
581
- a = inputs[0] # logits
582
- event_wait_list = build_event_wait_list(inputs)
583
- dtype = tensor.data_type
584
- output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
585
-
586
- m, n = a.shape
587
- work_group = [m]
588
- n = m if n.nil?
589
- cl_n = OpenCL::Int1.new(n || 1)
590
-
591
- event = _cl_program("log_softmax", dtype: dtype).send(:"log_softmax_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
592
- output_buffer.op = event
593
- output_buffer
594
- end
595
-
596
- register_op :softmax_cross_entropy_with_logits_v2 do |context, tensor, inputs|
597
- a = inputs[0] # logits
598
- b = inputs[1] # labels
599
- event_wait_list = build_event_wait_list(inputs)
600
- dtype = tensor.data_type
601
- output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
602
- output_buffer_backprop = _create_result_buffer(tensor.data_type, a.shape, "#{tensor.name}_2")
603
- rank = a.shape.size - 1
604
- m, n = a.shape
605
- work_group = [m]
606
- n = m if n.nil?
607
- cl_n = OpenCL::Int1.new(n || 1)
608
-
609
- event = _cl_program("softmax_cross", dtype: dtype).send(:"softmax_cross_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer, b.cl_buffer,
610
- output_buffer.cl_buffer, output_buffer_backprop.cl_buffer, event_wait_list: event_wait_list)
611
- output_buffer.op = event
612
- output_buffer_backprop.op = event
613
-
614
- loss = reduction(context, tensor, output_buffer, rank, :sum)
615
- OutputGroup.new([loss, output_buffer_backprop], [tensor.inputs[0].data_type, tensor.inputs[0].data_type])
616
- end
617
-
618
- register_op :softmax_cross_entropy_with_logits_v2_grad do |_context, tensor, inputs|
619
- a = inputs[0] # logits
620
- b = inputs[1] # labels
621
- c = inputs[2] # grads
622
- event_wait_list = build_event_wait_list(inputs)
623
- dtype = tensor.data_type
624
- output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
625
-
626
- m, n = a.shape
627
- work_group = [m]
628
- n = m if n.nil?
629
- cl_n = OpenCL::Int1.new(n || 1)
630
-
631
- event = _cl_program("softmax_cross_grad", dtype: dtype).send(:"softmax_cross_grad_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer, b.cl_buffer, c.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
632
- output_buffer.op = event
633
- output_buffer
634
- end
635
-
636
- register_op :softmax_grad do |_context, tensor, inputs|
637
- a, grad = inputs
638
-
639
- event_wait_list = build_event_wait_list(inputs)
640
- dtype = tensor.data_type
641
- output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
642
-
643
- m, n = a.shape
644
- work_group = [m]
645
- n = m if n.nil?
646
- cl_n = OpenCL::Int1.new(n || 1)
647
- event = _cl_program('softmax_grad', dtype: dtype, size: n).send(:"softmax_grad_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer, grad.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
648
- output_buffer.op = event
649
- output_buffer
650
- end
651
-
652
- register_op :check_numerics, noop: true do |context, tensor, inputs|
653
- a = complete_eval(inputs[0], context)
654
- name = tensor.options[:name]
655
-
656
- a.buffer.each do |input|
657
- raise TensorStream::InvalidArgumentError, "#{name} Invalid Argument" if input.nan? || input.infinite?
658
- end
659
- a
660
- end
661
-
662
- register_op :broadcast_transform do |context, tensor, inputs|
663
- a, b = inputs
664
-
665
- if a.shape == b.shape
666
- [a, b]
667
- else
668
- input_a = read_final_result(complete_eval(a, context))
669
- input_b = read_final_result(complete_eval(b, context))
670
- b_a, b_b = broadcast(input_a, input_b)
671
- [wrap_opencl(b_a, data_type: a.data_type, name: "#{tensor.name}_a"),
672
- wrap_opencl(b_b, data_type: a.data_type, name: "#{tensor.name}_b")]
673
- end
674
- end
675
-
676
- register_op :print do |context, tensor, inputs|
677
- a, b = inputs
678
- input_b = complete_eval(b, context)
679
- input_b = read_final_result(input_b)
680
- puts "#{tensor.options.fetch(:message, '')} #{input_b}"
681
- a
682
- end
683
-
684
- register_op :rank do |_context, tensor, inputs|
685
- wrap_opencl(inputs[0].shape.size, data_type: tensor.data_type, name: tensor.name)
686
- end
687
-
688
- register_op :stop_gradient do |_context, _tensor, inputs|
689
- inputs[0]
690
- end
691
-
692
- register_op :slice, noop: true do |context, tensor, inputs|
693
- input_a = complete_eval(inputs[0], context)
694
- input_b = read_final_result(complete_eval(inputs[1], context))
695
- size = tensor.options[:size]
696
-
697
- slice_param = input_b.zip(size).collect { |x, y| x..x + y - 1 }.reverse
698
-
699
- new_buf = input_a.buffer.reshape(*input_a.shape.reverse)
700
- sliced = new_buf.slice[*slice_param]
701
- convert_to_opencl(sliced.flatten, sliced.shape.reverse, data_type: inputs[0].data_type, name: tensor.name)
702
- end
703
-
704
- register_op :transpose, buffer: true do |_context, tensor, inputs|
705
- t_param = Array.new(inputs[0].shape.size) { |index| index }.reverse
706
-
707
- if inputs[0].shape.size == 2 && inputs[1].nil?
708
- transposed = inputs[0].buffer.reshape(*inputs[0].shape.reverse).transpose(*t_param)
709
- res = convert_to_opencl(transposed.flatten, transposed.shape.reverse, data_type: inputs[0].data_type, name: tensor.name)
710
- res
711
- else
712
- rank = inputs[0].shape.size
713
- perm = inputs[1].nil? ? (0...rank).to_a.reverse : inputs[1].buffer
714
- new_shape = perm.map { |p| inputs[0].shape[p] }.to_a
715
- output_buffer = _create_result_buffer(tensor.data_type, new_shape, tensor.name)
716
- transpose_with_perm(inputs[0].buffer, output_buffer.buffer, inputs[0].shape, new_shape, perm)
717
-
718
- write_op = _opencl_queue.enqueue_write_buffer(output_buffer.cl_buffer, output_buffer.buffer)
719
- output_buffer.op = write_op
720
- output_buffer
721
- end
722
- end
723
-
724
- register_op :index, noop: true do |context, tensor, inputs|
725
- a = _run(inputs[0], context)
726
- index = read_final_result(_run(inputs[1], context))
727
-
728
- if a.is_a?(OutputGroup)
729
- a.outputs[index]
730
- elsif a.is_a?(Array)
731
- a[index]
732
- else
733
- new_shape = a.shape.dup
734
- new_shape.shift
735
- input_a = read_final_result(a)
736
- convert_to_opencl(input_a[index], new_shape, data_type: a.data_type, name: tensor.name)
737
- end
738
- end
739
-
740
- register_op :broadcast_gradient_args, buffer: true do |_context, tensor, inputs|
741
- rx, ry = get_broadcast_gradient_args(inputs[0].buffer.to_a, inputs[1].buffer.to_a)
742
- OutputGroup.new([wrap_opencl(rx, data_type: :int32, name: tensor.name), wrap_opencl(ry, data_type: :int32, name: "#{tensor.name}:1")], tensor.inputs.map(&:data_type))
743
- end
744
-
745
- register_op :shape do |_context, tensor, inputs|
746
- wrap_opencl(inputs[0].shape, name: tensor.name, data_type: tensor.data_type)
747
- end
748
-
749
- register_op :reshape, buffer: true do |_context, tensor, inputs|
750
- arr = inputs[0]
751
- new_shape = read_final_result(inputs[1])
752
-
753
- shape = if new_shape.size.zero? && arr.buffer.size == 1
754
- new_shape
755
- else
756
- TensorShape.fix_inferred_elements(new_shape, arr.buffer.size)
757
- end
758
-
759
- convert_to_opencl(arr.buffer, shape, data_type: arr.data_type, name: tensor.name)
760
- end
761
-
762
- register_op :flow_group do |context, _tensor, inputs|
763
- _opencl_queue.finish
764
- nil
765
- end
766
-
767
- register_op :size do |_context, tensor, inputs|
768
- wrap_opencl(inputs[0].buffer.size, name: tensor.name, data_type: tensor.options[:out_type] || :int32)
769
- end
770
-
771
- %i[sum mean].each do |op|
772
- register_op op, noop: true do |context, tensor, inputs|
773
- reduction(context, tensor, inputs[0], inputs[1], op.to_sym)
774
- end
775
- end
776
-
777
- register_op :prod, noop: true do |context, tensor, inputs|
778
- input_a = complete_eval(inputs[0], context)
779
-
780
- if input_a.buffer.empty?
781
- convert_to_opencl([1.0], [], data_type: inputs[0].data_type, name: tensor.name)
782
- else
783
- reduction(context, tensor, inputs[0], inputs[1], :prod)
784
- end
785
- end
786
-
787
- register_op :argmin, buffer: true do |_context, tensor, inputs|
788
- axis = tensor.options[:axis] || 0
789
- rank = inputs[0].shape.size
790
- raise TensorStream::InvalidArgumentError, "Expected dimension in the range [#{-rank},#{rank}) but got #{axis}" if axis < -rank || axis >= rank
791
-
792
- arr = inputs[0].buffer.reshape(*inputs[0].shape.reverse).to_a
793
- op = get_op_with_axis(arr, axis, 0, inputs[0].data_type, ->(a, b) { a < b })
794
- convert_to_opencl(op, shape_eval(op), data_type: tensor.data_type, name: tensor.name)
795
- end
796
-
797
- register_op :argmax, buffer: true do |_context, tensor, inputs|
798
- axis = tensor.options[:axis] || 0
799
- rank = inputs[0].shape.size
800
- raise TensorStream::InvalidArgumentError, "Expected dimension in the range [#{-rank},#{rank}) but got #{axis}" if axis < -rank || axis >= rank
801
-
802
- arr = inputs[0].buffer.reshape(*inputs[0].shape.reverse).to_a
803
- op = get_op_with_axis(arr, axis, 0, inputs[0].data_type, ->(a, b) { a > b })
804
- convert_to_opencl(op, shape_eval(op), data_type: tensor.data_type, name: tensor.name)
805
- end
806
-
807
- def eval_operation(tensor, child_context)
808
- cache_key = "#{tensor.graph.object_id}_opencl_#{tensor.name}:#{object_id}"
809
- return @context[:_cache][cache_key] if @context[:_cache].key?(cache_key)
810
- return @context[cache_key] if @context.key?(cache_key)
811
- # puts "opencl: #{tensor.name}"
812
- invoke(tensor, child_context).tap do |result|
813
- if tensor.breakpoint
814
- a = resolve_placeholder(tensor.inputs[0], child_context) if tensor.inputs && tensor.inputs[0]
815
- b = resolve_placeholder(tensor.inputs[1], child_context) if tensor.inputs && tensor.inputs[1]
816
- a = read_final_result(complete_eval(a, child_context))
817
- b = read_final_result(complete_eval(b, child_context))
818
- result = read_final_result(complete_eval(result, child_context))
819
-
820
- tensor.breakpoint.call(tensor, a, b, result)
821
- end
822
- if @log_intermediates
823
- @context[:compute_history] << {
824
- name: tensor.name,
825
- type: tensor.data_type,
826
- shape: shape_eval(result),
827
- source: tensor.source,
828
- description: tensor.to_math(true, 1),
829
- value: result
830
- }
831
- end
832
- @context[cache_key] = result
833
- @context[:_cache][cache_key] = result if tensor.is_const
834
- end
835
- rescue EvaluatorExcecutionException => e
836
- _opencl_queue.finish # dump queue
837
- raise e, "error #{e.message} while evaluating #{tensor.name} : #{tensor.to_math(true, 1)} defined at #{tensor.source}"
838
- rescue TensorStreamError => e
839
- _opencl_queue.finish # dump queue
840
- raise e, "error #{e.message} while evaluating #{tensor.name} : #{tensor.to_math(true, 1)} defined at #{tensor.source}"
841
- rescue StandardError => e
842
- _opencl_queue.finish # dump queue
843
- puts e.message
844
- puts e.backtrace.join("\n")
845
-
846
- # shape_a = a.shape.shape if a
847
- # shape_b = b.shape.shape if b
848
- # dtype_a = a.data_type if a
849
- # dtype_b = b.data_type if b
850
- # a = complete_eval(a, child_context)
851
- # b = complete_eval(b, child_context)
852
- # puts "name: #{tensor.given_name}"
853
- # # puts "op: #{tensor.to_math(true, 1)}"
854
- # puts "A #{shape_a} #{dtype_a}: #{a}" if a
855
- # puts "B #{shape_b} #{dtype_b}: #{b}" if b
856
- # dump_intermediates if @log_intermediates
857
- # File.write('/home/jedld/workspace/tensor_stream/samples/error.graphml', TensorStream::Graphml.new.get_string(tensor, @session))
858
-
859
- # File.write('/Users/josephemmanueldayo/workspace/gradients.graphml', TensorStream::Graphml.new.get_string(tensor, @session))
860
- raise EvaluatorExcecutionException.new(e, tensor), "error #{e.message} while evaluating #{tensor.name} : #{tensor.to_math(true, 1)} defined at #{tensor.source}"
861
- end
862
-
863
- def eval_tensor(tensor, child_context)
864
- return tensor unless tensor.is_a?(Tensor)
865
-
866
- cache_key = "#{tensor.graph.object_id}_opencl_#{tensor.name}:#{object_id}"
867
- return @context[cache_key] if @context.key?(cache_key)
868
- return @context[:_cache][cache_key] if tensor.is_const && @context[:_cache][cache_key]
869
- @context[cache_key] = if tensor.value.is_a?(Tensor)
870
- _run(tensor.value, child_context)
871
- else
872
- wrap_opencl(tensor, name: tensor.name)
873
- end
874
- @context[:_cache][cache_key] = @context[cache_key] if tensor.is_const
875
- @context[cache_key]
876
- end
877
-
878
- private
879
-
880
- def assign_var(tensor, b, child_context)
881
- assign = tensor.inputs[0] || tensor
882
- buffer = complete_eval(b, child_context)
883
-
884
- if assign.buffer
885
- # buffer = type_cast(buffer, assign.data_type, name: "#{tensor.name}/cast_#{tensor.name}_#{tensor.data_type}")
886
- event_wait_list = build_event_wait_list([buffer, assign.buffer])
887
- assign.buffer.op = if assign.buffer.cl_buffer != buffer.cl_buffer
888
- _opencl_queue.enqueue_copy_buffer(buffer.cl_buffer, assign.buffer.cl_buffer, event_wait_list: event_wait_list)
889
- else
890
- buffer.op
891
- end
892
- else
893
- value = read_final_result(buffer)
894
- assign.buffer = convert_to_opencl(value, buffer.shape, data_type: tensor.data_type, name: assign.name)
895
- assign.value = value
896
- end
897
- assign.buffer.dirty = true
898
- assign.buffer
899
- end
900
-
901
- def execute_2_operand_func(op_name, tensor, input_a, input_b, child_context, prog_name = nil)
902
- a = _run(input_a, child_context)
903
- b = _run(input_b, child_context)
904
- a, b = auto_type_cast(a, b, name: "#{tensor.name}/cast_#{a.name}_#{b.data_type}")
905
- dtype = tensor.data_type
906
- result_shape = TensorShape.infer_shape(a.shape, b.shape)
907
- return _create_result_buffer(dtype, [0], "out_#{tensor.name}") if result_shape == [0]
908
- output_buffer = _create_result_buffer(tensor.data_type, result_shape, "out_#{tensor.name}")
909
- a, b, prog, switch_operands = select_program(a, b, op_name)
910
- m, n = result_shape
911
- work_group = [m || 1, n || 1]
912
- cl_m = OpenCL::Int1.new(m || 1)
913
- cl_n = OpenCL::Int1.new(n || 1)
914
- cl_switch = OpenCL::Int1.new(switch_operands) # no need to switch for addition
915
-
916
- event_wait_list = build_event_wait_list([a, b]) # add dependency wait list
917
-
918
- method_call = :"#{prog}_#{a.data_type}_#{b.data_type}"
919
- prog_name ||= op_name
920
- event = if prog == "#{op_name}_b"
921
- cl_m_b, cl_n_b = if b.shape.size == 2
922
- [OpenCL::Int1.new(b.shape[0]), OpenCL::Int1.new(b.shape[1])]
923
- elsif b.shape.size == 1
924
- [OpenCL::Int1.new(1), OpenCL::Int1.new(b.shape[0])]
925
- else
926
- raise "rank > 2 not supported!"
927
- end
928
- _cl_program(prog_name, a: a.data_type, b: b.data_type, dtype: dtype).
929
- send(method_call, _opencl_queue, work_group, cl_m, cl_n, cl_m_b, cl_n_b,
930
- cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
931
- else
932
- _cl_program(prog_name, a: a.data_type, b: b.data_type, dtype: dtype).
933
- send(method_call, _opencl_queue, work_group, cl_m, cl_n, cl_switch,
934
- a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
935
- end
936
-
937
- output_buffer.op = event
938
- output_buffer
939
- end
940
-
941
- def execute_cond_func(op_name, tensor, pred, input_a, input_b, child_context)
942
- p = _run(pred, child_context)
943
- a = _run(input_a, child_context)
944
- b = _run(input_b, child_context)
945
-
946
- a, b = auto_type_cast(a, b, name: "#{tensor.name}/cast_#{a.name}_#{b.data_type}")
947
- dtype = tensor.data_type
948
-
949
- output_buffer = _create_result_buffer(tensor.data_type, p.shape, tensor.name)
950
-
951
- m, n = p.shape
952
- work_group = [m || 1, n || 1]
953
- cl_m = OpenCL::Int1.new(m || 1)
954
- cl_n = OpenCL::Int1.new(n || 1)
955
-
956
- event_wait_list = build_event_wait_list([a, b, p]) # add dependency wait list
957
- output_buffer.op = _cl_program(op_name.to_s, dtype: dtype).send(:"#{op_name}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, p.cl_buffer, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
958
- output_buffer
959
- end
960
-
961
- def execute_func(op_name, tensor, a, child_context)
962
- a = _run(a, child_context)
963
- event_wait_list = build_event_wait_list([a])
964
- dtype = tensor.data_type
965
- output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
966
-
967
- m, n = a.shape
968
- work_group = [m || 1, n || 1]
969
- cl_m = OpenCL::Int1.new(m || 1)
970
- cl_n = OpenCL::Int1.new(n || 1)
971
-
972
- event = _cl_program(op_name.to_s, dtype: dtype).send(:"#{op_name}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
973
- output_buffer.op = event
974
- output_buffer
975
- end
976
-
977
- def auto_type_cast(a, b, name: nil)
978
- return [a, b] if a.data_type == b.data_type
979
- m, n = b.shape
980
- work_group = [m || 1, n || 1]
981
- event_wait_list = build_event_wait_list([b])
982
- buffer = _create_result_buffer(b.data_type, b.shape, name)
983
-
984
- cl_m = OpenCL::Int1.new(m || 1)
985
- cl_n = OpenCL::Int1.new(n || 1)
986
-
987
- buffer.op = _cl_program("cast", source_dt: a.data_type, target_dt: b.data_type).cast(_opencl_queue, work_group, cl_m, cl_n, b.cl_buffer, buffer.cl_buffer, event_wait_list: event_wait_list)
988
- [a, buffer]
989
- end
990
-
991
- def type_cast(source, data_type, name: nil)
992
- return source if source.data_type == data_type
993
- m, n = source.shape
994
- work_group = [m || 1, n || 1]
995
- event_wait_list = [source.op].compact
996
- buffer = _create_result_buffer(data_type, source.shape, name)
997
-
998
- cl_m = OpenCL::Int1.new(m || 1)
999
- cl_n = OpenCL::Int1.new(n || 1)
1000
-
1001
- buffer.op = _cl_program("cast", source_dt: source.data_type, target_dt: data_type).cast(_opencl_queue, work_group, cl_m, cl_n, source.cl_buffer, buffer.cl_buffer, event_wait_list: event_wait_list)
1002
- buffer
1003
- end
1004
-
1005
- def wrap_opencl(tensor, data_type: nil, name: nil)
1006
- value, shape = if tensor.is_a?(Tensor)
1007
- [tensor.value, tensor.shape.shape]
1008
- else
1009
- [tensor, shape_eval(tensor)]
1010
- end
1011
-
1012
- convert_to_opencl(value, shape, data_type: data_type || tensor.data_type, name: name)
1013
- end
1014
-
1015
- def get_cached_buffer(name, shape)
1016
- cache_key = "_cl_object_#{name}:#{shape.join('_')}:#{object_id}"
1017
- @context[:_cache][cache_key]
1018
- end
1019
-
1020
- def convert_to_opencl(value, shape, data_type: nil, name: nil)
1021
- value = [value] if !value.is_a?(Array) && !value.is_a?(NArray)
1022
-
1023
- cache_key = "_cl_object_#{name}:#{shape.join('_')}:#{object_id}"
1024
- cl_object = if name && @context[:_cache][cache_key]
1025
- @context[:_cache][cache_key]
1026
- else
1027
- narray_size = shape.reduce(:*) || 1
1028
-
1029
- buffer = if value.is_a?(NArray)
1030
- value
1031
- else
1032
- allocate_narray_for_type(data_type, narray_size)
1033
- end
1034
-
1035
- return nil if buffer.nil?
1036
-
1037
- cl_buffer_size = shape.empty? ? 1 : shape.reduce(:*)
1038
-
1039
- cl_buffer = unless value.flatten.empty?
1040
- cl_buffer_size = 1 if cl_buffer_size.zero?
1041
- _opencl_context.create_buffer(cl_buffer_size * buffer.element_size)
1042
- end
1043
-
1044
- @context[:_cache][cache_key] = OpenCLBuffer.new(name: name, data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer)
1045
- end
1046
-
1047
- if value.is_a?(Array)
1048
- value.flatten.each_with_index do |element, index|
1049
- cl_object.buffer[index] = if element.is_a?(Tensor)
1050
- read_final_result(complete_eval(element, {}))
1051
- elsif data_type == :boolean
1052
- element ? 1 : 0
1053
- else
1054
- Tensor.cast_dtype(element, data_type)
1055
- end
1056
- end
1057
- elsif value.is_a?(NArray)
1058
- cl_object.buffer = value
1059
- elsif data_type == :boolean
1060
- cl_object.buffer[0] = element ? 1 : 0
1061
- else
1062
- cl_object.buffer[0] = Tensor.cast_dtype(value, data_type)
1063
- end
1064
-
1065
- write_op = _opencl_queue.enqueue_write_buffer(cl_object.cl_buffer, cl_object.buffer) if cl_object.cl_buffer && !value.nil? && (!value.is_a?(Array) || !value.empty?)
1066
- cl_object.op = write_op
1067
- cl_object
1068
- end
1069
-
1070
- def allocate_narray_for_type(data_type, narray_size)
1071
- case data_type
1072
- when :float, :float32
1073
- NArray.sfloat(narray_size)
1074
- when :float64
1075
- NArray.float(narray_size)
1076
- when :int, :int32, :int64
1077
- NArray.int(narray_size)
1078
- when :int16
1079
- NArray.sint(narray_size)
1080
- when :boolean
1081
- NArray.sint(narray_size)
1082
- when :unknown
1083
- nil
1084
- else
1085
- raise "unsupported type #{data_type}"
1086
- end
1087
- end
1088
-
1089
- def _create_result_buffer(data_type, shape, name)
1090
- return OpenCLBuffer.new(name: name, data_type: data_type, shape: [0], buffer: nil, cl_buffer: nil) if shape == [0]
1091
- @context[:_cache][:_cl_buffers]["_result_#{name}_#{shape.join('_')}:#{object_id}"] ||= begin
1092
- size = shape.empty? || shape == [0] ? 1 : shape.reduce(:*)
1093
- buffer = allocate_narray_for_type(data_type, size)
1094
- cl_buffer = _opencl_context.create_buffer(buffer.size * buffer.element_size)
1095
- OpenCLBuffer.new(data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer, name: name)
1096
- end
1097
- end
1098
-
1099
- def get_op_with_axis(a, target_axis, current_axis, output_type, op = ->(t, u) { t > u })
1100
- if target_axis == current_axis
1101
- if a[0].is_a?(Array)
1102
- (0...a[0].size).each.collect do |column_index|
1103
- max = nil
1104
- max_index = 0
1105
- a.each_with_index do |row, row_index|
1106
- if max.nil? || op.call(row[column_index], max)
1107
- max = row[column_index]
1108
- max_index = row_index
1109
- end
1110
- end
1111
-
1112
- Tensor.cast_dtype(max_index, output_type)
1113
- end
1114
- else
1115
- max = nil
1116
- max_index = 0
1117
- a.each_with_index do |x, index|
1118
- if max.nil? || op.call(x, max)
1119
- max = x
1120
- max_index = index
1121
- end
1122
- end
1123
- Tensor.cast_dtype(max_index, output_type)
1124
- end
1125
- else
1126
- a.collect do |row|
1127
- get_op_with_axis(row, target_axis, current_axis + 1, output_type, op)
1128
- end
1129
- end
1130
- end
1131
-
1132
- def _reduced_shape(input_shape, axes)
1133
- return [] if axes.nil? # reduce to scalar
1134
- axes = [axes] unless axes.is_a?(Array)
1135
- return input_shape if axes.empty?
1136
-
1137
- axes.each do |dimen|
1138
- input_shape[dimen] = 1
1139
- end
1140
- input_shape
1141
- end
1142
-
1143
- def reduction(child_context, tensor, a, b, func)
1144
- input = complete_eval(a, child_context)
1145
- axis = b.is_a?(Tensor) ? read_final_result(complete_eval(b, child_context)) : b
1146
- if axis.nil?
1147
- red = input.buffer.send(func)
1148
- convert_to_opencl(red, [], data_type: tensor.data_type, name: tensor.name)
1149
- else
1150
- return input if input.shape.empty?
1151
- value = input.buffer.reshape(*input.shape.reverse)
1152
- rank = input.shape.size - 1
1153
-
1154
- if axis.is_a?(Array)
1155
- axis.map { |x| rank - x.abs }.sort.reverse_each do |x|
1156
- value = value.send(func, x.to_i)
1157
- end
1158
- else
1159
- value = value.send(func, rank - axis.abs)
1160
- end
1161
-
1162
- new_shape = if value.is_a?(NArray)
1163
- value.shape.reverse
1164
- else
1165
- value = [value]
1166
- []
1167
- end
1168
-
1169
- new_shape = _reduced_shape(input.shape.dup, axis) if tensor.options[:keepdims]
1170
-
1171
- convert_to_opencl(value.flatten, new_shape, data_type: tensor.data_type, name: tensor.name)
1172
- end
1173
- end
1174
-
1175
- # selects variants of cl programs depending on input
1176
- def select_program(input_a, input_b, op)
1177
- return [input_a, input_b, op.to_s, 0] if input_a.shape == input_b.shape
1178
-
1179
- return [input_b, input_a, "#{op}_c", 1] if input_a.shape.empty? || input_a.shape.reduce(:*) == 1 # A is scalar?
1180
- return [input_a, input_b, "#{op}_c", 0] if input_b.shape.empty? || input_a.shape.reduce(:*) == 1 # B is scalar?
1181
-
1182
- return [input_b, input_a, "#{op}_b", 1] if input_a.shape.size < input_b.shape.size
1183
-
1184
- if input_a.shape.size == input_b.shape.size
1185
- input_a.shape.zip(input_b.shape).each do |s1, s2|
1186
- return [input_b, input_a, "#{op}_b", 1] if s1 < s2
1187
- end
1188
- end
1189
-
1190
- [input_a, input_b, "#{op}_b", 0]
1191
- end
1192
-
1193
- def _rank_from_shape(shape)
1194
- shape.is_a?(Array) ? shape.size : 0
1195
- end
1196
-
1197
- def build_event_wait_list(inputs)
1198
- inputs.compact.map(&:op).flatten
1199
- end
1200
-
1201
- def resolve_placeholder(placeholder, _execution_context = {})
1202
- return nil if placeholder.nil?
1203
-
1204
- var = if placeholder.is_a?(Placeholder)
1205
- @context[placeholder.name.to_sym].tap do |c|
1206
- raise "missing placeholder #{placeholder.name}" if c.nil?
1207
- end
1208
- else
1209
- placeholder
1210
- end
1211
-
1212
- return convert_to_opencl(var, shape_eval(var), data_type: placeholder.data_type, name: placeholder.name) unless var.is_a?(Tensor)
1213
- Tensor.cast_dtype(var, placeholder.data_type)
1214
- end
1215
-
1216
- def all_true?(arr)
1217
- if arr.is_a?(Array) || arr.is_a?(NArray)
1218
- arr.each do |a|
1219
- return false unless all_true?(a)
1220
- end
1221
- return true
1222
- end
1223
-
1224
- arr != 0
1225
- end
1226
- end
1227
- end
1228
- end
1229
-
1230
- TensorStream::Evaluator.register_evaluator(TensorStream::Evaluator::OpenclEvaluator, 'opencl', 1)