tensor_stream 0.2.0 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (58) hide show
  1. checksums.yaml +5 -5
  2. data/.circleci/config.yml +2 -1
  3. data/CHANGELOG.md +5 -0
  4. data/README.md +28 -1
  5. data/benchmark/benchmark.rb +129 -0
  6. data/lib/tensor_stream.rb +7 -4
  7. data/lib/tensor_stream/evaluator/buffer.rb +10 -0
  8. data/lib/tensor_stream/evaluator/evaluator.rb +1 -0
  9. data/lib/tensor_stream/evaluator/kernels/_bool_operand.cl +45 -0
  10. data/lib/tensor_stream/evaluator/kernels/_operand.cl +45 -0
  11. data/lib/tensor_stream/evaluator/kernels/abs.cl +16 -0
  12. data/lib/tensor_stream/evaluator/kernels/add.cl +5 -0
  13. data/lib/tensor_stream/evaluator/kernels/argmax.cl +15 -0
  14. data/lib/tensor_stream/evaluator/kernels/argmin.cl +15 -0
  15. data/lib/tensor_stream/evaluator/kernels/cast.cl +15 -0
  16. data/lib/tensor_stream/evaluator/kernels/cond.cl.erb +5 -0
  17. data/lib/tensor_stream/evaluator/kernels/cos.cl +7 -0
  18. data/lib/tensor_stream/evaluator/kernels/div.cl.erb +5 -0
  19. data/lib/tensor_stream/evaluator/kernels/exp.cl +7 -0
  20. data/lib/tensor_stream/evaluator/kernels/gemm.cl +63 -0
  21. data/lib/tensor_stream/evaluator/kernels/log.cl +7 -0
  22. data/lib/tensor_stream/evaluator/kernels/log1p.cl +7 -0
  23. data/lib/tensor_stream/evaluator/kernels/max.cl +91 -0
  24. data/lib/tensor_stream/evaluator/kernels/mul.cl +5 -0
  25. data/lib/tensor_stream/evaluator/kernels/negate.cl +15 -0
  26. data/lib/tensor_stream/evaluator/kernels/pow.cl +130 -0
  27. data/lib/tensor_stream/evaluator/kernels/reciprocal.cl +15 -0
  28. data/lib/tensor_stream/evaluator/kernels/round.cl +7 -0
  29. data/lib/tensor_stream/evaluator/kernels/sigmoid.cl +8 -0
  30. data/lib/tensor_stream/evaluator/kernels/sigmoid_grad.cl +54 -0
  31. data/lib/tensor_stream/evaluator/kernels/sign.cl +23 -0
  32. data/lib/tensor_stream/evaluator/kernels/sin.cl +8 -0
  33. data/lib/tensor_stream/evaluator/kernels/sqrt.cl +8 -0
  34. data/lib/tensor_stream/evaluator/kernels/square.cl +15 -0
  35. data/lib/tensor_stream/evaluator/kernels/sub.cl +5 -0
  36. data/lib/tensor_stream/evaluator/kernels/tan.cl +7 -0
  37. data/lib/tensor_stream/evaluator/kernels/tanh.cl +7 -0
  38. data/lib/tensor_stream/evaluator/kernels/tanh_grad.cl +6 -0
  39. data/lib/tensor_stream/evaluator/kernels/where.cl +15 -0
  40. data/lib/tensor_stream/evaluator/opencl_buffer.rb +30 -0
  41. data/lib/tensor_stream/evaluator/opencl_evaluator.rb +1095 -0
  42. data/lib/tensor_stream/evaluator/opencl_template_helper.rb +58 -0
  43. data/lib/tensor_stream/evaluator/operation_helpers/array_ops_helper.rb +27 -0
  44. data/lib/tensor_stream/evaluator/ruby_evaluator.rb +20 -31
  45. data/lib/tensor_stream/graph.rb +4 -2
  46. data/lib/tensor_stream/math_gradients.rb +3 -0
  47. data/lib/tensor_stream/operation.rb +29 -2
  48. data/lib/tensor_stream/ops.rb +14 -2
  49. data/lib/tensor_stream/placeholder.rb +1 -1
  50. data/lib/tensor_stream/session.rb +10 -3
  51. data/lib/tensor_stream/tensor_shape.rb +1 -1
  52. data/lib/tensor_stream/train/saver.rb +1 -1
  53. data/lib/tensor_stream/variable.rb +7 -1
  54. data/lib/tensor_stream/version.rb +1 -1
  55. data/samples/logistic_regression.rb +2 -1
  56. data/samples/nearest_neighbor.rb +54 -0
  57. data/tensor_stream.gemspec +3 -1
  58. metadata +107 -28
@@ -0,0 +1,23 @@
1
+ __kernel void sign_fp(const int M, const int N, __global const float *A, __global float *C) {
2
+ // Get the index of the current element to be processed
3
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
4
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
5
+ float value = A[globalRow * N + globalCol];
6
+ if (isnan(value) || value == 0.0f) {
7
+ C[globalRow * N + globalCol] = 0.0;
8
+ } else {
9
+ C[globalRow * N + globalCol] = value < 0 ? -1.0 : 1.0;
10
+ }
11
+ }
12
+
13
+ __kernel void sign_int(const int M, const int N, __global const int *A, __global int *C) {
14
+ // Get the index of the current element to be processed
15
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
16
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
17
+ float value = A[globalRow * N + globalCol];
18
+ if (isnan(value) || value == 0) {
19
+ C[globalRow * N + globalCol] = 0;
20
+ } else {
21
+ C[globalRow * N + globalCol] = value < 0 ? -1 : 1;
22
+ }
23
+ }
@@ -0,0 +1,8 @@
1
+
2
+ __kernel void sin_fp(const int M, const int N, __global const float *A, __global float *C) {
3
+ // Get the index of the current element to be processed
4
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
+
7
+ C[globalRow * N + globalCol] = sin(A[globalRow * N + globalCol]);
8
+ }
@@ -0,0 +1,8 @@
1
+
2
+ __kernel void sqrt_fp(const int M, const int N, __global const float *A, __global float *C) {
3
+ // Get the index of the current element to be processed
4
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
+
7
+ C[globalRow * N + globalCol] = sqrt(A[globalRow * N + globalCol]);
8
+ }
@@ -0,0 +1,15 @@
1
+ __kernel void square_fp(const int M, const int N, __global const float *A, __global float *C) {
2
+ // Get the index of the current element to be processed
3
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
4
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
5
+
6
+ C[globalRow * N + globalCol] = A[globalRow * N + globalCol] * A[globalRow * N + globalCol];
7
+ }
8
+
9
+ __kernel void square_int(const int M, const int N, __global const int *A, __global int *C) {
10
+ // Get the index of the current element to be processed
11
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
12
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
13
+
14
+ C[globalRow * N + globalCol] = A[globalRow * N + globalCol] * A[globalRow * N + globalCol];
15
+ }
@@ -0,0 +1,5 @@
1
+ % %w[fp int].product(%w[sub]).each do |dtype, fname|
2
+ % c_dtype = dtype_to_c_type(dtype)
3
+ % op = operator_to_c(fname)
4
+ <%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: fname, dtype: dtype, result_t: c_dtype %>
5
+ % end
@@ -0,0 +1,7 @@
1
+ __kernel void tan_fp(const int M, const int N, __global const float *A, __global float *C) {
2
+ // Get the index of the current element to be processed
3
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
4
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
5
+
6
+ C[globalRow * N + globalCol] = tan(A[globalRow * N + globalCol]);
7
+ }
@@ -0,0 +1,7 @@
1
+ __kernel void tanh_fp(const int M, const int N, __global const float *A, __global float *C) {
2
+ // Get the index of the current element to be processed
3
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
4
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
5
+
6
+ C[globalRow * N + globalCol] = tanh(A[globalRow * N + globalCol]);
7
+ }
@@ -0,0 +1,6 @@
1
+ __kernel void tanh_grad_fp(const int M, const int N, __global const float *A, __global float *C) {
2
+ // Get the index of the current element to be processed
3
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
4
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
5
+ C[globalRow * N + globalCol] = 1 - tanh(A[globalRow * N + globalCol]) * tanh(A[globalRow * N + globalCol]);
6
+ }
@@ -0,0 +1,15 @@
1
+ __kernel void where_fp(const int M, const int N, __global const int *PRED, __global const float *A, __global const float *B, __global float *C) {
2
+ // Get the index of the current element to be processed
3
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
4
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
5
+
6
+ C[globalRow * N + globalCol] = PRED[globalRow * N + globalCol] ? A[globalRow * N + globalCol] : B[globalRow * N + globalCol];
7
+ }
8
+
9
+ __kernel void where_int(const int M, const int N, __global const int *PRED, __global const int *A, __global const int *B, __global int *C) {
10
+ // Get the index of the current element to be processed
11
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
12
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
13
+
14
+ C[globalRow * N + globalCol] = PRED[globalRow * N + globalCol] ? A[globalRow * N + globalCol] : B[globalRow * N + globalCol];
15
+ }
@@ -0,0 +1,30 @@
1
+ module TensorStream
2
+ class OpenCLBuffer < Buffer
3
+ include ArrayOpsHelper
4
+
5
+ attr_accessor :data_type, :shape, :buffer, :cl_buffer, :op
6
+
7
+ def initialize(data_type: , shape:, buffer:, cl_buffer:, op: nil, name: nil)
8
+ @data_type = data_type
9
+ @shape = shape
10
+ @buffer = buffer
11
+ @cl_buffer = cl_buffer
12
+ @name = name
13
+ @op = op
14
+ end
15
+
16
+ def to_ruby
17
+ return [] if buffer.empty?
18
+ if shape.empty?
19
+ return buffer[0] != 0 if data_type == :boolean
20
+ return buffer[0]
21
+ end
22
+
23
+ result = buffer.reshape(*shape.reverse).to_a
24
+ if data_type == :boolean
25
+ result = process_function_op(result, ->(a, _b) { a != 0 })
26
+ end
27
+ result
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,1095 @@
1
+ require 'tensor_stream/evaluator/operation_helpers/random_gaussian'
2
+ require 'tensor_stream/evaluator/operation_helpers/array_ops_helper'
3
+ require 'tensor_stream/evaluator/operation_helpers/math_helper'
4
+ require 'tensor_stream/evaluator/opencl_buffer'
5
+ require 'tensor_stream/evaluator/opencl_template_helper'
6
+ require 'distribution'
7
+ require 'opencl_ruby_ffi'
8
+ require 'narray_ffi'
9
+
10
+ module TensorStream
11
+ module Evaluator
12
+ class FullEvalNotPossible < RuntimeError
13
+ end
14
+
15
+ # Errors during graph evaluation
16
+ class EvaluatorExcecutionException < RuntimeError
17
+ attr_reader :tensor
18
+
19
+ def initialize(exception, tensor)
20
+ @exception = exception
21
+ @tensor = tensor
22
+ end
23
+
24
+ def wrapped_exception
25
+ @exception
26
+ end
27
+ end
28
+
29
+ ## PURE ruby evaluator used for testing and development
30
+ class OpenclEvaluator
31
+ attr_accessor :retain
32
+
33
+ include TensorStream::OpHelper
34
+ include TensorStream::ArrayOpsHelper
35
+ include TensorStream::MathHelper
36
+
37
+ def initialize(session, context, thread_pool: nil, log_intermediates: false, preferred_device: nil)
38
+ @session = session
39
+ @context = context
40
+ @log_intermediates = log_intermediates
41
+ @preferred_device = preferred_device
42
+ @retain = context[:retain] || []
43
+ @thread_pool = thread_pool || Concurrent::ImmediateExecutor.new
44
+
45
+ @context[:compute_history] = [] if log_intermediates
46
+ end
47
+
48
+ # opencl evaluator main entrypoint
49
+ def run(tensor, execution_context)
50
+ _create_opencl_context
51
+ # _prepare_kernels
52
+
53
+ read_final_result(complete_eval(tensor, execution_context))
54
+ end
55
+
56
+ def complete_eval(tensor, context)
57
+ create_command_queue
58
+ buffer = _run(tensor, context)
59
+ if buffer.is_a?(Array)
60
+ buffer = buffer.collect do |b|
61
+ next b if b.buffer.size.zero?
62
+ _opencl_queue.enqueue_read_buffer(b.cl_buffer, b.buffer, event_wait_list: [b.op].compact)
63
+ b
64
+ end
65
+ else
66
+ return buffer if buffer.nil? || buffer.buffer.size.zero?
67
+ _opencl_queue.enqueue_read_buffer(buffer.cl_buffer, buffer.buffer, event_wait_list: [buffer.op].compact)
68
+ end
69
+
70
+ _opencl_queue.finish
71
+ buffer
72
+ end
73
+
74
+ def opencl_device
75
+ @context[:_cache][:_opencl_device]
76
+ end
77
+
78
+ protected
79
+
80
+ # read result from opencl and convert to ruby
81
+ def read_final_result(buffer)
82
+ return buffer.map { |b| read_final_result(b) } if buffer.is_a?(Array)
83
+ return nil if buffer.nil?
84
+
85
+ buffer.to_ruby
86
+ end
87
+
88
+ def _create_opencl_context
89
+ @context[:_cache][:_opencl_device] ||= begin
90
+ if @preferred_device
91
+ @preferred_device
92
+ else
93
+ device, _score, _platform, _index = choose_best_device
94
+ device
95
+ end
96
+ end
97
+ @context[:_cache][:_opencl_context] ||= OpenCL.create_context(opencl_device)
98
+ end
99
+
100
+ def choose_best_device
101
+ @best_device ||= begin
102
+ devices = OpenCL.platforms.flat_map do |p|
103
+ p.devices.select { |d| d.available > 0 }.each_with_index.collect do |d, index|
104
+ score = 0
105
+ if d.type.to_s == 'CPU'
106
+ score += 1
107
+ elsif d.type.to_s == 'GPU'
108
+ score += 4
109
+ end
110
+
111
+ score += d.max_compute_units
112
+
113
+ [d, score, p.name, index]
114
+ end
115
+ end
116
+ end
117
+ devices.max { |a| a[1] }
118
+ end
119
+
120
+ def create_command_queue
121
+ supported_proprties = opencl_device.queue_properties.names
122
+ properties = []
123
+ properties << OpenCL::CommandQueue::PROFILING_ENABLE if supported_proprties.include?('PROFILING_ENABLE')
124
+ properties << OpenCL::CommandQueue::OUT_OF_ORDER_EXEC_MODE_ENABLE if supported_proprties.include?('OUT_OF_ORDER_EXEC_MODE_ENABLE')
125
+ @context[:_cache][:_opencl_queue] ||= _opencl_context.create_command_queue(opencl_device, properties: properties)
126
+ end
127
+
128
+ def _opencl_context
129
+ @context[:_cache][:_opencl_context]
130
+ end
131
+
132
+ def _opencl_queue
133
+ @context[:_cache][:_opencl_queue]
134
+ end
135
+
136
+ def cl_template_path(kernel, extension)
137
+ File.join(File.dirname(__FILE__), 'kernels', "#{kernel}.#{extension}")
138
+ end
139
+
140
+ def _cl_program(kernel)
141
+ @context[:_cache]["_opencl_kernel_#{kernel}"] ||= begin
142
+ filename = %w[cl.erb cl].map { |ext| cl_template_path(kernel, ext) }.find { |n| File.exist?(n) }
143
+ source = File.read(filename)
144
+ source = OpenclTemplateHelper.new(source).generate
145
+ program = _opencl_context.create_program_with_source(source)
146
+ program.build
147
+ rescue OpenCL::Error::BUILD_PROGRAM_FAILURE => e
148
+ puts "OpenCL Compile error: #{program.build_log}"
149
+ raise e
150
+ end
151
+ end
152
+
153
+ def _run(tensor, execution_context)
154
+ return tensor if tensor.is_a?(OpenCLBuffer)
155
+ return tensor.map { |t| _run(t, execution_context) } if tensor.is_a?(Array)
156
+
157
+ return tensor if retain.include?(tensor) # if var is in retain don't eval to value
158
+
159
+ tensor = tensor.call if tensor.is_a?(Proc)
160
+
161
+ child_context = execution_context.dup
162
+ res = if tensor.is_a?(Operation)
163
+ eval_operation(tensor, child_context)
164
+ elsif tensor.is_a?(Variable)
165
+ eval_variable(tensor, child_context)
166
+ elsif tensor.is_a?(Placeholder)
167
+ resolve_placeholder(tensor, child_context)
168
+ else
169
+ eval_tensor(tensor, child_context)
170
+ end
171
+ execution_context.deep_merge!(returns: child_context[:returns])
172
+ res
173
+ end
174
+
175
+ def eval_variable(tensor, child_context)
176
+ raise "variable #{tensor.name} not initalized" if tensor.value.nil? && (tensor.buffer.nil? || !tensor.buffer.dirty)
177
+ tensor.buffer = wrap_opencl(tensor, name: tensor.name) if tensor.buffer.nil?
178
+ tensor.buffer
179
+ end
180
+
181
+ def eval_operation(tensor, child_context)
182
+ return @context[tensor.name] if @context.key?(tensor.name)
183
+
184
+ a = resolve_placeholder(tensor.items[0], child_context) if tensor.items && tensor.items[0]
185
+ b = resolve_placeholder(tensor.items[1], child_context) if tensor.items && tensor.items[1]
186
+
187
+ case tensor.operation
188
+ when :concat
189
+ input_a = read_final_result(complete_eval(a, child_context))
190
+ arr = concat_array(input_a, tensor.options[:axis])
191
+ convert_to_opencl(arr.flatten, shape_eval(arr), data_type: tensor.data_type, name: tensor.name)
192
+ when :cond
193
+ pred = complete_eval(tensor.options[:pred], child_context)
194
+ a = _run(a, child_context)
195
+ b = _run(b, child_context)
196
+
197
+ if all_true?(pred.buffer)
198
+ a
199
+ else
200
+ b
201
+ end
202
+ when :identity
203
+ _run(a, child_context)
204
+ when :eye
205
+ rows = complete_eval(a, child_context)
206
+ columns = complete_eval(b, child_context)
207
+ shape = [rows.buffer[0], columns.buffer[0]]
208
+ eye_arr = Array.new(rows.buffer[0]) do |i|
209
+ Array.new(columns.buffer[0]) do |col|
210
+ if fp_type?(tensor.data_type)
211
+ i == col ? 1.0 : 0.0
212
+ else
213
+ i == col ? 1 : 0
214
+ end
215
+ end
216
+ end
217
+
218
+ convert_to_opencl(eye_arr.flatten, shape, data_type: tensor.data_type, name: tensor.name)
219
+ when :pad
220
+ a = read_final_result(complete_eval(a, child_context))
221
+ p = read_final_result(complete_eval(tensor.options[:paddings], child_context))
222
+
223
+ padding = arr_pad(a, p, tensor.data_type)
224
+ convert_to_opencl(padding.flatten, shape_eval(padding), data_type: tensor.data_type, name: tensor.name)
225
+ when :tile
226
+ input = read_final_result(complete_eval(a, child_context))
227
+ multiples = read_final_result(complete_eval(b, child_context))
228
+
229
+ rank = get_rank(input)
230
+ raise '1D or higher tensor required' if rank.zero?
231
+ raise "invalid multiple size passed #{rank} != #{multiples.size}" if rank != multiples.size
232
+
233
+ tile = tile_arr(input, 0, multiples)
234
+ arr = tile.nil? ? [] : tile
235
+ convert_to_opencl(arr.flatten, shape_eval(arr), data_type: tensor.data_type, name: tensor.name)
236
+ when :assign
237
+ assign_var(tensor, b, child_context)
238
+ when :assign_add
239
+ a = _run(a, child_context)
240
+ b = _run(b, child_context)
241
+
242
+ value = execute_2_operand_func('add', tensor, a, b, child_context)
243
+ assign_var(tensor, value, child_context)
244
+ when :assign_sub
245
+ a = _run(a, child_context)
246
+ b = _run(b, child_context)
247
+
248
+ value = execute_2_operand_func('sub', tensor, a, b, child_context)
249
+ assign_var(tensor, value, child_context)
250
+ when :less
251
+ execute_2_operand_func('less', tensor, a, b, child_context, 'cond')
252
+ when :less_equal
253
+ execute_2_operand_func('less_equal', tensor, a, b, child_context, 'cond')
254
+ when :greater
255
+ execute_2_operand_func('greater', tensor, a, b, child_context, 'cond')
256
+ when :greater_equal
257
+ execute_2_operand_func('greater_equal', tensor, a, b, child_context, 'cond')
258
+ when :equal
259
+ execute_2_operand_func('equal', tensor, a, b, child_context, 'cond')
260
+ when :not_equal
261
+ execute_2_operand_func('not_equal', tensor, a, b, child_context, 'cond')
262
+ when :logical_and
263
+ execute_2_operand_func('logical_and', tensor, a, b, child_context, 'cond')
264
+ when :where
265
+ pred = tensor.options[:pred]
266
+ execute_cond_func('where', tensor, pred, a, b, child_context)
267
+ when :max
268
+ execute_2_operand_func('max', tensor, a, b, child_context)
269
+ when :add
270
+ execute_2_operand_func('add', tensor, a, b, child_context)
271
+ when :div
272
+ execute_2_operand_func('div', tensor, a, b, child_context)
273
+ when :sub
274
+ execute_2_operand_func('sub', tensor, a, b, child_context)
275
+ when :matmul
276
+ a = _run(a, child_context)
277
+ b = _run(b, child_context)
278
+
279
+ m = a.shape[0]
280
+ n = b.shape[1]
281
+ v = b.shape[0]
282
+ k = a.shape[1]
283
+
284
+ m, k = [a.shape[1], a.shape[0]] if tensor.options[:transpose_a]
285
+ n, v = [b.shape[0], b.shape[1]] if tensor.options[:transpose_b]
286
+
287
+ result_shape = [m, n]
288
+
289
+ raise "#{tensor.items[0].name} rank must be greater than 1" if a.shape.size < 2
290
+ raise "#{tensor.items[1].name} rank must be greater than 1" if b.shape.size < 2
291
+ raise "incompatible shape sizes for matrix multiplication (#{a.shape[1]} != #{b.shape[0]}) #{a.shape} vs #{b.shape}" if k != v
292
+
293
+ dtype = TensorStream::Ops::FLOATING_POINT_TYPES.include?(tensor.data_type) ? 'fp' : 'int'
294
+ a, b = type_cast(a, b)
295
+ output_buffer = _create_result_buffer(a.data_type, result_shape, tensor.name)
296
+
297
+ cl_m = OpenCL::Int1.new(m)
298
+ cl_n = OpenCL::Int1.new(n)
299
+ cl_k = OpenCL::Int1.new(k)
300
+
301
+ transpose_a = OpenCL::Int1.new(tensor.options[:transpose_a] ? 1 : 0)
302
+ transpose_b = OpenCL::Int1.new(tensor.options[:transpose_b] ? 1 : 0)
303
+
304
+ output_buffer.op = _cl_program('gemm').send(:"gemm_#{dtype}", _opencl_queue, result_shape, cl_m, cl_n, cl_k, transpose_a, transpose_b, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer)
305
+ output_buffer
306
+ when :mul
307
+ execute_2_operand_func('mul', tensor, a, b, child_context)
308
+ when :pow
309
+ execute_2_operand_func('pow', tensor, a, b, child_context)
310
+ when :cast
311
+ a = _run(a, child_context)
312
+ if a.data_type != tensor.data_type
313
+ buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
314
+ s_dtype = TensorStream::Ops::FLOATING_POINT_TYPES.include?(a.data_type) ? 'fp' : 'int'
315
+ t_dtype = TensorStream::Ops::FLOATING_POINT_TYPES.include?(tensor.data_type) ? 'fp' : 'int'
316
+ m, n = a.shape
317
+ cl_m = OpenCL::Int1.new(m || 1)
318
+ cl_n = OpenCL::Int1.new(n || 1)
319
+ work_group = [m || 1, n || 1]
320
+
321
+ buffer.op = _cl_program("cast").send(:"cast_#{s_dtype}_#{t_dtype}",_opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, buffer.cl_buffer)
322
+ buffer
323
+ else
324
+ a
325
+ end
326
+ when :sign
327
+ execute_func('sign', tensor, a, child_context)
328
+ when :exp
329
+ execute_func('exp', tensor, a, child_context)
330
+ when :log
331
+ execute_func('log', tensor, a, child_context)
332
+ when :sin
333
+ execute_func('sin', tensor, a, child_context)
334
+ when :tan
335
+ execute_func('tan', tensor, a, child_context)
336
+ when :cos
337
+ execute_func('cos', tensor, a, child_context)
338
+ when :abs
339
+ execute_func('abs', tensor, a, child_context)
340
+ when :sqrt
341
+ execute_func('sqrt', tensor, a, child_context)
342
+ when :negate
343
+ execute_func('negate', tensor, a, child_context)
344
+ when :square
345
+ execute_func('square', tensor, a, child_context)
346
+ when :reciprocal
347
+ execute_func('reciprocal', tensor, a, child_context)
348
+ when :tanh
349
+ execute_func('tanh', tensor, a, child_context)
350
+ when :tanh_grad
351
+ execute_func('tanh_grad', tensor, a, child_context)
352
+ when :sigmoid
353
+ execute_func('sigmoid', tensor, a, child_context)
354
+ when :log1p
355
+ execute_func('log1p', tensor, a, child_context)
356
+ when :round
357
+ execute_func('round', tensor, a, child_context)
358
+ when :sigmoid_grad
359
+ execute_2_operand_func('sigmoid_grad', tensor, a, b, child_context)
360
+ when :truncate
361
+ a = _run(a, child_context)
362
+ b = _run(b, child_context)
363
+
364
+ if a.shape.size.zero?
365
+ a
366
+ else
367
+ input_b = read_final_result(b)
368
+ if a.shape == input_b
369
+ a
370
+ else
371
+ input_a = read_final_result(a)
372
+ if input_b == []
373
+ if a.buffer.size == 1
374
+ a.shape = input_b
375
+ a
376
+ else
377
+ wrap_opencl(a.buffer[0], data_type: a.data_type, name: tensor.name)
378
+ end
379
+ else
380
+ wrap_opencl(truncate(input_a, input_b), data_type: a.data_type, name: tensor.name)
381
+ end
382
+ end
383
+ end
384
+ when :zeros, :ones, :zeros_like, :ones_like
385
+ shape = if %i[zeros_like ones_like].include?(tensor.operation)
386
+ _run(a, child_context).shape
387
+ else
388
+ read_final_result(complete_eval(a, child_context)) || tensor.shape.shape
389
+ end
390
+
391
+ func = if %i[zeros zeros_like].include?(tensor.operation)
392
+ -> { tensor.data_type == :int32 ? 0 : 0.0 }
393
+ else
394
+ -> { tensor.data_type == :int32 ? 1 : 1.0 }
395
+ end
396
+
397
+ size = shape.empty? ? 1 : shape.reduce(:*)
398
+
399
+ buffer = if TensorStream::Ops::FLOATING_POINT_TYPES.include?(tensor.data_type)
400
+ NArray.sfloat(size)
401
+ elsif TensorStream::Ops::INTEGER_TYPES.include?(tensor.data_type)
402
+ NArray.int(size)
403
+ else
404
+ raise "unsupported type #{tensor.data_type}"
405
+ end
406
+
407
+ data = if !shape.empty?
408
+ Array.new(size) do |index|
409
+ func.call
410
+ end
411
+ else
412
+ func.call
413
+ end
414
+
415
+ convert_to_opencl(data, shape, data_type: tensor.data_type, name: tensor.name)
416
+ when :broadcast_transform
417
+ a = _run(a, child_context)
418
+ b = _run(b, child_context)
419
+
420
+ if a.shape == b.shape
421
+ [a, b]
422
+ else
423
+ input_a = read_final_result(complete_eval(a, child_context))
424
+ input_b = read_final_result(complete_eval(b, child_context))
425
+ b_a, b_b = broadcast(input_a, input_b)
426
+ [ wrap_opencl(b_a, data_type: a.data_type, name: "#{tensor.name}_a"),
427
+ wrap_opencl(b_b, data_type: a.data_type, name: "#{tensor.name}_b")]
428
+ end
429
+ when :print
430
+ a = _run(a, child_context)
431
+ b = _run(b, child_context)
432
+ input_b = complete_eval(b, child_context)
433
+ input_b = read_final_result(input_b)
434
+ puts "#{tensor.options.fetch(:message, '')} #{input_b}"
435
+ a
436
+ when :rank
437
+ a = _run(a, child_context)
438
+ wrap_opencl(a.shape.size, data_type: tensor.data_type, name: tensor.name)
439
+ when :stop_gradient
440
+ _run(a, child_context)
441
+ when :slice
442
+ input_a = complete_eval(a, child_context)
443
+ input_b = read_final_result(complete_eval(b, child_context))
444
+ size = tensor.options[:size]
445
+
446
+ slice_param = input_b.zip(size).collect { |x, y| x..x + y - 1 }.reverse
447
+
448
+ new_buf = input_a.buffer.reshape(*input_a.shape.reverse)
449
+ sliced = new_buf.slice[*slice_param]
450
+ convert_to_opencl(sliced.flatten, sliced.shape.reverse, data_type: a.data_type, name: tensor.name)
451
+ when :transpose
452
+ input_a = complete_eval(a, child_context)
453
+ t_param = Array.new(input_a.shape.size) { |index| index }.reverse
454
+ transposed = input_a.buffer.reshape(*input_a.shape.reverse).transpose(*t_param)
455
+ convert_to_opencl(transposed.flatten, transposed.shape.reverse, data_type: a.data_type, name: tensor.name)
456
+ when :index
457
+ a = complete_eval(a, child_context)
458
+ input_a = read_final_result(a)
459
+ index = read_final_result(complete_eval(b, child_context))
460
+
461
+ if a.is_a?(Array)
462
+ a[index]
463
+ else
464
+ new_shape = a.shape.dup
465
+ new_shape.shift
466
+ convert_to_opencl(input_a[index], new_shape, data_type: a.data_type, name: tensor.name)
467
+ end
468
+ when :broadcast_gradient_args
469
+ a = complete_eval(a, child_context)
470
+ b = complete_eval(b, child_context)
471
+
472
+ wrap_opencl(get_broadcast_gradient_args(a.buffer.to_a, b.buffer.to_a), data_type: a.data_type, name: tensor.name)
473
+ when :shape
474
+ a = _run(a, child_context)
475
+
476
+ wrap_opencl(a.shape, name: tensor.name, data_type: tensor.options[:out_type] || :float32)
477
+ when :reshape
478
+ arr = complete_eval(a, child_context)
479
+ new_shape = read_final_result(complete_eval(b, child_context))
480
+
481
+ if new_shape.size.zero? && arr.buffer.size == 1
482
+ arr.shape = new_shape
483
+ arr
484
+ else
485
+ new_shape = TensorShape.fix_inferred_elements(new_shape, arr.buffer.size)
486
+ arr.shape = new_shape
487
+ arr
488
+ end
489
+ when :random_uniform
490
+ maxval = tensor.options.fetch(:maxval, 1)
491
+ minval = tensor.options.fetch(:minval, 0)
492
+ seed = tensor.options[:seed]
493
+
494
+ random = _get_randomizer(tensor, seed)
495
+ generator = -> { random.rand * (maxval - minval) + minval }
496
+ shape = tensor.options[:shape] || tensor.shape.shape
497
+
498
+ convert_to_opencl(generate_vector(shape, generator: generator), shape, data_type: tensor.data_type, name: tensor.name)
499
+ when :random_normal
500
+ random = _get_randomizer(tensor, seed)
501
+ r = RandomGaussian.new(tensor.options.fetch(:mean), tensor.options.fetch(:stddev), -> { random.rand })
502
+ random = _get_randomizer(tensor, seed)
503
+ generator = -> { r.rand }
504
+ shape = tensor.options[:shape] || tensor.shape.shape
505
+
506
+ convert_to_opencl(generate_vector(shape, generator: generator), shape, data_type: tensor.data_type, name: tensor.name)
507
+ when :glorot_uniform
508
+ random = _get_randomizer(tensor, seed)
509
+
510
+ shape = tensor.options[:shape] || tensor.shape.shape
511
+ fan_in, fan_out = if shape.size.zero?
512
+ [1, 1]
513
+ elsif shape.size == 1
514
+ [1, shape[0]]
515
+ else
516
+ [shape[0], shape.last]
517
+ end
518
+
519
+ limit = Math.sqrt(6.0 / (fan_in + fan_out))
520
+
521
+ minval = -limit
522
+ maxval = limit
523
+
524
+ generator = -> { random.rand * (maxval - minval) + minval }
525
+ convert_to_opencl(generate_vector(shape, generator: generator), shape, data_type: tensor.data_type, name: tensor.name)
526
+ when :flow_group
527
+ tensor.items.collect { |item| _run(item, child_context) }
528
+ when :sum
529
+ reduction(child_context, tensor, a, b, :sum)
530
+ when :mean
531
+ reduction(child_context, tensor, a, b, :mean)
532
+ when :prod
533
+ input_a = complete_eval(a, child_context)
534
+ if input_a.buffer.empty?
535
+ convert_to_opencl([1.0], [], data_type: a.data_type, name: tensor.name)
536
+ else
537
+ reduction(child_context, tensor, a, b, :prod)
538
+ end
539
+ when :argmin
540
+ a = complete_eval(a, child_context)
541
+ axis = tensor.options[:axis] || 0
542
+ arr = a.buffer.reshape(*a.shape.reverse).to_a
543
+ op = get_op_with_axis(arr, axis, 0, a.data_type, ->(a, b) { a < b })
544
+ convert_to_opencl(op, shape_eval(op), data_type: tensor.data_type, name: tensor.name)
545
+ when :argmax
546
+ a = complete_eval(a, child_context)
547
+ axis = tensor.options[:axis] || 0
548
+ arr = a.buffer.reshape(*a.shape.reverse).to_a
549
+ op = get_op_with_axis(arr, axis, 0, a.data_type, ->(a, b) { a > b })
550
+ convert_to_opencl(op, shape_eval(op), data_type: tensor.data_type, name: tensor.name)
551
+ else
552
+ raise "unknown op #{tensor.operation}"
553
+ end.tap do |result|
554
+ if tensor.breakpoint
555
+ a = read_final_result(complete_eval(a, child_context))
556
+ b = read_final_result(complete_eval(b, child_context))
557
+ result = read_final_result(complete_eval(result, child_context))
558
+
559
+ tensor.breakpoint.call(tensor, a, b, result)
560
+ end
561
+ if @log_intermediates
562
+ @context[:compute_history] << {
563
+ name: tensor.name,
564
+ type: tensor.data_type,
565
+ shape: shape_eval(result),
566
+ source: tensor.source,
567
+ description: tensor.to_math(true, 1),
568
+ value: result
569
+ }
570
+ end
571
+ @context[tensor.name] = result
572
+ end
573
+ rescue EvaluatorExcecutionException => e
574
+ raise e
575
+ rescue StandardError => e
576
+ puts e.message
577
+ puts e.backtrace.join("\n")
578
+
579
+ # shape_a = a.shape.shape if a
580
+ # shape_b = b.shape.shape if b
581
+ # dtype_a = a.data_type if a
582
+ # dtype_b = b.data_type if b
583
+ # a = complete_eval(a, child_context)
584
+ # b = complete_eval(b, child_context)
585
+ # puts "name: #{tensor.given_name}"
586
+ # # puts "op: #{tensor.to_math(true, 1)}"
587
+ # puts "A #{shape_a} #{dtype_a}: #{a}" if a
588
+ # puts "B #{shape_b} #{dtype_b}: #{b}" if b
589
+ # dump_intermediates if @log_intermediates
590
+ # File.write('/home/jedld/workspace/tensor_stream/samples/error.graphml', TensorStream::Graphml.new.get_string(tensor, @session))
591
+
592
+ # File.write('/Users/josephemmanueldayo/workspace/gradients.graphml', TensorStream::Graphml.new.get_string(tensor, @session))
593
+ raise EvaluatorExcecutionException.new(e, tensor), "error #{e.message} while evaluating #{tensor.name} : #{tensor.to_math(true,1)} defined at #{tensor.source}"
594
+ end
595
+
596
+ def eval_tensor(tensor, child_context)
597
+ return tensor unless tensor.is_a?(Tensor)
598
+
599
+ cache_key = "#{tensor.graph.object_id}_opencl_#{tensor.name}"
600
+ return @context[cache_key] if @context.key?(cache_key)
601
+ return @context[:_cache][cache_key] if tensor.is_const && @context[:_cache][cache_key]
602
+ @context[cache_key] = if tensor.value.is_a?(Tensor)
603
+ _run(tensor.value, child_context)
604
+ else
605
+ wrap_opencl(tensor, name: tensor.name)
606
+ end
607
+ @context[:_cache][cache_key] = @context[cache_key] if tensor.is_const
608
+ end
609
+
610
+ private
611
+
612
+ def assign_var(tensor, b, child_context)
613
+ assign = tensor.items[0] || tensor
614
+ buffer = complete_eval(b, child_context)
615
+ if assign.buffer
616
+ assign.buffer.op = _opencl_queue.enqueue_write_buffer(assign.buffer.cl_buffer, buffer.buffer)
617
+ else
618
+ assign.buffer = convert_to_opencl(read_final_result(buffer), buffer.shape, data_type: tensor.data_type, name: tensor.name)
619
+ end
620
+ assign.buffer.dirty = true
621
+ assign.buffer
622
+ end
623
+
624
+ def execute_2_operand_func(op_name, tensor, input_a, input_b, child_context, prog_name = nil)
625
+ a = _run(input_a, child_context)
626
+ b = _run(input_b, child_context)
627
+ a, b = type_cast(a, b)
628
+ dtype = TensorStream::Ops::FLOATING_POINT_TYPES.include?(tensor.data_type) ? 'fp' : 'int'
629
+ result_shape = TensorShape.infer_shape(a.shape, b.shape)
630
+
631
+ output_buffer = _create_result_buffer(tensor.data_type, result_shape, tensor.name)
632
+ a, b, prog, switch_operands = select_program(a, b, op_name)
633
+ m, n = result_shape
634
+ work_group = [m || 1, n || 1]
635
+ cl_m = OpenCL::Int1.new(m || 1)
636
+ cl_n = OpenCL::Int1.new(n || 1)
637
+ cl_switch = OpenCL::Int1.new(switch_operands) # no need to switch for addition
638
+
639
+ event_wait_list = [a.op, b.op].compact # add dependency wait list
640
+
641
+ event = if prog == "#{op_name}_b"
642
+ cl_m_b, cl_n_b = if b.shape.size == 2
643
+ [ OpenCL::Int1.new(b.shape[0]), OpenCL::Int1.new(b.shape[1]) ]
644
+ elsif b.shape.size == 1
645
+ [ OpenCL::Int1.new(1), OpenCL::Int1.new(b.shape[0]) ]
646
+ else
647
+ raise "rank > 2 not supported!"
648
+ end
649
+ _cl_program("#{prog_name || op_name}").send(:"#{prog}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, cl_m_b, cl_n_b, cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
650
+ else
651
+ _cl_program("#{prog_name || op_name}").send(:"#{prog}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
652
+ end
653
+
654
+ output_buffer.op = event
655
+ output_buffer
656
+ end
657
+
658
+ def execute_cond_func(op_name, tensor, pred, input_a, input_b, child_context)
659
+ p = _run(pred, child_context)
660
+ a = _run(input_a, child_context)
661
+ b = _run(input_b, child_context)
662
+
663
+ a, b = type_cast(a, b)
664
+ dtype = TensorStream::Ops::FLOATING_POINT_TYPES.include?(tensor.data_type) ? 'fp' : 'int'
665
+
666
+ output_buffer = _create_result_buffer(tensor.data_type, p.shape, tensor.name)
667
+
668
+ m, n = p.shape
669
+ work_group = [m || 1, n || 1]
670
+ cl_m = OpenCL::Int1.new(m || 1)
671
+ cl_n = OpenCL::Int1.new(n || 1)
672
+
673
+ event_wait_list = [a.op, b.op, p.op].compact # add dependency wait list
674
+ output_buffer.op = _cl_program("#{op_name}").send(:"#{op_name}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, p.cl_buffer, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
675
+ output_buffer
676
+ end
677
+
678
+ def execute_func(op_name, tensor, a, child_context)
679
+ a = _run(a, child_context)
680
+ event_wait_list = [a.op].compact
681
+ dtype = TensorStream::Ops::FLOATING_POINT_TYPES.include?(tensor.data_type) ? 'fp' : 'int'
682
+ output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
683
+
684
+ m, n = a.shape
685
+ work_group = [m || 1, n || 1]
686
+ cl_m = OpenCL::Int1.new(m || 1)
687
+ cl_n = OpenCL::Int1.new(n || 1)
688
+
689
+ event = _cl_program("#{op_name}").send(:"#{op_name}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
690
+ output_buffer.op = event
691
+ output_buffer
692
+ end
693
+
694
+ def type_cast(a, b)
695
+ return [a, b] if a.data_type == b.data_type
696
+ m, n = b.shape
697
+ work_group = [m || 1, n || 1]
698
+ buffer = buffer_for(b.shape, b.data_type)
699
+ if (TensorStream::Ops::FLOATING_POINT_TYPES.include?(a.data_type.to_sym))
700
+ if TensorStream::Ops::INTEGER_TYPES.include?(b.data_type.to_sym)
701
+ cl_m = OpenCL::Int1.new(m || 1)
702
+ cl_n = OpenCL::Int1.new(n || 1)
703
+
704
+ _cl_program("cast").cast_int_fp(_opencl_queue, work_group, cl_m, cl_n, b.cl_buffer, buffer.cl_buffer)
705
+ return [a, buffer]
706
+ end
707
+ elsif TensorStream::Ops::INTEGER_TYPES.include?(a.data_type.to_sym)
708
+ if TensorStream::Ops::FLOATING_POINT_TYPES.include?(b.data_type.to_sym)
709
+ cl_m = OpenCL::Int1.new(m || 1)
710
+ cl_n = OpenCL::Int1.new(n || 1)
711
+ _cl_program("cast").cast_fp_int(_opencl_queue, work_group, cl_m, cl_n, b.cl_buffer, buffer.cl_buffer)
712
+ return [a, buffer]
713
+ end
714
+ end
715
+
716
+ [a, b]
717
+ end
718
+
719
+ def buffer_for(shape, data_type)
720
+ size = shape.empty? ? 1 : shape.reduce(:*)
721
+
722
+ buffer = allocate_narray_for_type(data_type, size)
723
+
724
+ cl_buffer = _opencl_context.create_buffer(buffer.size * buffer.element_size)
725
+ OpenCLBuffer.new(data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer)
726
+ end
727
+
728
+ def wrap_opencl(tensor, data_type: nil, name: nil)
729
+ value, shape = if tensor.is_a?(Tensor)
730
+ [tensor.value, tensor.shape.shape]
731
+ else
732
+ [tensor , shape_eval(tensor)]
733
+ end
734
+
735
+ convert_to_opencl(value, shape, data_type: data_type || tensor.data_type, name: name)
736
+ end
737
+
738
+ def convert_to_opencl(value, shape, data_type: nil, name: nil)
739
+ if !value.is_a?(Array) && !value.is_a?(NArray)
740
+ value = [value]
741
+ end
742
+
743
+ cache_key = "_cl_object_#{name}_#{shape.join('_')}"
744
+ cl_object = if name && @context[:_cache][cache_key]
745
+ @context[:_cache][cache_key]
746
+ else
747
+ narray_size = shape.reduce(:*) || 1
748
+
749
+ buffer = if value.is_a?(NArray)
750
+ value
751
+ else
752
+ allocate_narray_for_type(data_type, narray_size)
753
+ end
754
+
755
+ cl_buffer_size = shape.empty? ? 1 : shape.reduce(:*)
756
+
757
+ cl_buffer = if !value.flatten.empty?
758
+ cl_buffer_size = 1 if cl_buffer_size.zero?
759
+ _opencl_context.create_buffer(cl_buffer_size * buffer.element_size)
760
+ else
761
+ nil
762
+ end
763
+
764
+ @context[:_cache][cache_key] = OpenCLBuffer.new(name: name, data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer)
765
+ end
766
+
767
+ if value.is_a?(Array)
768
+ value.flatten.each_with_index do |element, index|
769
+ if element.is_a?(Tensor)
770
+ cl_object.buffer[index] = read_final_result(complete_eval(element, {}))
771
+ else
772
+ cl_object.buffer[index] = Tensor.cast_dtype(element, data_type)
773
+ end
774
+ end
775
+ elsif value.is_a?(NArray)
776
+ cl_object.buffer = value
777
+ else
778
+ cl_object.buffer[0] = Tensor.cast_dtype(value, data_type)
779
+ end
780
+
781
+ write_op = if cl_object.cl_buffer && !value.nil? && (!value.is_a?(Array) || !value.empty?)
782
+ _opencl_queue.enqueue_write_buffer(cl_object.cl_buffer, cl_object.buffer)
783
+ end
784
+ cl_object.op = write_op
785
+ cl_object
786
+ end
787
+
788
+ def allocate_narray_for_type(data_type, narray_size)
789
+ if TensorStream::Ops::FLOATING_POINT_TYPES.include?(data_type.to_sym) || TensorStream::Ops::FLOATING_POINT_TYPES.include?(data_type.to_sym)
790
+ NArray.sfloat(narray_size)
791
+ elsif TensorStream::Ops::INTEGER_TYPES.include?(data_type.to_sym) || TensorStream::Ops::INTEGER_TYPES.include?(data_type.to_sym)
792
+ NArray.int(narray_size)
793
+ elsif data_type.to_sym == :boolean
794
+ NArray.int(narray_size)
795
+ else
796
+ raise "unsupported type #{data_type}"
797
+ end
798
+ end
799
+
800
+ def _create_result_buffer(data_type, shape, name)
801
+ @context[:_cache]["_result_#{name}_#{shape.join('_')}"] ||= begin
802
+ size = shape.empty? ? 1 : shape.reduce(:*)
803
+ buffer = allocate_narray_for_type(data_type, size)
804
+ cl_buffer = _opencl_context.create_buffer(buffer.size * buffer.element_size)
805
+ OpenCLBuffer.new(data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer)
806
+ end
807
+ end
808
+
809
+ def get_op_with_axis(a, target_axis, current_axis, output_type, op = ->(t, u) { t > u })
810
+ if target_axis == current_axis
811
+ if a[0].is_a?(Array)
812
+ (0...a[0].size).each.collect do |column_index|
813
+ max = nil
814
+ max_index = 0
815
+ a.each_with_index do |row, row_index|
816
+ if max.nil? || op.call(row[column_index], max)
817
+ max = row[column_index]
818
+ max_index = row_index
819
+ end
820
+ end
821
+
822
+ Tensor.cast_dtype(max_index, output_type)
823
+ end
824
+ else
825
+ max = nil
826
+ max_index = 0
827
+ a.each_with_index do |x, index|
828
+ if max.nil? || op.call(x, max)
829
+ max = x
830
+ max_index = index
831
+ end
832
+ end
833
+ Tensor.cast_dtype(max_index, output_type)
834
+ end
835
+ else
836
+ a.collect do |row|
837
+ get_op_with_axis(row, target_axis, current_axis + 1, output_type, op)
838
+ end
839
+ end
840
+ end
841
+
842
+ def reduction(child_context, tensor, a, b, func)
843
+ input = complete_eval(a, child_context)
844
+ axis = read_final_result(complete_eval(b, child_context))
845
+ if axis.nil?
846
+ convert_to_opencl(input.buffer.send(func), [], data_type: tensor.data_type, name: tensor.name)
847
+ else
848
+ return input if input.shape.empty?
849
+ value = input.buffer.reshape(*input.shape.reverse)
850
+ rank = input.shape.size - 1
851
+
852
+ if axis.is_a?(Array)
853
+ axis.map{ |x| rank - x.abs }.sort.reverse.each do |x|
854
+ value = value.send(func, x)
855
+ end
856
+ else
857
+ value = value.send(func, rank - axis.abs)
858
+ end
859
+
860
+ new_shape = if value.is_a?(NArray)
861
+ value.shape.reverse
862
+ else
863
+ value = [value]
864
+ []
865
+ end
866
+
867
+ if tensor.options[:keepdims]
868
+ new_shape = reduced_shape(input.shape.dup, axis)
869
+ end
870
+
871
+ convert_to_opencl(value.flatten, new_shape, data_type: tensor.data_type, name: tensor.name)
872
+ end
873
+ end
874
+
875
+ def arr_pad(arr, paddings, data_type = :float32, rank = 0)
876
+ raise "padding #{paddings[rank]} needs to have to elements [before, after]" if paddings[rank].size != 2
877
+
878
+ before = paddings[rank][0]
879
+ after = paddings[rank][1]
880
+ pad_value = fp_type?(data_type) ? 0.0 : 0
881
+ if arr[0].is_a?(Array)
882
+ next_dim_elem = arr.collect { |a| arr_pad(a, paddings, data_type, rank + 1) }
883
+ padding = deep_dup_array(next_dim_elem[0], pad_value)
884
+ Array.new(before) { padding } + next_dim_elem + Array.new(after) { padding }
885
+ else
886
+ Array.new(before) { pad_value } + arr + Array.new(after) { pad_value }
887
+ end
888
+ end
889
+
890
+ def deep_dup_array(arr, value = nil)
891
+ if arr.is_a?(Array)
892
+ arr.dup.collect do |a|
893
+ deep_dup_array(a, value)
894
+ end
895
+ else
896
+ value.nil? ? arr : value
897
+ end
898
+ end
899
+
900
+ def matmul_const_transform(mat, mat_b, tensor)
901
+ if !mat.is_a?(Array)
902
+ compat_shape = shape_eval(mat_b).reverse
903
+ func = -> { tensor.data_type == :int32 ? mat.to_i : mat.to_f }
904
+
905
+ generate_vector(compat_shape, generator: func)
906
+ else
907
+ mat
908
+ end
909
+ end
910
+
911
+ # determine possible reduction axis to be used
912
+ def _broadcast_gradient_op(vector_shape1, vector_shape2, level)
913
+ va_rank = _rank_from_shape(vector_shape1)
914
+ vb_rank = _rank_from_shape(vector_shape2)
915
+ return [] if vector_shape1 == vector_shape2 # same shape so no reductions
916
+
917
+ shape2_r = vector_shape2.reverse
918
+
919
+ vector_shape1.reverse.each_with_index.collect do |s, index|
920
+ next va_rank - index - 1 if index >= shape2_r.size
921
+ next nil if shape2_r[index] == s
922
+ next nil if shape2_r[index] > s
923
+ va_rank - index - 1
924
+ end.compact
925
+ end
926
+
927
+ # selects variants of cl programs depending on input
928
+ def select_program(input_a, input_b, op)
929
+ return [input_a, input_b, "#{op}", 0] if input_a.shape == input_b.shape
930
+
931
+ return [input_b, input_a, "#{op}_c", 1] if input_a.shape.empty? || input_a.shape.reduce(:*) == 1 # A is scalar?
932
+ return [input_a, input_b, "#{op}_c", 0] if input_b.shape.empty? || input_a.shape.reduce(:*) == 1 # B is scalar?
933
+
934
+ return [input_b, input_a, "#{op}_b", 1] if input_a.shape.size < input_b.shape.size
935
+
936
+ if input_a.shape.size == input_b.shape.size
937
+ input_a.shape.zip(input_b.shape).each do |s1, s2|
938
+ return [input_b, input_a, "#{op}_b", 1] if s1 < s2
939
+ end
940
+ end
941
+
942
+ [input_a, input_b, "#{op}_b", 0]
943
+ end
944
+
945
+ def _rank_from_shape(shape)
946
+ shape.is_a?(Array) ? shape.size : 0
947
+ end
948
+
949
+ def get_broadcast_gradient_args(input_a, input_b)
950
+ return [] if get_rank(input_b).zero? && get_rank(input_a).zero?
951
+ return nil if get_rank(input_b).zero?
952
+ # ruby scalar
953
+ if get_rank(input_a).zero?
954
+ _broadcast_gradient_op(input_b, input_a, 0, true)
955
+ elsif get_rank(input_a) > 0
956
+ _broadcast_gradient_op(input_a, input_b, 0)
957
+ end
958
+ end
959
+
960
+ def concat_array(values, axis)
961
+ combined_array = values.shift
962
+ axis = get_rank(combined_array) - 1 if axis == -1
963
+
964
+ values.each do |v|
965
+ combined_array = concat(combined_array, v, axis)
966
+ end
967
+ combined_array
968
+ end
969
+
970
+ def concat(a, b, axis)
971
+ if axis.zero?
972
+ a + b
973
+ else
974
+ a.each_with_index.collect do |i, index|
975
+ concat(i, b[index], axis - 1)
976
+ end
977
+ end
978
+ end
979
+
980
+ def resolve_placeholder(placeholder, _execution_context = {})
981
+ return nil if placeholder.nil?
982
+ return placeholder if retain.include?(placeholder)
983
+
984
+ var = if placeholder.is_a?(Placeholder)
985
+ @context[placeholder.name.to_sym].tap do |c|
986
+ raise "missing placeholder #{placeholder.name}" if c.nil?
987
+ end
988
+ else
989
+ placeholder
990
+ end
991
+
992
+ return convert_to_opencl(var, shape_eval(var), data_type: placeholder.data_type, name: placeholder.name) unless var.is_a?(Tensor)
993
+ Tensor.cast_dtype(var, placeholder.data_type)
994
+ end
995
+
996
+ def reduce_axis(current_axis, axis, val, keep_dims, f = ->(a, b) { a + b })
997
+ return val unless val.is_a?(Array)
998
+
999
+ r = val.collect do |v|
1000
+ reduce_axis(current_axis + 1, axis, v, keep_dims, f)
1001
+ end
1002
+
1003
+ should_reduce_axis = axis.nil? || (axis.is_a?(Array) && axis.include?(current_axis)) || (current_axis == axis)
1004
+
1005
+ if should_reduce_axis
1006
+ reduced_val = r[0]
1007
+ if r.size > 1
1008
+ reduced_val = f.call(r[0..val.size])
1009
+ elsif r.size == 0
1010
+ reduced_val = f.call(nil)
1011
+ end
1012
+ keep_dims ? [ reduced_val ] : reduced_val
1013
+ else
1014
+ r
1015
+ end
1016
+ end
1017
+
1018
+ # handle 3 tensor math operations
1019
+ def call_3way_vector_op(v_a, v_b, v_c, child_context, op = ->(a, b, c) { a + b + c })
1020
+ return op.call(v_a, v_b, v_c) unless v_a.is_a?(Array)
1021
+
1022
+ v_a.each_with_index.collect do |v1, index|
1023
+ v2 = v_b[index]
1024
+ v3 = v_c[index]
1025
+ if v1.is_a?(Array)
1026
+ call_3way_vector_op(v1, v2, v3, child_context, op)
1027
+ else
1028
+ op.call(v1, v2, v3)
1029
+ end
1030
+ end
1031
+ end
1032
+
1033
+ def all_true?(arr)
1034
+ if arr.is_a?(Array) || arr.is_a?(NArray)
1035
+ arr.each do |a|
1036
+ return false unless all_true?(a)
1037
+ end
1038
+ return true
1039
+ end
1040
+
1041
+ arr != 0
1042
+ end
1043
+
1044
+ def generate_vector(shape, dtype: :float32, generator:)
1045
+ if shape.is_a?(Integer)
1046
+ Array.new(shape) do
1047
+ generator.call
1048
+ end
1049
+ elsif shape.size > 1
1050
+ Array.new(shape[0]) do
1051
+ generate_vector(shape[1..shape.size], generator: generator, dtype: dtype)
1052
+ end
1053
+ elsif shape.size == 1
1054
+ Array.new(shape[0]) do
1055
+ generator.call
1056
+ end
1057
+ elsif shape.size.zero?
1058
+ generator.call
1059
+ end
1060
+ end
1061
+
1062
+ def _get_randomizer(tensor, seed)
1063
+ if tensor.graph.random_seed && seed
1064
+ Random.new(tensor.graph.random_seed ^ seed)
1065
+ elsif tensor.graph.random_seed
1066
+ @session.randomizer[tensor.graph.object_id] ||= Random.new(tensor.graph.random_seed)
1067
+ @session.randomizer[tensor.graph.object_id]
1068
+ elsif seed
1069
+ @session.randomizer[tensor.operation] ||= Random.new(seed)
1070
+ @session.randomizer[tensor.operation]
1071
+ else
1072
+ Random.new
1073
+ end
1074
+ end
1075
+
1076
+ def dump_intermediates
1077
+ arr = []
1078
+ arr << "============== start ==================="
1079
+ @context[:compute_history].each_with_index do |history, index|
1080
+ arr << "------------------------------------"
1081
+ arr << history[:name]
1082
+ arr << "#{history[:type]} #{history[:shape]}"
1083
+ arr << history[:source]
1084
+ arr << history[:description]
1085
+ arr << ""
1086
+ arr << history[:value].to_json
1087
+ arr << "------------------------------------"
1088
+ end
1089
+ arr << "============== end ====================="
1090
+ str = arr.join("\n")
1091
+ File.write("/tmp/intermediates.txt", str)
1092
+ end
1093
+ end
1094
+ end
1095
+ end