tensor_stream 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. checksums.yaml +5 -5
  2. data/.circleci/config.yml +2 -1
  3. data/CHANGELOG.md +5 -0
  4. data/README.md +28 -1
  5. data/benchmark/benchmark.rb +129 -0
  6. data/lib/tensor_stream.rb +7 -4
  7. data/lib/tensor_stream/evaluator/buffer.rb +10 -0
  8. data/lib/tensor_stream/evaluator/evaluator.rb +1 -0
  9. data/lib/tensor_stream/evaluator/kernels/_bool_operand.cl +45 -0
  10. data/lib/tensor_stream/evaluator/kernels/_operand.cl +45 -0
  11. data/lib/tensor_stream/evaluator/kernels/abs.cl +16 -0
  12. data/lib/tensor_stream/evaluator/kernels/add.cl +5 -0
  13. data/lib/tensor_stream/evaluator/kernels/argmax.cl +15 -0
  14. data/lib/tensor_stream/evaluator/kernels/argmin.cl +15 -0
  15. data/lib/tensor_stream/evaluator/kernels/cast.cl +15 -0
  16. data/lib/tensor_stream/evaluator/kernels/cond.cl.erb +5 -0
  17. data/lib/tensor_stream/evaluator/kernels/cos.cl +7 -0
  18. data/lib/tensor_stream/evaluator/kernels/div.cl.erb +5 -0
  19. data/lib/tensor_stream/evaluator/kernels/exp.cl +7 -0
  20. data/lib/tensor_stream/evaluator/kernels/gemm.cl +63 -0
  21. data/lib/tensor_stream/evaluator/kernels/log.cl +7 -0
  22. data/lib/tensor_stream/evaluator/kernels/log1p.cl +7 -0
  23. data/lib/tensor_stream/evaluator/kernels/max.cl +91 -0
  24. data/lib/tensor_stream/evaluator/kernels/mul.cl +5 -0
  25. data/lib/tensor_stream/evaluator/kernels/negate.cl +15 -0
  26. data/lib/tensor_stream/evaluator/kernels/pow.cl +130 -0
  27. data/lib/tensor_stream/evaluator/kernels/reciprocal.cl +15 -0
  28. data/lib/tensor_stream/evaluator/kernels/round.cl +7 -0
  29. data/lib/tensor_stream/evaluator/kernels/sigmoid.cl +8 -0
  30. data/lib/tensor_stream/evaluator/kernels/sigmoid_grad.cl +54 -0
  31. data/lib/tensor_stream/evaluator/kernels/sign.cl +23 -0
  32. data/lib/tensor_stream/evaluator/kernels/sin.cl +8 -0
  33. data/lib/tensor_stream/evaluator/kernels/sqrt.cl +8 -0
  34. data/lib/tensor_stream/evaluator/kernels/square.cl +15 -0
  35. data/lib/tensor_stream/evaluator/kernels/sub.cl +5 -0
  36. data/lib/tensor_stream/evaluator/kernels/tan.cl +7 -0
  37. data/lib/tensor_stream/evaluator/kernels/tanh.cl +7 -0
  38. data/lib/tensor_stream/evaluator/kernels/tanh_grad.cl +6 -0
  39. data/lib/tensor_stream/evaluator/kernels/where.cl +15 -0
  40. data/lib/tensor_stream/evaluator/opencl_buffer.rb +30 -0
  41. data/lib/tensor_stream/evaluator/opencl_evaluator.rb +1095 -0
  42. data/lib/tensor_stream/evaluator/opencl_template_helper.rb +58 -0
  43. data/lib/tensor_stream/evaluator/operation_helpers/array_ops_helper.rb +27 -0
  44. data/lib/tensor_stream/evaluator/ruby_evaluator.rb +20 -31
  45. data/lib/tensor_stream/graph.rb +4 -2
  46. data/lib/tensor_stream/math_gradients.rb +3 -0
  47. data/lib/tensor_stream/operation.rb +29 -2
  48. data/lib/tensor_stream/ops.rb +14 -2
  49. data/lib/tensor_stream/placeholder.rb +1 -1
  50. data/lib/tensor_stream/session.rb +10 -3
  51. data/lib/tensor_stream/tensor_shape.rb +1 -1
  52. data/lib/tensor_stream/train/saver.rb +1 -1
  53. data/lib/tensor_stream/variable.rb +7 -1
  54. data/lib/tensor_stream/version.rb +1 -1
  55. data/samples/logistic_regression.rb +2 -1
  56. data/samples/nearest_neighbor.rb +54 -0
  57. data/tensor_stream.gemspec +3 -1
  58. metadata +107 -28
@@ -0,0 +1,23 @@
1
+ __kernel void sign_fp(const int M, const int N, __global const float *A, __global float *C) {
2
+ // Get the index of the current element to be processed
3
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
4
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
5
+ float value = A[globalRow * N + globalCol];
6
+ if (isnan(value) || value == 0.0f) {
7
+ C[globalRow * N + globalCol] = 0.0;
8
+ } else {
9
+ C[globalRow * N + globalCol] = value < 0 ? -1.0 : 1.0;
10
+ }
11
+ }
12
+
13
+ __kernel void sign_int(const int M, const int N, __global const int *A, __global int *C) {
14
+ // Get the index of the current element to be processed
15
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
16
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
17
+ float value = A[globalRow * N + globalCol];
18
+ if (isnan(value) || value == 0) {
19
+ C[globalRow * N + globalCol] = 0;
20
+ } else {
21
+ C[globalRow * N + globalCol] = value < 0 ? -1 : 1;
22
+ }
23
+ }
@@ -0,0 +1,8 @@
1
+
2
+ __kernel void sin_fp(const int M, const int N, __global const float *A, __global float *C) {
3
+ // Get the index of the current element to be processed
4
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
+
7
+ C[globalRow * N + globalCol] = sin(A[globalRow * N + globalCol]);
8
+ }
@@ -0,0 +1,8 @@
1
+
2
+ __kernel void sqrt_fp(const int M, const int N, __global const float *A, __global float *C) {
3
+ // Get the index of the current element to be processed
4
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
+
7
+ C[globalRow * N + globalCol] = sqrt(A[globalRow * N + globalCol]);
8
+ }
@@ -0,0 +1,15 @@
1
+ __kernel void square_fp(const int M, const int N, __global const float *A, __global float *C) {
2
+ // Get the index of the current element to be processed
3
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
4
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
5
+
6
+ C[globalRow * N + globalCol] = A[globalRow * N + globalCol] * A[globalRow * N + globalCol];
7
+ }
8
+
9
+ __kernel void square_int(const int M, const int N, __global const int *A, __global int *C) {
10
+ // Get the index of the current element to be processed
11
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
12
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
13
+
14
+ C[globalRow * N + globalCol] = A[globalRow * N + globalCol] * A[globalRow * N + globalCol];
15
+ }
@@ -0,0 +1,5 @@
1
+ % %w[fp int].product(%w[sub]).each do |dtype, fname|
2
+ % c_dtype = dtype_to_c_type(dtype)
3
+ % op = operator_to_c(fname)
4
+ <%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: fname, dtype: dtype, result_t: c_dtype %>
5
+ % end
@@ -0,0 +1,7 @@
1
+ __kernel void tan_fp(const int M, const int N, __global const float *A, __global float *C) {
2
+ // Get the index of the current element to be processed
3
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
4
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
5
+
6
+ C[globalRow * N + globalCol] = tan(A[globalRow * N + globalCol]);
7
+ }
@@ -0,0 +1,7 @@
1
+ __kernel void tanh_fp(const int M, const int N, __global const float *A, __global float *C) {
2
+ // Get the index of the current element to be processed
3
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
4
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
5
+
6
+ C[globalRow * N + globalCol] = tanh(A[globalRow * N + globalCol]);
7
+ }
@@ -0,0 +1,6 @@
1
+ __kernel void tanh_grad_fp(const int M, const int N, __global const float *A, __global float *C) {
2
+ // Get the index of the current element to be processed
3
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
4
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
5
+ C[globalRow * N + globalCol] = 1 - tanh(A[globalRow * N + globalCol]) * tanh(A[globalRow * N + globalCol]);
6
+ }
@@ -0,0 +1,15 @@
1
+ __kernel void where_fp(const int M, const int N, __global const int *PRED, __global const float *A, __global const float *B, __global float *C) {
2
+ // Get the index of the current element to be processed
3
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
4
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
5
+
6
+ C[globalRow * N + globalCol] = PRED[globalRow * N + globalCol] ? A[globalRow * N + globalCol] : B[globalRow * N + globalCol];
7
+ }
8
+
9
+ __kernel void where_int(const int M, const int N, __global const int *PRED, __global const int *A, __global const int *B, __global int *C) {
10
+ // Get the index of the current element to be processed
11
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
12
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
13
+
14
+ C[globalRow * N + globalCol] = PRED[globalRow * N + globalCol] ? A[globalRow * N + globalCol] : B[globalRow * N + globalCol];
15
+ }
@@ -0,0 +1,30 @@
1
+ module TensorStream
2
+ class OpenCLBuffer < Buffer
3
+ include ArrayOpsHelper
4
+
5
+ attr_accessor :data_type, :shape, :buffer, :cl_buffer, :op
6
+
7
+ def initialize(data_type: , shape:, buffer:, cl_buffer:, op: nil, name: nil)
8
+ @data_type = data_type
9
+ @shape = shape
10
+ @buffer = buffer
11
+ @cl_buffer = cl_buffer
12
+ @name = name
13
+ @op = op
14
+ end
15
+
16
+ def to_ruby
17
+ return [] if buffer.empty?
18
+ if shape.empty?
19
+ return buffer[0] != 0 if data_type == :boolean
20
+ return buffer[0]
21
+ end
22
+
23
+ result = buffer.reshape(*shape.reverse).to_a
24
+ if data_type == :boolean
25
+ result = process_function_op(result, ->(a, _b) { a != 0 })
26
+ end
27
+ result
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,1095 @@
1
+ require 'tensor_stream/evaluator/operation_helpers/random_gaussian'
2
+ require 'tensor_stream/evaluator/operation_helpers/array_ops_helper'
3
+ require 'tensor_stream/evaluator/operation_helpers/math_helper'
4
+ require 'tensor_stream/evaluator/opencl_buffer'
5
+ require 'tensor_stream/evaluator/opencl_template_helper'
6
+ require 'distribution'
7
+ require 'opencl_ruby_ffi'
8
+ require 'narray_ffi'
9
+
10
+ module TensorStream
11
+ module Evaluator
12
+ class FullEvalNotPossible < RuntimeError
13
+ end
14
+
15
+ # Errors during graph evaluation
16
+ class EvaluatorExcecutionException < RuntimeError
17
+ attr_reader :tensor
18
+
19
+ def initialize(exception, tensor)
20
+ @exception = exception
21
+ @tensor = tensor
22
+ end
23
+
24
+ def wrapped_exception
25
+ @exception
26
+ end
27
+ end
28
+
29
+ ## PURE ruby evaluator used for testing and development
30
+ class OpenclEvaluator
31
+ attr_accessor :retain
32
+
33
+ include TensorStream::OpHelper
34
+ include TensorStream::ArrayOpsHelper
35
+ include TensorStream::MathHelper
36
+
37
+ def initialize(session, context, thread_pool: nil, log_intermediates: false, preferred_device: nil)
38
+ @session = session
39
+ @context = context
40
+ @log_intermediates = log_intermediates
41
+ @preferred_device = preferred_device
42
+ @retain = context[:retain] || []
43
+ @thread_pool = thread_pool || Concurrent::ImmediateExecutor.new
44
+
45
+ @context[:compute_history] = [] if log_intermediates
46
+ end
47
+
48
+ # opencl evaluator main entrypoint
49
+ def run(tensor, execution_context)
50
+ _create_opencl_context
51
+ # _prepare_kernels
52
+
53
+ read_final_result(complete_eval(tensor, execution_context))
54
+ end
55
+
56
+ def complete_eval(tensor, context)
57
+ create_command_queue
58
+ buffer = _run(tensor, context)
59
+ if buffer.is_a?(Array)
60
+ buffer = buffer.collect do |b|
61
+ next b if b.buffer.size.zero?
62
+ _opencl_queue.enqueue_read_buffer(b.cl_buffer, b.buffer, event_wait_list: [b.op].compact)
63
+ b
64
+ end
65
+ else
66
+ return buffer if buffer.nil? || buffer.buffer.size.zero?
67
+ _opencl_queue.enqueue_read_buffer(buffer.cl_buffer, buffer.buffer, event_wait_list: [buffer.op].compact)
68
+ end
69
+
70
+ _opencl_queue.finish
71
+ buffer
72
+ end
73
+
74
+ def opencl_device
75
+ @context[:_cache][:_opencl_device]
76
+ end
77
+
78
+ protected
79
+
80
+ # read result from opencl and convert to ruby
81
+ def read_final_result(buffer)
82
+ return buffer.map { |b| read_final_result(b) } if buffer.is_a?(Array)
83
+ return nil if buffer.nil?
84
+
85
+ buffer.to_ruby
86
+ end
87
+
88
+ def _create_opencl_context
89
+ @context[:_cache][:_opencl_device] ||= begin
90
+ if @preferred_device
91
+ @preferred_device
92
+ else
93
+ device, _score, _platform, _index = choose_best_device
94
+ device
95
+ end
96
+ end
97
+ @context[:_cache][:_opencl_context] ||= OpenCL.create_context(opencl_device)
98
+ end
99
+
100
+ def choose_best_device
101
+ @best_device ||= begin
102
+ devices = OpenCL.platforms.flat_map do |p|
103
+ p.devices.select { |d| d.available > 0 }.each_with_index.collect do |d, index|
104
+ score = 0
105
+ if d.type.to_s == 'CPU'
106
+ score += 1
107
+ elsif d.type.to_s == 'GPU'
108
+ score += 4
109
+ end
110
+
111
+ score += d.max_compute_units
112
+
113
+ [d, score, p.name, index]
114
+ end
115
+ end
116
+ end
117
+ devices.max { |a| a[1] }
118
+ end
119
+
120
+ def create_command_queue
121
+ supported_proprties = opencl_device.queue_properties.names
122
+ properties = []
123
+ properties << OpenCL::CommandQueue::PROFILING_ENABLE if supported_proprties.include?('PROFILING_ENABLE')
124
+ properties << OpenCL::CommandQueue::OUT_OF_ORDER_EXEC_MODE_ENABLE if supported_proprties.include?('OUT_OF_ORDER_EXEC_MODE_ENABLE')
125
+ @context[:_cache][:_opencl_queue] ||= _opencl_context.create_command_queue(opencl_device, properties: properties)
126
+ end
127
+
128
+ def _opencl_context
129
+ @context[:_cache][:_opencl_context]
130
+ end
131
+
132
+ def _opencl_queue
133
+ @context[:_cache][:_opencl_queue]
134
+ end
135
+
136
+ def cl_template_path(kernel, extension)
137
+ File.join(File.dirname(__FILE__), 'kernels', "#{kernel}.#{extension}")
138
+ end
139
+
140
+ def _cl_program(kernel)
141
+ @context[:_cache]["_opencl_kernel_#{kernel}"] ||= begin
142
+ filename = %w[cl.erb cl].map { |ext| cl_template_path(kernel, ext) }.find { |n| File.exist?(n) }
143
+ source = File.read(filename)
144
+ source = OpenclTemplateHelper.new(source).generate
145
+ program = _opencl_context.create_program_with_source(source)
146
+ program.build
147
+ rescue OpenCL::Error::BUILD_PROGRAM_FAILURE => e
148
+ puts "OpenCL Compile error: #{program.build_log}"
149
+ raise e
150
+ end
151
+ end
152
+
153
+ def _run(tensor, execution_context)
154
+ return tensor if tensor.is_a?(OpenCLBuffer)
155
+ return tensor.map { |t| _run(t, execution_context) } if tensor.is_a?(Array)
156
+
157
+ return tensor if retain.include?(tensor) # if var is in retain don't eval to value
158
+
159
+ tensor = tensor.call if tensor.is_a?(Proc)
160
+
161
+ child_context = execution_context.dup
162
+ res = if tensor.is_a?(Operation)
163
+ eval_operation(tensor, child_context)
164
+ elsif tensor.is_a?(Variable)
165
+ eval_variable(tensor, child_context)
166
+ elsif tensor.is_a?(Placeholder)
167
+ resolve_placeholder(tensor, child_context)
168
+ else
169
+ eval_tensor(tensor, child_context)
170
+ end
171
+ execution_context.deep_merge!(returns: child_context[:returns])
172
+ res
173
+ end
174
+
175
+ def eval_variable(tensor, child_context)
176
+ raise "variable #{tensor.name} not initalized" if tensor.value.nil? && (tensor.buffer.nil? || !tensor.buffer.dirty)
177
+ tensor.buffer = wrap_opencl(tensor, name: tensor.name) if tensor.buffer.nil?
178
+ tensor.buffer
179
+ end
180
+
181
+ def eval_operation(tensor, child_context)
182
+ return @context[tensor.name] if @context.key?(tensor.name)
183
+
184
+ a = resolve_placeholder(tensor.items[0], child_context) if tensor.items && tensor.items[0]
185
+ b = resolve_placeholder(tensor.items[1], child_context) if tensor.items && tensor.items[1]
186
+
187
+ case tensor.operation
188
+ when :concat
189
+ input_a = read_final_result(complete_eval(a, child_context))
190
+ arr = concat_array(input_a, tensor.options[:axis])
191
+ convert_to_opencl(arr.flatten, shape_eval(arr), data_type: tensor.data_type, name: tensor.name)
192
+ when :cond
193
+ pred = complete_eval(tensor.options[:pred], child_context)
194
+ a = _run(a, child_context)
195
+ b = _run(b, child_context)
196
+
197
+ if all_true?(pred.buffer)
198
+ a
199
+ else
200
+ b
201
+ end
202
+ when :identity
203
+ _run(a, child_context)
204
+ when :eye
205
+ rows = complete_eval(a, child_context)
206
+ columns = complete_eval(b, child_context)
207
+ shape = [rows.buffer[0], columns.buffer[0]]
208
+ eye_arr = Array.new(rows.buffer[0]) do |i|
209
+ Array.new(columns.buffer[0]) do |col|
210
+ if fp_type?(tensor.data_type)
211
+ i == col ? 1.0 : 0.0
212
+ else
213
+ i == col ? 1 : 0
214
+ end
215
+ end
216
+ end
217
+
218
+ convert_to_opencl(eye_arr.flatten, shape, data_type: tensor.data_type, name: tensor.name)
219
+ when :pad
220
+ a = read_final_result(complete_eval(a, child_context))
221
+ p = read_final_result(complete_eval(tensor.options[:paddings], child_context))
222
+
223
+ padding = arr_pad(a, p, tensor.data_type)
224
+ convert_to_opencl(padding.flatten, shape_eval(padding), data_type: tensor.data_type, name: tensor.name)
225
+ when :tile
226
+ input = read_final_result(complete_eval(a, child_context))
227
+ multiples = read_final_result(complete_eval(b, child_context))
228
+
229
+ rank = get_rank(input)
230
+ raise '1D or higher tensor required' if rank.zero?
231
+ raise "invalid multiple size passed #{rank} != #{multiples.size}" if rank != multiples.size
232
+
233
+ tile = tile_arr(input, 0, multiples)
234
+ arr = tile.nil? ? [] : tile
235
+ convert_to_opencl(arr.flatten, shape_eval(arr), data_type: tensor.data_type, name: tensor.name)
236
+ when :assign
237
+ assign_var(tensor, b, child_context)
238
+ when :assign_add
239
+ a = _run(a, child_context)
240
+ b = _run(b, child_context)
241
+
242
+ value = execute_2_operand_func('add', tensor, a, b, child_context)
243
+ assign_var(tensor, value, child_context)
244
+ when :assign_sub
245
+ a = _run(a, child_context)
246
+ b = _run(b, child_context)
247
+
248
+ value = execute_2_operand_func('sub', tensor, a, b, child_context)
249
+ assign_var(tensor, value, child_context)
250
+ when :less
251
+ execute_2_operand_func('less', tensor, a, b, child_context, 'cond')
252
+ when :less_equal
253
+ execute_2_operand_func('less_equal', tensor, a, b, child_context, 'cond')
254
+ when :greater
255
+ execute_2_operand_func('greater', tensor, a, b, child_context, 'cond')
256
+ when :greater_equal
257
+ execute_2_operand_func('greater_equal', tensor, a, b, child_context, 'cond')
258
+ when :equal
259
+ execute_2_operand_func('equal', tensor, a, b, child_context, 'cond')
260
+ when :not_equal
261
+ execute_2_operand_func('not_equal', tensor, a, b, child_context, 'cond')
262
+ when :logical_and
263
+ execute_2_operand_func('logical_and', tensor, a, b, child_context, 'cond')
264
+ when :where
265
+ pred = tensor.options[:pred]
266
+ execute_cond_func('where', tensor, pred, a, b, child_context)
267
+ when :max
268
+ execute_2_operand_func('max', tensor, a, b, child_context)
269
+ when :add
270
+ execute_2_operand_func('add', tensor, a, b, child_context)
271
+ when :div
272
+ execute_2_operand_func('div', tensor, a, b, child_context)
273
+ when :sub
274
+ execute_2_operand_func('sub', tensor, a, b, child_context)
275
+ when :matmul
276
+ a = _run(a, child_context)
277
+ b = _run(b, child_context)
278
+
279
+ m = a.shape[0]
280
+ n = b.shape[1]
281
+ v = b.shape[0]
282
+ k = a.shape[1]
283
+
284
+ m, k = [a.shape[1], a.shape[0]] if tensor.options[:transpose_a]
285
+ n, v = [b.shape[0], b.shape[1]] if tensor.options[:transpose_b]
286
+
287
+ result_shape = [m, n]
288
+
289
+ raise "#{tensor.items[0].name} rank must be greater than 1" if a.shape.size < 2
290
+ raise "#{tensor.items[1].name} rank must be greater than 1" if b.shape.size < 2
291
+ raise "incompatible shape sizes for matrix multiplication (#{a.shape[1]} != #{b.shape[0]}) #{a.shape} vs #{b.shape}" if k != v
292
+
293
+ dtype = TensorStream::Ops::FLOATING_POINT_TYPES.include?(tensor.data_type) ? 'fp' : 'int'
294
+ a, b = type_cast(a, b)
295
+ output_buffer = _create_result_buffer(a.data_type, result_shape, tensor.name)
296
+
297
+ cl_m = OpenCL::Int1.new(m)
298
+ cl_n = OpenCL::Int1.new(n)
299
+ cl_k = OpenCL::Int1.new(k)
300
+
301
+ transpose_a = OpenCL::Int1.new(tensor.options[:transpose_a] ? 1 : 0)
302
+ transpose_b = OpenCL::Int1.new(tensor.options[:transpose_b] ? 1 : 0)
303
+
304
+ output_buffer.op = _cl_program('gemm').send(:"gemm_#{dtype}", _opencl_queue, result_shape, cl_m, cl_n, cl_k, transpose_a, transpose_b, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer)
305
+ output_buffer
306
+ when :mul
307
+ execute_2_operand_func('mul', tensor, a, b, child_context)
308
+ when :pow
309
+ execute_2_operand_func('pow', tensor, a, b, child_context)
310
+ when :cast
311
+ a = _run(a, child_context)
312
+ if a.data_type != tensor.data_type
313
+ buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
314
+ s_dtype = TensorStream::Ops::FLOATING_POINT_TYPES.include?(a.data_type) ? 'fp' : 'int'
315
+ t_dtype = TensorStream::Ops::FLOATING_POINT_TYPES.include?(tensor.data_type) ? 'fp' : 'int'
316
+ m, n = a.shape
317
+ cl_m = OpenCL::Int1.new(m || 1)
318
+ cl_n = OpenCL::Int1.new(n || 1)
319
+ work_group = [m || 1, n || 1]
320
+
321
+ buffer.op = _cl_program("cast").send(:"cast_#{s_dtype}_#{t_dtype}",_opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, buffer.cl_buffer)
322
+ buffer
323
+ else
324
+ a
325
+ end
326
+ when :sign
327
+ execute_func('sign', tensor, a, child_context)
328
+ when :exp
329
+ execute_func('exp', tensor, a, child_context)
330
+ when :log
331
+ execute_func('log', tensor, a, child_context)
332
+ when :sin
333
+ execute_func('sin', tensor, a, child_context)
334
+ when :tan
335
+ execute_func('tan', tensor, a, child_context)
336
+ when :cos
337
+ execute_func('cos', tensor, a, child_context)
338
+ when :abs
339
+ execute_func('abs', tensor, a, child_context)
340
+ when :sqrt
341
+ execute_func('sqrt', tensor, a, child_context)
342
+ when :negate
343
+ execute_func('negate', tensor, a, child_context)
344
+ when :square
345
+ execute_func('square', tensor, a, child_context)
346
+ when :reciprocal
347
+ execute_func('reciprocal', tensor, a, child_context)
348
+ when :tanh
349
+ execute_func('tanh', tensor, a, child_context)
350
+ when :tanh_grad
351
+ execute_func('tanh_grad', tensor, a, child_context)
352
+ when :sigmoid
353
+ execute_func('sigmoid', tensor, a, child_context)
354
+ when :log1p
355
+ execute_func('log1p', tensor, a, child_context)
356
+ when :round
357
+ execute_func('round', tensor, a, child_context)
358
+ when :sigmoid_grad
359
+ execute_2_operand_func('sigmoid_grad', tensor, a, b, child_context)
360
+ when :truncate
361
+ a = _run(a, child_context)
362
+ b = _run(b, child_context)
363
+
364
+ if a.shape.size.zero?
365
+ a
366
+ else
367
+ input_b = read_final_result(b)
368
+ if a.shape == input_b
369
+ a
370
+ else
371
+ input_a = read_final_result(a)
372
+ if input_b == []
373
+ if a.buffer.size == 1
374
+ a.shape = input_b
375
+ a
376
+ else
377
+ wrap_opencl(a.buffer[0], data_type: a.data_type, name: tensor.name)
378
+ end
379
+ else
380
+ wrap_opencl(truncate(input_a, input_b), data_type: a.data_type, name: tensor.name)
381
+ end
382
+ end
383
+ end
384
+ when :zeros, :ones, :zeros_like, :ones_like
385
+ shape = if %i[zeros_like ones_like].include?(tensor.operation)
386
+ _run(a, child_context).shape
387
+ else
388
+ read_final_result(complete_eval(a, child_context)) || tensor.shape.shape
389
+ end
390
+
391
+ func = if %i[zeros zeros_like].include?(tensor.operation)
392
+ -> { tensor.data_type == :int32 ? 0 : 0.0 }
393
+ else
394
+ -> { tensor.data_type == :int32 ? 1 : 1.0 }
395
+ end
396
+
397
+ size = shape.empty? ? 1 : shape.reduce(:*)
398
+
399
+ buffer = if TensorStream::Ops::FLOATING_POINT_TYPES.include?(tensor.data_type)
400
+ NArray.sfloat(size)
401
+ elsif TensorStream::Ops::INTEGER_TYPES.include?(tensor.data_type)
402
+ NArray.int(size)
403
+ else
404
+ raise "unsupported type #{tensor.data_type}"
405
+ end
406
+
407
+ data = if !shape.empty?
408
+ Array.new(size) do |index|
409
+ func.call
410
+ end
411
+ else
412
+ func.call
413
+ end
414
+
415
+ convert_to_opencl(data, shape, data_type: tensor.data_type, name: tensor.name)
416
+ when :broadcast_transform
417
+ a = _run(a, child_context)
418
+ b = _run(b, child_context)
419
+
420
+ if a.shape == b.shape
421
+ [a, b]
422
+ else
423
+ input_a = read_final_result(complete_eval(a, child_context))
424
+ input_b = read_final_result(complete_eval(b, child_context))
425
+ b_a, b_b = broadcast(input_a, input_b)
426
+ [ wrap_opencl(b_a, data_type: a.data_type, name: "#{tensor.name}_a"),
427
+ wrap_opencl(b_b, data_type: a.data_type, name: "#{tensor.name}_b")]
428
+ end
429
+ when :print
430
+ a = _run(a, child_context)
431
+ b = _run(b, child_context)
432
+ input_b = complete_eval(b, child_context)
433
+ input_b = read_final_result(input_b)
434
+ puts "#{tensor.options.fetch(:message, '')} #{input_b}"
435
+ a
436
+ when :rank
437
+ a = _run(a, child_context)
438
+ wrap_opencl(a.shape.size, data_type: tensor.data_type, name: tensor.name)
439
+ when :stop_gradient
440
+ _run(a, child_context)
441
+ when :slice
442
+ input_a = complete_eval(a, child_context)
443
+ input_b = read_final_result(complete_eval(b, child_context))
444
+ size = tensor.options[:size]
445
+
446
+ slice_param = input_b.zip(size).collect { |x, y| x..x + y - 1 }.reverse
447
+
448
+ new_buf = input_a.buffer.reshape(*input_a.shape.reverse)
449
+ sliced = new_buf.slice[*slice_param]
450
+ convert_to_opencl(sliced.flatten, sliced.shape.reverse, data_type: a.data_type, name: tensor.name)
451
+ when :transpose
452
+ input_a = complete_eval(a, child_context)
453
+ t_param = Array.new(input_a.shape.size) { |index| index }.reverse
454
+ transposed = input_a.buffer.reshape(*input_a.shape.reverse).transpose(*t_param)
455
+ convert_to_opencl(transposed.flatten, transposed.shape.reverse, data_type: a.data_type, name: tensor.name)
456
+ when :index
457
+ a = complete_eval(a, child_context)
458
+ input_a = read_final_result(a)
459
+ index = read_final_result(complete_eval(b, child_context))
460
+
461
+ if a.is_a?(Array)
462
+ a[index]
463
+ else
464
+ new_shape = a.shape.dup
465
+ new_shape.shift
466
+ convert_to_opencl(input_a[index], new_shape, data_type: a.data_type, name: tensor.name)
467
+ end
468
+ when :broadcast_gradient_args
469
+ a = complete_eval(a, child_context)
470
+ b = complete_eval(b, child_context)
471
+
472
+ wrap_opencl(get_broadcast_gradient_args(a.buffer.to_a, b.buffer.to_a), data_type: a.data_type, name: tensor.name)
473
+ when :shape
474
+ a = _run(a, child_context)
475
+
476
+ wrap_opencl(a.shape, name: tensor.name, data_type: tensor.options[:out_type] || :float32)
477
+ when :reshape
478
+ arr = complete_eval(a, child_context)
479
+ new_shape = read_final_result(complete_eval(b, child_context))
480
+
481
+ if new_shape.size.zero? && arr.buffer.size == 1
482
+ arr.shape = new_shape
483
+ arr
484
+ else
485
+ new_shape = TensorShape.fix_inferred_elements(new_shape, arr.buffer.size)
486
+ arr.shape = new_shape
487
+ arr
488
+ end
489
+ when :random_uniform
490
+ maxval = tensor.options.fetch(:maxval, 1)
491
+ minval = tensor.options.fetch(:minval, 0)
492
+ seed = tensor.options[:seed]
493
+
494
+ random = _get_randomizer(tensor, seed)
495
+ generator = -> { random.rand * (maxval - minval) + minval }
496
+ shape = tensor.options[:shape] || tensor.shape.shape
497
+
498
+ convert_to_opencl(generate_vector(shape, generator: generator), shape, data_type: tensor.data_type, name: tensor.name)
499
+ when :random_normal
500
+ random = _get_randomizer(tensor, seed)
501
+ r = RandomGaussian.new(tensor.options.fetch(:mean), tensor.options.fetch(:stddev), -> { random.rand })
502
+ random = _get_randomizer(tensor, seed)
503
+ generator = -> { r.rand }
504
+ shape = tensor.options[:shape] || tensor.shape.shape
505
+
506
+ convert_to_opencl(generate_vector(shape, generator: generator), shape, data_type: tensor.data_type, name: tensor.name)
507
+ when :glorot_uniform
508
+ random = _get_randomizer(tensor, seed)
509
+
510
+ shape = tensor.options[:shape] || tensor.shape.shape
511
+ fan_in, fan_out = if shape.size.zero?
512
+ [1, 1]
513
+ elsif shape.size == 1
514
+ [1, shape[0]]
515
+ else
516
+ [shape[0], shape.last]
517
+ end
518
+
519
+ limit = Math.sqrt(6.0 / (fan_in + fan_out))
520
+
521
+ minval = -limit
522
+ maxval = limit
523
+
524
+ generator = -> { random.rand * (maxval - minval) + minval }
525
+ convert_to_opencl(generate_vector(shape, generator: generator), shape, data_type: tensor.data_type, name: tensor.name)
526
+ when :flow_group
527
+ tensor.items.collect { |item| _run(item, child_context) }
528
+ when :sum
529
+ reduction(child_context, tensor, a, b, :sum)
530
+ when :mean
531
+ reduction(child_context, tensor, a, b, :mean)
532
+ when :prod
533
+ input_a = complete_eval(a, child_context)
534
+ if input_a.buffer.empty?
535
+ convert_to_opencl([1.0], [], data_type: a.data_type, name: tensor.name)
536
+ else
537
+ reduction(child_context, tensor, a, b, :prod)
538
+ end
539
+ when :argmin
540
+ a = complete_eval(a, child_context)
541
+ axis = tensor.options[:axis] || 0
542
+ arr = a.buffer.reshape(*a.shape.reverse).to_a
543
+ op = get_op_with_axis(arr, axis, 0, a.data_type, ->(a, b) { a < b })
544
+ convert_to_opencl(op, shape_eval(op), data_type: tensor.data_type, name: tensor.name)
545
+ when :argmax
546
+ a = complete_eval(a, child_context)
547
+ axis = tensor.options[:axis] || 0
548
+ arr = a.buffer.reshape(*a.shape.reverse).to_a
549
+ op = get_op_with_axis(arr, axis, 0, a.data_type, ->(a, b) { a > b })
550
+ convert_to_opencl(op, shape_eval(op), data_type: tensor.data_type, name: tensor.name)
551
+ else
552
+ raise "unknown op #{tensor.operation}"
553
+ end.tap do |result|
554
+ if tensor.breakpoint
555
+ a = read_final_result(complete_eval(a, child_context))
556
+ b = read_final_result(complete_eval(b, child_context))
557
+ result = read_final_result(complete_eval(result, child_context))
558
+
559
+ tensor.breakpoint.call(tensor, a, b, result)
560
+ end
561
+ if @log_intermediates
562
+ @context[:compute_history] << {
563
+ name: tensor.name,
564
+ type: tensor.data_type,
565
+ shape: shape_eval(result),
566
+ source: tensor.source,
567
+ description: tensor.to_math(true, 1),
568
+ value: result
569
+ }
570
+ end
571
+ @context[tensor.name] = result
572
+ end
573
+ rescue EvaluatorExcecutionException => e
574
+ raise e
575
+ rescue StandardError => e
576
+ puts e.message
577
+ puts e.backtrace.join("\n")
578
+
579
+ # shape_a = a.shape.shape if a
580
+ # shape_b = b.shape.shape if b
581
+ # dtype_a = a.data_type if a
582
+ # dtype_b = b.data_type if b
583
+ # a = complete_eval(a, child_context)
584
+ # b = complete_eval(b, child_context)
585
+ # puts "name: #{tensor.given_name}"
586
+ # # puts "op: #{tensor.to_math(true, 1)}"
587
+ # puts "A #{shape_a} #{dtype_a}: #{a}" if a
588
+ # puts "B #{shape_b} #{dtype_b}: #{b}" if b
589
+ # dump_intermediates if @log_intermediates
590
+ # File.write('/home/jedld/workspace/tensor_stream/samples/error.graphml', TensorStream::Graphml.new.get_string(tensor, @session))
591
+
592
+ # File.write('/Users/josephemmanueldayo/workspace/gradients.graphml', TensorStream::Graphml.new.get_string(tensor, @session))
593
+ raise EvaluatorExcecutionException.new(e, tensor), "error #{e.message} while evaluating #{tensor.name} : #{tensor.to_math(true,1)} defined at #{tensor.source}"
594
+ end
595
+
596
+ def eval_tensor(tensor, child_context)
597
+ return tensor unless tensor.is_a?(Tensor)
598
+
599
+ cache_key = "#{tensor.graph.object_id}_opencl_#{tensor.name}"
600
+ return @context[cache_key] if @context.key?(cache_key)
601
+ return @context[:_cache][cache_key] if tensor.is_const && @context[:_cache][cache_key]
602
+ @context[cache_key] = if tensor.value.is_a?(Tensor)
603
+ _run(tensor.value, child_context)
604
+ else
605
+ wrap_opencl(tensor, name: tensor.name)
606
+ end
607
+ @context[:_cache][cache_key] = @context[cache_key] if tensor.is_const
608
+ end
609
+
610
+ private
611
+
612
+ def assign_var(tensor, b, child_context)
613
+ assign = tensor.items[0] || tensor
614
+ buffer = complete_eval(b, child_context)
615
+ if assign.buffer
616
+ assign.buffer.op = _opencl_queue.enqueue_write_buffer(assign.buffer.cl_buffer, buffer.buffer)
617
+ else
618
+ assign.buffer = convert_to_opencl(read_final_result(buffer), buffer.shape, data_type: tensor.data_type, name: tensor.name)
619
+ end
620
+ assign.buffer.dirty = true
621
+ assign.buffer
622
+ end
623
+
624
+ def execute_2_operand_func(op_name, tensor, input_a, input_b, child_context, prog_name = nil)
625
+ a = _run(input_a, child_context)
626
+ b = _run(input_b, child_context)
627
+ a, b = type_cast(a, b)
628
+ dtype = TensorStream::Ops::FLOATING_POINT_TYPES.include?(tensor.data_type) ? 'fp' : 'int'
629
+ result_shape = TensorShape.infer_shape(a.shape, b.shape)
630
+
631
+ output_buffer = _create_result_buffer(tensor.data_type, result_shape, tensor.name)
632
+ a, b, prog, switch_operands = select_program(a, b, op_name)
633
+ m, n = result_shape
634
+ work_group = [m || 1, n || 1]
635
+ cl_m = OpenCL::Int1.new(m || 1)
636
+ cl_n = OpenCL::Int1.new(n || 1)
637
+ cl_switch = OpenCL::Int1.new(switch_operands) # no need to switch for addition
638
+
639
+ event_wait_list = [a.op, b.op].compact # add dependency wait list
640
+
641
+ event = if prog == "#{op_name}_b"
642
+ cl_m_b, cl_n_b = if b.shape.size == 2
643
+ [ OpenCL::Int1.new(b.shape[0]), OpenCL::Int1.new(b.shape[1]) ]
644
+ elsif b.shape.size == 1
645
+ [ OpenCL::Int1.new(1), OpenCL::Int1.new(b.shape[0]) ]
646
+ else
647
+ raise "rank > 2 not supported!"
648
+ end
649
+ _cl_program("#{prog_name || op_name}").send(:"#{prog}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, cl_m_b, cl_n_b, cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
650
+ else
651
+ _cl_program("#{prog_name || op_name}").send(:"#{prog}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
652
+ end
653
+
654
+ output_buffer.op = event
655
+ output_buffer
656
+ end
657
+
658
+ def execute_cond_func(op_name, tensor, pred, input_a, input_b, child_context)
659
+ p = _run(pred, child_context)
660
+ a = _run(input_a, child_context)
661
+ b = _run(input_b, child_context)
662
+
663
+ a, b = type_cast(a, b)
664
+ dtype = TensorStream::Ops::FLOATING_POINT_TYPES.include?(tensor.data_type) ? 'fp' : 'int'
665
+
666
+ output_buffer = _create_result_buffer(tensor.data_type, p.shape, tensor.name)
667
+
668
+ m, n = p.shape
669
+ work_group = [m || 1, n || 1]
670
+ cl_m = OpenCL::Int1.new(m || 1)
671
+ cl_n = OpenCL::Int1.new(n || 1)
672
+
673
+ event_wait_list = [a.op, b.op, p.op].compact # add dependency wait list
674
+ output_buffer.op = _cl_program("#{op_name}").send(:"#{op_name}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, p.cl_buffer, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
675
+ output_buffer
676
+ end
677
+
678
+ def execute_func(op_name, tensor, a, child_context)
679
+ a = _run(a, child_context)
680
+ event_wait_list = [a.op].compact
681
+ dtype = TensorStream::Ops::FLOATING_POINT_TYPES.include?(tensor.data_type) ? 'fp' : 'int'
682
+ output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
683
+
684
+ m, n = a.shape
685
+ work_group = [m || 1, n || 1]
686
+ cl_m = OpenCL::Int1.new(m || 1)
687
+ cl_n = OpenCL::Int1.new(n || 1)
688
+
689
+ event = _cl_program("#{op_name}").send(:"#{op_name}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
690
+ output_buffer.op = event
691
+ output_buffer
692
+ end
693
+
694
+ def type_cast(a, b)
695
+ return [a, b] if a.data_type == b.data_type
696
+ m, n = b.shape
697
+ work_group = [m || 1, n || 1]
698
+ buffer = buffer_for(b.shape, b.data_type)
699
+ if (TensorStream::Ops::FLOATING_POINT_TYPES.include?(a.data_type.to_sym))
700
+ if TensorStream::Ops::INTEGER_TYPES.include?(b.data_type.to_sym)
701
+ cl_m = OpenCL::Int1.new(m || 1)
702
+ cl_n = OpenCL::Int1.new(n || 1)
703
+
704
+ _cl_program("cast").cast_int_fp(_opencl_queue, work_group, cl_m, cl_n, b.cl_buffer, buffer.cl_buffer)
705
+ return [a, buffer]
706
+ end
707
+ elsif TensorStream::Ops::INTEGER_TYPES.include?(a.data_type.to_sym)
708
+ if TensorStream::Ops::FLOATING_POINT_TYPES.include?(b.data_type.to_sym)
709
+ cl_m = OpenCL::Int1.new(m || 1)
710
+ cl_n = OpenCL::Int1.new(n || 1)
711
+ _cl_program("cast").cast_fp_int(_opencl_queue, work_group, cl_m, cl_n, b.cl_buffer, buffer.cl_buffer)
712
+ return [a, buffer]
713
+ end
714
+ end
715
+
716
+ [a, b]
717
+ end
718
+
719
+ def buffer_for(shape, data_type)
720
+ size = shape.empty? ? 1 : shape.reduce(:*)
721
+
722
+ buffer = allocate_narray_for_type(data_type, size)
723
+
724
+ cl_buffer = _opencl_context.create_buffer(buffer.size * buffer.element_size)
725
+ OpenCLBuffer.new(data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer)
726
+ end
727
+
728
+ def wrap_opencl(tensor, data_type: nil, name: nil)
729
+ value, shape = if tensor.is_a?(Tensor)
730
+ [tensor.value, tensor.shape.shape]
731
+ else
732
+ [tensor , shape_eval(tensor)]
733
+ end
734
+
735
+ convert_to_opencl(value, shape, data_type: data_type || tensor.data_type, name: name)
736
+ end
737
+
738
+ def convert_to_opencl(value, shape, data_type: nil, name: nil)
739
+ if !value.is_a?(Array) && !value.is_a?(NArray)
740
+ value = [value]
741
+ end
742
+
743
+ cache_key = "_cl_object_#{name}_#{shape.join('_')}"
744
+ cl_object = if name && @context[:_cache][cache_key]
745
+ @context[:_cache][cache_key]
746
+ else
747
+ narray_size = shape.reduce(:*) || 1
748
+
749
+ buffer = if value.is_a?(NArray)
750
+ value
751
+ else
752
+ allocate_narray_for_type(data_type, narray_size)
753
+ end
754
+
755
+ cl_buffer_size = shape.empty? ? 1 : shape.reduce(:*)
756
+
757
+ cl_buffer = if !value.flatten.empty?
758
+ cl_buffer_size = 1 if cl_buffer_size.zero?
759
+ _opencl_context.create_buffer(cl_buffer_size * buffer.element_size)
760
+ else
761
+ nil
762
+ end
763
+
764
+ @context[:_cache][cache_key] = OpenCLBuffer.new(name: name, data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer)
765
+ end
766
+
767
+ if value.is_a?(Array)
768
+ value.flatten.each_with_index do |element, index|
769
+ if element.is_a?(Tensor)
770
+ cl_object.buffer[index] = read_final_result(complete_eval(element, {}))
771
+ else
772
+ cl_object.buffer[index] = Tensor.cast_dtype(element, data_type)
773
+ end
774
+ end
775
+ elsif value.is_a?(NArray)
776
+ cl_object.buffer = value
777
+ else
778
+ cl_object.buffer[0] = Tensor.cast_dtype(value, data_type)
779
+ end
780
+
781
+ write_op = if cl_object.cl_buffer && !value.nil? && (!value.is_a?(Array) || !value.empty?)
782
+ _opencl_queue.enqueue_write_buffer(cl_object.cl_buffer, cl_object.buffer)
783
+ end
784
+ cl_object.op = write_op
785
+ cl_object
786
+ end
787
+
788
+ def allocate_narray_for_type(data_type, narray_size)
789
+ if TensorStream::Ops::FLOATING_POINT_TYPES.include?(data_type.to_sym) || TensorStream::Ops::FLOATING_POINT_TYPES.include?(data_type.to_sym)
790
+ NArray.sfloat(narray_size)
791
+ elsif TensorStream::Ops::INTEGER_TYPES.include?(data_type.to_sym) || TensorStream::Ops::INTEGER_TYPES.include?(data_type.to_sym)
792
+ NArray.int(narray_size)
793
+ elsif data_type.to_sym == :boolean
794
+ NArray.int(narray_size)
795
+ else
796
+ raise "unsupported type #{data_type}"
797
+ end
798
+ end
799
+
800
+ def _create_result_buffer(data_type, shape, name)
801
+ @context[:_cache]["_result_#{name}_#{shape.join('_')}"] ||= begin
802
+ size = shape.empty? ? 1 : shape.reduce(:*)
803
+ buffer = allocate_narray_for_type(data_type, size)
804
+ cl_buffer = _opencl_context.create_buffer(buffer.size * buffer.element_size)
805
+ OpenCLBuffer.new(data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer)
806
+ end
807
+ end
808
+
809
+ def get_op_with_axis(a, target_axis, current_axis, output_type, op = ->(t, u) { t > u })
810
+ if target_axis == current_axis
811
+ if a[0].is_a?(Array)
812
+ (0...a[0].size).each.collect do |column_index|
813
+ max = nil
814
+ max_index = 0
815
+ a.each_with_index do |row, row_index|
816
+ if max.nil? || op.call(row[column_index], max)
817
+ max = row[column_index]
818
+ max_index = row_index
819
+ end
820
+ end
821
+
822
+ Tensor.cast_dtype(max_index, output_type)
823
+ end
824
+ else
825
+ max = nil
826
+ max_index = 0
827
+ a.each_with_index do |x, index|
828
+ if max.nil? || op.call(x, max)
829
+ max = x
830
+ max_index = index
831
+ end
832
+ end
833
+ Tensor.cast_dtype(max_index, output_type)
834
+ end
835
+ else
836
+ a.collect do |row|
837
+ get_op_with_axis(row, target_axis, current_axis + 1, output_type, op)
838
+ end
839
+ end
840
+ end
841
+
842
+ def reduction(child_context, tensor, a, b, func)
843
+ input = complete_eval(a, child_context)
844
+ axis = read_final_result(complete_eval(b, child_context))
845
+ if axis.nil?
846
+ convert_to_opencl(input.buffer.send(func), [], data_type: tensor.data_type, name: tensor.name)
847
+ else
848
+ return input if input.shape.empty?
849
+ value = input.buffer.reshape(*input.shape.reverse)
850
+ rank = input.shape.size - 1
851
+
852
+ if axis.is_a?(Array)
853
+ axis.map{ |x| rank - x.abs }.sort.reverse.each do |x|
854
+ value = value.send(func, x)
855
+ end
856
+ else
857
+ value = value.send(func, rank - axis.abs)
858
+ end
859
+
860
+ new_shape = if value.is_a?(NArray)
861
+ value.shape.reverse
862
+ else
863
+ value = [value]
864
+ []
865
+ end
866
+
867
+ if tensor.options[:keepdims]
868
+ new_shape = reduced_shape(input.shape.dup, axis)
869
+ end
870
+
871
+ convert_to_opencl(value.flatten, new_shape, data_type: tensor.data_type, name: tensor.name)
872
+ end
873
+ end
874
+
875
+ def arr_pad(arr, paddings, data_type = :float32, rank = 0)
876
+ raise "padding #{paddings[rank]} needs to have to elements [before, after]" if paddings[rank].size != 2
877
+
878
+ before = paddings[rank][0]
879
+ after = paddings[rank][1]
880
+ pad_value = fp_type?(data_type) ? 0.0 : 0
881
+ if arr[0].is_a?(Array)
882
+ next_dim_elem = arr.collect { |a| arr_pad(a, paddings, data_type, rank + 1) }
883
+ padding = deep_dup_array(next_dim_elem[0], pad_value)
884
+ Array.new(before) { padding } + next_dim_elem + Array.new(after) { padding }
885
+ else
886
+ Array.new(before) { pad_value } + arr + Array.new(after) { pad_value }
887
+ end
888
+ end
889
+
890
+ def deep_dup_array(arr, value = nil)
891
+ if arr.is_a?(Array)
892
+ arr.dup.collect do |a|
893
+ deep_dup_array(a, value)
894
+ end
895
+ else
896
+ value.nil? ? arr : value
897
+ end
898
+ end
899
+
900
+ def matmul_const_transform(mat, mat_b, tensor)
901
+ if !mat.is_a?(Array)
902
+ compat_shape = shape_eval(mat_b).reverse
903
+ func = -> { tensor.data_type == :int32 ? mat.to_i : mat.to_f }
904
+
905
+ generate_vector(compat_shape, generator: func)
906
+ else
907
+ mat
908
+ end
909
+ end
910
+
911
+ # determine possible reduction axis to be used
912
+ def _broadcast_gradient_op(vector_shape1, vector_shape2, level)
913
+ va_rank = _rank_from_shape(vector_shape1)
914
+ vb_rank = _rank_from_shape(vector_shape2)
915
+ return [] if vector_shape1 == vector_shape2 # same shape so no reductions
916
+
917
+ shape2_r = vector_shape2.reverse
918
+
919
+ vector_shape1.reverse.each_with_index.collect do |s, index|
920
+ next va_rank - index - 1 if index >= shape2_r.size
921
+ next nil if shape2_r[index] == s
922
+ next nil if shape2_r[index] > s
923
+ va_rank - index - 1
924
+ end.compact
925
+ end
926
+
927
+ # selects variants of cl programs depending on input
928
+ def select_program(input_a, input_b, op)
929
+ return [input_a, input_b, "#{op}", 0] if input_a.shape == input_b.shape
930
+
931
+ return [input_b, input_a, "#{op}_c", 1] if input_a.shape.empty? || input_a.shape.reduce(:*) == 1 # A is scalar?
932
+ return [input_a, input_b, "#{op}_c", 0] if input_b.shape.empty? || input_a.shape.reduce(:*) == 1 # B is scalar?
933
+
934
+ return [input_b, input_a, "#{op}_b", 1] if input_a.shape.size < input_b.shape.size
935
+
936
+ if input_a.shape.size == input_b.shape.size
937
+ input_a.shape.zip(input_b.shape).each do |s1, s2|
938
+ return [input_b, input_a, "#{op}_b", 1] if s1 < s2
939
+ end
940
+ end
941
+
942
+ [input_a, input_b, "#{op}_b", 0]
943
+ end
944
+
945
+ def _rank_from_shape(shape)
946
+ shape.is_a?(Array) ? shape.size : 0
947
+ end
948
+
949
+ def get_broadcast_gradient_args(input_a, input_b)
950
+ return [] if get_rank(input_b).zero? && get_rank(input_a).zero?
951
+ return nil if get_rank(input_b).zero?
952
+ # ruby scalar
953
+ if get_rank(input_a).zero?
954
+ _broadcast_gradient_op(input_b, input_a, 0, true)
955
+ elsif get_rank(input_a) > 0
956
+ _broadcast_gradient_op(input_a, input_b, 0)
957
+ end
958
+ end
959
+
960
+ def concat_array(values, axis)
961
+ combined_array = values.shift
962
+ axis = get_rank(combined_array) - 1 if axis == -1
963
+
964
+ values.each do |v|
965
+ combined_array = concat(combined_array, v, axis)
966
+ end
967
+ combined_array
968
+ end
969
+
970
+ def concat(a, b, axis)
971
+ if axis.zero?
972
+ a + b
973
+ else
974
+ a.each_with_index.collect do |i, index|
975
+ concat(i, b[index], axis - 1)
976
+ end
977
+ end
978
+ end
979
+
980
+ def resolve_placeholder(placeholder, _execution_context = {})
981
+ return nil if placeholder.nil?
982
+ return placeholder if retain.include?(placeholder)
983
+
984
+ var = if placeholder.is_a?(Placeholder)
985
+ @context[placeholder.name.to_sym].tap do |c|
986
+ raise "missing placeholder #{placeholder.name}" if c.nil?
987
+ end
988
+ else
989
+ placeholder
990
+ end
991
+
992
+ return convert_to_opencl(var, shape_eval(var), data_type: placeholder.data_type, name: placeholder.name) unless var.is_a?(Tensor)
993
+ Tensor.cast_dtype(var, placeholder.data_type)
994
+ end
995
+
996
+ def reduce_axis(current_axis, axis, val, keep_dims, f = ->(a, b) { a + b })
997
+ return val unless val.is_a?(Array)
998
+
999
+ r = val.collect do |v|
1000
+ reduce_axis(current_axis + 1, axis, v, keep_dims, f)
1001
+ end
1002
+
1003
+ should_reduce_axis = axis.nil? || (axis.is_a?(Array) && axis.include?(current_axis)) || (current_axis == axis)
1004
+
1005
+ if should_reduce_axis
1006
+ reduced_val = r[0]
1007
+ if r.size > 1
1008
+ reduced_val = f.call(r[0..val.size])
1009
+ elsif r.size == 0
1010
+ reduced_val = f.call(nil)
1011
+ end
1012
+ keep_dims ? [ reduced_val ] : reduced_val
1013
+ else
1014
+ r
1015
+ end
1016
+ end
1017
+
1018
+ # handle 3 tensor math operations
1019
+ def call_3way_vector_op(v_a, v_b, v_c, child_context, op = ->(a, b, c) { a + b + c })
1020
+ return op.call(v_a, v_b, v_c) unless v_a.is_a?(Array)
1021
+
1022
+ v_a.each_with_index.collect do |v1, index|
1023
+ v2 = v_b[index]
1024
+ v3 = v_c[index]
1025
+ if v1.is_a?(Array)
1026
+ call_3way_vector_op(v1, v2, v3, child_context, op)
1027
+ else
1028
+ op.call(v1, v2, v3)
1029
+ end
1030
+ end
1031
+ end
1032
+
1033
+ def all_true?(arr)
1034
+ if arr.is_a?(Array) || arr.is_a?(NArray)
1035
+ arr.each do |a|
1036
+ return false unless all_true?(a)
1037
+ end
1038
+ return true
1039
+ end
1040
+
1041
+ arr != 0
1042
+ end
1043
+
1044
+ def generate_vector(shape, dtype: :float32, generator:)
1045
+ if shape.is_a?(Integer)
1046
+ Array.new(shape) do
1047
+ generator.call
1048
+ end
1049
+ elsif shape.size > 1
1050
+ Array.new(shape[0]) do
1051
+ generate_vector(shape[1..shape.size], generator: generator, dtype: dtype)
1052
+ end
1053
+ elsif shape.size == 1
1054
+ Array.new(shape[0]) do
1055
+ generator.call
1056
+ end
1057
+ elsif shape.size.zero?
1058
+ generator.call
1059
+ end
1060
+ end
1061
+
1062
+ def _get_randomizer(tensor, seed)
1063
+ if tensor.graph.random_seed && seed
1064
+ Random.new(tensor.graph.random_seed ^ seed)
1065
+ elsif tensor.graph.random_seed
1066
+ @session.randomizer[tensor.graph.object_id] ||= Random.new(tensor.graph.random_seed)
1067
+ @session.randomizer[tensor.graph.object_id]
1068
+ elsif seed
1069
+ @session.randomizer[tensor.operation] ||= Random.new(seed)
1070
+ @session.randomizer[tensor.operation]
1071
+ else
1072
+ Random.new
1073
+ end
1074
+ end
1075
+
1076
+ def dump_intermediates
1077
+ arr = []
1078
+ arr << "============== start ==================="
1079
+ @context[:compute_history].each_with_index do |history, index|
1080
+ arr << "------------------------------------"
1081
+ arr << history[:name]
1082
+ arr << "#{history[:type]} #{history[:shape]}"
1083
+ arr << history[:source]
1084
+ arr << history[:description]
1085
+ arr << ""
1086
+ arr << history[:value].to_json
1087
+ arr << "------------------------------------"
1088
+ end
1089
+ arr << "============== end ====================="
1090
+ str = arr.join("\n")
1091
+ File.write("/tmp/intermediates.txt", str)
1092
+ end
1093
+ end
1094
+ end
1095
+ end