tensor_stream 0.6.1 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +10 -0
  3. data/CHANGELOG.md +8 -0
  4. data/README.md +40 -1
  5. data/benchmark/benchmark.rb +4 -1
  6. data/lib/tensor_stream.rb +5 -0
  7. data/lib/tensor_stream/debugging/debugging.rb +4 -2
  8. data/lib/tensor_stream/device.rb +2 -1
  9. data/lib/tensor_stream/evaluator/base_evaluator.rb +43 -32
  10. data/lib/tensor_stream/evaluator/evaluator.rb +0 -1
  11. data/lib/tensor_stream/evaluator/opencl/kernels/acos.cl +8 -0
  12. data/lib/tensor_stream/evaluator/opencl/kernels/apply_gradient.cl +9 -0
  13. data/lib/tensor_stream/evaluator/opencl/kernels/asin.cl +9 -0
  14. data/lib/tensor_stream/evaluator/opencl/kernels/floor_mod.cl +3 -0
  15. data/lib/tensor_stream/evaluator/opencl/kernels/log_softmax.cl +26 -0
  16. data/lib/tensor_stream/evaluator/opencl/kernels/max.cl +5 -5
  17. data/lib/tensor_stream/evaluator/opencl/kernels/min.cl +46 -0
  18. data/lib/tensor_stream/evaluator/opencl/kernels/real_div.cl +3 -0
  19. data/lib/tensor_stream/evaluator/opencl/kernels/softmax_cross.cl +27 -0
  20. data/lib/tensor_stream/evaluator/opencl/kernels/softmax_cross_grad.cl +28 -0
  21. data/lib/tensor_stream/evaluator/opencl/opencl_buffer.rb +5 -6
  22. data/lib/tensor_stream/evaluator/opencl/opencl_evaluator.rb +200 -265
  23. data/lib/tensor_stream/evaluator/operation_helpers/array_ops_helper.rb +4 -8
  24. data/lib/tensor_stream/evaluator/ruby_evaluator.rb +193 -122
  25. data/lib/tensor_stream/exceptions.rb +6 -0
  26. data/lib/tensor_stream/graph.rb +21 -6
  27. data/lib/tensor_stream/graph_builder.rb +67 -0
  28. data/lib/tensor_stream/graph_deserializers/protobuf.rb +271 -0
  29. data/lib/tensor_stream/graph_keys.rb +1 -0
  30. data/lib/tensor_stream/graph_serializers/pbtext.rb +11 -10
  31. data/lib/tensor_stream/helpers/op_helper.rb +7 -33
  32. data/lib/tensor_stream/helpers/string_helper.rb +16 -0
  33. data/lib/tensor_stream/math_gradients.rb +67 -44
  34. data/lib/tensor_stream/nn/nn_ops.rb +7 -1
  35. data/lib/tensor_stream/operation.rb +14 -27
  36. data/lib/tensor_stream/ops.rb +82 -29
  37. data/lib/tensor_stream/session.rb +4 -0
  38. data/lib/tensor_stream/tensor.rb +30 -12
  39. data/lib/tensor_stream/tensor_shape.rb +1 -1
  40. data/lib/tensor_stream/train/gradient_descent_optimizer.rb +37 -4
  41. data/lib/tensor_stream/train/saver.rb +46 -0
  42. data/lib/tensor_stream/train/utils.rb +37 -0
  43. data/lib/tensor_stream/trainer.rb +2 -0
  44. data/lib/tensor_stream/utils.rb +24 -14
  45. data/lib/tensor_stream/variable.rb +5 -11
  46. data/lib/tensor_stream/variable_scope.rb +15 -0
  47. data/lib/tensor_stream/version.rb +1 -1
  48. data/samples/iris.rb +8 -4
  49. data/samples/linear_regression.rb +1 -1
  50. data/samples/multigpu.rb +73 -0
  51. data/samples/nearest_neighbor.rb +3 -3
  52. data/tensor_stream.gemspec +1 -1
  53. data/test_samples/raw_neural_net_sample.rb +4 -1
  54. metadata +21 -6
@@ -0,0 +1,46 @@
1
+ // same dimension add floating point op
2
+ % c_dtype = dtype_to_c_type(dtype)
3
+ __kernel void min_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
4
+ // Get the index of the current element to be processed
5
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
7
+
8
+ C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <= B[globalRow * N + globalCol] ? A[globalRow * N + globalCol] : B[globalRow * N + globalCol];
9
+ }
10
+
11
+ // 1D + Scalar floating point add op
12
+ __kernel void min_c_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
13
+ // Get the index of the current element to be processed
14
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
15
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
16
+
17
+ if (switch_op == 0) {
18
+ C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <= B[0] ? A[globalRow * N + globalCol] : B[0];
19
+ } else {
20
+ C[globalRow * N + globalCol] = B[0] <= A[globalRow * N + globalCol] ? B[0] : A[globalRow * N + globalCol];
21
+ }
22
+ }
23
+
24
+ // 1D + Scalar floating point add op broadcast
25
+ __kernel void min_b_<%= dtype %>_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
26
+ // Get the index of the current element to be processed
27
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
28
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
29
+
30
+ int b_m_index = globalRow;
31
+ int b_n_index = globalCol;
32
+
33
+ if ( b_m_index >= M2) {
34
+ b_m_index = b_m_index % M2;
35
+ };
36
+
37
+ if (b_n_index >= N2) {
38
+ b_n_index = b_n_index % N2;
39
+ }
40
+
41
+ if (switch_op == 0) {
42
+ C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <= B[b_m_index * N2 + b_n_index] ? A[globalRow * N + globalCol] : B[b_m_index * N2 + b_n_index];
43
+ } else {
44
+ C[globalRow * N + globalCol] = B[b_m_index * N2 + b_n_index] <= A[globalRow * N + globalCol] ? B[b_m_index * N2 + b_n_index] : A[globalRow * N + globalCol];
45
+ }
46
+ }
@@ -0,0 +1,3 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ % op = operator_to_c('div')
3
+ <%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'real_div', dtype: "#{a}_#{b}", result_t: c_dtype %>
@@ -0,0 +1,27 @@
1
+ // First naive implementation
2
+ % c_dtype = dtype_to_c_type(dtype)
3
+ __kernel void softmax_cross_<%= dtype %>(const int N,
4
+ const __global <%= c_dtype %>* A,
5
+ const __global <%= c_dtype %>* L,
6
+ __global <%= c_dtype %>* C) {
7
+
8
+ // Get the index of the current element to be processed
9
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
10
+
11
+ // Compute a single element (loop over K)
12
+ <%= c_dtype %> acc = 0.0f;
13
+ <%= c_dtype %> max = <%= min_value_for(dtype) %>;
14
+
15
+ for (int k=0; k<N; k++) {
16
+ max = A[globalRow*N + k] > max ? A[globalRow*N + k] : max;
17
+ }
18
+
19
+ for (int k=0; k<N; k++) {
20
+ acc += exp(A[globalRow*N + k] - max);
21
+ }
22
+
23
+ // Store the result
24
+ for (int k=0; k < N; k++) {
25
+ C[globalRow*N + k] = (log(acc) - (A[globalRow*N + k] - max)) * L[globalRow*N + k];
26
+ }
27
+ }
@@ -0,0 +1,28 @@
1
+ // First naive implementation
2
+ % c_dtype = dtype_to_c_type(dtype)
3
+ __kernel void softmax_cross_grad_<%= dtype %>(const int N,
4
+ const __global <%= c_dtype %>* A,
5
+ const __global <%= c_dtype %>* L,
6
+ const __global <%= c_dtype %>* G,
7
+ __global <%= c_dtype %>* C) {
8
+
9
+ // Get the index of the current element to be processed
10
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
11
+
12
+ // Compute a single element (loop over K)
13
+ <%= c_dtype %> acc = 0.0f;
14
+ <%= c_dtype %> max = <%= min_value_for(dtype) %>;
15
+
16
+ for (int k=0; k<N; k++) {
17
+ max = A[globalRow*N + k] > max ? A[globalRow*N + k] : max;
18
+ }
19
+
20
+ for (int k=0; k<N; k++) {
21
+ acc += exp(A[globalRow*N + k] - max);
22
+ }
23
+
24
+ // Store the result
25
+ for (int k=0; k < N; k++) {
26
+ C[globalRow*N + k] = ((exp(A[globalRow*N + k] - max)/acc) * G[globalRow*N + k] - L[globalRow*N + k]);
27
+ }
28
+ }
@@ -1,10 +1,11 @@
1
1
  module TensorStream
2
+ # Buffer used by the OpenCL evaluator
2
3
  class OpenCLBuffer < Buffer
3
4
  include ArrayOpsHelper
4
5
 
5
6
  attr_accessor :shape, :buffer, :cl_buffer, :op
6
7
 
7
- def initialize(data_type: , shape:, buffer:, cl_buffer:, op: nil, name: nil)
8
+ def initialize(data_type:, shape:, buffer:, cl_buffer:, op: nil, name: nil)
8
9
  @data_type = data_type
9
10
  @shape = shape
10
11
  @buffer = buffer
@@ -25,12 +26,10 @@ module TensorStream
25
26
  op.command_queue.finish
26
27
  self.dirty = false
27
28
  end
28
- result = buffer.reshape(*shape.map { |s| s.to_i}.reverse).to_a
29
29
 
30
- if data_type == :boolean
31
- result = process_function_op(result, ->(a, _b) { a != 0 })
32
- end
30
+ result = buffer.reshape(*shape.map(&:to_i).reverse).to_a
31
+ result = process_function_op(result, ->(a, _b) { a != 0 }) if data_type == :boolean
33
32
  result
34
33
  end
35
34
  end
36
- end
35
+ end
@@ -102,6 +102,7 @@ module TensorStream
102
102
 
103
103
  def complete_eval(tensor, context)
104
104
  buffer = _run(tensor, context)
105
+
105
106
  if buffer.is_a?(Array)
106
107
  buffer = buffer.collect do |b|
107
108
  next b if b.buffer.size.zero?
@@ -109,7 +110,8 @@ module TensorStream
109
110
  b
110
111
  end
111
112
  else
112
- return buffer if buffer.nil?
113
+ return buffer.outputs[0] if buffer.is_a?(OutputGroup)
114
+ return buffer if buffer.nil?
113
115
  return [] if buffer.buffer.nil?
114
116
  return buffer if buffer.buffer.size.zero?
115
117
  _opencl_queue.enqueue_read_buffer(buffer.cl_buffer, buffer.buffer, event_wait_list: [buffer.op].compact)
@@ -150,13 +152,6 @@ module TensorStream
150
152
  @opencl_context = OpenCL.create_context(opencl_device)
151
153
  end
152
154
 
153
- def choose_best_device
154
- @best_device ||= begin
155
- devices = OpenclEvaluator.query_devices_with_score
156
- devices.sort { |a| a[1] }.reverse.first
157
- end
158
- end
159
-
160
155
  def self.query_devices_with_score
161
156
  OpenCL.platforms.flat_map do |p|
162
157
 
@@ -282,18 +277,82 @@ module TensorStream
282
277
  assign_var(tensor, value, context)
283
278
  end
284
279
 
280
+ # Fast in place multiply subtract assign
281
+ register_op :apply_gradient_descent do |_context, tensor, inputs|
282
+ _target_var, learning_rate, delta = inputs
283
+
284
+ assign = tensor.inputs[0] || tensor
285
+
286
+ unless assign.buffer
287
+ value = read_final_result(buffer)
288
+ assign.buffer = convert_to_opencl(value, buffer.shape, data_type: tensor.data_type, name: assign.name)
289
+ assign.value = value
290
+ end
291
+
292
+ assign.buffer.dirty = true # force buffer copy when variable is read externally
293
+ output_buffer = assign.buffer
294
+
295
+ m, n = output_buffer.shape
296
+ work_group = [m || 1, n || 1]
297
+ cl_m = OpenCL::Int1.new(m || 1)
298
+ cl_n = OpenCL::Int1.new(n || 1)
299
+
300
+ event_wait_list = [assign.buffer.op, learning_rate.op, delta.op].compact # add dependency wait list
301
+ method_call = :"apply_gradient_#{output_buffer.data_type}"
302
+ event = _cl_program("apply_gradient", dtype: output_buffer.data_type).send(method_call, _opencl_queue, work_group, cl_m, cl_n, delta.cl_buffer, learning_rate.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
303
+ output_buffer.op = event
304
+ output_buffer
305
+ end
306
+
285
307
  %i[less less_equal greater greater_equal equal not_equal logical_and].each do |op|
286
308
  register_op op, noop: true do |context, tensor, inputs|
287
309
  execute_2_operand_func(op.to_s, tensor, inputs[0], inputs[1], context, 'cond')
288
310
  end
289
311
  end
290
312
 
291
- %i[max add div sub mod mul pow sigmoid_grad squared_difference].each do |op|
313
+ %i[max min add real_div div sub floor_mod mod mul pow sigmoid_grad squared_difference].each do |op|
292
314
  register_op op, noop: true do |context, tensor, inputs|
293
315
  execute_2_operand_func(op.to_s, tensor, inputs[0], inputs[1], context)
294
316
  end
295
317
  end
296
318
 
319
+ register_op :add_n do |_context, tensor, inputs|
320
+ if inputs.size == 1
321
+ inputs[0]
322
+ else
323
+ m, n = inputs[0].shape
324
+ work_group = [m || 1, n || 1]
325
+ cl_m = OpenCL::Int1.new(m || 1)
326
+ cl_n = OpenCL::Int1.new(n || 1)
327
+ cl_switch = OpenCL::Int1.new(0)
328
+ dtype = tensor.data_type
329
+
330
+ output_buffer = _create_result_buffer(tensor.data_type, inputs[0].shape, "out_#{tensor.name}")
331
+ inputs_queue = inputs.dup
332
+ a = inputs_queue.pop
333
+ until inputs_queue.empty?
334
+ b = inputs_queue.pop
335
+ event_wait_list = [a.op, b.op].compact
336
+ method_call = :"add_#{a.data_type}_#{b.data_type}"
337
+ event = _cl_program('add', a: a.data_type, b: b.data_type, dtype: dtype).send(method_call, _opencl_queue, work_group, cl_m, cl_n, cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
338
+ a = output_buffer
339
+ a.op = event
340
+ end
341
+
342
+ output_buffer.op = a.op
343
+ output_buffer
344
+ end
345
+ end
346
+
347
+ register_op :expand_dims, buffer: true do |_context, tensor, inputs|
348
+ axis = inputs[1].buffer[0]
349
+ shape = inputs[0].shape.dup
350
+ axis = -axis if axis == shape.size
351
+ new_shape = shape.insert(axis, 1).compact
352
+ new_buf = inputs[0].buffer.reshape(*new_shape.reverse)
353
+ convert_to_opencl(new_buf, new_shape, data_type: inputs[0].data_type, name: tensor.name)
354
+ end
355
+
297
356
  register_op :floor_div, noop: true do |context, tensor, inputs|
298
357
  if fp_type?(tensor.data_type)
299
358
  execute_2_operand_func('floor_div', tensor, inputs[0], inputs[1], context)
@@ -307,7 +366,7 @@ module TensorStream
307
366
  execute_cond_func('where', tensor, pred, inputs[0], inputs[1], context)
308
367
  end
309
368
 
310
- register_op :matmul do |_context, tensor, inputs|
369
+ register_op :mat_mul do |_context, tensor, inputs|
311
370
  a, b = inputs
312
371
 
313
372
  m = a.shape[0]
@@ -355,7 +414,7 @@ module TensorStream
355
414
  end
356
415
  end
357
416
 
358
- %i[sign exp tan sin cos abs sqrt negate square reciprocal tanh tanh_grad sigmoid log1p round floor ceil].each do |op|
417
+ %i[sign exp tan acos asin sin cos abs sqrt negate square reciprocal tanh tanh_grad sigmoid log1p round floor ceil].each do |op|
359
418
  register_op op, noop: true do |context, tensor, inputs|
360
419
  execute_func(op.to_s, tensor, inputs[0], context)
361
420
  end
@@ -377,6 +436,57 @@ module TensorStream
377
436
  output_buffer
378
437
  end
379
438
 
439
+ register_op :log_softmax do |_context, tensor, inputs|
440
+ a = inputs[0] # logits
441
+ event_wait_list = [a.op].compact
442
+ dtype = tensor.data_type
443
+ output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
444
+
445
+ m, n = a.shape
446
+ work_group = [m]
447
+ n = m if n.nil?
448
+ cl_n = OpenCL::Int1.new(n || 1)
449
+
450
+ event = _cl_program("log_softmax", dtype: dtype).send(:"log_softmax_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
451
+ output_buffer.op = event
452
+ output_buffer
453
+ end
454
+
455
+ register_op :softmax_cross_entropy_with_logits_v2 do |_context, tensor, inputs|
456
+ a = inputs[0] # logits
457
+ b = inputs[1] # labels
458
+ event_wait_list = [a.op, b.op].compact
459
+ dtype = tensor.data_type
460
+ output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
461
+
462
+ m, n = a.shape
463
+ work_group = [m]
464
+ n = m if n.nil?
465
+ cl_n = OpenCL::Int1.new(n || 1)
466
+
467
+ event = _cl_program("softmax_cross", dtype: dtype).send(:"softmax_cross_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
468
+ output_buffer.op = event
469
+ output_buffer
470
+ end
471
+
472
+ register_op :softmax_cross_entropy_with_logits_v2_grad do |_context, tensor, inputs|
473
+ a = inputs[0] # logits
474
+ b = inputs[1] # labels
475
+ c = inputs[2] # grads
476
+ event_wait_list = [a.op, b.op, c.op].compact
477
+ dtype = tensor.data_type
478
+ output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
479
+
480
+ m, n = a.shape
481
+ work_group = [m]
482
+ n = m if n.nil?
483
+ cl_n = OpenCL::Int1.new(n || 1)
484
+
485
+ event = _cl_program("softmax_cross_grad", dtype: dtype).send(:"softmax_cross_grad_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer, b.cl_buffer, c.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
486
+ output_buffer.op = event
487
+ output_buffer
488
+ end
489
+
380
490
  register_op :softmax_grad do |_context, tensor, inputs|
381
491
  a, grad = inputs
382
492
 
@@ -417,30 +527,6 @@ module TensorStream
417
527
  end
418
528
  end
419
529
 
420
- register_op :truncate do |_context, tensor, inputs|
421
- a, b = inputs
422
- if a.shape.size.zero?
423
- a
424
- else
425
- input_b = read_final_result(b)
426
- if a.shape == input_b
427
- a
428
- else
429
- input_a = read_final_result(a)
430
- if input_b == []
431
- if a.buffer.size == 1
432
- a.shape = input_b
433
- a
434
- else
435
- wrap_opencl(a.buffer[0], data_type: a.data_type, name: tensor.name)
436
- end
437
- else
438
- wrap_opencl(truncate(input_a, input_b), data_type: a.data_type, name: tensor.name)
439
- end
440
- end
441
- end
442
- end
443
-
444
530
  register_op :print do |context, tensor, inputs|
445
531
  a, b = inputs
446
532
  input_b = complete_eval(b, context)
@@ -475,23 +561,27 @@ module TensorStream
475
561
  convert_to_opencl(transposed.flatten, transposed.shape.reverse, data_type: inputs[0].data_type, name: tensor.name)
476
562
  end
477
563
 
478
- register_op :index, buffer: true do |_context, tensor, inputs|
479
- a = inputs[0]
480
- input_a = read_final_result(a)
481
- index = read_final_result(inputs[1])
564
+ register_op :index, noop: true do |context, tensor, inputs|
565
+ a = _run(inputs[0], context)
566
+ index = read_final_result(_run(inputs[1], context))
482
567
 
483
- if a.is_a?(Array)
484
- a[index]
568
+ if a.is_a?(OutputGroup)
569
+ a.outputs[index]
485
570
  else
486
- new_shape = a.shape.dup
487
- new_shape.shift
488
- convert_to_opencl(input_a[index], new_shape, data_type: a.data_type, name: tensor.name)
571
+ if a.is_a?(Array)
572
+ a[index]
573
+ else
574
+ new_shape = a.shape.dup
575
+ new_shape.shift
576
+ input_a = read_final_result(a)
577
+ convert_to_opencl(input_a[index], new_shape, data_type: a.data_type, name: tensor.name)
578
+ end
489
579
  end
490
580
  end
491
581
 
492
582
  register_op :broadcast_gradient_args, buffer: true do |_context, tensor, inputs|
493
583
  rx, ry = get_broadcast_gradient_args(inputs[0].buffer.to_a, inputs[1].buffer.to_a)
494
- [ wrap_opencl(rx, data_type: :int32, name: "#{tensor.name}"), wrap_opencl(ry, data_type: :int32, name: "#{tensor.name}:1")]
584
+ OutputGroup.new([wrap_opencl(rx, data_type: :int32, name: "#{tensor.name}"), wrap_opencl(ry, data_type: :int32, name: "#{tensor.name}:1")])
495
585
  end
496
586
 
497
587
  register_op :shape do |_context, tensor, inputs|
@@ -537,6 +627,9 @@ module TensorStream
537
627
 
538
628
  register_op :argmin, buffer: true do |_context, tensor, inputs|
539
629
  axis = tensor.options[:axis] || 0
630
+ rank = inputs[0].shape.size
631
+ raise TensorStream::InvalidArgumentError, "Expected dimension in the range [#{-rank},#{rank}) but got #{axis}" if axis < -rank || axis >= rank
632
+
540
633
  arr = inputs[0].buffer.reshape(*inputs[0].shape.reverse).to_a
541
634
  op = get_op_with_axis(arr, axis, 0, inputs[0].data_type, ->(a, b) { a < b })
542
635
  convert_to_opencl(op, shape_eval(op), data_type: tensor.data_type, name: tensor.name)
@@ -544,6 +637,9 @@ module TensorStream
544
637
 
545
638
  register_op :argmax, buffer: true do |_context, tensor, inputs|
546
639
  axis = tensor.options[:axis] || 0
640
+ rank = inputs[0].shape.size
641
+ raise TensorStream::InvalidArgumentError, "Expected dimension in the range [#{-rank},#{rank}) but got #{axis}" if axis < -rank || axis >= rank
642
+
547
643
  arr = inputs[0].buffer.reshape(*inputs[0].shape.reverse).to_a
548
644
  op = get_op_with_axis(arr, axis, 0, inputs[0].data_type, ->(a, b) { a > b })
549
645
  convert_to_opencl(op, shape_eval(op), data_type: tensor.data_type, name: tensor.name)
@@ -559,7 +655,7 @@ module TensorStream
559
655
  # puts "#{tensor.to_math(true,1)} = #{read_final_result(complete_eval(result, child_context))}"
560
656
  if tensor.breakpoint
561
657
  a = resolve_placeholder(tensor.inputs[0], child_context) if tensor.inputs && tensor.inputs[0]
562
- b = resolve_placeholder(tensor.inputs[1], child_context) if tensor.inputs && tensor.inputs[1]
658
+ b = resolve_placeholder(tensor.inputs[1], child_context) if tensor.inputs && tensor.inputs[1]
563
659
  a = read_final_result(complete_eval(a, child_context))
564
660
  b = read_final_result(complete_eval(b, child_context))
565
661
  result = read_final_result(complete_eval(result, child_context))
@@ -581,6 +677,8 @@ module TensorStream
581
677
  end
582
678
  rescue EvaluatorExcecutionException => e
583
679
  raise e
680
+ rescue TensorStreamError => e
681
+ raise e
584
682
  rescue StandardError => e
585
683
  _opencl_queue.finish # dump queue
586
684
  puts e.message
@@ -614,7 +712,8 @@ module TensorStream
614
712
  else
615
713
  wrap_opencl(tensor, name: tensor.name)
616
714
  end
617
- @context[:_cache][cache_key] = @context[cache_key] if tensor.is_const
715
+ @context[:_cache][cache_key] = @context[cache_key] if tensor.is_const
716
+ @context[cache_key]
618
717
  end
619
718
 
620
719
  private
@@ -625,11 +724,11 @@ module TensorStream
625
724
 
626
725
  if assign.buffer
627
726
  # buffer = type_cast(buffer, assign.data_type, name: "#{tensor.name}/cast_#{tensor.name}_#{tensor.data_type}")
628
- if assign.buffer.cl_buffer != buffer.cl_buffer
629
- assign.buffer.op = _opencl_queue.enqueue_copy_buffer(buffer.cl_buffer, assign.buffer.cl_buffer, event_wait_list: [buffer.op, assign.buffer.op])
630
- else
631
- assign.buffer.op = buffer.op
632
- end
727
+ assign.buffer.op = if assign.buffer.cl_buffer != buffer.cl_buffer
728
+ _opencl_queue.enqueue_copy_buffer(buffer.cl_buffer, assign.buffer.cl_buffer, event_wait_list: [buffer.op, assign.buffer.op])
729
+ else
730
+ buffer.op
731
+ end
633
732
  else
634
733
  value = read_final_result(buffer)
635
734
  assign.buffer = convert_to_opencl(value, buffer.shape, data_type: tensor.data_type, name: assign.name)
@@ -660,12 +759,12 @@ module TensorStream
660
759
  method_call = :"#{prog}_#{a.data_type}_#{b.data_type}"
661
760
  event = if prog == "#{op_name}_b"
662
761
  cl_m_b, cl_n_b = if b.shape.size == 2
663
- [ OpenCL::Int1.new(b.shape[0]), OpenCL::Int1.new(b.shape[1]) ]
664
- elsif b.shape.size == 1
665
- [ OpenCL::Int1.new(1), OpenCL::Int1.new(b.shape[0]) ]
666
- else
667
- raise "rank > 2 not supported!"
668
- end
762
+ [OpenCL::Int1.new(b.shape[0]), OpenCL::Int1.new(b.shape[1])]
763
+ elsif b.shape.size == 1
764
+ [OpenCL::Int1.new(1), OpenCL::Int1.new(b.shape[0])]
765
+ else
766
+ raise "rank > 2 not supported!"
767
+ end
669
768
  _cl_program("#{prog_name || op_name}", a: a.data_type, b: b.data_type, dtype: dtype).send(method_call, _opencl_queue, work_group, cl_m, cl_n, cl_m_b, cl_n_b, cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
670
769
  else
671
770
  _cl_program("#{prog_name || op_name}", a: a.data_type, b: b.data_type, dtype: dtype).send(method_call, _opencl_queue, work_group, cl_m, cl_n, cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
@@ -691,7 +790,7 @@ module TensorStream
691
790
  cl_n = OpenCL::Int1.new(n || 1)
692
791
 
693
792
  event_wait_list = [a.op, b.op, p.op].compact # add dependency wait list
694
- output_buffer.op = _cl_program("#{op_name}", dtype: dtype).send(:"#{op_name}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, p.cl_buffer, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
793
+ output_buffer.op = _cl_program(op_name.to_s, dtype: dtype).send(:"#{op_name}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, p.cl_buffer, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
695
794
  output_buffer
696
795
  end
697
796
 
@@ -706,7 +805,7 @@ module TensorStream
706
805
  cl_m = OpenCL::Int1.new(m || 1)
707
806
  cl_n = OpenCL::Int1.new(n || 1)
708
807
 
709
- event = _cl_program("#{op_name}", dtype: dtype).send(:"#{op_name}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
808
+ event = _cl_program(op_name.to_s, dtype: dtype).send(:"#{op_name}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
710
809
  output_buffer.op = event
711
810
  output_buffer
712
811
  end
@@ -741,60 +840,58 @@ module TensorStream
741
840
 
742
841
  def wrap_opencl(tensor, data_type: nil, name: nil)
743
842
  value, shape = if tensor.is_a?(Tensor)
744
- [tensor.value, tensor.shape.shape]
745
- else
746
- [tensor , shape_eval(tensor)]
747
- end
843
+ [tensor.value, tensor.shape.shape]
844
+ else
845
+ [tensor, shape_eval(tensor)]
846
+ end
748
847
 
749
848
  convert_to_opencl(value, shape, data_type: data_type || tensor.data_type, name: name)
750
849
  end
751
850
 
752
851
  def convert_to_opencl(value, shape, data_type: nil, name: nil)
753
- if !value.is_a?(Array) && !value.is_a?(NArray)
754
- value = [value]
755
- end
852
+ value = [value] if !value.is_a?(Array) && !value.is_a?(NArray)
756
853
 
757
854
  cache_key = "_cl_object_#{name}:#{shape.join('_')}:#{object_id}"
758
- cl_object = if name && @context[:_cache][cache_key]
855
+ cl_object = if name && @context[:_cache][cache_key]
759
856
  @context[:_cache][cache_key]
760
- else
761
- narray_size = shape.reduce(:*) || 1
857
+ else
858
+ narray_size = shape.reduce(:*) || 1
762
859
 
763
- buffer = if value.is_a?(NArray)
764
- value
765
- else
766
- allocate_narray_for_type(data_type, narray_size)
767
- end
860
+ buffer = if value.is_a?(NArray)
861
+ value
862
+ else
863
+ allocate_narray_for_type(data_type, narray_size)
864
+ end
768
865
 
769
- cl_buffer_size = shape.empty? ? 1 : shape.reduce(:*)
866
+ cl_buffer_size = shape.empty? ? 1 : shape.reduce(:*)
770
867
 
771
- cl_buffer = if !value.flatten.empty?
772
- cl_buffer_size = 1 if cl_buffer_size.zero?
773
- _opencl_context.create_buffer(cl_buffer_size * buffer.element_size)
774
- else
775
- nil
776
- end
868
+ cl_buffer = unless value.flatten.empty?
869
+ cl_buffer_size = 1 if cl_buffer_size.zero?
870
+ _opencl_context.create_buffer(cl_buffer_size * buffer.element_size)
871
+ end
777
872
 
778
- @context[:_cache][cache_key] = OpenCLBuffer.new(name: name, data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer)
779
- end
873
+ @context[:_cache][cache_key] = OpenCLBuffer.new(name: name, data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer)
874
+ end
780
875
 
781
876
  if value.is_a?(Array)
782
877
  value.flatten.each_with_index do |element, index|
783
- if element.is_a?(Tensor)
784
- cl_object.buffer[index] = read_final_result(complete_eval(element, {}))
785
- else
786
- cl_object.buffer[index] = ( data_type == :boolean ? ( element ? 1 : 0 ) : Tensor.cast_dtype(element, data_type))
787
- end
878
+ cl_object.buffer[index] = if element.is_a?(Tensor)
879
+ read_final_result(complete_eval(element, {}))
880
+ elsif data_type == :boolean
881
+ element ? 1 : 0
882
+ else
883
+ Tensor.cast_dtype(element, data_type)
884
+ end
788
885
  end
789
886
  elsif value.is_a?(NArray)
790
887
  cl_object.buffer = value
888
+ elsif data_type == :boolean
889
+ cl_object.buffer[0] = element ? 1 : 0
791
890
  else
792
- cl_object.buffer[0] = ( data_type == :boolean ? ( element ? 1 : 0 ) : Tensor.cast_dtype(value, data_type))
891
+ cl_object.buffer[0] = Tensor.cast_dtype(value, data_type)
793
892
  end
794
893
 
795
- write_op = if cl_object.cl_buffer && !value.nil? && (!value.is_a?(Array) || !value.empty?)
796
- _opencl_queue.enqueue_write_buffer(cl_object.cl_buffer, cl_object.buffer)
797
- end
894
+ write_op = _opencl_queue.enqueue_write_buffer(cl_object.cl_buffer, cl_object.buffer) if cl_object.cl_buffer && !value.nil? && (!value.is_a?(Array) || !value.empty?)
798
895
  cl_object.op = write_op
799
896
  cl_object
800
897
  end
@@ -861,7 +958,7 @@ module TensorStream
861
958
 
862
959
  def _reduced_shape(input_shape, axes)
863
960
  return [] if axes.nil? # reduce to scalar
864
- axes = [ axes ] unless axes.is_a?(Array)
961
+ axes = [axes] unless axes.is_a?(Array)
865
962
  return input_shape if axes.empty?
866
963
 
867
964
  axes.each do |dimen|
@@ -882,8 +979,7 @@ module TensorStream
882
979
  rank = input.shape.size - 1
883
980
 
884
981
  if axis.is_a?(Array)
885
- axis.map{ |x| rank - x.abs }.sort.reverse.each do |x|
886
-
982
+ axis.map { |x| rank - x.abs }.sort.reverse_each do |x|
887
983
  value = value.send(func, x.to_i)
888
984
  end
889
985
  else
@@ -891,75 +987,21 @@ module TensorStream
891
987
  end
892
988
 
893
989
  new_shape = if value.is_a?(NArray)
894
- value.shape.reverse
895
- else
896
- value = [value]
897
- []
898
- end
990
+ value.shape.reverse
991
+ else
992
+ value = [value]
993
+ []
994
+ end
899
995
 
900
- if tensor.options[:keepdims]
901
- new_shape = _reduced_shape(input.shape.dup, axis)
902
- end
996
+ new_shape = _reduced_shape(input.shape.dup, axis) if tensor.options[:keepdims]
903
997
 
904
998
  convert_to_opencl(value.flatten, new_shape, data_type: tensor.data_type, name: tensor.name)
905
999
  end
906
1000
  end
907
1001
 
908
- def arr_pad(arr, paddings, data_type = :float32, rank = 0)
909
- raise "padding #{paddings[rank]} needs to have to elements [before, after]" if paddings[rank].size != 2
910
-
911
- before = paddings[rank][0]
912
- after = paddings[rank][1]
913
- pad_value = fp_type?(data_type) ? 0.0 : 0
914
- if arr[0].is_a?(Array)
915
- next_dim_elem = arr.collect { |a| arr_pad(a, paddings, data_type, rank + 1) }
916
- padding = deep_dup_array(next_dim_elem[0], pad_value)
917
- Array.new(before) { padding } + next_dim_elem + Array.new(after) { padding }
918
- else
919
- Array.new(before) { pad_value } + arr + Array.new(after) { pad_value }
920
- end
921
- end
922
-
923
- def deep_dup_array(arr, value = nil)
924
- if arr.is_a?(Array)
925
- arr.dup.collect do |a|
926
- deep_dup_array(a, value)
927
- end
928
- else
929
- value.nil? ? arr : value
930
- end
931
- end
932
-
933
- def matmul_const_transform(mat, mat_b, tensor)
934
- if !mat.is_a?(Array)
935
- compat_shape = shape_eval(mat_b).reverse
936
- func = -> { tensor.data_type == :int32 ? mat.to_i : mat.to_f }
937
-
938
- generate_vector(compat_shape, generator: func)
939
- else
940
- mat
941
- end
942
- end
943
-
944
- # determine possible reduction axis to be used
945
- def _broadcast_gradient_op(vector_shape1, vector_shape2, level)
946
- va_rank = _rank_from_shape(vector_shape1)
947
- vb_rank = _rank_from_shape(vector_shape2)
948
- return [] if vector_shape1 == vector_shape2 # same shape so no reductions
949
-
950
- shape2_r = vector_shape2.reverse
951
-
952
- vector_shape1.reverse.each_with_index.collect do |s, index|
953
- next va_rank - index - 1 if index >= shape2_r.size
954
- next nil if shape2_r[index] == s
955
- next nil if shape2_r[index] > s
956
- va_rank - index - 1
957
- end.compact
958
- end
959
-
960
1002
  # selects variants of cl programs depending on input
961
1003
  def select_program(input_a, input_b, op)
962
- return [input_a, input_b, "#{op}", 0] if input_a.shape == input_b.shape
1004
+ return [input_a, input_b, op.to_s, 0] if input_a.shape == input_b.shape
963
1005
 
964
1006
  return [input_b, input_a, "#{op}_c", 1] if input_a.shape.empty? || input_a.shape.reduce(:*) == 1 # A is scalar?
965
1007
  return [input_a, input_b, "#{op}_c", 0] if input_b.shape.empty? || input_a.shape.reduce(:*) == 1 # B is scalar?
@@ -979,26 +1021,6 @@ module TensorStream
979
1021
  shape.is_a?(Array) ? shape.size : 0
980
1022
  end
981
1023
 
982
- def concat_array(values, axis)
983
- combined_array = values.shift
984
- axis = get_rank(combined_array) - 1 if axis == -1
985
-
986
- values.each do |v|
987
- combined_array = concat(combined_array, v, axis)
988
- end
989
- combined_array
990
- end
991
-
992
- def concat(a, b, axis)
993
- if axis.zero?
994
- a + b
995
- else
996
- a.each_with_index.collect do |i, index|
997
- concat(i, b[index], axis - 1)
998
- end
999
- end
1000
- end
1001
-
1002
1024
  def resolve_placeholder(placeholder, _execution_context = {})
1003
1025
  return nil if placeholder.nil?
1004
1026
 
@@ -1014,43 +1036,6 @@ module TensorStream
1014
1036
  Tensor.cast_dtype(var, placeholder.data_type)
1015
1037
  end
1016
1038
 
1017
- def reduce_axis(current_axis, axis, val, keep_dims, f = ->(a, b) { a + b })
1018
- return val unless val.is_a?(Array)
1019
-
1020
- r = val.collect do |v|
1021
- reduce_axis(current_axis + 1, axis, v, keep_dims, f)
1022
- end
1023
-
1024
- should_reduce_axis = axis.nil? || (axis.is_a?(Array) && axis.include?(current_axis)) || (current_axis == axis)
1025
-
1026
- if should_reduce_axis
1027
- reduced_val = r[0]
1028
- if r.size > 1
1029
- reduced_val = f.call(r[0..val.size])
1030
- elsif r.size.zero?
1031
- reduced_val = f.call(nil)
1032
- end
1033
- keep_dims ? [ reduced_val ] : reduced_val
1034
- else
1035
- r
1036
- end
1037
- end
1038
-
1039
- # handle 3 tensor math operations
1040
- def call_3way_vector_op(v_a, v_b, v_c, child_context, op = ->(a, b, c) { a + b + c })
1041
- return op.call(v_a, v_b, v_c) unless v_a.is_a?(Array)
1042
-
1043
- v_a.each_with_index.collect do |v1, index|
1044
- v2 = v_b[index]
1045
- v3 = v_c[index]
1046
- if v1.is_a?(Array)
1047
- call_3way_vector_op(v1, v2, v3, child_context, op)
1048
- else
1049
- op.call(v1, v2, v3)
1050
- end
1051
- end
1052
- end
1053
-
1054
1039
  def all_true?(arr)
1055
1040
  if arr.is_a?(Array) || arr.is_a?(NArray)
1056
1041
  arr.each do |a|
@@ -1061,58 +1046,8 @@ module TensorStream
1061
1046
 
1062
1047
  arr != 0
1063
1048
  end
1064
-
1065
- def generate_vector(shape, dtype: :float32, generator:)
1066
- if shape.is_a?(Integer)
1067
- Array.new(shape) do
1068
- generator.call
1069
- end
1070
- elsif shape.size > 1
1071
- Array.new(shape[0]) do
1072
- generate_vector(shape[1..shape.size], generator: generator, dtype: dtype)
1073
- end
1074
- elsif shape.size == 1
1075
- Array.new(shape[0]) do
1076
- generator.call
1077
- end
1078
- elsif shape.size.zero?
1079
- generator.call
1080
- end
1081
- end
1082
-
1083
- def _get_randomizer(tensor, seed)
1084
- if tensor.graph.random_seed && seed
1085
- Random.new(tensor.graph.random_seed ^ seed)
1086
- elsif tensor.graph.random_seed
1087
- @session.randomizer[tensor.graph.object_id] ||= Random.new(tensor.graph.random_seed)
1088
- @session.randomizer[tensor.graph.object_id]
1089
- elsif seed
1090
- @session.randomizer[tensor.operation] ||= Random.new(seed)
1091
- @session.randomizer[tensor.operation]
1092
- else
1093
- Random.new
1094
- end
1095
- end
1096
-
1097
- def dump_intermediates
1098
- arr = []
1099
- arr << "============== start ==================="
1100
- @context[:compute_history].each_with_index do |history, index|
1101
- arr << "------------------------------------"
1102
- arr << history[:name]
1103
- arr << "#{history[:type]} #{history[:shape]}"
1104
- arr << history[:source]
1105
- arr << history[:description]
1106
- arr << ""
1107
- arr << history[:value].to_json
1108
- arr << "------------------------------------"
1109
- end
1110
- arr << "============== end ====================="
1111
- str = arr.join("\n")
1112
- File.write("/tmp/intermediates.txt", str)
1113
- end
1114
1049
  end
1115
1050
  end
1116
1051
  end
1117
1052
 
1118
- TensorStream::Evaluator.register_evaluator(TensorStream::Evaluator::OpenclEvaluator, 'opencl', 1)
1053
+ TensorStream::Evaluator.register_evaluator(TensorStream::Evaluator::OpenclEvaluator, 'opencl', 1)