tensor_stream 0.4.1 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +10 -0
  3. data/README.md +38 -17
  4. data/benchmark/benchmark.rb +16 -20
  5. data/lib/tensor_stream/control_flow.rb +3 -3
  6. data/lib/tensor_stream/debugging/debugging.rb +4 -4
  7. data/lib/tensor_stream/device.rb +5 -2
  8. data/lib/tensor_stream/evaluator/base_evaluator.rb +138 -0
  9. data/lib/tensor_stream/evaluator/buffer.rb +7 -2
  10. data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/_bool_operand.cl +3 -3
  11. data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/_operand.cl +0 -0
  12. data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/abs.cl +0 -0
  13. data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/add.cl +1 -1
  14. data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/argmax.cl +0 -0
  15. data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/argmin.cl +0 -0
  16. data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/cast.cl +0 -0
  17. data/lib/tensor_stream/evaluator/opencl/kernels/cond.cl.erb +6 -0
  18. data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/cos.cl +0 -0
  19. data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/div.cl.erb +1 -1
  20. data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/exp.cl +0 -0
  21. data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/gemm.cl +0 -0
  22. data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/log.cl +0 -0
  23. data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/log1p.cl +0 -0
  24. data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/max.cl +3 -3
  25. data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/mul.cl +1 -1
  26. data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/negate.cl +0 -0
  27. data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/pow.cl +3 -3
  28. data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/reciprocal.cl +0 -0
  29. data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/round.cl +0 -0
  30. data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/sigmoid.cl +0 -0
  31. data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/sigmoid_grad.cl +3 -3
  32. data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/sign.cl +1 -1
  33. data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/sin.cl +0 -0
  34. data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/softmax.cl +0 -0
  35. data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/softmax_grad.cl +0 -0
  36. data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/sqrt.cl +0 -0
  37. data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/square.cl +0 -0
  38. data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/sub.cl +1 -1
  39. data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/tan.cl +0 -0
  40. data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/tanh.cl +0 -0
  41. data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/tanh_grad.cl +0 -0
  42. data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/where.cl +1 -1
  43. data/lib/tensor_stream/evaluator/{opencl_buffer.rb → opencl/opencl_buffer.rb} +1 -1
  44. data/lib/tensor_stream/evaluator/opencl/opencl_device.rb +5 -0
  45. data/lib/tensor_stream/evaluator/{opencl_evaluator.rb → opencl/opencl_evaluator.rb} +404 -452
  46. data/lib/tensor_stream/evaluator/{opencl_template_helper.rb → opencl/opencl_template_helper.rb} +6 -6
  47. data/lib/tensor_stream/evaluator/operation_helpers/array_ops_helper.rb +21 -21
  48. data/lib/tensor_stream/evaluator/ruby_evaluator.rb +492 -398
  49. data/lib/tensor_stream/graph.rb +21 -1
  50. data/lib/tensor_stream/graph_serializers/graphml.rb +59 -59
  51. data/lib/tensor_stream/graph_serializers/pbtext.rb +1 -1
  52. data/lib/tensor_stream/helpers/op_helper.rb +6 -2
  53. data/lib/tensor_stream/math_gradients.rb +7 -7
  54. data/lib/tensor_stream/operation.rb +100 -100
  55. data/lib/tensor_stream/session.rb +81 -8
  56. data/lib/tensor_stream/tensor.rb +7 -5
  57. data/lib/tensor_stream/utils.rb +32 -19
  58. data/lib/tensor_stream/version.rb +1 -1
  59. data/tensor_stream.gemspec +0 -1
  60. data/test_samples/raw_neural_net_sample.rb +7 -7
  61. metadata +41 -53
  62. data/lib/tensor_stream/evaluator/kernels/cond.cl.erb +0 -5
@@ -1,3 +1,3 @@
1
1
  % c_dtype = dtype_to_c_type(dtype)
2
2
  % op = operator_to_c('add')
3
- <%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'add', dtype: dtype, result_t: c_dtype %>
3
+ <%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'add', dtype: "#{a}_#{b}", result_t: c_dtype %>
@@ -0,0 +1,6 @@
1
+ % ["#{dtype}"].product(%w[less less_equal equal not_equal greater greater_equal logical_and]).each do |dtype, fname|
2
+ % a_dtype = dtype_to_c_type(a)
3
+ % b_dtype = dtype_to_c_type(b)
4
+ % op = operator_to_c(fname)
5
+ <%= render 'bool_operand.cl', a_dtype: a_dtype, b_dtype: b_dtype, op: op, fname: fname, dtype: "#{a}_#{b}", result_t: 'short' %>
6
+ % end
@@ -1,3 +1,3 @@
1
1
  % c_dtype = dtype_to_c_type(dtype)
2
2
  % op = operator_to_c('div')
3
- <%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'div', dtype: dtype, result_t: c_dtype %>
3
+ <%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'div', dtype: "#{a}_#{b}", result_t: c_dtype %>
@@ -1,6 +1,6 @@
1
1
  // same dimension add floating point op
2
2
  % c_dtype = dtype_to_c_type(dtype)
3
- __kernel void max_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
3
+ __kernel void max_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
4
4
  // Get the index of the current element to be processed
5
5
  const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
6
  const int globalCol = get_global_id(1); // Col ID of C (0..N)
@@ -9,7 +9,7 @@
9
9
  }
10
10
 
11
11
  // 1D + Scalar floating point add op
12
- __kernel void max_c_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
12
+ __kernel void max_c_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
13
13
  // Get the index of the current element to be processed
14
14
  const int globalRow = get_global_id(0); // Row ID of C (0..M)
15
15
  const int globalCol = get_global_id(1); // Col ID of C (0..N)
@@ -22,7 +22,7 @@
22
22
  }
23
23
 
24
24
  // 1D + Scalar floating point add op broadcast
25
- __kernel void max_b_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
25
+ __kernel void max_b_<%= dtype %>_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
26
26
  // Get the index of the current element to be processed
27
27
  const int globalRow = get_global_id(0); // Row ID of C (0..M)
28
28
  const int globalCol = get_global_id(1); // Col ID of C (0..N)
@@ -1,3 +1,3 @@
1
1
  % c_dtype = dtype_to_c_type(dtype)
2
2
  % op = operator_to_c('mul')
3
- <%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'mul', dtype: dtype, result_t: c_dtype %>
3
+ <%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'mul', dtype: "#{a}_#{b}", result_t: c_dtype %>
@@ -1,6 +1,6 @@
1
1
  // same dimension add floating point op
2
2
  % c_dtype = dtype_to_c_type(dtype)
3
- __kernel void pow_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
3
+ __kernel void pow_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
4
4
  // Get the index of the current element to be processed
5
5
  const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
6
  const int globalCol = get_global_id(1); // Col ID of C (0..N)
@@ -9,7 +9,7 @@
9
9
  }
10
10
 
11
11
  // 1D + Scalar floating point add op
12
- __kernel void pow_c_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
12
+ __kernel void pow_c_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
13
13
  // Get the index of the current element to be processed
14
14
  const int globalRow = get_global_id(0); // Row ID of C (0..M)
15
15
  const int globalCol = get_global_id(1); // Col ID of C (0..N)
@@ -22,7 +22,7 @@
22
22
  }
23
23
 
24
24
  // 1D + Scalar floating point add op broadcast
25
- __kernel void pow_b_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
25
+ __kernel void pow_b_<%= dtype %>_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
26
26
  // Get the index of the current element to be processed
27
27
  const int globalRow = get_global_id(0); // Row ID of C (0..M)
28
28
  const int globalCol = get_global_id(1); // Col ID of C (0..N)
@@ -9,7 +9,7 @@ float sigmoid_grad(<%= c_dtype %> x, <%= c_dtype %> g) {
9
9
  }
10
10
 
11
11
  // same dimension add floating point op
12
- __kernel void sigmoid_grad_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
12
+ __kernel void sigmoid_grad_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
13
13
  // Get the index of the current element to be processed
14
14
  const int globalRow = get_global_id(0); // Row ID of C (0..M)
15
15
  const int globalCol = get_global_id(1); // Col ID of C (0..N)
@@ -18,7 +18,7 @@ float sigmoid_grad(<%= c_dtype %> x, <%= c_dtype %> g) {
18
18
  }
19
19
 
20
20
  // 1D + Scalar floating point add op
21
- __kernel void sigmoid_grad_c_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
21
+ __kernel void sigmoid_grad_c_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
22
22
  // Get the index of the current element to be processed
23
23
  const int globalRow = get_global_id(0); // Row ID of C (0..M)
24
24
  const int globalCol = get_global_id(1); // Col ID of C (0..N)
@@ -31,7 +31,7 @@ float sigmoid_grad(<%= c_dtype %> x, <%= c_dtype %> g) {
31
31
  }
32
32
 
33
33
  // 1D + Scalar floating point add op broadcast
34
- __kernel void sigmoid_grad_b_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
34
+ __kernel void sigmoid_grad_b_<%= dtype %>_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
35
35
  // Get the index of the current element to be processed
36
36
  const int globalRow = get_global_id(0); // Row ID of C (0..M)
37
37
  const int globalCol = get_global_id(1); // Col ID of C (0..N)
@@ -5,7 +5,7 @@ __kernel void sign_<%= dtype %>(const int M, const int N, __global const <%= c_d
5
5
  const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
6
  const int globalCol = get_global_id(1); // Col ID of C (0..N)
7
7
  <%= c_dtype %> value = A[globalRow * N + globalCol];
8
- % if is_floating_point?(dtype)
8
+ % if floating_point?(dtype)
9
9
  if (isnan(value) || value == 0.0f) {
10
10
  C[globalRow * N + globalCol] = 0.0;
11
11
  } else {
@@ -1,3 +1,3 @@
1
1
  % c_dtype = dtype_to_c_type(dtype)
2
2
  % op = operator_to_c('sub')
3
- <%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'sub', dtype: dtype, result_t: c_dtype %>
3
+ <%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'sub', dtype: "#{a}_#{b}", result_t: c_dtype %>
@@ -1,5 +1,5 @@
1
1
  % c_dtype = dtype_to_c_type(dtype)
2
- __kernel void where_<%= dtype %>(const int M, const int N, __global const int *PRED, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
2
+ __kernel void where_<%= dtype %>(const int M, const int N, __global const short *PRED, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
3
3
  // Get the index of the current element to be processed
4
4
  const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
5
  const int globalCol = get_global_id(1); // Col ID of C (0..N)
@@ -2,7 +2,7 @@ module TensorStream
2
2
  class OpenCLBuffer < Buffer
3
3
  include ArrayOpsHelper
4
4
 
5
- attr_accessor :data_type, :shape, :buffer, :cl_buffer, :op
5
+ attr_accessor :shape, :buffer, :cl_buffer, :op
6
6
 
7
7
  def initialize(data_type: , shape:, buffer:, cl_buffer:, op: nil, name: nil)
8
8
  @data_type = data_type
@@ -0,0 +1,5 @@
1
+ module TensorStream
2
+ class OpenclDevice < TensorStream::Device
3
+ attr_accessor :native_device
4
+ end
5
+ end
@@ -1,11 +1,12 @@
1
1
  require 'tensor_stream/evaluator/operation_helpers/random_gaussian'
2
2
  require 'tensor_stream/evaluator/operation_helpers/array_ops_helper'
3
3
  require 'tensor_stream/evaluator/operation_helpers/math_helper'
4
- require 'tensor_stream/evaluator/opencl_buffer'
5
- require 'tensor_stream/evaluator/opencl_template_helper'
6
- require 'distribution'
4
+ require 'tensor_stream/evaluator/opencl/opencl_buffer'
5
+ require 'tensor_stream/evaluator/opencl/opencl_template_helper'
6
+ require 'tensor_stream/evaluator/opencl/opencl_device'
7
7
  require 'opencl_ruby_ffi'
8
8
  require 'narray_ffi'
9
+ require 'tensor_stream/evaluator/base_evaluator'
9
10
 
10
11
  module TensorStream
11
12
  module Evaluator
@@ -27,31 +28,78 @@ module TensorStream
27
28
  end
28
29
 
29
30
  ## PURE ruby evaluator used for testing and development
30
- class OpenclEvaluator
31
+ class OpenclEvaluator < BaseEvaluator
31
32
  attr_accessor :retain
32
33
 
33
34
  include TensorStream::OpHelper
34
35
  include TensorStream::ArrayOpsHelper
35
36
  include TensorStream::MathHelper
36
37
 
37
- def initialize(session, context, thread_pool: nil, log_intermediates: false, preferred_device: nil)
38
- @session = session
39
- @context = context
40
- @log_intermediates = log_intermediates
41
- @preferred_device = preferred_device
42
- @retain = context[:retain] || []
43
- @thread_pool = thread_pool || Concurrent::ImmediateExecutor.new
44
- @context[:_cache][:_cl_buffers] ||= {} if @context[:_cache]
45
- @context[:compute_history] = [] if log_intermediates
38
+ def initialize(session, device, thread_pool: nil, log_intermediates: false)
39
+ super
40
+ _create_opencl_context(device.native_device)
41
+ @opencl_device = device.native_device
42
+ create_command_queue
43
+ end
44
+
45
+ def self.query_supported_devices
46
+ devices = query_devices_with_score
47
+ devices.sort { |a| a[1] }.reverse.map do |d|
48
+ opencl_to_device(d)
49
+ end
50
+ end
51
+
52
+ def self.fetch_device(query = [])
53
+ devices = query_devices_with_score
54
+ platform_devices = devices.select { |d| d[0].platform.to_s.downcase =~ /#{query[0].downcase}/ }
55
+ opencl_to_device(platform_devices[[query[1].to_i, platform_devices.size - 1].min])
56
+ end
57
+
58
+ def self.opencl_to_device(d)
59
+ device = d[0]
60
+ index = d[3]
61
+ platform_name = device.platform.name.gsub(' ', '_').downcase
62
+ uri = [platform_name, index].join(':')
63
+
64
+ device_type = device.type.to_s == 'GPU' ? :gpu : :cpu
65
+
66
+ OpenclDevice.new(uri, device_type, self).tap do |d|
67
+ d.native_device = device
68
+ end
69
+ end
70
+
71
+ ##
72
+ # Select the best device available in the system for this evaluator
73
+ def self.default_device
74
+ devices = OpenclEvaluator.query_devices_with_score
75
+ device = devices.sort { |a| a[1] }.reverse.first
76
+ opencl_to_device(device)
46
77
  end
47
78
 
48
79
  # opencl evaluator main entrypoint
49
80
  def run(tensor, execution_context)
50
- _create_opencl_context
51
- create_command_queue
52
81
  read_final_result(complete_eval(tensor, execution_context))
53
82
  end
54
83
 
84
+ def run_with_buffer(tensor, context, execution_context)
85
+ @context = context
86
+ @context[:_cache][:_cl_buffers] ||= {} if context[:_cache]
87
+
88
+ if tensor.is_a?(Array)
89
+ tensor.collect do |t|
90
+ value = run(t, execution_context)
91
+ Buffer.new(data_type: t.data_type, buffer: value)
92
+ end
93
+ else
94
+ value = run(tensor, execution_context)
95
+ Buffer.new(data_type: tensor.data_type, buffer: value)
96
+ end
97
+ end
98
+
99
+ def convert_from_buffer(tensor, result)
100
+ convert_to_opencl([result.buffer].flatten, shape_eval(result.buffer), data_type: result.data_type, name: tensor.name)
101
+ end
102
+
55
103
  def complete_eval(tensor, context)
56
104
  buffer = _run(tensor, context)
57
105
  if buffer.is_a?(Array)
@@ -69,11 +117,25 @@ module TensorStream
69
117
  end
70
118
 
71
119
  def opencl_device
72
- @context[:_cache][:_opencl_device]
120
+ @opencl_device
73
121
  end
74
122
 
75
123
  protected
76
124
 
125
+ def prepare_input(tensor, context, options = {})
126
+ return nil unless tensor
127
+ tensor = resolve_placeholder(tensor)
128
+ if options[:noop]
129
+ tensor
130
+ elsif options[:buffer]
131
+ complete_eval(tensor, context)
132
+ elsif options[:complete]
133
+ read_final_result(complete_eval(tensor, context))
134
+ else
135
+ _run(tensor, context)
136
+ end
137
+ end
138
+
77
139
  # read result from opencl and convert to ruby
78
140
  def read_final_result(buffer)
79
141
  return buffer.map { |b| read_final_result(b) } if buffer.is_a?(Array)
@@ -82,43 +144,37 @@ module TensorStream
82
144
  buffer.to_ruby
83
145
  end
84
146
 
85
- def _create_opencl_context
86
- @context[:_cache][:_opencl_device] ||= begin
87
- if @preferred_device
88
- @preferred_device
89
- else
90
- device, _score, _platform, _index = choose_best_device
91
- # puts "using #{device.name}"
92
- device
93
- end
94
- end
95
- @context[:cl_device] = opencl_device
96
- @context[:_cache][:_opencl_context] ||= OpenCL.create_context(opencl_device)
147
+ def _create_opencl_context(opencl_device)
148
+ @opencl_context = OpenCL.create_context(opencl_device)
97
149
  end
98
150
 
99
151
  def choose_best_device
100
152
  @best_device ||= begin
101
- devices = OpenCL.platforms.flat_map do |p|
102
-
103
- p.devices.select { |d| d.available > 0 }.each_with_index.collect do |d, index|
104
- score = 0
105
- if d.type.to_s == 'CPU'
106
- score += 1
107
- elsif d.type.to_s == 'GPU'
108
- score += 4
109
- end
153
+ devices = OpenclEvaluator.query_devices_with_score
154
+ devices.sort { |a| a[1] }.reverse.first
155
+ end
156
+ end
110
157
 
111
- if d.platform.name == 'NVIDIA CUDA'
112
- score += 1000
113
- end
158
+ def self.query_devices_with_score
159
+ OpenCL.platforms.flat_map do |p|
114
160
 
115
- score += d.max_compute_units
116
- score += d.max_clock_frequency
161
+ p.devices.select { |d| d.available > 0 }.each_with_index.collect do |d, index|
162
+ score = 0
163
+ if d.type.to_s == 'CPU'
164
+ score += 1
165
+ elsif d.type.to_s == 'GPU'
166
+ score += 4
167
+ end
117
168
 
118
- [d, score, p.name, index]
169
+ if d.platform.name == 'NVIDIA CUDA'
170
+ score += 1000
119
171
  end
172
+
173
+ score += d.max_compute_units
174
+ score += d.max_clock_frequency
175
+
176
+ [d, score, p.name, index]
120
177
  end
121
- devices.sort { |a| a[1] }.reverse.first
122
178
  end
123
179
  end
124
180
 
@@ -127,15 +183,15 @@ module TensorStream
127
183
  properties = []
128
184
  properties << OpenCL::CommandQueue::PROFILING_ENABLE if supported_proprties.include?('PROFILING_ENABLE')
129
185
  properties << OpenCL::CommandQueue::OUT_OF_ORDER_EXEC_MODE_ENABLE if supported_proprties.include?('OUT_OF_ORDER_EXEC_MODE_ENABLE')
130
- @context[:_cache][:_opencl_queue] ||= _opencl_context.create_command_queue(opencl_device, properties: properties)
186
+ @command_queue = _opencl_context.create_command_queue(opencl_device, properties: properties)
131
187
  end
132
188
 
133
189
  def _opencl_context
134
- @context[:_cache][:_opencl_context]
190
+ @opencl_context
135
191
  end
136
192
 
137
193
  def _opencl_queue
138
- @context[:_cache][:_opencl_queue]
194
+ @command_queue
139
195
  end
140
196
 
141
197
  def cl_template_path(kernel, extension)
@@ -144,7 +200,7 @@ module TensorStream
144
200
 
145
201
  def _cl_program(kernel, args = {})
146
202
  suffix = args.collect { |k,v| "#{k}.#{v}"}.join('.')
147
- @context[:_cache]["_opencl_kernel_#{kernel}.#{suffix}"] ||= begin
203
+ @context[:_cache]["_opencl_kernel_#{kernel}.#{suffix}:#{object_id}"] ||= begin
148
204
  filename = %w[cl.erb cl].map { |ext| cl_template_path(kernel, ext) }.find { |n| File.exist?(n) }
149
205
  source = File.read(filename)
150
206
  source = OpenclTemplateHelper.new(source).generate(args)
@@ -163,13 +219,16 @@ module TensorStream
163
219
  return tensor.map { |t| _run(t, execution_context) }
164
220
  end
165
221
 
166
- return tensor if retain.include?(tensor) # if var is in retain don't eval to value
167
-
168
222
  tensor = tensor.call if tensor.is_a?(Proc)
169
223
 
170
224
  child_context = execution_context.dup
171
225
  res = if tensor.is_a?(Operation)
172
- eval_operation(tensor, child_context)
226
+ if !self.class.ops.include?(tensor.operation.to_sym)
227
+ result = @session.delegate_to_evaluator(tensor, @context, execution_context)
228
+ convert_from_buffer(tensor, result)
229
+ else
230
+ eval_operation(tensor, child_context)
231
+ end
173
232
  elsif tensor.is_a?(Variable)
174
233
  eval_variable(tensor, child_context)
175
234
  elsif tensor.is_a?(Placeholder)
@@ -187,415 +246,306 @@ module TensorStream
187
246
  tensor.buffer
188
247
  end
189
248
 
190
- def eval_operation(tensor, child_context)
191
- return @context[tensor.name] if @context.key?(tensor.name)
192
- cache_key = "#{tensor.graph.object_id}_opencl_#{tensor.name}"
193
- return @context[cache_key] if @context.key?(cache_key)
194
- a = resolve_placeholder(tensor.items[0], child_context) if tensor.items && tensor.items[0]
195
- b = resolve_placeholder(tensor.items[1], child_context) if tensor.items && tensor.items[1]
196
- # puts tensor.name
197
- case tensor.operation
198
- when :concat
199
- input_a = read_final_result(complete_eval(a, child_context))
200
- arr = concat_array(input_a, tensor.options[:axis])
201
- convert_to_opencl(arr.flatten, shape_eval(arr), data_type: tensor.data_type, name: tensor.name)
202
- when :cond
203
- pred = complete_eval(tensor.options[:pred], child_context)
204
- a = _run(a, child_context)
205
- b = _run(b, child_context)
206
-
207
- if all_true?(pred.buffer)
208
- a
209
- else
210
- b
211
- end
212
- when :identity
213
- _run(a, child_context)
214
- when :eye
215
- rows = complete_eval(a, child_context)
216
- columns = complete_eval(b, child_context)
217
- shape = [rows.buffer[0], columns.buffer[0]]
218
- eye_arr = Array.new(rows.buffer[0]) do |i|
219
- Array.new(columns.buffer[0]) do |col|
220
- if fp_type?(tensor.data_type)
221
- i == col ? 1.0 : 0.0
222
- else
223
- i == col ? 1 : 0
224
- end
225
- end
226
- end
249
+ register_op :log do |context, tensor, inputs|
250
+ execute_func('log', tensor, inputs[0], context)
251
+ end
227
252
 
228
- convert_to_opencl(eye_arr.flatten, shape, data_type: tensor.data_type, name: tensor.name)
229
- when :pad
230
- a = read_final_result(complete_eval(a, child_context))
231
- p = read_final_result(complete_eval(tensor.options[:paddings], child_context))
232
-
233
- padding = arr_pad(a, p, tensor.data_type)
234
- convert_to_opencl(padding.flatten, shape_eval(padding), data_type: tensor.data_type, name: tensor.name)
235
- when :tile
236
- input = read_final_result(complete_eval(a, child_context))
237
- multiples = read_final_result(complete_eval(b, child_context))
238
-
239
- rank = get_rank(input)
240
- raise '1D or higher tensor required' if rank.zero?
241
- raise "invalid multiple size passed #{rank} != #{multiples.size}" if rank != multiples.size
242
-
243
- tile = tile_arr(input, 0, multiples)
244
- arr = tile.nil? ? [] : tile
245
- convert_to_opencl(arr.flatten, shape_eval(arr), data_type: tensor.data_type, name: tensor.name)
246
- when :assign
247
- assign_var(tensor, b, child_context)
248
- when :assign_add
249
- a = _run(a, child_context)
250
- b = _run(b, child_context)
251
- value = execute_2_operand_func('add', tensor, a, b, child_context)
252
- assign_var(tensor, value, child_context)
253
- when :assign_sub
254
- a = _run(a, child_context)
255
- b = _run(b, child_context)
256
-
257
- value = execute_2_operand_func('sub', tensor, a, b, child_context)
258
- assign_var(tensor, value, child_context)
259
- when :less
260
- execute_2_operand_func('less', tensor, a, b, child_context, 'cond')
261
- when :less_equal
262
- execute_2_operand_func('less_equal', tensor, a, b, child_context, 'cond')
263
- when :greater
264
- execute_2_operand_func('greater', tensor, a, b, child_context, 'cond')
265
- when :greater_equal
266
- execute_2_operand_func('greater_equal', tensor, a, b, child_context, 'cond')
267
- when :equal
268
- execute_2_operand_func('equal', tensor, a, b, child_context, 'cond')
269
- when :not_equal
270
- execute_2_operand_func('not_equal', tensor, a, b, child_context, 'cond')
271
- when :logical_and
272
- execute_2_operand_func('logical_and', tensor, a, b, child_context, 'cond')
273
- when :where
274
- pred = tensor.options[:pred]
275
- execute_cond_func('where', tensor, pred, a, b, child_context)
276
- when :max
277
- execute_2_operand_func('max', tensor, a, b, child_context)
278
- when :add
279
- execute_2_operand_func('add', tensor, a, b, child_context)
280
- when :div
281
- execute_2_operand_func('div', tensor, a, b, child_context)
282
- when :sub
283
- execute_2_operand_func('sub', tensor, a, b, child_context)
284
- when :matmul
285
- a = _run(a, child_context)
286
- b = _run(b, child_context)
287
-
288
- m = a.shape[0]
289
- n = b.shape[1]
290
- v = b.shape[0]
291
- k = a.shape[1]
292
-
293
- m, k = [a.shape[1], a.shape[0]] if tensor.options[:transpose_a]
294
- n, v = [b.shape[0], b.shape[1]] if tensor.options[:transpose_b]
295
-
296
- result_shape = [m, n]
297
-
298
- raise "#{tensor.items[0].name} rank must be greater than 1" if a.shape.size < 2
299
- raise "#{tensor.items[1].name} rank must be greater than 1" if b.shape.size < 2
300
- raise "incompatible shape sizes for matrix multiplication (#{a.shape[1]} != #{b.shape[0]}) #{a.shape} vs #{b.shape}" if k != v
301
-
302
- dtype = tensor.data_type
303
- a, b = auto_type_cast(a, b, name: "#{tensor.name}/cast_#{a.name}_#{b.data_type}")
304
- output_buffer = _create_result_buffer(a.data_type, result_shape, tensor.name)
305
-
306
- cl_m = OpenCL::Int1.new(m)
307
- cl_n = OpenCL::Int1.new(n)
308
- cl_k = OpenCL::Int1.new(k)
309
-
310
- transpose_a = OpenCL::Int1.new(tensor.options[:transpose_a] ? 1 : 0)
311
- transpose_b = OpenCL::Int1.new(tensor.options[:transpose_b] ? 1 : 0)
312
-
313
- output_buffer.op = _cl_program('gemm', dtype: dtype).send(:"gemm_#{dtype}", _opencl_queue, result_shape, cl_m, cl_n, cl_k, transpose_a, transpose_b, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer)
314
- output_buffer
315
- when :mul
316
- execute_2_operand_func('mul', tensor, a, b, child_context)
317
- when :pow
318
- execute_2_operand_func('pow', tensor, a, b, child_context)
319
- when :cast
320
- a = _run(a, child_context)
321
- if a.data_type != tensor.data_type
322
- buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
323
- m, n = a.shape
324
- cl_m = OpenCL::Int1.new(m || 1)
325
- cl_n = OpenCL::Int1.new(n || 1)
326
- work_group = [m || 1, n || 1]
327
-
328
- buffer.op = _cl_program("cast", source_dt: a.data_type, target_dt: tensor.data_type).cast(_opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, buffer.cl_buffer)
329
- buffer
330
- else
331
- a
332
- end
333
- when :sign
334
- execute_func('sign', tensor, a, child_context)
335
- when :exp
336
- execute_func('exp', tensor, a, child_context)
337
- when :log
338
- execute_func('log', tensor, a, child_context)
339
- when :sin
340
- execute_func('sin', tensor, a, child_context)
341
- when :tan
342
- execute_func('tan', tensor, a, child_context)
343
- when :cos
344
- execute_func('cos', tensor, a, child_context)
345
- when :abs
346
- execute_func('abs', tensor, a, child_context)
347
- when :sqrt
348
- execute_func('sqrt', tensor, a, child_context)
349
- when :negate
350
- execute_func('negate', tensor, a, child_context)
351
- when :square
352
- execute_func('square', tensor, a, child_context)
353
- when :reciprocal
354
- execute_func('reciprocal', tensor, a, child_context)
355
- when :tanh
356
- execute_func('tanh', tensor, a, child_context)
357
- when :tanh_grad
358
- execute_func('tanh_grad', tensor, a, child_context)
359
- when :sigmoid
360
- execute_func('sigmoid', tensor, a, child_context)
361
- when :log1p
362
- execute_func('log1p', tensor, a, child_context)
363
- when :round
364
- execute_func('round', tensor, a, child_context)
365
- when :softmax
366
- a = _run(a, child_context)
367
- event_wait_list = [a.op].compact
368
- dtype = tensor.data_type
369
- output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
253
+ register_op :sin do |context, tensor, inputs|
254
+ execute_func('sin', tensor, inputs[0], context)
255
+ end
370
256
 
371
- m, n = a.shape
372
- work_group = [m]
373
- n = m if n.nil?
374
- cl_n = OpenCL::Int1.new(n || 1)
257
+ register_op :cond do |context, tensor, inputs|
258
+ pred = complete_eval(tensor.options[:pred], context)
375
259
 
376
- event = _cl_program("softmax", dtype: dtype).send(:"softmax_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
377
- output_buffer.op = event
378
- output_buffer
379
- when :softmax_grad
380
- a = _run(a, child_context)
381
- grad = _run(b, child_context)
382
- event_wait_list = [a.op].compact
383
- dtype = tensor.data_type
384
- output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
260
+ if all_true?(pred.buffer)
261
+ inputs[0]
262
+ else
263
+ inputs[1]
264
+ end
265
+ end
266
+
267
+ register_op :identity do |_context, _tensor, inputs|
268
+ inputs[0]
269
+ end
270
+
271
+ register_op :assign, noop: true do |context, tensor, inputs|
272
+ assign_var(tensor, inputs[1], context)
273
+ end
274
+
275
+ register_op :assign_add do |context, tensor, inputs|
276
+ value = execute_2_operand_func('add', tensor, inputs[0], inputs[1], context)
277
+ assign_var(tensor, value, context)
278
+ end
279
+
280
+ register_op :assign_sub do |context, tensor, inputs|
281
+ value = execute_2_operand_func('sub', tensor, inputs[0], inputs[1], context)
282
+ assign_var(tensor, value, context)
283
+ end
284
+
285
+ %i[less less_equal greater greater_equal equal not_equal logical_and].each do |op|
286
+ register_op op, noop: true do |context, tensor, inputs|
287
+ execute_2_operand_func(op.to_s, tensor, inputs[0], inputs[1], context, 'cond')
288
+ end
289
+ end
385
290
 
291
+ %i[max add div sub mul pow sigmoid_grad].each do |op|
292
+ register_op op, noop: true do |context, tensor, inputs|
293
+ execute_2_operand_func(op.to_s, tensor, inputs[0], inputs[1], context)
294
+ end
295
+ end
296
+
297
+ register_op :where, noop: true do |context, tensor, inputs|
298
+ pred = tensor.options[:pred]
299
+ execute_cond_func('where', tensor, pred, inputs[0], inputs[1], context)
300
+ end
301
+
302
+ register_op :matmul do |_context, tensor, inputs|
303
+ a, b = inputs
304
+
305
+ m = a.shape[0]
306
+ n = b.shape[1]
307
+ v = b.shape[0]
308
+ k = a.shape[1]
309
+
310
+ m, k = [a.shape[1], a.shape[0]] if tensor.options[:transpose_a]
311
+ n, v = [b.shape[0], b.shape[1]] if tensor.options[:transpose_b]
312
+
313
+ result_shape = [m, n]
314
+
315
+ raise "#{tensor.inputs[0].name} rank must be greater than 1" if a.shape.size < 2
316
+ raise "#{tensor.inputs[1].name} rank must be greater than 1" if b.shape.size < 2
317
+ raise "incompatible shape sizes for matrix multiplication (#{a.shape[1]} != #{b.shape[0]}) #{a.shape} vs #{b.shape}" if k != v
318
+
319
+ dtype = tensor.data_type
320
+ a, b = auto_type_cast(a, b, name: "#{tensor.name}/cast_#{a.name}_#{b.data_type}")
321
+ output_buffer = _create_result_buffer(a.data_type, result_shape, tensor.name)
322
+
323
+ cl_m = OpenCL::Int1.new(m)
324
+ cl_n = OpenCL::Int1.new(n)
325
+ cl_k = OpenCL::Int1.new(k)
326
+
327
+ transpose_a = OpenCL::Int1.new(tensor.options[:transpose_a] ? 1 : 0)
328
+ transpose_b = OpenCL::Int1.new(tensor.options[:transpose_b] ? 1 : 0)
329
+
330
+ output_buffer.op = _cl_program('gemm', dtype: dtype).send(:"gemm_#{dtype}", _opencl_queue, result_shape, cl_m, cl_n, cl_k, transpose_a, transpose_b, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer)
331
+ output_buffer
332
+ end
333
+
334
+ register_op :cast do |_context, tensor, inputs|
335
+ a = inputs[0]
336
+ if a.data_type != tensor.data_type
337
+ buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
386
338
  m, n = a.shape
387
- work_group = [m]
388
- n = m if n.nil?
339
+ cl_m = OpenCL::Int1.new(m || 1)
389
340
  cl_n = OpenCL::Int1.new(n || 1)
390
- event = _cl_program("softmax_grad", dtype: dtype, size: n).send(:"softmax_grad_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer, grad.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
391
- output_buffer.op = event
392
- output_buffer
393
- when :sigmoid_grad
394
- execute_2_operand_func('sigmoid_grad', tensor, a, b, child_context)
395
- when :truncate
396
- a = _run(a, child_context)
397
- b = _run(b, child_context)
398
-
399
- if a.shape.size.zero?
341
+ work_group = [m || 1, n || 1]
342
+
343
+ buffer.op = _cl_program("cast", source_dt: a.data_type, target_dt: tensor.data_type).cast(_opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, buffer.cl_buffer)
344
+ buffer
345
+ else
346
+ a
347
+ end
348
+ end
349
+
350
+ %i[sign exp tan cos abs sqrt negate square reciprocal tanh tanh_grad sigmoid log1p round].each do |op|
351
+ register_op op, noop: true do |context, tensor, inputs|
352
+ execute_func(op.to_s, tensor, inputs[0], context)
353
+ end
354
+ end
355
+
356
+ register_op :softmax do |_context, tensor, inputs|
357
+ a = inputs[0]
358
+ event_wait_list = [a.op].compact
359
+ dtype = tensor.data_type
360
+ output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
361
+
362
+ m, n = a.shape
363
+ work_group = [m]
364
+ n = m if n.nil?
365
+ cl_n = OpenCL::Int1.new(n || 1)
366
+
367
+ event = _cl_program("softmax", dtype: dtype).send(:"softmax_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
368
+ output_buffer.op = event
369
+ output_buffer
370
+ end
371
+
372
+ register_op :softmax_grad do |_context, tensor, inputs|
373
+ a, grad = inputs
374
+
375
+ event_wait_list = [a.op].compact
376
+ dtype = tensor.data_type
377
+ output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
378
+
379
+ m, n = a.shape
380
+ work_group = [m]
381
+ n = m if n.nil?
382
+ cl_n = OpenCL::Int1.new(n || 1)
383
+ event = _cl_program('softmax_grad', dtype: dtype, size: n).send(:"softmax_grad_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer, grad.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
384
+ output_buffer.op = event
385
+ output_buffer
386
+ end
387
+
388
+ register_op :truncate do |context, tensor, inputs|
389
+ a, b = inputs
390
+ if a.shape.size.zero?
391
+ a
392
+ else
393
+ input_b = read_final_result(b)
394
+ if a.shape == input_b
400
395
  a
401
396
  else
402
- input_b = read_final_result(b)
403
- if a.shape == input_b
404
- a
405
- else
406
- input_a = read_final_result(a)
407
- if input_b == []
408
- if a.buffer.size == 1
409
- a.shape = input_b
410
- a
411
- else
412
- wrap_opencl(a.buffer[0], data_type: a.data_type, name: tensor.name)
413
- end
397
+ input_a = read_final_result(a)
398
+ if input_b == []
399
+ if a.buffer.size == 1
400
+ a.shape = input_b
401
+ a
414
402
  else
415
- wrap_opencl(truncate(input_a, input_b), data_type: a.data_type, name: tensor.name)
403
+ wrap_opencl(a.buffer[0], data_type: a.data_type, name: tensor.name)
416
404
  end
405
+ else
406
+ wrap_opencl(truncate(input_a, input_b), data_type: a.data_type, name: tensor.name)
417
407
  end
418
408
  end
419
- when :check_numerics
420
- a = complete_eval(a, child_context)
421
- name = tensor.options[:name]
409
+ end
410
+ end
422
411
 
423
- a.buffer.each do |item|
424
- raise "#{name} Invalid Argument" if item.nan? || item.infinite?
425
- end
426
- a
427
- when :zeros, :ones, :zeros_like, :ones_like
428
- shape = if %i[zeros_like ones_like].include?(tensor.operation)
429
- _run(a, child_context).shape
430
- else
431
- read_final_result(complete_eval(a, child_context)) || tensor.shape.shape
432
- end
412
+ register_op :check_numerics, noop: true do |context, tensor, inputs|
413
+ a = complete_eval(inputs[0], context)
414
+ name = tensor.options[:name]
433
415
 
434
- func = if %i[zeros zeros_like].include?(tensor.operation)
435
- -> { tensor.data_type == :int32 ? 0 : 0.0 }
436
- else
437
- -> { tensor.data_type == :int32 ? 1 : 1.0 }
438
- end
416
+ a.buffer.each do |input|
417
+ raise "#{name} Invalid Argument" if input.nan? || input.infinite?
418
+ end
419
+ a
420
+ end
439
421
 
440
- size = shape.empty? ? 1 : shape.reduce(:*)
422
+ register_op :broadcast_transform do |context, tensor, inputs|
423
+ a, b = inputs
441
424
 
442
- buffer = if TensorStream::Ops::FLOATING_POINT_TYPES.include?(tensor.data_type)
443
- NArray.sfloat(size)
444
- elsif TensorStream::Ops::INTEGER_TYPES.include?(tensor.data_type)
445
- NArray.int(size)
446
- else
447
- raise "unsupported type #{tensor.data_type}"
448
- end
449
-
450
- data = if !shape.empty?
451
- Array.new(size) do |index|
452
- func.call
453
- end
454
- else
455
- func.call
456
- end
425
+ if a.shape == b.shape
426
+ [a, b]
427
+ else
428
+ input_a = read_final_result(complete_eval(a, context))
429
+ input_b = read_final_result(complete_eval(b, context))
430
+ b_a, b_b = broadcast(input_a, input_b)
431
+ [ wrap_opencl(b_a, data_type: a.data_type, name: "#{tensor.name}_a"),
432
+ wrap_opencl(b_b, data_type: a.data_type, name: "#{tensor.name}_b")]
433
+ end
434
+ end
457
435
 
458
- convert_to_opencl(data, shape, data_type: tensor.data_type, name: tensor.name)
459
- when :broadcast_transform
460
- a = _run(a, child_context)
461
- b = _run(b, child_context)
462
-
463
- if a.shape == b.shape
464
- [a, b]
465
- else
466
- input_a = read_final_result(complete_eval(a, child_context))
467
- input_b = read_final_result(complete_eval(b, child_context))
468
- b_a, b_b = broadcast(input_a, input_b)
469
- [ wrap_opencl(b_a, data_type: a.data_type, name: "#{tensor.name}_a"),
470
- wrap_opencl(b_b, data_type: a.data_type, name: "#{tensor.name}_b")]
471
- end
472
- when :print
473
- a = _run(a, child_context)
474
- b = _run(b, child_context)
475
- input_b = complete_eval(b, child_context)
476
- input_b = read_final_result(input_b)
477
- puts "#{tensor.options.fetch(:message, '')} #{input_b}"
478
- a
479
- when :rank
480
- a = _run(a, child_context)
481
- wrap_opencl(a.shape.size, data_type: tensor.data_type, name: tensor.name)
482
- when :stop_gradient
483
- _run(a, child_context)
484
- when :slice
485
- input_a = complete_eval(a, child_context)
486
- input_b = read_final_result(complete_eval(b, child_context))
487
- size = tensor.options[:size]
488
-
489
- slice_param = input_b.zip(size).collect { |x, y| x..x + y - 1 }.reverse
490
-
491
- new_buf = input_a.buffer.reshape(*input_a.shape.reverse)
492
- sliced = new_buf.slice[*slice_param]
493
- convert_to_opencl(sliced.flatten, sliced.shape.reverse, data_type: a.data_type, name: tensor.name)
494
- when :transpose
495
- input_a = complete_eval(a, child_context)
496
- t_param = Array.new(input_a.shape.size) { |index| index }.reverse
497
- transposed = input_a.buffer.reshape(*input_a.shape.reverse).transpose(*t_param)
498
- convert_to_opencl(transposed.flatten, transposed.shape.reverse, data_type: a.data_type, name: tensor.name)
499
- when :index
500
- a = complete_eval(a, child_context)
501
- input_a = read_final_result(a)
502
- index = read_final_result(complete_eval(b, child_context))
503
-
504
- if a.is_a?(Array)
505
- a[index]
506
- else
507
- new_shape = a.shape.dup
508
- new_shape.shift
509
- convert_to_opencl(input_a[index], new_shape, data_type: a.data_type, name: tensor.name)
510
- end
511
- when :broadcast_gradient_args
512
- a = complete_eval(a, child_context)
513
- b = complete_eval(b, child_context)
514
-
515
- wrap_opencl(get_broadcast_gradient_args(a.buffer.to_a, b.buffer.to_a), data_type: a.data_type, name: tensor.name)
516
- when :shape
517
- a = _run(a, child_context)
518
-
519
- wrap_opencl(a.shape, name: tensor.name, data_type: tensor.options[:out_type] || :float32)
520
- when :reshape
521
- arr = complete_eval(a, child_context)
522
- new_shape = read_final_result(complete_eval(b, child_context))
523
-
524
- if new_shape.size.zero? && arr.buffer.size == 1
525
- arr.shape = new_shape
526
- arr
527
- else
528
- new_shape = TensorShape.fix_inferred_elements(new_shape, arr.buffer.size)
529
- arr.shape = new_shape
530
- arr
531
- end
532
- when :random_uniform
533
- maxval = tensor.options.fetch(:maxval, 1)
534
- minval = tensor.options.fetch(:minval, 0)
535
- seed = tensor.options[:seed]
536
-
537
- random = _get_randomizer(tensor, seed)
538
- generator = -> { random.rand * (maxval - minval) + minval }
539
- shape = tensor.options[:shape] || tensor.shape.shape
540
-
541
- convert_to_opencl(generate_vector(shape, generator: generator), shape, data_type: tensor.data_type, name: tensor.name)
542
- when :random_normal
543
- random = _get_randomizer(tensor, seed)
544
- r = RandomGaussian.new(tensor.options.fetch(:mean), tensor.options.fetch(:stddev), -> { random.rand })
545
- random = _get_randomizer(tensor, seed)
546
- generator = -> { r.rand }
547
- shape = tensor.options[:shape] || tensor.shape.shape
548
-
549
- convert_to_opencl(generate_vector(shape, generator: generator), shape, data_type: tensor.data_type, name: tensor.name)
550
- when :glorot_uniform
551
- random = _get_randomizer(tensor, seed)
552
-
553
- shape = tensor.options[:shape] || tensor.shape.shape
554
- fan_in, fan_out = if shape.size.zero?
555
- [1, 1]
556
- elsif shape.size == 1
557
- [1, shape[0]]
558
- else
559
- [shape[0], shape.last]
560
- end
561
-
562
- limit = Math.sqrt(6.0 / (fan_in + fan_out))
563
-
564
- minval = -limit
565
- maxval = limit
566
-
567
- generator = -> { random.rand * (maxval - minval) + minval }
568
- convert_to_opencl(generate_vector(shape, generator: generator), shape, data_type: tensor.data_type, name: tensor.name)
569
- when :flow_group
570
- tensor.items.collect { |item| _run(item, child_context) }
571
- when :sum
572
- reduction(child_context, tensor, a, b, :sum)
573
- when :mean
574
- reduction(child_context, tensor, a, b, :mean)
575
- when :prod
576
- input_a = complete_eval(a, child_context)
577
- if input_a.buffer.empty?
578
- convert_to_opencl([1.0], [], data_type: a.data_type, name: tensor.name)
579
- else
580
- reduction(child_context, tensor, a, b, :prod)
581
- end
582
- when :argmin
583
- a = complete_eval(a, child_context)
584
- axis = tensor.options[:axis] || 0
585
- arr = a.buffer.reshape(*a.shape.reverse).to_a
586
- op = get_op_with_axis(arr, axis, 0, a.data_type, ->(a, b) { a < b })
587
- convert_to_opencl(op, shape_eval(op), data_type: tensor.data_type, name: tensor.name)
588
- when :argmax
589
- a = complete_eval(a, child_context)
590
- axis = tensor.options[:axis] || 0
591
- arr = a.buffer.reshape(*a.shape.reverse).to_a
592
- op = get_op_with_axis(arr, axis, 0, a.data_type, ->(a, b) { a > b })
593
- convert_to_opencl(op, shape_eval(op), data_type: tensor.data_type, name: tensor.name)
436
+ register_op :print do |context, tensor, inputs|
437
+ a, b = inputs
438
+ input_b = complete_eval(b, context)
439
+ input_b = read_final_result(input_b)
440
+ puts "#{tensor.options.fetch(:message, '')} #{input_b}"
441
+ a
442
+ end
443
+
444
+ register_op :rank do |_context, tensor, inputs|
445
+ wrap_opencl(inputs[0].shape.size, data_type: tensor.data_type, name: tensor.name)
446
+ end
447
+
448
+ register_op :stop_gradient do |_context, _tensor, inputs|
449
+ inputs[0]
450
+ end
451
+
452
+ register_op :slice, noop: true do |context, tensor, inputs|
453
+ input_a = complete_eval(inputs[0], context)
454
+ input_b = read_final_result(complete_eval(inputs[1], context))
455
+ size = tensor.options[:size]
456
+
457
+ slice_param = input_b.zip(size).collect { |x, y| x..x + y - 1 }.reverse
458
+
459
+ new_buf = input_a.buffer.reshape(*input_a.shape.reverse)
460
+ sliced = new_buf.slice[*slice_param]
461
+ convert_to_opencl(sliced.flatten, sliced.shape.reverse, data_type: inputs[0].data_type, name: tensor.name)
462
+ end
463
+
464
+ register_op :transpose, buffer: true do |_context, tensor, inputs|
465
+ t_param = Array.new(inputs[0].shape.size) { |index| index }.reverse
466
+ transposed = inputs[0].buffer.reshape(*inputs[0].shape.reverse).transpose(*t_param)
467
+ convert_to_opencl(transposed.flatten, transposed.shape.reverse, data_type: inputs[0].data_type, name: tensor.name)
468
+ end
469
+
470
+ register_op :index, buffer: true do |_context, tensor, inputs|
471
+ a = inputs[0]
472
+ input_a = read_final_result(a)
473
+ index = read_final_result(inputs[1])
474
+
475
+ if a.is_a?(Array)
476
+ a[index]
594
477
  else
595
- raise "unknown op #{tensor.operation}"
596
- end.tap do |result|
478
+ new_shape = a.shape.dup
479
+ new_shape.shift
480
+ convert_to_opencl(input_a[index], new_shape, data_type: a.data_type, name: tensor.name)
481
+ end
482
+ end
483
+
484
+ register_op :broadcast_gradient_args, buffer: true do |_context, tensor, inputs|
485
+ wrap_opencl(get_broadcast_gradient_args(inputs[0].buffer.to_a, inputs[1].buffer.to_a), data_type: inputs[0].data_type, name: tensor.name)
486
+ end
487
+
488
+ register_op :shape do |_context, tensor, inputs|
489
+ wrap_opencl(inputs[0].shape, name: tensor.name, data_type: tensor.options[:out_type] || :float32)
490
+ end
491
+
492
+ register_op :reshape, buffer: true do |_context, _tensor, inputs|
493
+ arr = inputs[0]
494
+ new_shape = read_final_result(inputs[1])
495
+
496
+ if new_shape.size.zero? && arr.buffer.size == 1
497
+ arr.shape = new_shape
498
+ arr
499
+ else
500
+ new_shape = TensorShape.fix_inferred_elements(new_shape, arr.buffer.size)
501
+ arr.shape = new_shape
502
+ arr
503
+ end
504
+ end
505
+
506
+ register_op :flow_group do |_context, _tensor, inputs|
507
+ inputs
508
+ end
509
+
510
+ %i[sum mean].each do |op|
511
+ register_op op, noop: true do |context, tensor, inputs|
512
+ reduction(context, tensor, inputs[0], inputs[1], op.to_sym)
513
+ end
514
+ end
515
+
516
+ register_op :prod, noop: true do |context, tensor, inputs|
517
+ input_a = complete_eval(inputs[0], context)
518
+ if input_a.buffer.empty?
519
+ convert_to_opencl([1.0], [], data_type: inputs[0].data_type, name: tensor.name)
520
+ else
521
+ reduction(context, tensor, inputs[0], inputs[1], :prod)
522
+ end
523
+ end
524
+
525
+ register_op :argmin, buffer: true do |_context, tensor, inputs|
526
+ axis = tensor.options[:axis] || 0
527
+ arr = inputs[0].buffer.reshape(*inputs[0].shape.reverse).to_a
528
+ op = get_op_with_axis(arr, axis, 0, inputs[0].data_type, ->(a, b) { a < b })
529
+ convert_to_opencl(op, shape_eval(op), data_type: tensor.data_type, name: tensor.name)
530
+ end
531
+
532
+ register_op :argmax, buffer: true do |_context, tensor, inputs|
533
+ axis = tensor.options[:axis] || 0
534
+ arr = inputs[0].buffer.reshape(*inputs[0].shape.reverse).to_a
535
+ op = get_op_with_axis(arr, axis, 0, inputs[0].data_type, ->(a, b) { a > b })
536
+ convert_to_opencl(op, shape_eval(op), data_type: tensor.data_type, name: tensor.name)
537
+ end
538
+
539
+ def eval_operation(tensor, child_context)
540
+ return @context[tensor.name] if @context.key?(tensor.name)
541
+ cache_key = "#{tensor.graph.object_id}_opencl_#{tensor.name}:#{object_id}"
542
+ return @context[cache_key] if @context.key?(cache_key)
543
+ # puts tensor.name
544
+ invoke(tensor, child_context).tap do |result|
597
545
  # puts "#{tensor.to_math(true,1)} = #{read_final_result(complete_eval(result, child_context))}"
598
546
  if tensor.breakpoint
547
+ a = resolve_placeholder(tensor.inputs[0], child_context) if tensor.inputs && tensor.inputs[0]
548
+ b = resolve_placeholder(tensor.inputs[1], child_context) if tensor.inputs && tensor.inputs[1]
599
549
  a = read_final_result(complete_eval(a, child_context))
600
550
  b = read_final_result(complete_eval(b, child_context))
601
551
  result = read_final_result(complete_eval(result, child_context))
@@ -642,7 +592,7 @@ module TensorStream
642
592
  def eval_tensor(tensor, child_context)
643
593
  return tensor unless tensor.is_a?(Tensor)
644
594
 
645
- cache_key = "#{tensor.graph.object_id}_opencl_#{tensor.name}"
595
+ cache_key = "#{tensor.graph.object_id}_opencl_#{tensor.name}:#{object_id}"
646
596
  return @context[cache_key] if @context.key?(cache_key)
647
597
  return @context[:_cache][cache_key] if tensor.is_const && @context[:_cache][cache_key]
648
598
  @context[cache_key] = if tensor.value.is_a?(Tensor)
@@ -656,7 +606,7 @@ module TensorStream
656
606
  private
657
607
 
658
608
  def assign_var(tensor, b, child_context)
659
- assign = tensor.items[0] || tensor
609
+ assign = tensor.inputs[0] || tensor
660
610
  buffer = complete_eval(b, child_context)
661
611
 
662
612
  if assign.buffer
@@ -678,7 +628,7 @@ module TensorStream
678
628
  dtype = tensor.data_type
679
629
  result_shape = TensorShape.infer_shape(a.shape, b.shape)
680
630
 
681
- output_buffer = _create_result_buffer(tensor.data_type, result_shape, tensor.name)
631
+ output_buffer = _create_result_buffer(tensor.data_type, result_shape, "out_#{tensor.name}")
682
632
  a, b, prog, switch_operands = select_program(a, b, op_name)
683
633
  m, n = result_shape
684
634
  work_group = [m || 1, n || 1]
@@ -688,6 +638,7 @@ module TensorStream
688
638
 
689
639
  event_wait_list = [a.op, b.op].compact # add dependency wait list
690
640
 
641
+ method_call = :"#{prog}_#{a.data_type}_#{b.data_type}"
691
642
  event = if prog == "#{op_name}_b"
692
643
  cl_m_b, cl_n_b = if b.shape.size == 2
693
644
  [ OpenCL::Int1.new(b.shape[0]), OpenCL::Int1.new(b.shape[1]) ]
@@ -696,9 +647,9 @@ module TensorStream
696
647
  else
697
648
  raise "rank > 2 not supported!"
698
649
  end
699
- _cl_program("#{prog_name || op_name}", dtype: dtype).send(:"#{prog}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, cl_m_b, cl_n_b, cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
650
+ _cl_program("#{prog_name || op_name}", a: a.data_type, b: b.data_type, dtype: dtype).send(method_call, _opencl_queue, work_group, cl_m, cl_n, cl_m_b, cl_n_b, cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
700
651
  else
701
- _cl_program("#{prog_name || op_name}", dtype: dtype).send(:"#{prog}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
652
+ _cl_program("#{prog_name || op_name}", a: a.data_type, b: b.data_type, dtype: dtype).send(method_call, _opencl_queue, work_group, cl_m, cl_n, cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
702
653
  end
703
654
 
704
655
  output_buffer.op = event
@@ -784,7 +735,7 @@ module TensorStream
784
735
  value = [value]
785
736
  end
786
737
 
787
- cache_key = "_cl_object_#{name}_#{shape.join('_')}"
738
+ cache_key = "_cl_object_#{name}:#{shape.join('_')}:#{object_id}"
788
739
  cl_object = if name && @context[:_cache][cache_key]
789
740
  @context[:_cache][cache_key]
790
741
  else
@@ -813,13 +764,13 @@ module TensorStream
813
764
  if element.is_a?(Tensor)
814
765
  cl_object.buffer[index] = read_final_result(complete_eval(element, {}))
815
766
  else
816
- cl_object.buffer[index] = Tensor.cast_dtype(element, data_type)
767
+ cl_object.buffer[index] = ( data_type == :boolean ? ( element ? 1 : 0 ) : Tensor.cast_dtype(element, data_type))
817
768
  end
818
769
  end
819
770
  elsif value.is_a?(NArray)
820
771
  cl_object.buffer = value
821
772
  else
822
- cl_object.buffer[0] = Tensor.cast_dtype(value, data_type)
773
+ cl_object.buffer[0] = ( data_type == :boolean ? ( element ? 1 : 0 ) : Tensor.cast_dtype(value, data_type))
823
774
  end
824
775
 
825
776
  write_op = if cl_object.cl_buffer && !value.nil? && (!value.is_a?(Array) || !value.empty?)
@@ -840,14 +791,14 @@ module TensorStream
840
791
  when :int16
841
792
  NArray.sint(narray_size)
842
793
  when :boolean
843
- NArray.int(narray_size)
794
+ NArray.sint(narray_size)
844
795
  else
845
796
  raise "unsupported type #{data_type}"
846
797
  end
847
798
  end
848
799
 
849
800
  def _create_result_buffer(data_type, shape, name)
850
- @context[:_cache][:_cl_buffers]["_result_#{name}_#{shape.join('_')}"] ||= begin
801
+ @context[:_cache][:_cl_buffers]["_result_#{name}_#{shape.join('_')}:#{object_id}"] ||= begin
851
802
  size = shape.empty? ? 1 : shape.reduce(:*)
852
803
  buffer = allocate_narray_for_type(data_type, size)
853
804
  cl_buffer = _opencl_context.create_buffer(buffer.size * buffer.element_size)
@@ -1029,7 +980,6 @@ module TensorStream
1029
980
 
1030
981
  def resolve_placeholder(placeholder, _execution_context = {})
1031
982
  return nil if placeholder.nil?
1032
- return placeholder if retain.include?(placeholder)
1033
983
 
1034
984
  var = if placeholder.is_a?(Placeholder)
1035
985
  @context[placeholder.name.to_sym].tap do |c|
@@ -1056,7 +1006,7 @@ module TensorStream
1056
1006
  reduced_val = r[0]
1057
1007
  if r.size > 1
1058
1008
  reduced_val = f.call(r[0..val.size])
1059
- elsif r.size == 0
1009
+ elsif r.size.zero?
1060
1010
  reduced_val = f.call(nil)
1061
1011
  end
1062
1012
  keep_dims ? [ reduced_val ] : reduced_val
@@ -1143,3 +1093,5 @@ module TensorStream
1143
1093
  end
1144
1094
  end
1145
1095
  end
1096
+
1097
+ TensorStream::Evaluator.register_evaluator(TensorStream::Evaluator::OpenclEvaluator, 'opencl', 1)