tensor_stream 0.4.1 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (62) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +10 -0
  3. data/README.md +38 -17
  4. data/benchmark/benchmark.rb +16 -20
  5. data/lib/tensor_stream/control_flow.rb +3 -3
  6. data/lib/tensor_stream/debugging/debugging.rb +4 -4
  7. data/lib/tensor_stream/device.rb +5 -2
  8. data/lib/tensor_stream/evaluator/base_evaluator.rb +138 -0
  9. data/lib/tensor_stream/evaluator/buffer.rb +7 -2
  10. data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/_bool_operand.cl +3 -3
  11. data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/_operand.cl +0 -0
  12. data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/abs.cl +0 -0
  13. data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/add.cl +1 -1
  14. data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/argmax.cl +0 -0
  15. data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/argmin.cl +0 -0
  16. data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/cast.cl +0 -0
  17. data/lib/tensor_stream/evaluator/opencl/kernels/cond.cl.erb +6 -0
  18. data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/cos.cl +0 -0
  19. data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/div.cl.erb +1 -1
  20. data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/exp.cl +0 -0
  21. data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/gemm.cl +0 -0
  22. data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/log.cl +0 -0
  23. data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/log1p.cl +0 -0
  24. data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/max.cl +3 -3
  25. data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/mul.cl +1 -1
  26. data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/negate.cl +0 -0
  27. data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/pow.cl +3 -3
  28. data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/reciprocal.cl +0 -0
  29. data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/round.cl +0 -0
  30. data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/sigmoid.cl +0 -0
  31. data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/sigmoid_grad.cl +3 -3
  32. data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/sign.cl +1 -1
  33. data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/sin.cl +0 -0
  34. data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/softmax.cl +0 -0
  35. data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/softmax_grad.cl +0 -0
  36. data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/sqrt.cl +0 -0
  37. data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/square.cl +0 -0
  38. data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/sub.cl +1 -1
  39. data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/tan.cl +0 -0
  40. data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/tanh.cl +0 -0
  41. data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/tanh_grad.cl +0 -0
  42. data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/where.cl +1 -1
  43. data/lib/tensor_stream/evaluator/{opencl_buffer.rb → opencl/opencl_buffer.rb} +1 -1
  44. data/lib/tensor_stream/evaluator/opencl/opencl_device.rb +5 -0
  45. data/lib/tensor_stream/evaluator/{opencl_evaluator.rb → opencl/opencl_evaluator.rb} +404 -452
  46. data/lib/tensor_stream/evaluator/{opencl_template_helper.rb → opencl/opencl_template_helper.rb} +6 -6
  47. data/lib/tensor_stream/evaluator/operation_helpers/array_ops_helper.rb +21 -21
  48. data/lib/tensor_stream/evaluator/ruby_evaluator.rb +492 -398
  49. data/lib/tensor_stream/graph.rb +21 -1
  50. data/lib/tensor_stream/graph_serializers/graphml.rb +59 -59
  51. data/lib/tensor_stream/graph_serializers/pbtext.rb +1 -1
  52. data/lib/tensor_stream/helpers/op_helper.rb +6 -2
  53. data/lib/tensor_stream/math_gradients.rb +7 -7
  54. data/lib/tensor_stream/operation.rb +100 -100
  55. data/lib/tensor_stream/session.rb +81 -8
  56. data/lib/tensor_stream/tensor.rb +7 -5
  57. data/lib/tensor_stream/utils.rb +32 -19
  58. data/lib/tensor_stream/version.rb +1 -1
  59. data/tensor_stream.gemspec +0 -1
  60. data/test_samples/raw_neural_net_sample.rb +7 -7
  61. metadata +41 -53
  62. data/lib/tensor_stream/evaluator/kernels/cond.cl.erb +0 -5
@@ -1,3 +1,3 @@
1
1
  % c_dtype = dtype_to_c_type(dtype)
2
2
  % op = operator_to_c('add')
3
- <%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'add', dtype: dtype, result_t: c_dtype %>
3
+ <%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'add', dtype: "#{a}_#{b}", result_t: c_dtype %>
@@ -0,0 +1,6 @@
1
+ % ["#{dtype}"].product(%w[less less_equal equal not_equal greater greater_equal logical_and]).each do |dtype, fname|
2
+ % a_dtype = dtype_to_c_type(a)
3
+ % b_dtype = dtype_to_c_type(b)
4
+ % op = operator_to_c(fname)
5
+ <%= render 'bool_operand.cl', a_dtype: a_dtype, b_dtype: b_dtype, op: op, fname: fname, dtype: "#{a}_#{b}", result_t: 'short' %>
6
+ % end
@@ -1,3 +1,3 @@
1
1
  % c_dtype = dtype_to_c_type(dtype)
2
2
  % op = operator_to_c('div')
3
- <%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'div', dtype: dtype, result_t: c_dtype %>
3
+ <%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'div', dtype: "#{a}_#{b}", result_t: c_dtype %>
@@ -1,6 +1,6 @@
1
1
  // same dimension add floating point op
2
2
  % c_dtype = dtype_to_c_type(dtype)
3
- __kernel void max_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
3
+ __kernel void max_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
4
4
  // Get the index of the current element to be processed
5
5
  const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
6
  const int globalCol = get_global_id(1); // Col ID of C (0..N)
@@ -9,7 +9,7 @@
9
9
  }
10
10
 
11
11
  // 1D + Scalar floating point add op
12
- __kernel void max_c_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
12
+ __kernel void max_c_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
13
13
  // Get the index of the current element to be processed
14
14
  const int globalRow = get_global_id(0); // Row ID of C (0..M)
15
15
  const int globalCol = get_global_id(1); // Col ID of C (0..N)
@@ -22,7 +22,7 @@
22
22
  }
23
23
 
24
24
  // 1D + Scalar floating point add op broadcast
25
- __kernel void max_b_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
25
+ __kernel void max_b_<%= dtype %>_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
26
26
  // Get the index of the current element to be processed
27
27
  const int globalRow = get_global_id(0); // Row ID of C (0..M)
28
28
  const int globalCol = get_global_id(1); // Col ID of C (0..N)
@@ -1,3 +1,3 @@
1
1
  % c_dtype = dtype_to_c_type(dtype)
2
2
  % op = operator_to_c('mul')
3
- <%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'mul', dtype: dtype, result_t: c_dtype %>
3
+ <%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'mul', dtype: "#{a}_#{b}", result_t: c_dtype %>
@@ -1,6 +1,6 @@
1
1
  // same dimension add floating point op
2
2
  % c_dtype = dtype_to_c_type(dtype)
3
- __kernel void pow_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
3
+ __kernel void pow_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
4
4
  // Get the index of the current element to be processed
5
5
  const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
6
  const int globalCol = get_global_id(1); // Col ID of C (0..N)
@@ -9,7 +9,7 @@
9
9
  }
10
10
 
11
11
  // 1D + Scalar floating point add op
12
- __kernel void pow_c_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
12
+ __kernel void pow_c_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
13
13
  // Get the index of the current element to be processed
14
14
  const int globalRow = get_global_id(0); // Row ID of C (0..M)
15
15
  const int globalCol = get_global_id(1); // Col ID of C (0..N)
@@ -22,7 +22,7 @@
22
22
  }
23
23
 
24
24
  // 1D + Scalar floating point add op broadcast
25
- __kernel void pow_b_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
25
+ __kernel void pow_b_<%= dtype %>_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
26
26
  // Get the index of the current element to be processed
27
27
  const int globalRow = get_global_id(0); // Row ID of C (0..M)
28
28
  const int globalCol = get_global_id(1); // Col ID of C (0..N)
@@ -9,7 +9,7 @@ float sigmoid_grad(<%= c_dtype %> x, <%= c_dtype %> g) {
9
9
  }
10
10
 
11
11
  // same dimension add floating point op
12
- __kernel void sigmoid_grad_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
12
+ __kernel void sigmoid_grad_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
13
13
  // Get the index of the current element to be processed
14
14
  const int globalRow = get_global_id(0); // Row ID of C (0..M)
15
15
  const int globalCol = get_global_id(1); // Col ID of C (0..N)
@@ -18,7 +18,7 @@ float sigmoid_grad(<%= c_dtype %> x, <%= c_dtype %> g) {
18
18
  }
19
19
 
20
20
  // 1D + Scalar floating point add op
21
- __kernel void sigmoid_grad_c_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
21
+ __kernel void sigmoid_grad_c_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
22
22
  // Get the index of the current element to be processed
23
23
  const int globalRow = get_global_id(0); // Row ID of C (0..M)
24
24
  const int globalCol = get_global_id(1); // Col ID of C (0..N)
@@ -31,7 +31,7 @@ float sigmoid_grad(<%= c_dtype %> x, <%= c_dtype %> g) {
31
31
  }
32
32
 
33
33
  // 1D + Scalar floating point add op broadcast
34
- __kernel void sigmoid_grad_b_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
34
+ __kernel void sigmoid_grad_b_<%= dtype %>_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
35
35
  // Get the index of the current element to be processed
36
36
  const int globalRow = get_global_id(0); // Row ID of C (0..M)
37
37
  const int globalCol = get_global_id(1); // Col ID of C (0..N)
@@ -5,7 +5,7 @@ __kernel void sign_<%= dtype %>(const int M, const int N, __global const <%= c_d
5
5
  const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
6
  const int globalCol = get_global_id(1); // Col ID of C (0..N)
7
7
  <%= c_dtype %> value = A[globalRow * N + globalCol];
8
- % if is_floating_point?(dtype)
8
+ % if floating_point?(dtype)
9
9
  if (isnan(value) || value == 0.0f) {
10
10
  C[globalRow * N + globalCol] = 0.0;
11
11
  } else {
@@ -1,3 +1,3 @@
1
1
  % c_dtype = dtype_to_c_type(dtype)
2
2
  % op = operator_to_c('sub')
3
- <%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'sub', dtype: dtype, result_t: c_dtype %>
3
+ <%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'sub', dtype: "#{a}_#{b}", result_t: c_dtype %>
@@ -1,5 +1,5 @@
1
1
  % c_dtype = dtype_to_c_type(dtype)
2
- __kernel void where_<%= dtype %>(const int M, const int N, __global const int *PRED, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
2
+ __kernel void where_<%= dtype %>(const int M, const int N, __global const short *PRED, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
3
3
  // Get the index of the current element to be processed
4
4
  const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
5
  const int globalCol = get_global_id(1); // Col ID of C (0..N)
@@ -2,7 +2,7 @@ module TensorStream
2
2
  class OpenCLBuffer < Buffer
3
3
  include ArrayOpsHelper
4
4
 
5
- attr_accessor :data_type, :shape, :buffer, :cl_buffer, :op
5
+ attr_accessor :shape, :buffer, :cl_buffer, :op
6
6
 
7
7
  def initialize(data_type: , shape:, buffer:, cl_buffer:, op: nil, name: nil)
8
8
  @data_type = data_type
@@ -0,0 +1,5 @@
1
+ module TensorStream
2
+ class OpenclDevice < TensorStream::Device
3
+ attr_accessor :native_device
4
+ end
5
+ end
@@ -1,11 +1,12 @@
1
1
  require 'tensor_stream/evaluator/operation_helpers/random_gaussian'
2
2
  require 'tensor_stream/evaluator/operation_helpers/array_ops_helper'
3
3
  require 'tensor_stream/evaluator/operation_helpers/math_helper'
4
- require 'tensor_stream/evaluator/opencl_buffer'
5
- require 'tensor_stream/evaluator/opencl_template_helper'
6
- require 'distribution'
4
+ require 'tensor_stream/evaluator/opencl/opencl_buffer'
5
+ require 'tensor_stream/evaluator/opencl/opencl_template_helper'
6
+ require 'tensor_stream/evaluator/opencl/opencl_device'
7
7
  require 'opencl_ruby_ffi'
8
8
  require 'narray_ffi'
9
+ require 'tensor_stream/evaluator/base_evaluator'
9
10
 
10
11
  module TensorStream
11
12
  module Evaluator
@@ -27,31 +28,78 @@ module TensorStream
27
28
  end
28
29
 
29
30
  ## PURE ruby evaluator used for testing and development
30
- class OpenclEvaluator
31
+ class OpenclEvaluator < BaseEvaluator
31
32
  attr_accessor :retain
32
33
 
33
34
  include TensorStream::OpHelper
34
35
  include TensorStream::ArrayOpsHelper
35
36
  include TensorStream::MathHelper
36
37
 
37
- def initialize(session, context, thread_pool: nil, log_intermediates: false, preferred_device: nil)
38
- @session = session
39
- @context = context
40
- @log_intermediates = log_intermediates
41
- @preferred_device = preferred_device
42
- @retain = context[:retain] || []
43
- @thread_pool = thread_pool || Concurrent::ImmediateExecutor.new
44
- @context[:_cache][:_cl_buffers] ||= {} if @context[:_cache]
45
- @context[:compute_history] = [] if log_intermediates
38
+ def initialize(session, device, thread_pool: nil, log_intermediates: false)
39
+ super
40
+ _create_opencl_context(device.native_device)
41
+ @opencl_device = device.native_device
42
+ create_command_queue
43
+ end
44
+
45
+ def self.query_supported_devices
46
+ devices = query_devices_with_score
47
+ devices.sort { |a| a[1] }.reverse.map do |d|
48
+ opencl_to_device(d)
49
+ end
50
+ end
51
+
52
+ def self.fetch_device(query = [])
53
+ devices = query_devices_with_score
54
+ platform_devices = devices.select { |d| d[0].platform.to_s.downcase =~ /#{query[0].downcase}/ }
55
+ opencl_to_device(platform_devices[[query[1].to_i, platform_devices.size - 1].min])
56
+ end
57
+
58
+ def self.opencl_to_device(d)
59
+ device = d[0]
60
+ index = d[3]
61
+ platform_name = device.platform.name.gsub(' ', '_').downcase
62
+ uri = [platform_name, index].join(':')
63
+
64
+ device_type = device.type.to_s == 'GPU' ? :gpu : :cpu
65
+
66
+ OpenclDevice.new(uri, device_type, self).tap do |d|
67
+ d.native_device = device
68
+ end
69
+ end
70
+
71
+ ##
72
+ # Select the best device available in the system for this evaluator
73
+ def self.default_device
74
+ devices = OpenclEvaluator.query_devices_with_score
75
+ device = devices.sort { |a| a[1] }.reverse.first
76
+ opencl_to_device(device)
46
77
  end
47
78
 
48
79
  # opencl evaluator main entrypoint
49
80
  def run(tensor, execution_context)
50
- _create_opencl_context
51
- create_command_queue
52
81
  read_final_result(complete_eval(tensor, execution_context))
53
82
  end
54
83
 
84
+ def run_with_buffer(tensor, context, execution_context)
85
+ @context = context
86
+ @context[:_cache][:_cl_buffers] ||= {} if context[:_cache]
87
+
88
+ if tensor.is_a?(Array)
89
+ tensor.collect do |t|
90
+ value = run(t, execution_context)
91
+ Buffer.new(data_type: t.data_type, buffer: value)
92
+ end
93
+ else
94
+ value = run(tensor, execution_context)
95
+ Buffer.new(data_type: tensor.data_type, buffer: value)
96
+ end
97
+ end
98
+
99
+ def convert_from_buffer(tensor, result)
100
+ convert_to_opencl([result.buffer].flatten, shape_eval(result.buffer), data_type: result.data_type, name: tensor.name)
101
+ end
102
+
55
103
  def complete_eval(tensor, context)
56
104
  buffer = _run(tensor, context)
57
105
  if buffer.is_a?(Array)
@@ -69,11 +117,25 @@ module TensorStream
69
117
  end
70
118
 
71
119
  def opencl_device
72
- @context[:_cache][:_opencl_device]
120
+ @opencl_device
73
121
  end
74
122
 
75
123
  protected
76
124
 
125
+ def prepare_input(tensor, context, options = {})
126
+ return nil unless tensor
127
+ tensor = resolve_placeholder(tensor)
128
+ if options[:noop]
129
+ tensor
130
+ elsif options[:buffer]
131
+ complete_eval(tensor, context)
132
+ elsif options[:complete]
133
+ read_final_result(complete_eval(tensor, context))
134
+ else
135
+ _run(tensor, context)
136
+ end
137
+ end
138
+
77
139
  # read result from opencl and convert to ruby
78
140
  def read_final_result(buffer)
79
141
  return buffer.map { |b| read_final_result(b) } if buffer.is_a?(Array)
@@ -82,43 +144,37 @@ module TensorStream
82
144
  buffer.to_ruby
83
145
  end
84
146
 
85
- def _create_opencl_context
86
- @context[:_cache][:_opencl_device] ||= begin
87
- if @preferred_device
88
- @preferred_device
89
- else
90
- device, _score, _platform, _index = choose_best_device
91
- # puts "using #{device.name}"
92
- device
93
- end
94
- end
95
- @context[:cl_device] = opencl_device
96
- @context[:_cache][:_opencl_context] ||= OpenCL.create_context(opencl_device)
147
+ def _create_opencl_context(opencl_device)
148
+ @opencl_context = OpenCL.create_context(opencl_device)
97
149
  end
98
150
 
99
151
  def choose_best_device
100
152
  @best_device ||= begin
101
- devices = OpenCL.platforms.flat_map do |p|
102
-
103
- p.devices.select { |d| d.available > 0 }.each_with_index.collect do |d, index|
104
- score = 0
105
- if d.type.to_s == 'CPU'
106
- score += 1
107
- elsif d.type.to_s == 'GPU'
108
- score += 4
109
- end
153
+ devices = OpenclEvaluator.query_devices_with_score
154
+ devices.sort { |a| a[1] }.reverse.first
155
+ end
156
+ end
110
157
 
111
- if d.platform.name == 'NVIDIA CUDA'
112
- score += 1000
113
- end
158
+ def self.query_devices_with_score
159
+ OpenCL.platforms.flat_map do |p|
114
160
 
115
- score += d.max_compute_units
116
- score += d.max_clock_frequency
161
+ p.devices.select { |d| d.available > 0 }.each_with_index.collect do |d, index|
162
+ score = 0
163
+ if d.type.to_s == 'CPU'
164
+ score += 1
165
+ elsif d.type.to_s == 'GPU'
166
+ score += 4
167
+ end
117
168
 
118
- [d, score, p.name, index]
169
+ if d.platform.name == 'NVIDIA CUDA'
170
+ score += 1000
119
171
  end
172
+
173
+ score += d.max_compute_units
174
+ score += d.max_clock_frequency
175
+
176
+ [d, score, p.name, index]
120
177
  end
121
- devices.sort { |a| a[1] }.reverse.first
122
178
  end
123
179
  end
124
180
 
@@ -127,15 +183,15 @@ module TensorStream
127
183
  properties = []
128
184
  properties << OpenCL::CommandQueue::PROFILING_ENABLE if supported_proprties.include?('PROFILING_ENABLE')
129
185
  properties << OpenCL::CommandQueue::OUT_OF_ORDER_EXEC_MODE_ENABLE if supported_proprties.include?('OUT_OF_ORDER_EXEC_MODE_ENABLE')
130
- @context[:_cache][:_opencl_queue] ||= _opencl_context.create_command_queue(opencl_device, properties: properties)
186
+ @command_queue = _opencl_context.create_command_queue(opencl_device, properties: properties)
131
187
  end
132
188
 
133
189
  def _opencl_context
134
- @context[:_cache][:_opencl_context]
190
+ @opencl_context
135
191
  end
136
192
 
137
193
  def _opencl_queue
138
- @context[:_cache][:_opencl_queue]
194
+ @command_queue
139
195
  end
140
196
 
141
197
  def cl_template_path(kernel, extension)
@@ -144,7 +200,7 @@ module TensorStream
144
200
 
145
201
  def _cl_program(kernel, args = {})
146
202
  suffix = args.collect { |k,v| "#{k}.#{v}"}.join('.')
147
- @context[:_cache]["_opencl_kernel_#{kernel}.#{suffix}"] ||= begin
203
+ @context[:_cache]["_opencl_kernel_#{kernel}.#{suffix}:#{object_id}"] ||= begin
148
204
  filename = %w[cl.erb cl].map { |ext| cl_template_path(kernel, ext) }.find { |n| File.exist?(n) }
149
205
  source = File.read(filename)
150
206
  source = OpenclTemplateHelper.new(source).generate(args)
@@ -163,13 +219,16 @@ module TensorStream
163
219
  return tensor.map { |t| _run(t, execution_context) }
164
220
  end
165
221
 
166
- return tensor if retain.include?(tensor) # if var is in retain don't eval to value
167
-
168
222
  tensor = tensor.call if tensor.is_a?(Proc)
169
223
 
170
224
  child_context = execution_context.dup
171
225
  res = if tensor.is_a?(Operation)
172
- eval_operation(tensor, child_context)
226
+ if !self.class.ops.include?(tensor.operation.to_sym)
227
+ result = @session.delegate_to_evaluator(tensor, @context, execution_context)
228
+ convert_from_buffer(tensor, result)
229
+ else
230
+ eval_operation(tensor, child_context)
231
+ end
173
232
  elsif tensor.is_a?(Variable)
174
233
  eval_variable(tensor, child_context)
175
234
  elsif tensor.is_a?(Placeholder)
@@ -187,415 +246,306 @@ module TensorStream
187
246
  tensor.buffer
188
247
  end
189
248
 
190
- def eval_operation(tensor, child_context)
191
- return @context[tensor.name] if @context.key?(tensor.name)
192
- cache_key = "#{tensor.graph.object_id}_opencl_#{tensor.name}"
193
- return @context[cache_key] if @context.key?(cache_key)
194
- a = resolve_placeholder(tensor.items[0], child_context) if tensor.items && tensor.items[0]
195
- b = resolve_placeholder(tensor.items[1], child_context) if tensor.items && tensor.items[1]
196
- # puts tensor.name
197
- case tensor.operation
198
- when :concat
199
- input_a = read_final_result(complete_eval(a, child_context))
200
- arr = concat_array(input_a, tensor.options[:axis])
201
- convert_to_opencl(arr.flatten, shape_eval(arr), data_type: tensor.data_type, name: tensor.name)
202
- when :cond
203
- pred = complete_eval(tensor.options[:pred], child_context)
204
- a = _run(a, child_context)
205
- b = _run(b, child_context)
206
-
207
- if all_true?(pred.buffer)
208
- a
209
- else
210
- b
211
- end
212
- when :identity
213
- _run(a, child_context)
214
- when :eye
215
- rows = complete_eval(a, child_context)
216
- columns = complete_eval(b, child_context)
217
- shape = [rows.buffer[0], columns.buffer[0]]
218
- eye_arr = Array.new(rows.buffer[0]) do |i|
219
- Array.new(columns.buffer[0]) do |col|
220
- if fp_type?(tensor.data_type)
221
- i == col ? 1.0 : 0.0
222
- else
223
- i == col ? 1 : 0
224
- end
225
- end
226
- end
249
+ register_op :log do |context, tensor, inputs|
250
+ execute_func('log', tensor, inputs[0], context)
251
+ end
227
252
 
228
- convert_to_opencl(eye_arr.flatten, shape, data_type: tensor.data_type, name: tensor.name)
229
- when :pad
230
- a = read_final_result(complete_eval(a, child_context))
231
- p = read_final_result(complete_eval(tensor.options[:paddings], child_context))
232
-
233
- padding = arr_pad(a, p, tensor.data_type)
234
- convert_to_opencl(padding.flatten, shape_eval(padding), data_type: tensor.data_type, name: tensor.name)
235
- when :tile
236
- input = read_final_result(complete_eval(a, child_context))
237
- multiples = read_final_result(complete_eval(b, child_context))
238
-
239
- rank = get_rank(input)
240
- raise '1D or higher tensor required' if rank.zero?
241
- raise "invalid multiple size passed #{rank} != #{multiples.size}" if rank != multiples.size
242
-
243
- tile = tile_arr(input, 0, multiples)
244
- arr = tile.nil? ? [] : tile
245
- convert_to_opencl(arr.flatten, shape_eval(arr), data_type: tensor.data_type, name: tensor.name)
246
- when :assign
247
- assign_var(tensor, b, child_context)
248
- when :assign_add
249
- a = _run(a, child_context)
250
- b = _run(b, child_context)
251
- value = execute_2_operand_func('add', tensor, a, b, child_context)
252
- assign_var(tensor, value, child_context)
253
- when :assign_sub
254
- a = _run(a, child_context)
255
- b = _run(b, child_context)
256
-
257
- value = execute_2_operand_func('sub', tensor, a, b, child_context)
258
- assign_var(tensor, value, child_context)
259
- when :less
260
- execute_2_operand_func('less', tensor, a, b, child_context, 'cond')
261
- when :less_equal
262
- execute_2_operand_func('less_equal', tensor, a, b, child_context, 'cond')
263
- when :greater
264
- execute_2_operand_func('greater', tensor, a, b, child_context, 'cond')
265
- when :greater_equal
266
- execute_2_operand_func('greater_equal', tensor, a, b, child_context, 'cond')
267
- when :equal
268
- execute_2_operand_func('equal', tensor, a, b, child_context, 'cond')
269
- when :not_equal
270
- execute_2_operand_func('not_equal', tensor, a, b, child_context, 'cond')
271
- when :logical_and
272
- execute_2_operand_func('logical_and', tensor, a, b, child_context, 'cond')
273
- when :where
274
- pred = tensor.options[:pred]
275
- execute_cond_func('where', tensor, pred, a, b, child_context)
276
- when :max
277
- execute_2_operand_func('max', tensor, a, b, child_context)
278
- when :add
279
- execute_2_operand_func('add', tensor, a, b, child_context)
280
- when :div
281
- execute_2_operand_func('div', tensor, a, b, child_context)
282
- when :sub
283
- execute_2_operand_func('sub', tensor, a, b, child_context)
284
- when :matmul
285
- a = _run(a, child_context)
286
- b = _run(b, child_context)
287
-
288
- m = a.shape[0]
289
- n = b.shape[1]
290
- v = b.shape[0]
291
- k = a.shape[1]
292
-
293
- m, k = [a.shape[1], a.shape[0]] if tensor.options[:transpose_a]
294
- n, v = [b.shape[0], b.shape[1]] if tensor.options[:transpose_b]
295
-
296
- result_shape = [m, n]
297
-
298
- raise "#{tensor.items[0].name} rank must be greater than 1" if a.shape.size < 2
299
- raise "#{tensor.items[1].name} rank must be greater than 1" if b.shape.size < 2
300
- raise "incompatible shape sizes for matrix multiplication (#{a.shape[1]} != #{b.shape[0]}) #{a.shape} vs #{b.shape}" if k != v
301
-
302
- dtype = tensor.data_type
303
- a, b = auto_type_cast(a, b, name: "#{tensor.name}/cast_#{a.name}_#{b.data_type}")
304
- output_buffer = _create_result_buffer(a.data_type, result_shape, tensor.name)
305
-
306
- cl_m = OpenCL::Int1.new(m)
307
- cl_n = OpenCL::Int1.new(n)
308
- cl_k = OpenCL::Int1.new(k)
309
-
310
- transpose_a = OpenCL::Int1.new(tensor.options[:transpose_a] ? 1 : 0)
311
- transpose_b = OpenCL::Int1.new(tensor.options[:transpose_b] ? 1 : 0)
312
-
313
- output_buffer.op = _cl_program('gemm', dtype: dtype).send(:"gemm_#{dtype}", _opencl_queue, result_shape, cl_m, cl_n, cl_k, transpose_a, transpose_b, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer)
314
- output_buffer
315
- when :mul
316
- execute_2_operand_func('mul', tensor, a, b, child_context)
317
- when :pow
318
- execute_2_operand_func('pow', tensor, a, b, child_context)
319
- when :cast
320
- a = _run(a, child_context)
321
- if a.data_type != tensor.data_type
322
- buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
323
- m, n = a.shape
324
- cl_m = OpenCL::Int1.new(m || 1)
325
- cl_n = OpenCL::Int1.new(n || 1)
326
- work_group = [m || 1, n || 1]
327
-
328
- buffer.op = _cl_program("cast", source_dt: a.data_type, target_dt: tensor.data_type).cast(_opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, buffer.cl_buffer)
329
- buffer
330
- else
331
- a
332
- end
333
- when :sign
334
- execute_func('sign', tensor, a, child_context)
335
- when :exp
336
- execute_func('exp', tensor, a, child_context)
337
- when :log
338
- execute_func('log', tensor, a, child_context)
339
- when :sin
340
- execute_func('sin', tensor, a, child_context)
341
- when :tan
342
- execute_func('tan', tensor, a, child_context)
343
- when :cos
344
- execute_func('cos', tensor, a, child_context)
345
- when :abs
346
- execute_func('abs', tensor, a, child_context)
347
- when :sqrt
348
- execute_func('sqrt', tensor, a, child_context)
349
- when :negate
350
- execute_func('negate', tensor, a, child_context)
351
- when :square
352
- execute_func('square', tensor, a, child_context)
353
- when :reciprocal
354
- execute_func('reciprocal', tensor, a, child_context)
355
- when :tanh
356
- execute_func('tanh', tensor, a, child_context)
357
- when :tanh_grad
358
- execute_func('tanh_grad', tensor, a, child_context)
359
- when :sigmoid
360
- execute_func('sigmoid', tensor, a, child_context)
361
- when :log1p
362
- execute_func('log1p', tensor, a, child_context)
363
- when :round
364
- execute_func('round', tensor, a, child_context)
365
- when :softmax
366
- a = _run(a, child_context)
367
- event_wait_list = [a.op].compact
368
- dtype = tensor.data_type
369
- output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
253
+ register_op :sin do |context, tensor, inputs|
254
+ execute_func('sin', tensor, inputs[0], context)
255
+ end
370
256
 
371
- m, n = a.shape
372
- work_group = [m]
373
- n = m if n.nil?
374
- cl_n = OpenCL::Int1.new(n || 1)
257
+ register_op :cond do |context, tensor, inputs|
258
+ pred = complete_eval(tensor.options[:pred], context)
375
259
 
376
- event = _cl_program("softmax", dtype: dtype).send(:"softmax_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
377
- output_buffer.op = event
378
- output_buffer
379
- when :softmax_grad
380
- a = _run(a, child_context)
381
- grad = _run(b, child_context)
382
- event_wait_list = [a.op].compact
383
- dtype = tensor.data_type
384
- output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
260
+ if all_true?(pred.buffer)
261
+ inputs[0]
262
+ else
263
+ inputs[1]
264
+ end
265
+ end
266
+
267
+ register_op :identity do |_context, _tensor, inputs|
268
+ inputs[0]
269
+ end
270
+
271
+ register_op :assign, noop: true do |context, tensor, inputs|
272
+ assign_var(tensor, inputs[1], context)
273
+ end
274
+
275
+ register_op :assign_add do |context, tensor, inputs|
276
+ value = execute_2_operand_func('add', tensor, inputs[0], inputs[1], context)
277
+ assign_var(tensor, value, context)
278
+ end
279
+
280
+ register_op :assign_sub do |context, tensor, inputs|
281
+ value = execute_2_operand_func('sub', tensor, inputs[0], inputs[1], context)
282
+ assign_var(tensor, value, context)
283
+ end
284
+
285
+ %i[less less_equal greater greater_equal equal not_equal logical_and].each do |op|
286
+ register_op op, noop: true do |context, tensor, inputs|
287
+ execute_2_operand_func(op.to_s, tensor, inputs[0], inputs[1], context, 'cond')
288
+ end
289
+ end
385
290
 
291
+ %i[max add div sub mul pow sigmoid_grad].each do |op|
292
+ register_op op, noop: true do |context, tensor, inputs|
293
+ execute_2_operand_func(op.to_s, tensor, inputs[0], inputs[1], context)
294
+ end
295
+ end
296
+
297
+ register_op :where, noop: true do |context, tensor, inputs|
298
+ pred = tensor.options[:pred]
299
+ execute_cond_func('where', tensor, pred, inputs[0], inputs[1], context)
300
+ end
301
+
302
+ register_op :matmul do |_context, tensor, inputs|
303
+ a, b = inputs
304
+
305
+ m = a.shape[0]
306
+ n = b.shape[1]
307
+ v = b.shape[0]
308
+ k = a.shape[1]
309
+
310
+ m, k = [a.shape[1], a.shape[0]] if tensor.options[:transpose_a]
311
+ n, v = [b.shape[0], b.shape[1]] if tensor.options[:transpose_b]
312
+
313
+ result_shape = [m, n]
314
+
315
+ raise "#{tensor.inputs[0].name} rank must be greater than 1" if a.shape.size < 2
316
+ raise "#{tensor.inputs[1].name} rank must be greater than 1" if b.shape.size < 2
317
+ raise "incompatible shape sizes for matrix multiplication (#{a.shape[1]} != #{b.shape[0]}) #{a.shape} vs #{b.shape}" if k != v
318
+
319
+ dtype = tensor.data_type
320
+ a, b = auto_type_cast(a, b, name: "#{tensor.name}/cast_#{a.name}_#{b.data_type}")
321
+ output_buffer = _create_result_buffer(a.data_type, result_shape, tensor.name)
322
+
323
+ cl_m = OpenCL::Int1.new(m)
324
+ cl_n = OpenCL::Int1.new(n)
325
+ cl_k = OpenCL::Int1.new(k)
326
+
327
+ transpose_a = OpenCL::Int1.new(tensor.options[:transpose_a] ? 1 : 0)
328
+ transpose_b = OpenCL::Int1.new(tensor.options[:transpose_b] ? 1 : 0)
329
+
330
+ output_buffer.op = _cl_program('gemm', dtype: dtype).send(:"gemm_#{dtype}", _opencl_queue, result_shape, cl_m, cl_n, cl_k, transpose_a, transpose_b, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer)
331
+ output_buffer
332
+ end
333
+
334
+ register_op :cast do |_context, tensor, inputs|
335
+ a = inputs[0]
336
+ if a.data_type != tensor.data_type
337
+ buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
386
338
  m, n = a.shape
387
- work_group = [m]
388
- n = m if n.nil?
339
+ cl_m = OpenCL::Int1.new(m || 1)
389
340
  cl_n = OpenCL::Int1.new(n || 1)
390
- event = _cl_program("softmax_grad", dtype: dtype, size: n).send(:"softmax_grad_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer, grad.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
391
- output_buffer.op = event
392
- output_buffer
393
- when :sigmoid_grad
394
- execute_2_operand_func('sigmoid_grad', tensor, a, b, child_context)
395
- when :truncate
396
- a = _run(a, child_context)
397
- b = _run(b, child_context)
398
-
399
- if a.shape.size.zero?
341
+ work_group = [m || 1, n || 1]
342
+
343
+ buffer.op = _cl_program("cast", source_dt: a.data_type, target_dt: tensor.data_type).cast(_opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, buffer.cl_buffer)
344
+ buffer
345
+ else
346
+ a
347
+ end
348
+ end
349
+
350
+ %i[sign exp tan cos abs sqrt negate square reciprocal tanh tanh_grad sigmoid log1p round].each do |op|
351
+ register_op op, noop: true do |context, tensor, inputs|
352
+ execute_func(op.to_s, tensor, inputs[0], context)
353
+ end
354
+ end
355
+
356
+ register_op :softmax do |_context, tensor, inputs|
357
+ a = inputs[0]
358
+ event_wait_list = [a.op].compact
359
+ dtype = tensor.data_type
360
+ output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
361
+
362
+ m, n = a.shape
363
+ work_group = [m]
364
+ n = m if n.nil?
365
+ cl_n = OpenCL::Int1.new(n || 1)
366
+
367
+ event = _cl_program("softmax", dtype: dtype).send(:"softmax_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
368
+ output_buffer.op = event
369
+ output_buffer
370
+ end
371
+
372
+ register_op :softmax_grad do |_context, tensor, inputs|
373
+ a, grad = inputs
374
+
375
+ event_wait_list = [a.op].compact
376
+ dtype = tensor.data_type
377
+ output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
378
+
379
+ m, n = a.shape
380
+ work_group = [m]
381
+ n = m if n.nil?
382
+ cl_n = OpenCL::Int1.new(n || 1)
383
+ event = _cl_program('softmax_grad', dtype: dtype, size: n).send(:"softmax_grad_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer, grad.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
384
+ output_buffer.op = event
385
+ output_buffer
386
+ end
387
+
388
+ register_op :truncate do |context, tensor, inputs|
389
+ a, b = inputs
390
+ if a.shape.size.zero?
391
+ a
392
+ else
393
+ input_b = read_final_result(b)
394
+ if a.shape == input_b
400
395
  a
401
396
  else
402
- input_b = read_final_result(b)
403
- if a.shape == input_b
404
- a
405
- else
406
- input_a = read_final_result(a)
407
- if input_b == []
408
- if a.buffer.size == 1
409
- a.shape = input_b
410
- a
411
- else
412
- wrap_opencl(a.buffer[0], data_type: a.data_type, name: tensor.name)
413
- end
397
+ input_a = read_final_result(a)
398
+ if input_b == []
399
+ if a.buffer.size == 1
400
+ a.shape = input_b
401
+ a
414
402
  else
415
- wrap_opencl(truncate(input_a, input_b), data_type: a.data_type, name: tensor.name)
403
+ wrap_opencl(a.buffer[0], data_type: a.data_type, name: tensor.name)
416
404
  end
405
+ else
406
+ wrap_opencl(truncate(input_a, input_b), data_type: a.data_type, name: tensor.name)
417
407
  end
418
408
  end
419
- when :check_numerics
420
- a = complete_eval(a, child_context)
421
- name = tensor.options[:name]
409
+ end
410
+ end
422
411
 
423
- a.buffer.each do |item|
424
- raise "#{name} Invalid Argument" if item.nan? || item.infinite?
425
- end
426
- a
427
- when :zeros, :ones, :zeros_like, :ones_like
428
- shape = if %i[zeros_like ones_like].include?(tensor.operation)
429
- _run(a, child_context).shape
430
- else
431
- read_final_result(complete_eval(a, child_context)) || tensor.shape.shape
432
- end
412
+ register_op :check_numerics, noop: true do |context, tensor, inputs|
413
+ a = complete_eval(inputs[0], context)
414
+ name = tensor.options[:name]
433
415
 
434
- func = if %i[zeros zeros_like].include?(tensor.operation)
435
- -> { tensor.data_type == :int32 ? 0 : 0.0 }
436
- else
437
- -> { tensor.data_type == :int32 ? 1 : 1.0 }
438
- end
416
+ a.buffer.each do |input|
417
+ raise "#{name} Invalid Argument" if input.nan? || input.infinite?
418
+ end
419
+ a
420
+ end
439
421
 
440
- size = shape.empty? ? 1 : shape.reduce(:*)
422
+ register_op :broadcast_transform do |context, tensor, inputs|
423
+ a, b = inputs
441
424
 
442
- buffer = if TensorStream::Ops::FLOATING_POINT_TYPES.include?(tensor.data_type)
443
- NArray.sfloat(size)
444
- elsif TensorStream::Ops::INTEGER_TYPES.include?(tensor.data_type)
445
- NArray.int(size)
446
- else
447
- raise "unsupported type #{tensor.data_type}"
448
- end
449
-
450
- data = if !shape.empty?
451
- Array.new(size) do |index|
452
- func.call
453
- end
454
- else
455
- func.call
456
- end
425
+ if a.shape == b.shape
426
+ [a, b]
427
+ else
428
+ input_a = read_final_result(complete_eval(a, context))
429
+ input_b = read_final_result(complete_eval(b, context))
430
+ b_a, b_b = broadcast(input_a, input_b)
431
+ [ wrap_opencl(b_a, data_type: a.data_type, name: "#{tensor.name}_a"),
432
+ wrap_opencl(b_b, data_type: a.data_type, name: "#{tensor.name}_b")]
433
+ end
434
+ end
457
435
 
458
- convert_to_opencl(data, shape, data_type: tensor.data_type, name: tensor.name)
459
- when :broadcast_transform
460
- a = _run(a, child_context)
461
- b = _run(b, child_context)
462
-
463
- if a.shape == b.shape
464
- [a, b]
465
- else
466
- input_a = read_final_result(complete_eval(a, child_context))
467
- input_b = read_final_result(complete_eval(b, child_context))
468
- b_a, b_b = broadcast(input_a, input_b)
469
- [ wrap_opencl(b_a, data_type: a.data_type, name: "#{tensor.name}_a"),
470
- wrap_opencl(b_b, data_type: a.data_type, name: "#{tensor.name}_b")]
471
- end
472
- when :print
473
- a = _run(a, child_context)
474
- b = _run(b, child_context)
475
- input_b = complete_eval(b, child_context)
476
- input_b = read_final_result(input_b)
477
- puts "#{tensor.options.fetch(:message, '')} #{input_b}"
478
- a
479
- when :rank
480
- a = _run(a, child_context)
481
- wrap_opencl(a.shape.size, data_type: tensor.data_type, name: tensor.name)
482
- when :stop_gradient
483
- _run(a, child_context)
484
- when :slice
485
- input_a = complete_eval(a, child_context)
486
- input_b = read_final_result(complete_eval(b, child_context))
487
- size = tensor.options[:size]
488
-
489
- slice_param = input_b.zip(size).collect { |x, y| x..x + y - 1 }.reverse
490
-
491
- new_buf = input_a.buffer.reshape(*input_a.shape.reverse)
492
- sliced = new_buf.slice[*slice_param]
493
- convert_to_opencl(sliced.flatten, sliced.shape.reverse, data_type: a.data_type, name: tensor.name)
494
- when :transpose
495
- input_a = complete_eval(a, child_context)
496
- t_param = Array.new(input_a.shape.size) { |index| index }.reverse
497
- transposed = input_a.buffer.reshape(*input_a.shape.reverse).transpose(*t_param)
498
- convert_to_opencl(transposed.flatten, transposed.shape.reverse, data_type: a.data_type, name: tensor.name)
499
- when :index
500
- a = complete_eval(a, child_context)
501
- input_a = read_final_result(a)
502
- index = read_final_result(complete_eval(b, child_context))
503
-
504
- if a.is_a?(Array)
505
- a[index]
506
- else
507
- new_shape = a.shape.dup
508
- new_shape.shift
509
- convert_to_opencl(input_a[index], new_shape, data_type: a.data_type, name: tensor.name)
510
- end
511
- when :broadcast_gradient_args
512
- a = complete_eval(a, child_context)
513
- b = complete_eval(b, child_context)
514
-
515
- wrap_opencl(get_broadcast_gradient_args(a.buffer.to_a, b.buffer.to_a), data_type: a.data_type, name: tensor.name)
516
- when :shape
517
- a = _run(a, child_context)
518
-
519
- wrap_opencl(a.shape, name: tensor.name, data_type: tensor.options[:out_type] || :float32)
520
- when :reshape
521
- arr = complete_eval(a, child_context)
522
- new_shape = read_final_result(complete_eval(b, child_context))
523
-
524
- if new_shape.size.zero? && arr.buffer.size == 1
525
- arr.shape = new_shape
526
- arr
527
- else
528
- new_shape = TensorShape.fix_inferred_elements(new_shape, arr.buffer.size)
529
- arr.shape = new_shape
530
- arr
531
- end
532
- when :random_uniform
533
- maxval = tensor.options.fetch(:maxval, 1)
534
- minval = tensor.options.fetch(:minval, 0)
535
- seed = tensor.options[:seed]
536
-
537
- random = _get_randomizer(tensor, seed)
538
- generator = -> { random.rand * (maxval - minval) + minval }
539
- shape = tensor.options[:shape] || tensor.shape.shape
540
-
541
- convert_to_opencl(generate_vector(shape, generator: generator), shape, data_type: tensor.data_type, name: tensor.name)
542
- when :random_normal
543
- random = _get_randomizer(tensor, seed)
544
- r = RandomGaussian.new(tensor.options.fetch(:mean), tensor.options.fetch(:stddev), -> { random.rand })
545
- random = _get_randomizer(tensor, seed)
546
- generator = -> { r.rand }
547
- shape = tensor.options[:shape] || tensor.shape.shape
548
-
549
- convert_to_opencl(generate_vector(shape, generator: generator), shape, data_type: tensor.data_type, name: tensor.name)
550
- when :glorot_uniform
551
- random = _get_randomizer(tensor, seed)
552
-
553
- shape = tensor.options[:shape] || tensor.shape.shape
554
- fan_in, fan_out = if shape.size.zero?
555
- [1, 1]
556
- elsif shape.size == 1
557
- [1, shape[0]]
558
- else
559
- [shape[0], shape.last]
560
- end
561
-
562
- limit = Math.sqrt(6.0 / (fan_in + fan_out))
563
-
564
- minval = -limit
565
- maxval = limit
566
-
567
- generator = -> { random.rand * (maxval - minval) + minval }
568
- convert_to_opencl(generate_vector(shape, generator: generator), shape, data_type: tensor.data_type, name: tensor.name)
569
- when :flow_group
570
- tensor.items.collect { |item| _run(item, child_context) }
571
- when :sum
572
- reduction(child_context, tensor, a, b, :sum)
573
- when :mean
574
- reduction(child_context, tensor, a, b, :mean)
575
- when :prod
576
- input_a = complete_eval(a, child_context)
577
- if input_a.buffer.empty?
578
- convert_to_opencl([1.0], [], data_type: a.data_type, name: tensor.name)
579
- else
580
- reduction(child_context, tensor, a, b, :prod)
581
- end
582
- when :argmin
583
- a = complete_eval(a, child_context)
584
- axis = tensor.options[:axis] || 0
585
- arr = a.buffer.reshape(*a.shape.reverse).to_a
586
- op = get_op_with_axis(arr, axis, 0, a.data_type, ->(a, b) { a < b })
587
- convert_to_opencl(op, shape_eval(op), data_type: tensor.data_type, name: tensor.name)
588
- when :argmax
589
- a = complete_eval(a, child_context)
590
- axis = tensor.options[:axis] || 0
591
- arr = a.buffer.reshape(*a.shape.reverse).to_a
592
- op = get_op_with_axis(arr, axis, 0, a.data_type, ->(a, b) { a > b })
593
- convert_to_opencl(op, shape_eval(op), data_type: tensor.data_type, name: tensor.name)
436
+ register_op :print do |context, tensor, inputs|
437
+ a, b = inputs
438
+ input_b = complete_eval(b, context)
439
+ input_b = read_final_result(input_b)
440
+ puts "#{tensor.options.fetch(:message, '')} #{input_b}"
441
+ a
442
+ end
443
+
444
+ register_op :rank do |_context, tensor, inputs|
445
+ wrap_opencl(inputs[0].shape.size, data_type: tensor.data_type, name: tensor.name)
446
+ end
447
+
448
+ register_op :stop_gradient do |_context, _tensor, inputs|
449
+ inputs[0]
450
+ end
451
+
452
+ register_op :slice, noop: true do |context, tensor, inputs|
453
+ input_a = complete_eval(inputs[0], context)
454
+ input_b = read_final_result(complete_eval(inputs[1], context))
455
+ size = tensor.options[:size]
456
+
457
+ slice_param = input_b.zip(size).collect { |x, y| x..x + y - 1 }.reverse
458
+
459
+ new_buf = input_a.buffer.reshape(*input_a.shape.reverse)
460
+ sliced = new_buf.slice[*slice_param]
461
+ convert_to_opencl(sliced.flatten, sliced.shape.reverse, data_type: inputs[0].data_type, name: tensor.name)
462
+ end
463
+
464
+ register_op :transpose, buffer: true do |_context, tensor, inputs|
465
+ t_param = Array.new(inputs[0].shape.size) { |index| index }.reverse
466
+ transposed = inputs[0].buffer.reshape(*inputs[0].shape.reverse).transpose(*t_param)
467
+ convert_to_opencl(transposed.flatten, transposed.shape.reverse, data_type: inputs[0].data_type, name: tensor.name)
468
+ end
469
+
470
+ register_op :index, buffer: true do |_context, tensor, inputs|
471
+ a = inputs[0]
472
+ input_a = read_final_result(a)
473
+ index = read_final_result(inputs[1])
474
+
475
+ if a.is_a?(Array)
476
+ a[index]
594
477
  else
595
- raise "unknown op #{tensor.operation}"
596
- end.tap do |result|
478
+ new_shape = a.shape.dup
479
+ new_shape.shift
480
+ convert_to_opencl(input_a[index], new_shape, data_type: a.data_type, name: tensor.name)
481
+ end
482
+ end
483
+
484
+ register_op :broadcast_gradient_args, buffer: true do |_context, tensor, inputs|
485
+ wrap_opencl(get_broadcast_gradient_args(inputs[0].buffer.to_a, inputs[1].buffer.to_a), data_type: inputs[0].data_type, name: tensor.name)
486
+ end
487
+
488
+ register_op :shape do |_context, tensor, inputs|
489
+ wrap_opencl(inputs[0].shape, name: tensor.name, data_type: tensor.options[:out_type] || :float32)
490
+ end
491
+
492
+ register_op :reshape, buffer: true do |_context, _tensor, inputs|
493
+ arr = inputs[0]
494
+ new_shape = read_final_result(inputs[1])
495
+
496
+ if new_shape.size.zero? && arr.buffer.size == 1
497
+ arr.shape = new_shape
498
+ arr
499
+ else
500
+ new_shape = TensorShape.fix_inferred_elements(new_shape, arr.buffer.size)
501
+ arr.shape = new_shape
502
+ arr
503
+ end
504
+ end
505
+
506
+ register_op :flow_group do |_context, _tensor, inputs|
507
+ inputs
508
+ end
509
+
510
+ %i[sum mean].each do |op|
511
+ register_op op, noop: true do |context, tensor, inputs|
512
+ reduction(context, tensor, inputs[0], inputs[1], op.to_sym)
513
+ end
514
+ end
515
+
516
+ register_op :prod, noop: true do |context, tensor, inputs|
517
+ input_a = complete_eval(inputs[0], context)
518
+ if input_a.buffer.empty?
519
+ convert_to_opencl([1.0], [], data_type: inputs[0].data_type, name: tensor.name)
520
+ else
521
+ reduction(context, tensor, inputs[0], inputs[1], :prod)
522
+ end
523
+ end
524
+
525
+ register_op :argmin, buffer: true do |_context, tensor, inputs|
526
+ axis = tensor.options[:axis] || 0
527
+ arr = inputs[0].buffer.reshape(*inputs[0].shape.reverse).to_a
528
+ op = get_op_with_axis(arr, axis, 0, inputs[0].data_type, ->(a, b) { a < b })
529
+ convert_to_opencl(op, shape_eval(op), data_type: tensor.data_type, name: tensor.name)
530
+ end
531
+
532
+ register_op :argmax, buffer: true do |_context, tensor, inputs|
533
+ axis = tensor.options[:axis] || 0
534
+ arr = inputs[0].buffer.reshape(*inputs[0].shape.reverse).to_a
535
+ op = get_op_with_axis(arr, axis, 0, inputs[0].data_type, ->(a, b) { a > b })
536
+ convert_to_opencl(op, shape_eval(op), data_type: tensor.data_type, name: tensor.name)
537
+ end
538
+
539
+ def eval_operation(tensor, child_context)
540
+ return @context[tensor.name] if @context.key?(tensor.name)
541
+ cache_key = "#{tensor.graph.object_id}_opencl_#{tensor.name}:#{object_id}"
542
+ return @context[cache_key] if @context.key?(cache_key)
543
+ # puts tensor.name
544
+ invoke(tensor, child_context).tap do |result|
597
545
  # puts "#{tensor.to_math(true,1)} = #{read_final_result(complete_eval(result, child_context))}"
598
546
  if tensor.breakpoint
547
+ a = resolve_placeholder(tensor.inputs[0], child_context) if tensor.inputs && tensor.inputs[0]
548
+ b = resolve_placeholder(tensor.inputs[1], child_context) if tensor.inputs && tensor.inputs[1]
599
549
  a = read_final_result(complete_eval(a, child_context))
600
550
  b = read_final_result(complete_eval(b, child_context))
601
551
  result = read_final_result(complete_eval(result, child_context))
@@ -642,7 +592,7 @@ module TensorStream
642
592
  def eval_tensor(tensor, child_context)
643
593
  return tensor unless tensor.is_a?(Tensor)
644
594
 
645
- cache_key = "#{tensor.graph.object_id}_opencl_#{tensor.name}"
595
+ cache_key = "#{tensor.graph.object_id}_opencl_#{tensor.name}:#{object_id}"
646
596
  return @context[cache_key] if @context.key?(cache_key)
647
597
  return @context[:_cache][cache_key] if tensor.is_const && @context[:_cache][cache_key]
648
598
  @context[cache_key] = if tensor.value.is_a?(Tensor)
@@ -656,7 +606,7 @@ module TensorStream
656
606
  private
657
607
 
658
608
  def assign_var(tensor, b, child_context)
659
- assign = tensor.items[0] || tensor
609
+ assign = tensor.inputs[0] || tensor
660
610
  buffer = complete_eval(b, child_context)
661
611
 
662
612
  if assign.buffer
@@ -678,7 +628,7 @@ module TensorStream
678
628
  dtype = tensor.data_type
679
629
  result_shape = TensorShape.infer_shape(a.shape, b.shape)
680
630
 
681
- output_buffer = _create_result_buffer(tensor.data_type, result_shape, tensor.name)
631
+ output_buffer = _create_result_buffer(tensor.data_type, result_shape, "out_#{tensor.name}")
682
632
  a, b, prog, switch_operands = select_program(a, b, op_name)
683
633
  m, n = result_shape
684
634
  work_group = [m || 1, n || 1]
@@ -688,6 +638,7 @@ module TensorStream
688
638
 
689
639
  event_wait_list = [a.op, b.op].compact # add dependency wait list
690
640
 
641
+ method_call = :"#{prog}_#{a.data_type}_#{b.data_type}"
691
642
  event = if prog == "#{op_name}_b"
692
643
  cl_m_b, cl_n_b = if b.shape.size == 2
693
644
  [ OpenCL::Int1.new(b.shape[0]), OpenCL::Int1.new(b.shape[1]) ]
@@ -696,9 +647,9 @@ module TensorStream
696
647
  else
697
648
  raise "rank > 2 not supported!"
698
649
  end
699
- _cl_program("#{prog_name || op_name}", dtype: dtype).send(:"#{prog}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, cl_m_b, cl_n_b, cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
650
+ _cl_program("#{prog_name || op_name}", a: a.data_type, b: b.data_type, dtype: dtype).send(method_call, _opencl_queue, work_group, cl_m, cl_n, cl_m_b, cl_n_b, cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
700
651
  else
701
- _cl_program("#{prog_name || op_name}", dtype: dtype).send(:"#{prog}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
652
+ _cl_program("#{prog_name || op_name}", a: a.data_type, b: b.data_type, dtype: dtype).send(method_call, _opencl_queue, work_group, cl_m, cl_n, cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
702
653
  end
703
654
 
704
655
  output_buffer.op = event
@@ -784,7 +735,7 @@ module TensorStream
784
735
  value = [value]
785
736
  end
786
737
 
787
- cache_key = "_cl_object_#{name}_#{shape.join('_')}"
738
+ cache_key = "_cl_object_#{name}:#{shape.join('_')}:#{object_id}"
788
739
  cl_object = if name && @context[:_cache][cache_key]
789
740
  @context[:_cache][cache_key]
790
741
  else
@@ -813,13 +764,13 @@ module TensorStream
813
764
  if element.is_a?(Tensor)
814
765
  cl_object.buffer[index] = read_final_result(complete_eval(element, {}))
815
766
  else
816
- cl_object.buffer[index] = Tensor.cast_dtype(element, data_type)
767
+ cl_object.buffer[index] = ( data_type == :boolean ? ( element ? 1 : 0 ) : Tensor.cast_dtype(element, data_type))
817
768
  end
818
769
  end
819
770
  elsif value.is_a?(NArray)
820
771
  cl_object.buffer = value
821
772
  else
822
- cl_object.buffer[0] = Tensor.cast_dtype(value, data_type)
773
+ cl_object.buffer[0] = ( data_type == :boolean ? ( element ? 1 : 0 ) : Tensor.cast_dtype(value, data_type))
823
774
  end
824
775
 
825
776
  write_op = if cl_object.cl_buffer && !value.nil? && (!value.is_a?(Array) || !value.empty?)
@@ -840,14 +791,14 @@ module TensorStream
840
791
  when :int16
841
792
  NArray.sint(narray_size)
842
793
  when :boolean
843
- NArray.int(narray_size)
794
+ NArray.sint(narray_size)
844
795
  else
845
796
  raise "unsupported type #{data_type}"
846
797
  end
847
798
  end
848
799
 
849
800
  def _create_result_buffer(data_type, shape, name)
850
- @context[:_cache][:_cl_buffers]["_result_#{name}_#{shape.join('_')}"] ||= begin
801
+ @context[:_cache][:_cl_buffers]["_result_#{name}_#{shape.join('_')}:#{object_id}"] ||= begin
851
802
  size = shape.empty? ? 1 : shape.reduce(:*)
852
803
  buffer = allocate_narray_for_type(data_type, size)
853
804
  cl_buffer = _opencl_context.create_buffer(buffer.size * buffer.element_size)
@@ -1029,7 +980,6 @@ module TensorStream
1029
980
 
1030
981
  def resolve_placeholder(placeholder, _execution_context = {})
1031
982
  return nil if placeholder.nil?
1032
- return placeholder if retain.include?(placeholder)
1033
983
 
1034
984
  var = if placeholder.is_a?(Placeholder)
1035
985
  @context[placeholder.name.to_sym].tap do |c|
@@ -1056,7 +1006,7 @@ module TensorStream
1056
1006
  reduced_val = r[0]
1057
1007
  if r.size > 1
1058
1008
  reduced_val = f.call(r[0..val.size])
1059
- elsif r.size == 0
1009
+ elsif r.size.zero?
1060
1010
  reduced_val = f.call(nil)
1061
1011
  end
1062
1012
  keep_dims ? [ reduced_val ] : reduced_val
@@ -1143,3 +1093,5 @@ module TensorStream
1143
1093
  end
1144
1094
  end
1145
1095
  end
1096
+
1097
+ TensorStream::Evaluator.register_evaluator(TensorStream::Evaluator::OpenclEvaluator, 'opencl', 1)