tensor_stream 0.7.0 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. checksums.yaml +5 -5
  2. data/.rubocop.yml +6 -1
  3. data/CHANGELOG.md +10 -0
  4. data/README.md +35 -0
  5. data/lib/tensor_stream.rb +2 -2
  6. data/lib/tensor_stream/debugging/debugging.rb +2 -1
  7. data/lib/tensor_stream/dynamic_stitch.rb +23 -24
  8. data/lib/tensor_stream/evaluator/base_evaluator.rb +27 -18
  9. data/lib/tensor_stream/evaluator/opencl/kernels/apply_momentum.cl +16 -0
  10. data/lib/tensor_stream/evaluator/opencl/kernels/pack.cl +24 -0
  11. data/lib/tensor_stream/evaluator/opencl/kernels/softmax_cross.cl +6 -1
  12. data/lib/tensor_stream/evaluator/opencl/opencl_buffer.rb +6 -6
  13. data/lib/tensor_stream/evaluator/opencl/opencl_evaluator.rb +237 -107
  14. data/lib/tensor_stream/evaluator/operation_helpers/array_ops_helper.rb +97 -7
  15. data/lib/tensor_stream/evaluator/ruby_evaluator.rb +230 -123
  16. data/lib/tensor_stream/exceptions.rb +1 -0
  17. data/lib/tensor_stream/graph_builder.rb +2 -3
  18. data/lib/tensor_stream/graph_deserializers/protobuf.rb +22 -23
  19. data/lib/tensor_stream/graph_serializers/graphml.rb +26 -29
  20. data/lib/tensor_stream/graph_serializers/pbtext.rb +22 -19
  21. data/lib/tensor_stream/helpers/string_helper.rb +4 -5
  22. data/lib/tensor_stream/math_gradients.rb +141 -77
  23. data/lib/tensor_stream/nn/nn_ops.rb +4 -6
  24. data/lib/tensor_stream/operation.rb +139 -120
  25. data/lib/tensor_stream/ops.rb +36 -3
  26. data/lib/tensor_stream/session.rb +7 -11
  27. data/lib/tensor_stream/tensor.rb +3 -3
  28. data/lib/tensor_stream/tensor_shape.rb +5 -0
  29. data/lib/tensor_stream/train/gradient_descent_optimizer.rb +4 -37
  30. data/lib/tensor_stream/train/momentum_optimizer.rb +48 -0
  31. data/lib/tensor_stream/train/optimizer.rb +129 -0
  32. data/lib/tensor_stream/train/saver.rb +0 -1
  33. data/lib/tensor_stream/train/slot_creator.rb +62 -0
  34. data/lib/tensor_stream/train/utils.rb +11 -12
  35. data/lib/tensor_stream/trainer.rb +3 -0
  36. data/lib/tensor_stream/utils.rb +18 -11
  37. data/lib/tensor_stream/variable.rb +19 -12
  38. data/lib/tensor_stream/variable_scope.rb +1 -1
  39. data/lib/tensor_stream/version.rb +1 -1
  40. data/samples/iris.rb +2 -1
  41. data/samples/linear_regression.rb +3 -1
  42. data/samples/nearest_neighbor.rb +2 -0
  43. data/test_samples/neural_network_raw.py +101 -0
  44. data/test_samples/raw_neural_net_sample.rb +6 -4
  45. data/test_samples/test2.py +73 -27
  46. metadata +9 -3
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA256:
3
- metadata.gz: d42a81e850271f080d408c52f2bea15a07c6d41ee3c6790dc04e48f2ab485364
4
- data.tar.gz: a4aedfd3c9a532f31ea195c58124644fcae143726d37daa7a4a6afc6b39f439b
2
+ SHA1:
3
+ metadata.gz: f84c2b9852fcf4931c47c0130b67497a50a87b0f
4
+ data.tar.gz: 524e1105da4e06e3472cbcfa0e6f764ae4512d37
5
5
  SHA512:
6
- metadata.gz: 7d9fff1a8af14878c50469cfcee7942d2800906fe388504261290958628861b29d973ea65a3cf986cd1657acf490d2bc7164ec9ec2c14dc7c5d6c25121c6737f
7
- data.tar.gz: 42ef6af8fafd1a7f7f069e03f8c344bec87bd15217def07d859d4b33374a038e1b7ed54ac685b901d3ed9b51fd351300cc61553ca45f0591fc4a7c5e50bcee53
6
+ metadata.gz: 420e2675ab67d4c8462534bdf8c703671656f7852d984579e22ee57f1425dd5740fcb64a1e52363bf337cd7c691d87a75c76bd868b13c8a7f06d78e0eb00aa73
7
+ data.tar.gz: 24fe1022741883d46cdd5af51309da33d421d72874f0cc84bf2e0ed14a62602f1830c6060bd86e42359b7962b4a57727c9a48ce13d5950d5ba02f6a9cdfd719f
data/.rubocop.yml CHANGED
@@ -6,6 +6,10 @@ AllCops:
6
6
  - tensor_stream.gemspec
7
7
  - Rakefile
8
8
 
9
+ Naming/AccessorMethodName:
10
+ Exclude:
11
+ - lib/tensor_stream/utils.rb
12
+
9
13
  Style/StringLiterals:
10
14
  Enabled: false
11
15
 
@@ -81,4 +85,5 @@ Style/TrailingCommaInHashLiteral:
81
85
 
82
86
  Naming/UncommunicativeMethodParamName:
83
87
  Exclude:
84
- - lib/tensor_stream/evaluator/ruby_evaluator.rb
88
+ - lib/tensor_stream/evaluator/ruby_evaluator.rb
89
+ - lib/tensor_stream/ops.rb
data/CHANGELOG.md CHANGED
@@ -4,6 +4,16 @@ All notable changes to this project will be documented in this file.
4
4
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
5
5
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
6
6
 
7
+ ## [0.8.0] - 2018-08-29
8
+ ### Added
9
+ - [TRAINING] Added new supported optimizer, MomentumOptimizer loosely based on tensorflow's implementation (with nesterov support)
10
+ - [NEW OP] fill, stack, atan, cumprod, gather, invert_permutation, setdiff1d
11
+
12
+ ### Fixes
13
+ - Fixed device delegator where it does not pick the correct evaluator to use in some cases
14
+ - [GRADIENTS] Properly implement gradient computation for prod, tile, transpose
15
+ - Fixed gradient computation for softmax_cross_entropy_with_logits_v2 (now based on tensorflow's implementation)
16
+
7
17
  ## [0.7.0] - 2018-08-08
8
18
  ### Added
9
19
  - [NEW OP] expand_dims, min, acos, asin, add_n
data/README.md CHANGED
@@ -324,6 +324,41 @@ result = a + b
324
324
  File.write("model.pbtext", result.graph.as_graph_def)
325
325
  ```
326
326
 
327
+ ## Performance notes
328
+
329
+ Comparative performance with respect to other ruby libraries have not yet been performed. However it is
330
+ notable that TruffleRuby and ruby-2.6.0-preview2 with the --jit flag performs considerably better with respect
331
+ to previous versions of ruby(< 2.6)
332
+
333
+ Benchmarks running samples/linear_regression.rb on an Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz
334
+
335
+ ruby 2.4
336
+
337
+ ```
338
+ $ ruby -v
339
+ ruby 2.4.0p0 (2016-12-24 revision 57164) [x86_64-linux]
340
+ $ ruby samples/linear_regression.rb
341
+ 495 seconds 1000 epochs
342
+ ```
343
+
344
+ ruby 2.6.0-preview2
345
+
346
+ ```
347
+ $ ruby -v
348
+ ruby 2.6.0preview2 (2018-05-31 trunk 63539) [x86_64-linux]
349
+ $ ruby --jit samples/linear_regression.rb
350
+ 394 seconds 10000 epochs
351
+ ```
352
+
353
+ truffleruby
354
+ ```
355
+ $ ruby -v
356
+ truffleruby 1.0.0-rc5, like ruby 2.4.4, GraalVM CE Native [x86_64-linux]
357
+ 219 seconds 10000 epochs
358
+ ```
359
+
360
+ For training large networks that works on images, the opencl evaluator is the only way to go.
361
+
327
362
  ## Roadmap
328
363
 
329
364
  - Docs
data/lib/tensor_stream.rb CHANGED
@@ -20,8 +20,6 @@ require 'tensor_stream/operation'
20
20
  require 'tensor_stream/placeholder'
21
21
  require 'tensor_stream/control_flow'
22
22
  require 'tensor_stream/dynamic_stitch'
23
- require 'tensor_stream/train/utils'
24
- require 'tensor_stream/trainer'
25
23
  require 'tensor_stream/nn/nn_ops'
26
24
  require 'tensor_stream/evaluator/evaluator'
27
25
  require 'tensor_stream/graph_serializers/serializer'
@@ -31,6 +29,8 @@ require 'tensor_stream/graph_serializers/graphml'
31
29
  require 'tensor_stream/math_gradients'
32
30
  require "tensor_stream/debugging/debugging"
33
31
  require 'tensor_stream/utils'
32
+ require 'tensor_stream/train/utils'
33
+ require 'tensor_stream/trainer'
34
34
 
35
35
  # require 'tensor_stream/libraries/layers'
36
36
  require 'tensor_stream/monkey_patches/integer'
@@ -9,8 +9,9 @@ module TensorStream
9
9
  nodes_to_process.each do |node|
10
10
  node.inputs = node.inputs.collect do |input|
11
11
  next if input.nil?
12
+ next input if input.is_a?(Variable)
12
13
 
13
- if TensorStream::Ops::FLOATING_POINT_TYPES.include?(input.data_type)
14
+ if input.is_a?(Tensor) && TensorStream::Ops::FLOATING_POINT_TYPES.include?(input.data_type)
14
15
  TensorStream.check_numerics(input, "#{node.name}/#{input.name}", name: "check/#{node.name}/#{input.name}" )
15
16
  else
16
17
  input
@@ -1,28 +1,27 @@
1
1
  module TensorStream
2
- # Defines a TensorStream controlflow op
3
- class DynamicStitch < Operation
4
- attr_accessor :ops
5
-
6
- def initialize(flow_type, inputs, ops = nil, options = {})
7
- setup_initial_state(options)
8
-
9
- @operation = :"flow_#{flow_type}"
10
- @inputs = inputs
2
+ # Defines a TensorStream controlflow op
3
+ class DynamicStitch < Operation
4
+ attr_accessor :ops
11
5
 
12
- @data_type = Tensor.detect_type(inputs[1])
13
- @name = [@graph.get_name_scope, options[:name] || set_name].compact.join('/')
14
- @ops = ops
15
- @shape = TensorShape.new([inputs.size])
16
- @graph.add_node(self)
17
- end
18
-
19
- def set_data_type(_passed_data_type)
20
- :unknown
21
- end
22
-
23
- def run
24
- eval
25
- end
6
+ def initialize(flow_type, inputs, ops = nil, options = {})
7
+ setup_initial_state(options)
8
+
9
+ @operation = :"flow_#{flow_type}"
10
+ @inputs = inputs
11
+
12
+ @data_type = Tensor.detect_type(inputs[1])
13
+ @name = [@graph.get_name_scope, options[:name] || set_name].compact.join('/')
14
+ @ops = ops
15
+ @shape = TensorShape.new([inputs.size])
16
+ @graph.add_node(self)
17
+ end
18
+
19
+ def set_data_type(_passed_data_type)
20
+ :unknown
21
+ end
22
+
23
+ def run
24
+ eval
26
25
  end
27
26
  end
28
-
27
+ end
@@ -2,13 +2,14 @@ module TensorStream
2
2
  # Evaluator base module
3
3
  module Evaluator
4
4
  class OutputGroup
5
- attr_accessor :outputs
6
- def initialize(outputs = [])
5
+ attr_accessor :outputs, :data_types
6
+ def initialize(outputs = [], data_types = [])
7
7
  @outputs = outputs
8
+ @data_types = data_types
8
9
  end
9
10
  end
10
11
 
11
- class UnsupportedOp < Exception
12
+ class UnsupportedOp < RuntimeError
12
13
  def initialize(tensor)
13
14
  @tensor = tensor
14
15
  end
@@ -111,22 +112,13 @@ module TensorStream
111
112
 
112
113
  resolved_inputs = tensor.inputs.map do |i|
113
114
  next if i.nil?
115
+ next i if op_options[:noop]
114
116
 
115
117
  if i.is_a?(Array)
116
- next i.collect { |sub_item| sub_item.is_a?(Tensor) ? invoke(sub_item, execution_context) : sub_item }
118
+ next i.collect { |sub_item| sub_item.is_a?(Tensor) ? global_eval(tensor, sub_item, execution_context) : sub_item }
117
119
  end
118
120
 
119
- if !op_options[:noop] && @context[:_cache][:placement][tensor.name] != @context[:_cache][:placement][i.name] # tensor is on another device or evaluator
120
- cache_key = "#{tensor.graph.object_id}_#{i.name}:#{object_id}"
121
- next @context[:_cache][cache_key] if @context[:_cache].key?(cache_key)
122
-
123
- result = @session.delegate_to_evaluator(i, @context, execution_context)
124
- convert_from_buffer(i, result).tap do |buffer|
125
- @context[:_cache][cache_key] = buffer if i.is_const
126
- end
127
- else
128
- prepare_input(i, execution_context, op_options)
129
- end
121
+ global_eval(tensor, i, execution_context, op_options)
130
122
  end
131
123
 
132
124
  instance_exec(execution_context, tensor, resolved_inputs, &op[:block])
@@ -134,6 +126,23 @@ module TensorStream
134
126
 
135
127
  protected
136
128
 
129
+ def global_eval(tensor, input, execution_context, op_options = {})
130
+ return nil unless input
131
+ return input unless input.is_a?(Tensor)
132
+
133
+ if object_id != @context[:_cache][:placement][input.name][1].object_id # tensor is on another device or evaluator
134
+ cache_key = "#{tensor.graph.object_id}_#{input.name}:#{object_id}"
135
+ return @context[:_cache][cache_key] if @context[:_cache].key?(cache_key)
136
+
137
+ result = @session.delegate_to_evaluator(input, @context, execution_context)
138
+ convert_from_buffer(input, result).tap do |buffer|
139
+ @context[:_cache][cache_key] = buffer if input.is_const
140
+ end
141
+ else
142
+ prepare_input(input, execution_context, op_options)
143
+ end
144
+ end
145
+
137
146
  def get_broadcast_gradient_args(input_a, input_b)
138
147
  return [[], []] if input_a == input_b
139
148
 
@@ -153,16 +162,16 @@ module TensorStream
153
162
  end
154
163
  end
155
164
 
156
- [input_a_args.reverse, input_b_args.reverse]
165
+ [input_a_args.reverse, input_b_args.reverse]
157
166
  end
158
167
 
159
168
  ##
160
169
  # converts from a ruby Buffer object to the evaluator's native buffer format
161
- def convert_from_buffer(tensor, result)
170
+ def convert_from_buffer(_tensor, _result)
162
171
  raise "need implementation"
163
172
  end
164
173
 
165
- def prepare_input(tensor, context, options = {})
174
+ def prepare_input(_tensor, _context, _options = {})
166
175
  raise "need implementation"
167
176
  end
168
177
  end
@@ -0,0 +1,16 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ // same dimension add floating point op
3
+ __kernel void apply_momentum_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *grad, __global const <%= c_dtype %> *learning_rate,
4
+ __global const <%= c_dtype %> *momentum, __global <%= c_dtype %> *output, __global <%= c_dtype %> *acc) {
5
+ // Get the index of the current element to be processed
6
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
7
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
8
+ const int index = globalRow * N + globalCol;
9
+ <%= c_dtype %> acc_m = acc[index];
10
+ acc[index] = acc_m * momentum[0] + grad[index];
11
+ <% if nesterov %>
12
+ output[index] -= grad[index] * learning_rate[0] + acc_m * momentum[0] * learning_rate[0];
13
+ <% else %>
14
+ output[index] -= acc_m * learning_rate[0];
15
+ <% end %>
16
+ }
@@ -0,0 +1,24 @@
1
+ % ctype = dtype_to_c_type(data_type)
2
+
3
+ __kernel void pack(const int N, const int index, __global const <%= ctype %> *A, __global <%= ctype %> *C) {
4
+ // Get the index of the current element to be processed
5
+ const int globalCol = get_global_id(0); // Col ID of C (0..N)
6
+
7
+ int start = index * <%= divisors[0] %>;
8
+ int ptr = start + globalCol;
9
+ int index_map[<%= divisors.size %>] = { <%= Array.new(divisors.size) { 0 }.join(', ') %> };
10
+
11
+ // compute effective coordinates
12
+ <% divisors.each_with_index do |div, index| %>
13
+ index_map[<%= index %>] = (int)floor(ptr / (float)<%= div %>);<% if index < divisors.size - 1%>ptr = ptr % <%= div %>;<% end %><% end %>
14
+
15
+ // Apply axis translation if needed
16
+ <% if axis > 0 %>
17
+ int first = index_map[0];
18
+ <% axis.times do |i| %>
19
+ index_map[<%= i %>] = index_map[<%= (i + 1) %>];<% end %>
20
+ index_map[<%= axis %>] = first;
21
+ <% end%>
22
+
23
+ C[<%= multipliers.each_with_index.map { |m, idx| "#{m}*index_map[#{idx}]" }.join(' + ') %>] = A[globalCol];
24
+ }
@@ -1,9 +1,10 @@
1
+
1
2
  // First naive implementation
2
3
  % c_dtype = dtype_to_c_type(dtype)
3
4
  __kernel void softmax_cross_<%= dtype %>(const int N,
4
5
  const __global <%= c_dtype %>* A,
5
6
  const __global <%= c_dtype %>* L,
6
- __global <%= c_dtype %>* C) {
7
+ __global <%= c_dtype %>* C, __global <%= c_dtype %>* P) {
7
8
 
8
9
  // Get the index of the current element to be processed
9
10
  const int globalRow = get_global_id(0); // Row ID of C (0..M)
@@ -24,4 +25,8 @@ __kernel void softmax_cross_<%= dtype %>(const int N,
24
25
  for (int k=0; k < N; k++) {
25
26
  C[globalRow*N + k] = (log(acc) - (A[globalRow*N + k] - max)) * L[globalRow*N + k];
26
27
  }
28
+
29
+ for (int k=0; k < N; k++) {
30
+ P[globalRow*N + k] = (exp(A[globalRow*N + k] - max) / acc) - L[globalRow*N + k];
31
+ }
27
32
  }
@@ -16,10 +16,6 @@ module TensorStream
16
16
 
17
17
  def to_ruby
18
18
  return [] if buffer.empty?
19
- if shape.empty?
20
- return buffer[0] != 0 if data_type == :boolean
21
- return buffer[0]
22
- end
23
19
 
24
20
  if dirty
25
21
  op.command_queue.enqueue_read_buffer(cl_buffer, buffer, event_wait_list: [op].compact)
@@ -27,9 +23,13 @@ module TensorStream
27
23
  self.dirty = false
28
24
  end
29
25
 
26
+ if shape.empty?
27
+ return buffer[0] != 0 if data_type == :boolean
28
+ return buffer[0]
29
+ end
30
+
30
31
  result = buffer.reshape(*shape.map(&:to_i).reverse).to_a
31
- result = process_function_op(result, ->(a, _b) { a != 0 }) if data_type == :boolean
32
- result
32
+ data_type == :boolean ? process_function_op(result, ->(a, _b) { a != 0 }) : result
33
33
  end
34
34
  end
35
35
  end
@@ -30,6 +30,7 @@ module TensorStream
30
30
  ## PURE ruby evaluator used for testing and development
31
31
  class OpenclEvaluator < BaseEvaluator
32
32
  attr_accessor :retain
33
+ attr_reader :opencl_device
33
34
 
34
35
  include TensorStream::OpHelper
35
36
  include TensorStream::ArrayOpsHelper
@@ -51,20 +52,20 @@ module TensorStream
51
52
 
52
53
  def self.fetch_device(query = [])
53
54
  devices = query_devices_with_score
54
- platform_devices = devices.select { |d| d[0].platform.to_s.gsub(' ','_').downcase =~ /#{query[0].downcase}/ }
55
+ platform_devices = devices.select { |d| d[0].platform.to_s.tr(' ', '_').downcase =~ /#{query[0].downcase}/ }
55
56
  opencl_to_device(platform_devices[[query[1].to_i, platform_devices.size - 1].min])
56
57
  end
57
58
 
58
59
  def self.opencl_to_device(d)
59
60
  device = d[0]
60
61
  index = d[3]
61
- platform_name = device.platform.name.gsub(' ', '_').downcase
62
+ platform_name = device.platform.name.tr(' ', '_').downcase
62
63
  uri = [platform_name, index].join(':')
63
64
 
64
65
  device_type = device.type.to_s == 'GPU' ? :gpu : :cpu
65
66
 
66
- OpenclDevice.new(uri, device_type, self).tap do |d|
67
- d.native_device = device
67
+ OpenclDevice.new(uri, device_type, self).tap do |devide|
68
+ devide.native_device = device
68
69
  end
69
70
  end
70
71
 
@@ -96,8 +97,14 @@ module TensorStream
96
97
  end
97
98
  end
98
99
 
100
+ # buffer comes from non-opencl evaluator
99
101
  def convert_from_buffer(tensor, result)
100
- convert_to_opencl([result.buffer].flatten, shape_eval(result.buffer), data_type: result.data_type, name: tensor.name)
102
+ if result.buffer.is_a?(TensorStream::Evaluator::OutputGroup)
103
+ converted_outputs = result.buffer.outputs.zip(result.buffer.data_types).map { |output, data_type| convert_to_opencl([output].flatten, shape_eval(output), data_type: data_type, name: tensor.name) }
104
+ TensorStream::Evaluator::OutputGroup.new(converted_outputs, result.buffer.data_types)
105
+ else
106
+ convert_to_opencl([result.buffer].flatten, shape_eval(result.buffer), data_type: result.data_type, name: tensor.name)
107
+ end
101
108
  end
102
109
 
103
110
  def complete_eval(tensor, context)
@@ -106,7 +113,7 @@ module TensorStream
106
113
  if buffer.is_a?(Array)
107
114
  buffer = buffer.collect do |b|
108
115
  next b if b.buffer.size.zero?
109
- _opencl_queue.enqueue_read_buffer(b.cl_buffer, b.buffer, event_wait_list: [b.op].compact)
116
+ _opencl_queue.enqueue_read_buffer(b.cl_buffer, b.buffer, event_wait_list: build_event_wait_list([b]))
110
117
  b
111
118
  end
112
119
  else
@@ -114,14 +121,30 @@ module TensorStream
114
121
  return buffer if buffer.nil?
115
122
  return [] if buffer.buffer.nil?
116
123
  return buffer if buffer.buffer.size.zero?
117
- _opencl_queue.enqueue_read_buffer(buffer.cl_buffer, buffer.buffer, event_wait_list: [buffer.op].compact)
124
+ _opencl_queue.enqueue_read_buffer(buffer.cl_buffer, buffer.buffer, event_wait_list: build_event_wait_list([buffer]))
118
125
  end
119
126
  _opencl_queue.finish
120
127
  buffer
121
128
  end
122
129
 
123
- def opencl_device
124
- @opencl_device
130
+ def self.query_devices_with_score
131
+ OpenCL.platforms.flat_map do |p|
132
+ p.devices.select { |d| d.available > 0 }.each_with_index.collect do |d, index|
133
+ score = 0
134
+ if d.type.to_s == 'CPU'
135
+ score += 1
136
+ elsif d.type.to_s == 'GPU'
137
+ score += 4
138
+ end
139
+
140
+ score += 1000 if d.platform.name == 'NVIDIA CUDA'
141
+
142
+ score += d.max_compute_units
143
+ score += d.max_clock_frequency
144
+
145
+ [d, score, p.name, index]
146
+ end
147
+ end
125
148
  end
126
149
 
127
150
  protected
@@ -152,31 +175,9 @@ module TensorStream
152
175
  @opencl_context = OpenCL.create_context(opencl_device)
153
176
  end
154
177
 
155
- def self.query_devices_with_score
156
- OpenCL.platforms.flat_map do |p|
157
-
158
- p.devices.select { |d| d.available > 0 }.each_with_index.collect do |d, index|
159
- score = 0
160
- if d.type.to_s == 'CPU'
161
- score += 1
162
- elsif d.type.to_s == 'GPU'
163
- score += 4
164
- end
165
-
166
- if d.platform.name == 'NVIDIA CUDA'
167
- score += 1000
168
- end
169
-
170
- score += d.max_compute_units
171
- score += d.max_clock_frequency
172
-
173
- [d, score, p.name, index]
174
- end
175
- end
176
- end
177
-
178
178
  def create_command_queue
179
179
  supported_proprties = opencl_device.queue_properties.names
180
+
180
181
  properties = []
181
182
  properties << OpenCL::CommandQueue::PROFILING_ENABLE if supported_proprties.include?('PROFILING_ENABLE')
182
183
  properties << OpenCL::CommandQueue::OUT_OF_ORDER_EXEC_MODE_ENABLE if supported_proprties.include?('OUT_OF_ORDER_EXEC_MODE_ENABLE')
@@ -196,7 +197,7 @@ module TensorStream
196
197
  end
197
198
 
198
199
  def _cl_program(kernel, args = {})
199
- suffix = args.collect { |k,v| "#{k}.#{v}"}.join('.')
200
+ suffix = args.collect { |k, v| "#{k}.#{escape_arg_content(v)}" }.join('.')
200
201
  @context[:_cache]["_opencl_kernel_#{kernel}.#{suffix}:#{object_id}"] ||= begin
201
202
  filename = %w[cl.erb cl].map { |ext| cl_template_path(kernel, ext) }.find { |n| File.exist?(n) }
202
203
  raise "opencl kernel template for #{kernel} has not yet been defined" if filename.nil?
@@ -211,6 +212,13 @@ module TensorStream
211
212
  end
212
213
  end
213
214
 
215
+ def escape_arg_content(value)
216
+ return value.tr(' ','_') if value.is_a?(String)
217
+ return value.join('-') if value.is_a?(Array)
218
+
219
+ value
220
+ end
221
+
214
222
  def _run(tensor, execution_context)
215
223
  return tensor if tensor.is_a?(OpenCLBuffer)
216
224
  return tensor.map { |t| _run(t, execution_context) } if tensor.is_a?(Array) && !tensor.size.empty? && tensor[0].is_a?(Tensor)
@@ -236,7 +244,7 @@ module TensorStream
236
244
  res
237
245
  end
238
246
 
239
- def eval_variable(tensor, child_context)
247
+ def eval_variable(tensor, _child_context)
240
248
  raise "variable #{tensor.name} not initalized" if tensor.value.nil? && (tensor.buffer.nil? || !tensor.buffer.dirty)
241
249
  tensor.buffer = wrap_opencl(tensor, name: tensor.name) if tensor.buffer.nil?
242
250
  tensor.buffer
@@ -259,7 +267,10 @@ module TensorStream
259
267
  end
260
268
  end
261
269
 
262
- register_op :identity do |_context, _tensor, inputs|
270
+ register_op :identity do |context, tensor, inputs|
271
+ if tensor.inputs.size > 1
272
+ tensor.inputs[1..inputs.size].each { |input| complete_eval(input, context) }
273
+ end
263
274
  inputs[0]
264
275
  end
265
276
 
@@ -277,18 +288,19 @@ module TensorStream
277
288
  assign_var(tensor, value, context)
278
289
  end
279
290
 
291
+ register_op :variable, noop: true do |context, tensor, inputs|
292
+ variable = tensor.inputs[0]
293
+ raise "variable #{tensor.name} not initalized" if variable.value.nil? && (variable.buffer.nil? || !variable.buffer.dirty)
294
+ variable.buffer = wrap_opencl(variable, name: variable.name) if variable.buffer.nil?
295
+ variable.buffer
296
+ end
297
+
280
298
  # Fast in place multiply subtract assign
281
299
  register_op :apply_gradient_descent do |_context, tensor, inputs|
282
300
  _target_var, learning_rate, delta = inputs
283
301
 
284
302
  assign = tensor.inputs[0] || tensor
285
303
 
286
- unless assign.buffer
287
- value = read_final_result(buffer)
288
- assign.buffer = convert_to_opencl(value, buffer.shape, data_type: tensor.data_type, name: assign.name)
289
- assign.value = value
290
- end
291
-
292
304
  assign.buffer.dirty = true # force buffer copy when variable is read externally
293
305
  output_buffer = assign.buffer
294
306
 
@@ -297,13 +309,39 @@ module TensorStream
297
309
  cl_m = OpenCL::Int1.new(m || 1)
298
310
  cl_n = OpenCL::Int1.new(n || 1)
299
311
 
300
- event_wait_list = [assign.buffer.op, learning_rate.op, delta.op].compact # add dependency wait list
312
+ event_wait_list = build_event_wait_list([assign.buffer, learning_rate, delta])
301
313
  method_call = :"apply_gradient_#{output_buffer.data_type}"
302
314
  event = _cl_program("apply_gradient", dtype: output_buffer.data_type).send(method_call, _opencl_queue, work_group, cl_m, cl_n, delta.cl_buffer, learning_rate.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
303
315
  output_buffer.op = event
304
316
  output_buffer
305
317
  end
306
318
 
319
+ # Fast in place multiply subtract assign
320
+ register_op :apply_momentum do |_context, tensor, inputs|
321
+ target_var, momentum_var, learning_rate, grad, momentum = inputs
322
+
323
+ assign = tensor.inputs[0] || tensor
324
+ assign_acc = tensor.inputs[1]
325
+ assign.buffer.dirty = true # force buffer copy when variable is read externally
326
+ assign_acc.buffer.dirty = true # force buffer copy when variable is read externally
327
+
328
+ output_buffer = assign.buffer
329
+
330
+ m, n = output_buffer.shape
331
+ work_group = [m || 1, n || 1]
332
+ cl_m = OpenCL::Int1.new(m || 1)
333
+ cl_n = OpenCL::Int1.new(n || 1)
334
+
335
+ event_wait_list = build_event_wait_list([assign.buffer, assign_acc.buffer, learning_rate, grad, momentum])
336
+ method_call = :"apply_momentum_#{output_buffer.data_type}"
337
+ event = _cl_program("apply_momentum", nesterov: tensor.options[:use_nesterov], dtype: output_buffer.data_type).
338
+ send(method_call, _opencl_queue, work_group, cl_m, cl_n, grad.cl_buffer,
339
+ learning_rate.cl_buffer, momentum.cl_buffer, output_buffer.cl_buffer,
340
+ assign_acc.buffer.cl_buffer, event_wait_list: event_wait_list)
341
+ output_buffer.op = event
342
+ output_buffer
343
+ end
344
+
307
345
  %i[less less_equal greater greater_equal equal not_equal logical_and].each do |op|
308
346
  register_op op, noop: true do |context, tensor, inputs|
309
347
  execute_2_operand_func(op.to_s, tensor, inputs[0], inputs[1], context, 'cond')
@@ -332,7 +370,7 @@ module TensorStream
332
370
  a = inputs_queue.pop
333
371
  until inputs_queue.empty?
334
372
  b = inputs_queue.pop
335
- event_wait_list = [a.op, b.op].compact
373
+ event_wait_list = build_event_wait_list([a, b])
336
374
  method_call = :"add_#{a.data_type}_#{b.data_type}"
337
375
  event = _cl_program('add', a: a.data_type, b: b.data_type, dtype: dtype).send(method_call, _opencl_queue, work_group, cl_m, cl_n, cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
338
376
  a = output_buffer
@@ -353,6 +391,23 @@ module TensorStream
353
391
  convert_to_opencl(new_buf, new_shape, data_type: inputs[0].data_type, name: tensor.name)
354
392
  end
355
393
 
394
+ register_op :fill, buffer: true do |_context, tensor, inputs|
395
+ shape = inputs[0]
396
+ value = inputs[1]
397
+
398
+ narray_size = shape.buffer.to_a.reduce(:*) || 1
399
+ cl_buffer = get_cached_buffer(tensor.name, shape.buffer.to_a)
400
+
401
+ buffer = if cl_buffer
402
+ cl_buffer.buffer
403
+ else
404
+ allocate_narray_for_type(tensor.data_type, narray_size)
405
+ end
406
+
407
+ buffer.fill!(value.buffer[0])
408
+ convert_to_opencl(buffer, shape.buffer.to_a, data_type: tensor.data_type, name: tensor.name)
409
+ end
410
+
356
411
  register_op :floor_div, noop: true do |context, tensor, inputs|
357
412
  if fp_type?(tensor.data_type)
358
413
  execute_2_operand_func('floor_div', tensor, inputs[0], inputs[1], context)
@@ -374,8 +429,15 @@ module TensorStream
374
429
  v = b.shape[0]
375
430
  k = a.shape[1]
376
431
 
377
- m, k = [a.shape[1], a.shape[0]] if tensor.options[:transpose_a]
378
- n, v = [b.shape[0], b.shape[1]] if tensor.options[:transpose_b]
432
+ if tensor.options[:transpose_a]
433
+ m = a.shape[1]
434
+ k = a.shape[0]
435
+ end
436
+
437
+ if tensor.options[:transpose_b]
438
+ n = b.shape[0]
439
+ v = b.shape[1]
440
+ end
379
441
 
380
442
  result_shape = [m, n]
381
443
 
@@ -393,8 +455,8 @@ module TensorStream
393
455
 
394
456
  transpose_a = OpenCL::Int1.new(tensor.options[:transpose_a] ? 1 : 0)
395
457
  transpose_b = OpenCL::Int1.new(tensor.options[:transpose_b] ? 1 : 0)
396
-
397
- output_buffer.op = _cl_program('gemm', dtype: dtype).send(:"gemm_#{dtype}", _opencl_queue, result_shape, cl_m, cl_n, cl_k, transpose_a, transpose_b, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer)
458
+ event_wait_list = build_event_wait_list(inputs)
459
+ output_buffer.op = _cl_program('gemm', dtype: dtype).send(:"gemm_#{dtype}", _opencl_queue, result_shape, cl_m, cl_n, cl_k, transpose_a, transpose_b, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
398
460
  output_buffer
399
461
  end
400
462
 
@@ -406,14 +468,47 @@ module TensorStream
406
468
  cl_m = OpenCL::Int1.new(m || 1)
407
469
  cl_n = OpenCL::Int1.new(n || 1)
408
470
  work_group = [m || 1, n || 1]
409
-
410
- buffer.op = _cl_program("cast", source_dt: a.data_type, target_dt: tensor.data_type).cast(_opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, buffer.cl_buffer)
471
+ event_wait_list = build_event_wait_list(inputs)
472
+ buffer.op = _cl_program("cast", source_dt: a.data_type, target_dt: tensor.data_type).cast(_opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, buffer.cl_buffer, event_wait_list: event_wait_list)
411
473
  buffer
412
474
  else
413
475
  a
414
476
  end
415
477
  end
416
478
 
479
+ register_op :stack do |_context, tensor, inputs|
480
+ axis = tensor.options[:axis] || 0
481
+ shape = inputs[0].shape
482
+ rank = shape.size + 1
483
+ elem_size = shape.empty? ? 1 : shape.reduce(:*)
484
+
485
+ new_shape = [inputs.size]
486
+ shape.inject(new_shape) { |ns, s| ns << s }
487
+
488
+ divisors = new_shape.dup.drop(1).reverse.inject([1]) do |a, s|
489
+ a << s * a.last
490
+ end.reverse
491
+
492
+ axis = rank + axis if axis < 0
493
+ rotated_shape = Array.new(axis + 1) { new_shape.shift }
494
+ new_shape = rotated_shape.rotate! + new_shape
495
+
496
+ output_buffer = _create_result_buffer(tensor.data_type, new_shape, tensor.name)
497
+ multipliers = new_shape.dup.drop(1).reverse.inject([1]) do |a, s|
498
+ a << s * a.last
499
+ end.reverse
500
+
501
+ cl_n = OpenCL::Int1.new(elem_size)
502
+ work_group = [elem_size]
503
+ event_wait_list = build_event_wait_list(inputs)
504
+ ops = inputs.each_with_index.map do |input, index|
505
+ cl_index = OpenCL::Int1.new(index)
506
+ _cl_program("pack", data_type: tensor.data_type, divisors: divisors, multipliers: multipliers, axis: axis).pack(_opencl_queue, work_group, cl_n, cl_index, input.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
507
+ end
508
+ output_buffer.op = ops
509
+ output_buffer
510
+ end
511
+
417
512
  %i[sign exp tan acos asin sin cos abs sqrt negate square reciprocal tanh tanh_grad sigmoid log1p round floor ceil].each do |op|
418
513
  register_op op, noop: true do |context, tensor, inputs|
419
514
  execute_func(op.to_s, tensor, inputs[0], context)
@@ -422,7 +517,7 @@ module TensorStream
422
517
 
423
518
  register_op :softmax do |_context, tensor, inputs|
424
519
  a = inputs[0]
425
- event_wait_list = [a.op].compact
520
+ event_wait_list = build_event_wait_list(inputs)
426
521
  dtype = tensor.data_type
427
522
  output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
428
523
 
@@ -438,7 +533,7 @@ module TensorStream
438
533
 
439
534
  register_op :log_softmax do |_context, tensor, inputs|
440
535
  a = inputs[0] # logits
441
- event_wait_list = [a.op].compact
536
+ event_wait_list = build_event_wait_list(inputs)
442
537
  dtype = tensor.data_type
443
538
  output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
444
539
 
@@ -452,28 +547,33 @@ module TensorStream
452
547
  output_buffer
453
548
  end
454
549
 
455
- register_op :softmax_cross_entropy_with_logits_v2 do |_context, tensor, inputs|
550
+ register_op :softmax_cross_entropy_with_logits_v2 do |context, tensor, inputs|
456
551
  a = inputs[0] # logits
457
552
  b = inputs[1] # labels
458
- event_wait_list = [a.op, b.op].compact
553
+ event_wait_list = build_event_wait_list(inputs)
459
554
  dtype = tensor.data_type
460
555
  output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
461
-
556
+ output_buffer_backprop = _create_result_buffer(tensor.data_type, a.shape, "#{tensor.name}_2")
557
+ rank = a.shape.size - 1
462
558
  m, n = a.shape
463
559
  work_group = [m]
464
560
  n = m if n.nil?
465
561
  cl_n = OpenCL::Int1.new(n || 1)
466
562
 
467
- event = _cl_program("softmax_cross", dtype: dtype).send(:"softmax_cross_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
563
+ event = _cl_program("softmax_cross", dtype: dtype).send(:"softmax_cross_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer, b.cl_buffer,
564
+ output_buffer.cl_buffer, output_buffer_backprop.cl_buffer, event_wait_list: event_wait_list)
468
565
  output_buffer.op = event
469
- output_buffer
566
+ output_buffer_backprop.op = event
567
+
568
+ loss = reduction(context, tensor, output_buffer, rank, :sum)
569
+ OutputGroup.new([loss, output_buffer_backprop], [tensor.inputs[0].data_type, tensor.inputs[0].data_type])
470
570
  end
471
571
 
472
572
  register_op :softmax_cross_entropy_with_logits_v2_grad do |_context, tensor, inputs|
473
573
  a = inputs[0] # logits
474
574
  b = inputs[1] # labels
475
575
  c = inputs[2] # grads
476
- event_wait_list = [a.op, b.op, c.op].compact
576
+ event_wait_list = build_event_wait_list(inputs)
477
577
  dtype = tensor.data_type
478
578
  output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
479
579
 
@@ -490,7 +590,7 @@ module TensorStream
490
590
  register_op :softmax_grad do |_context, tensor, inputs|
491
591
  a, grad = inputs
492
592
 
493
- event_wait_list = [a.op].compact
593
+ event_wait_list = build_event_wait_list(inputs)
494
594
  dtype = tensor.data_type
495
595
  output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
496
596
 
@@ -508,7 +608,7 @@ module TensorStream
508
608
  name = tensor.options[:name]
509
609
 
510
610
  a.buffer.each do |input|
511
- raise "#{name} Invalid Argument" if input.nan? || input.infinite?
611
+ raise TensorStream::InvalidArgumentError, "#{name} Invalid Argument" if input.nan? || input.infinite?
512
612
  end
513
613
  a
514
614
  end
@@ -522,8 +622,8 @@ module TensorStream
522
622
  input_a = read_final_result(complete_eval(a, context))
523
623
  input_b = read_final_result(complete_eval(b, context))
524
624
  b_a, b_b = broadcast(input_a, input_b)
525
- [ wrap_opencl(b_a, data_type: a.data_type, name: "#{tensor.name}_a"),
526
- wrap_opencl(b_b, data_type: a.data_type, name: "#{tensor.name}_b")]
625
+ [wrap_opencl(b_a, data_type: a.data_type, name: "#{tensor.name}_a"),
626
+ wrap_opencl(b_b, data_type: a.data_type, name: "#{tensor.name}_b")]
527
627
  end
528
628
  end
529
629
 
@@ -557,8 +657,22 @@ module TensorStream
557
657
 
558
658
  register_op :transpose, buffer: true do |_context, tensor, inputs|
559
659
  t_param = Array.new(inputs[0].shape.size) { |index| index }.reverse
560
- transposed = inputs[0].buffer.reshape(*inputs[0].shape.reverse).transpose(*t_param)
561
- convert_to_opencl(transposed.flatten, transposed.shape.reverse, data_type: inputs[0].data_type, name: tensor.name)
660
+
661
+ if inputs[0].shape.size == 2 && inputs[1].nil?
662
+ transposed = inputs[0].buffer.reshape(*inputs[0].shape.reverse).transpose(*t_param)
663
+ res = convert_to_opencl(transposed.flatten, transposed.shape.reverse, data_type: inputs[0].data_type, name: tensor.name)
664
+ res
665
+ else
666
+ rank = inputs[0].shape.size
667
+ perm = inputs[1].nil? ? (0...rank).to_a.reverse : inputs[1].buffer
668
+ new_shape = perm.map { |p| inputs[0].shape[p] }.to_a
669
+ output_buffer = _create_result_buffer(tensor.data_type, new_shape, tensor.name)
670
+ transpose_with_perm(inputs[0].buffer, output_buffer.buffer, inputs[0].shape, new_shape, perm)
671
+
672
+ write_op = _opencl_queue.enqueue_write_buffer(output_buffer.cl_buffer, output_buffer.buffer)
673
+ output_buffer.op = write_op
674
+ output_buffer
675
+ end
562
676
  end
563
677
 
564
678
  register_op :index, noop: true do |context, tensor, inputs|
@@ -567,39 +681,36 @@ module TensorStream
567
681
 
568
682
  if a.is_a?(OutputGroup)
569
683
  a.outputs[index]
684
+ elsif a.is_a?(Array)
685
+ a[index]
570
686
  else
571
- if a.is_a?(Array)
572
- a[index]
573
- else
574
- new_shape = a.shape.dup
575
- new_shape.shift
576
- input_a = read_final_result(a)
577
- convert_to_opencl(input_a[index], new_shape, data_type: a.data_type, name: tensor.name)
578
- end
687
+ new_shape = a.shape.dup
688
+ new_shape.shift
689
+ input_a = read_final_result(a)
690
+ convert_to_opencl(input_a[index], new_shape, data_type: a.data_type, name: tensor.name)
579
691
  end
580
692
  end
581
693
 
582
694
  register_op :broadcast_gradient_args, buffer: true do |_context, tensor, inputs|
583
695
  rx, ry = get_broadcast_gradient_args(inputs[0].buffer.to_a, inputs[1].buffer.to_a)
584
- OutputGroup.new([wrap_opencl(rx, data_type: :int32, name: "#{tensor.name}"), wrap_opencl(ry, data_type: :int32, name: "#{tensor.name}:1")])
696
+ OutputGroup.new([wrap_opencl(rx, data_type: :int32, name: tensor.name), wrap_opencl(ry, data_type: :int32, name: "#{tensor.name}:1")], tensor.inputs.map(&:data_type))
585
697
  end
586
698
 
587
699
  register_op :shape do |_context, tensor, inputs|
588
700
  wrap_opencl(inputs[0].shape, name: tensor.name, data_type: tensor.data_type)
589
701
  end
590
702
 
591
- register_op :reshape, buffer: true do |_context, _tensor, inputs|
703
+ register_op :reshape, buffer: true do |_context, tensor, inputs|
592
704
  arr = inputs[0]
593
705
  new_shape = read_final_result(inputs[1])
594
706
 
595
- if new_shape.size.zero? && arr.buffer.size == 1
596
- arr.shape = new_shape
597
- arr
598
- else
599
- new_shape = TensorShape.fix_inferred_elements(new_shape, arr.buffer.size)
600
- arr.shape = new_shape
601
- arr
602
- end
707
+ shape = if new_shape.size.zero? && arr.buffer.size == 1
708
+ new_shape
709
+ else
710
+ TensorShape.fix_inferred_elements(new_shape, arr.buffer.size)
711
+ end
712
+
713
+ convert_to_opencl(arr.buffer, shape, data_type: arr.data_type, name: tensor.name)
603
714
  end
604
715
 
605
716
  register_op :flow_group do |_context, _tensor, inputs|
@@ -618,6 +729,7 @@ module TensorStream
618
729
 
619
730
  register_op :prod, noop: true do |context, tensor, inputs|
620
731
  input_a = complete_eval(inputs[0], context)
732
+
621
733
  if input_a.buffer.empty?
622
734
  convert_to_opencl([1.0], [], data_type: inputs[0].data_type, name: tensor.name)
623
735
  else
@@ -646,13 +758,11 @@ module TensorStream
646
758
  end
647
759
 
648
760
  def eval_operation(tensor, child_context)
649
-
650
761
  cache_key = "#{tensor.graph.object_id}_opencl_#{tensor.name}:#{object_id}"
651
762
  return @context[:_cache][cache_key] if @context[:_cache].key?(cache_key)
652
763
  return @context[cache_key] if @context.key?(cache_key)
653
- # puts tensor.name
764
+ # puts "opencl: #{tensor.name}"
654
765
  invoke(tensor, child_context).tap do |result|
655
- # puts "#{tensor.to_math(true,1)} = #{read_final_result(complete_eval(result, child_context))}"
656
766
  if tensor.breakpoint
657
767
  a = resolve_placeholder(tensor.inputs[0], child_context) if tensor.inputs && tensor.inputs[0]
658
768
  b = resolve_placeholder(tensor.inputs[1], child_context) if tensor.inputs && tensor.inputs[1]
@@ -676,9 +786,11 @@ module TensorStream
676
786
  @context[:_cache][cache_key] = result if tensor.is_const
677
787
  end
678
788
  rescue EvaluatorExcecutionException => e
679
- raise e
789
+ _opencl_queue.finish # dump queue
790
+ raise e, "error #{e.message} while evaluating #{tensor.name} : #{tensor.to_math(true, 1)} defined at #{tensor.source}"
680
791
  rescue TensorStreamError => e
681
- raise e
792
+ _opencl_queue.finish # dump queue
793
+ raise e, "error #{e.message} while evaluating #{tensor.name} : #{tensor.to_math(true, 1)} defined at #{tensor.source}"
682
794
  rescue StandardError => e
683
795
  _opencl_queue.finish # dump queue
684
796
  puts e.message
@@ -698,7 +810,7 @@ module TensorStream
698
810
  # File.write('/home/jedld/workspace/tensor_stream/samples/error.graphml', TensorStream::Graphml.new.get_string(tensor, @session))
699
811
 
700
812
  # File.write('/Users/josephemmanueldayo/workspace/gradients.graphml', TensorStream::Graphml.new.get_string(tensor, @session))
701
- raise EvaluatorExcecutionException.new(e, tensor), "error #{e.message} while evaluating #{tensor.name} : #{tensor.to_math(true,1)} defined at #{tensor.source}"
813
+ raise EvaluatorExcecutionException.new(e, tensor), "error #{e.message} while evaluating #{tensor.name} : #{tensor.to_math(true, 1)} defined at #{tensor.source}"
702
814
  end
703
815
 
704
816
  def eval_tensor(tensor, child_context)
@@ -724,8 +836,9 @@ module TensorStream
724
836
 
725
837
  if assign.buffer
726
838
  # buffer = type_cast(buffer, assign.data_type, name: "#{tensor.name}/cast_#{tensor.name}_#{tensor.data_type}")
839
+ event_wait_list = build_event_wait_list([buffer, assign.buffer])
727
840
  assign.buffer.op = if assign.buffer.cl_buffer != buffer.cl_buffer
728
- _opencl_queue.enqueue_copy_buffer(buffer.cl_buffer, assign.buffer.cl_buffer, event_wait_list: [buffer.op, assign.buffer.op])
841
+ _opencl_queue.enqueue_copy_buffer(buffer.cl_buffer, assign.buffer.cl_buffer, event_wait_list: event_wait_list)
729
842
  else
730
843
  buffer.op
731
844
  end
@@ -745,7 +858,6 @@ module TensorStream
745
858
  dtype = tensor.data_type
746
859
  result_shape = TensorShape.infer_shape(a.shape, b.shape)
747
860
  return _create_result_buffer(dtype, [0], "out_#{tensor.name}") if result_shape == [0]
748
-
749
861
  output_buffer = _create_result_buffer(tensor.data_type, result_shape, "out_#{tensor.name}")
750
862
  a, b, prog, switch_operands = select_program(a, b, op_name)
751
863
  m, n = result_shape
@@ -754,21 +866,26 @@ module TensorStream
754
866
  cl_n = OpenCL::Int1.new(n || 1)
755
867
  cl_switch = OpenCL::Int1.new(switch_operands) # no need to switch for addition
756
868
 
757
- event_wait_list = [a.op, b.op].compact # add dependency wait list
869
+ event_wait_list = build_event_wait_list([a, b]) # add dependency wait list
758
870
 
759
871
  method_call = :"#{prog}_#{a.data_type}_#{b.data_type}"
872
+ prog_name ||= op_name
760
873
  event = if prog == "#{op_name}_b"
761
- cl_m_b, cl_n_b = if b.shape.size == 2
762
- [OpenCL::Int1.new(b.shape[0]), OpenCL::Int1.new(b.shape[1])]
763
- elsif b.shape.size == 1
764
- [OpenCL::Int1.new(1), OpenCL::Int1.new(b.shape[0])]
765
- else
766
- raise "rank > 2 not supported!"
767
- end
768
- _cl_program("#{prog_name || op_name}", a: a.data_type, b: b.data_type, dtype: dtype).send(method_call, _opencl_queue, work_group, cl_m, cl_n, cl_m_b, cl_n_b, cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
769
- else
770
- _cl_program("#{prog_name || op_name}", a: a.data_type, b: b.data_type, dtype: dtype).send(method_call, _opencl_queue, work_group, cl_m, cl_n, cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
771
- end
874
+ cl_m_b, cl_n_b = if b.shape.size == 2
875
+ [OpenCL::Int1.new(b.shape[0]), OpenCL::Int1.new(b.shape[1])]
876
+ elsif b.shape.size == 1
877
+ [OpenCL::Int1.new(1), OpenCL::Int1.new(b.shape[0])]
878
+ else
879
+ raise "rank > 2 not supported!"
880
+ end
881
+ _cl_program(prog_name, a: a.data_type, b: b.data_type, dtype: dtype).
882
+ send(method_call, _opencl_queue, work_group, cl_m, cl_n, cl_m_b, cl_n_b,
883
+ cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
884
+ else
885
+ _cl_program(prog_name, a: a.data_type, b: b.data_type, dtype: dtype).
886
+ send(method_call, _opencl_queue, work_group, cl_m, cl_n, cl_switch,
887
+ a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
888
+ end
772
889
 
773
890
  output_buffer.op = event
774
891
  output_buffer
@@ -789,14 +906,14 @@ module TensorStream
789
906
  cl_m = OpenCL::Int1.new(m || 1)
790
907
  cl_n = OpenCL::Int1.new(n || 1)
791
908
 
792
- event_wait_list = [a.op, b.op, p.op].compact # add dependency wait list
909
+ event_wait_list = build_event_wait_list([a, b, p]) # add dependency wait list
793
910
  output_buffer.op = _cl_program(op_name.to_s, dtype: dtype).send(:"#{op_name}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, p.cl_buffer, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
794
911
  output_buffer
795
912
  end
796
913
 
797
914
  def execute_func(op_name, tensor, a, child_context)
798
915
  a = _run(a, child_context)
799
- event_wait_list = [a.op].compact
916
+ event_wait_list = build_event_wait_list([a])
800
917
  dtype = tensor.data_type
801
918
  output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
802
919
 
@@ -814,7 +931,7 @@ module TensorStream
814
931
  return [a, b] if a.data_type == b.data_type
815
932
  m, n = b.shape
816
933
  work_group = [m || 1, n || 1]
817
- event_wait_list = [b.op].compact
934
+ event_wait_list = build_event_wait_list([b])
818
935
  buffer = _create_result_buffer(b.data_type, b.shape, name)
819
936
 
820
937
  cl_m = OpenCL::Int1.new(m || 1)
@@ -848,6 +965,11 @@ module TensorStream
848
965
  convert_to_opencl(value, shape, data_type: data_type || tensor.data_type, name: name)
849
966
  end
850
967
 
968
+ def get_cached_buffer(name, shape)
969
+ cache_key = "_cl_object_#{name}:#{shape.join('_')}:#{object_id}"
970
+ @context[:_cache][cache_key]
971
+ end
972
+
851
973
  def convert_to_opencl(value, shape, data_type: nil, name: nil)
852
974
  value = [value] if !value.is_a?(Array) && !value.is_a?(NArray)
853
975
 
@@ -863,6 +985,8 @@ module TensorStream
863
985
  allocate_narray_for_type(data_type, narray_size)
864
986
  end
865
987
 
988
+ return nil if buffer.nil?
989
+
866
990
  cl_buffer_size = shape.empty? ? 1 : shape.reduce(:*)
867
991
 
868
992
  cl_buffer = unless value.flatten.empty?
@@ -908,18 +1032,20 @@ module TensorStream
908
1032
  NArray.sint(narray_size)
909
1033
  when :boolean
910
1034
  NArray.sint(narray_size)
1035
+ when :unknown
1036
+ nil
911
1037
  else
912
1038
  raise "unsupported type #{data_type}"
913
1039
  end
914
1040
  end
915
1041
 
916
1042
  def _create_result_buffer(data_type, shape, name)
917
- return OpenCLBuffer.new(data_type: data_type, shape: [0], buffer: nil, cl_buffer: nil) if shape == [0]
1043
+ return OpenCLBuffer.new(name: name, data_type: data_type, shape: [0], buffer: nil, cl_buffer: nil) if shape == [0]
918
1044
  @context[:_cache][:_cl_buffers]["_result_#{name}_#{shape.join('_')}:#{object_id}"] ||= begin
919
1045
  size = shape.empty? || shape == [0] ? 1 : shape.reduce(:*)
920
1046
  buffer = allocate_narray_for_type(data_type, size)
921
1047
  cl_buffer = _opencl_context.create_buffer(buffer.size * buffer.element_size)
922
- OpenCLBuffer.new(data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer)
1048
+ OpenCLBuffer.new(data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer, name: name)
923
1049
  end
924
1050
  end
925
1051
 
@@ -969,7 +1095,7 @@ module TensorStream
969
1095
 
970
1096
  def reduction(child_context, tensor, a, b, func)
971
1097
  input = complete_eval(a, child_context)
972
- axis = read_final_result(complete_eval(b, child_context))
1098
+ axis = b.is_a?(Tensor) ? read_final_result(complete_eval(b, child_context)) : b
973
1099
  if axis.nil?
974
1100
  red = input.buffer.send(func)
975
1101
  convert_to_opencl(red, [], data_type: tensor.data_type, name: tensor.name)
@@ -1021,6 +1147,10 @@ module TensorStream
1021
1147
  shape.is_a?(Array) ? shape.size : 0
1022
1148
  end
1023
1149
 
1150
+ def build_event_wait_list(inputs)
1151
+ inputs.compact.map(&:op).flatten
1152
+ end
1153
+
1024
1154
  def resolve_placeholder(placeholder, _execution_context = {})
1025
1155
  return nil if placeholder.nil?
1026
1156