tensor_stream 0.7.0 → 0.8.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. checksums.yaml +5 -5
  2. data/.rubocop.yml +6 -1
  3. data/CHANGELOG.md +10 -0
  4. data/README.md +35 -0
  5. data/lib/tensor_stream.rb +2 -2
  6. data/lib/tensor_stream/debugging/debugging.rb +2 -1
  7. data/lib/tensor_stream/dynamic_stitch.rb +23 -24
  8. data/lib/tensor_stream/evaluator/base_evaluator.rb +27 -18
  9. data/lib/tensor_stream/evaluator/opencl/kernels/apply_momentum.cl +16 -0
  10. data/lib/tensor_stream/evaluator/opencl/kernels/pack.cl +24 -0
  11. data/lib/tensor_stream/evaluator/opencl/kernels/softmax_cross.cl +6 -1
  12. data/lib/tensor_stream/evaluator/opencl/opencl_buffer.rb +6 -6
  13. data/lib/tensor_stream/evaluator/opencl/opencl_evaluator.rb +237 -107
  14. data/lib/tensor_stream/evaluator/operation_helpers/array_ops_helper.rb +97 -7
  15. data/lib/tensor_stream/evaluator/ruby_evaluator.rb +230 -123
  16. data/lib/tensor_stream/exceptions.rb +1 -0
  17. data/lib/tensor_stream/graph_builder.rb +2 -3
  18. data/lib/tensor_stream/graph_deserializers/protobuf.rb +22 -23
  19. data/lib/tensor_stream/graph_serializers/graphml.rb +26 -29
  20. data/lib/tensor_stream/graph_serializers/pbtext.rb +22 -19
  21. data/lib/tensor_stream/helpers/string_helper.rb +4 -5
  22. data/lib/tensor_stream/math_gradients.rb +141 -77
  23. data/lib/tensor_stream/nn/nn_ops.rb +4 -6
  24. data/lib/tensor_stream/operation.rb +139 -120
  25. data/lib/tensor_stream/ops.rb +36 -3
  26. data/lib/tensor_stream/session.rb +7 -11
  27. data/lib/tensor_stream/tensor.rb +3 -3
  28. data/lib/tensor_stream/tensor_shape.rb +5 -0
  29. data/lib/tensor_stream/train/gradient_descent_optimizer.rb +4 -37
  30. data/lib/tensor_stream/train/momentum_optimizer.rb +48 -0
  31. data/lib/tensor_stream/train/optimizer.rb +129 -0
  32. data/lib/tensor_stream/train/saver.rb +0 -1
  33. data/lib/tensor_stream/train/slot_creator.rb +62 -0
  34. data/lib/tensor_stream/train/utils.rb +11 -12
  35. data/lib/tensor_stream/trainer.rb +3 -0
  36. data/lib/tensor_stream/utils.rb +18 -11
  37. data/lib/tensor_stream/variable.rb +19 -12
  38. data/lib/tensor_stream/variable_scope.rb +1 -1
  39. data/lib/tensor_stream/version.rb +1 -1
  40. data/samples/iris.rb +2 -1
  41. data/samples/linear_regression.rb +3 -1
  42. data/samples/nearest_neighbor.rb +2 -0
  43. data/test_samples/neural_network_raw.py +101 -0
  44. data/test_samples/raw_neural_net_sample.rb +6 -4
  45. data/test_samples/test2.py +73 -27
  46. metadata +9 -3
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA256:
3
- metadata.gz: d42a81e850271f080d408c52f2bea15a07c6d41ee3c6790dc04e48f2ab485364
4
- data.tar.gz: a4aedfd3c9a532f31ea195c58124644fcae143726d37daa7a4a6afc6b39f439b
2
+ SHA1:
3
+ metadata.gz: f84c2b9852fcf4931c47c0130b67497a50a87b0f
4
+ data.tar.gz: 524e1105da4e06e3472cbcfa0e6f764ae4512d37
5
5
  SHA512:
6
- metadata.gz: 7d9fff1a8af14878c50469cfcee7942d2800906fe388504261290958628861b29d973ea65a3cf986cd1657acf490d2bc7164ec9ec2c14dc7c5d6c25121c6737f
7
- data.tar.gz: 42ef6af8fafd1a7f7f069e03f8c344bec87bd15217def07d859d4b33374a038e1b7ed54ac685b901d3ed9b51fd351300cc61553ca45f0591fc4a7c5e50bcee53
6
+ metadata.gz: 420e2675ab67d4c8462534bdf8c703671656f7852d984579e22ee57f1425dd5740fcb64a1e52363bf337cd7c691d87a75c76bd868b13c8a7f06d78e0eb00aa73
7
+ data.tar.gz: 24fe1022741883d46cdd5af51309da33d421d72874f0cc84bf2e0ed14a62602f1830c6060bd86e42359b7962b4a57727c9a48ce13d5950d5ba02f6a9cdfd719f
data/.rubocop.yml CHANGED
@@ -6,6 +6,10 @@ AllCops:
6
6
  - tensor_stream.gemspec
7
7
  - Rakefile
8
8
 
9
+ Naming/AccessorMethodName:
10
+ Exclude:
11
+ - lib/tensor_stream/utils.rb
12
+
9
13
  Style/StringLiterals:
10
14
  Enabled: false
11
15
 
@@ -81,4 +85,5 @@ Style/TrailingCommaInHashLiteral:
81
85
 
82
86
  Naming/UncommunicativeMethodParamName:
83
87
  Exclude:
84
- - lib/tensor_stream/evaluator/ruby_evaluator.rb
88
+ - lib/tensor_stream/evaluator/ruby_evaluator.rb
89
+ - lib/tensor_stream/ops.rb
data/CHANGELOG.md CHANGED
@@ -4,6 +4,16 @@ All notable changes to this project will be documented in this file.
4
4
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
5
5
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
6
6
 
7
+ ## [0.8.0] - 2018-08-29
8
+ ### Added
9
+ - [TRAINING] Added new supported optimizer, MomentumOptimizer loosely based on tensorflow's implementation (with nesterov support)
10
+ - [NEW OP] fill, stack, atan, cumprod, gather, invert_permutation, setdiff1d
11
+
12
+ ### Fixes
13
+ - Fixed device delegator where it does not pick the correct evaluator to use in some cases
14
+ - [GRADIENTS] Properly implement gradient computation for prod, tile, transpose
15
+ - Fixed gradient computation for softmax_cross_entropy_with_logits_v2 (now based on tensorflow's implementation)
16
+
7
17
  ## [0.7.0] - 2018-08-08
8
18
  ### Added
9
19
  - [NEW OP] expand_dims, min, acos, asin, add_n
data/README.md CHANGED
@@ -324,6 +324,41 @@ result = a + b
324
324
  File.write("model.pbtext", result.graph.as_graph_def)
325
325
  ```
326
326
 
327
+ ## Performance notes
328
+
329
+ Comparative performance with respect to other ruby libraries have not yet been performed. However it is
330
+ notable that TruffleRuby and ruby-2.6.0-preview2 with the --jit flag performs considerably better with respect
331
+ to previous versions of ruby(< 2.6)
332
+
333
+ Benchmarks running samples/linear_regression.rb on an Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz
334
+
335
+ ruby 2.4
336
+
337
+ ```
338
+ $ ruby -v
339
+ ruby 2.4.0p0 (2016-12-24 revision 57164) [x86_64-linux]
340
+ $ ruby samples/linear_regression.rb
341
+ 495 seconds 1000 epochs
342
+ ```
343
+
344
+ ruby 2.6.0-preview2
345
+
346
+ ```
347
+ $ ruby -v
348
+ ruby 2.6.0preview2 (2018-05-31 trunk 63539) [x86_64-linux]
349
+ $ ruby --jit samples/linear_regression.rb
350
+ 394 seconds 10000 epochs
351
+ ```
352
+
353
+ truffleruby
354
+ ```
355
+ $ ruby -v
356
+ truffleruby 1.0.0-rc5, like ruby 2.4.4, GraalVM CE Native [x86_64-linux]
357
+ 219 seconds 10000 epochs
358
+ ```
359
+
360
+ For training large networks that works on images, the opencl evaluator is the only way to go.
361
+
327
362
  ## Roadmap
328
363
 
329
364
  - Docs
data/lib/tensor_stream.rb CHANGED
@@ -20,8 +20,6 @@ require 'tensor_stream/operation'
20
20
  require 'tensor_stream/placeholder'
21
21
  require 'tensor_stream/control_flow'
22
22
  require 'tensor_stream/dynamic_stitch'
23
- require 'tensor_stream/train/utils'
24
- require 'tensor_stream/trainer'
25
23
  require 'tensor_stream/nn/nn_ops'
26
24
  require 'tensor_stream/evaluator/evaluator'
27
25
  require 'tensor_stream/graph_serializers/serializer'
@@ -31,6 +29,8 @@ require 'tensor_stream/graph_serializers/graphml'
31
29
  require 'tensor_stream/math_gradients'
32
30
  require "tensor_stream/debugging/debugging"
33
31
  require 'tensor_stream/utils'
32
+ require 'tensor_stream/train/utils'
33
+ require 'tensor_stream/trainer'
34
34
 
35
35
  # require 'tensor_stream/libraries/layers'
36
36
  require 'tensor_stream/monkey_patches/integer'
@@ -9,8 +9,9 @@ module TensorStream
9
9
  nodes_to_process.each do |node|
10
10
  node.inputs = node.inputs.collect do |input|
11
11
  next if input.nil?
12
+ next input if input.is_a?(Variable)
12
13
 
13
- if TensorStream::Ops::FLOATING_POINT_TYPES.include?(input.data_type)
14
+ if input.is_a?(Tensor) && TensorStream::Ops::FLOATING_POINT_TYPES.include?(input.data_type)
14
15
  TensorStream.check_numerics(input, "#{node.name}/#{input.name}", name: "check/#{node.name}/#{input.name}" )
15
16
  else
16
17
  input
@@ -1,28 +1,27 @@
1
1
  module TensorStream
2
- # Defines a TensorStream controlflow op
3
- class DynamicStitch < Operation
4
- attr_accessor :ops
5
-
6
- def initialize(flow_type, inputs, ops = nil, options = {})
7
- setup_initial_state(options)
8
-
9
- @operation = :"flow_#{flow_type}"
10
- @inputs = inputs
2
+ # Defines a TensorStream controlflow op
3
+ class DynamicStitch < Operation
4
+ attr_accessor :ops
11
5
 
12
- @data_type = Tensor.detect_type(inputs[1])
13
- @name = [@graph.get_name_scope, options[:name] || set_name].compact.join('/')
14
- @ops = ops
15
- @shape = TensorShape.new([inputs.size])
16
- @graph.add_node(self)
17
- end
18
-
19
- def set_data_type(_passed_data_type)
20
- :unknown
21
- end
22
-
23
- def run
24
- eval
25
- end
6
+ def initialize(flow_type, inputs, ops = nil, options = {})
7
+ setup_initial_state(options)
8
+
9
+ @operation = :"flow_#{flow_type}"
10
+ @inputs = inputs
11
+
12
+ @data_type = Tensor.detect_type(inputs[1])
13
+ @name = [@graph.get_name_scope, options[:name] || set_name].compact.join('/')
14
+ @ops = ops
15
+ @shape = TensorShape.new([inputs.size])
16
+ @graph.add_node(self)
17
+ end
18
+
19
+ def set_data_type(_passed_data_type)
20
+ :unknown
21
+ end
22
+
23
+ def run
24
+ eval
26
25
  end
27
26
  end
28
-
27
+ end
@@ -2,13 +2,14 @@ module TensorStream
2
2
  # Evaluator base module
3
3
  module Evaluator
4
4
  class OutputGroup
5
- attr_accessor :outputs
6
- def initialize(outputs = [])
5
+ attr_accessor :outputs, :data_types
6
+ def initialize(outputs = [], data_types = [])
7
7
  @outputs = outputs
8
+ @data_types = data_types
8
9
  end
9
10
  end
10
11
 
11
- class UnsupportedOp < Exception
12
+ class UnsupportedOp < RuntimeError
12
13
  def initialize(tensor)
13
14
  @tensor = tensor
14
15
  end
@@ -111,22 +112,13 @@ module TensorStream
111
112
 
112
113
  resolved_inputs = tensor.inputs.map do |i|
113
114
  next if i.nil?
115
+ next i if op_options[:noop]
114
116
 
115
117
  if i.is_a?(Array)
116
- next i.collect { |sub_item| sub_item.is_a?(Tensor) ? invoke(sub_item, execution_context) : sub_item }
118
+ next i.collect { |sub_item| sub_item.is_a?(Tensor) ? global_eval(tensor, sub_item, execution_context) : sub_item }
117
119
  end
118
120
 
119
- if !op_options[:noop] && @context[:_cache][:placement][tensor.name] != @context[:_cache][:placement][i.name] # tensor is on another device or evaluator
120
- cache_key = "#{tensor.graph.object_id}_#{i.name}:#{object_id}"
121
- next @context[:_cache][cache_key] if @context[:_cache].key?(cache_key)
122
-
123
- result = @session.delegate_to_evaluator(i, @context, execution_context)
124
- convert_from_buffer(i, result).tap do |buffer|
125
- @context[:_cache][cache_key] = buffer if i.is_const
126
- end
127
- else
128
- prepare_input(i, execution_context, op_options)
129
- end
121
+ global_eval(tensor, i, execution_context, op_options)
130
122
  end
131
123
 
132
124
  instance_exec(execution_context, tensor, resolved_inputs, &op[:block])
@@ -134,6 +126,23 @@ module TensorStream
134
126
 
135
127
  protected
136
128
 
129
+ def global_eval(tensor, input, execution_context, op_options = {})
130
+ return nil unless input
131
+ return input unless input.is_a?(Tensor)
132
+
133
+ if object_id != @context[:_cache][:placement][input.name][1].object_id # tensor is on another device or evaluator
134
+ cache_key = "#{tensor.graph.object_id}_#{input.name}:#{object_id}"
135
+ return @context[:_cache][cache_key] if @context[:_cache].key?(cache_key)
136
+
137
+ result = @session.delegate_to_evaluator(input, @context, execution_context)
138
+ convert_from_buffer(input, result).tap do |buffer|
139
+ @context[:_cache][cache_key] = buffer if input.is_const
140
+ end
141
+ else
142
+ prepare_input(input, execution_context, op_options)
143
+ end
144
+ end
145
+
137
146
  def get_broadcast_gradient_args(input_a, input_b)
138
147
  return [[], []] if input_a == input_b
139
148
 
@@ -153,16 +162,16 @@ module TensorStream
153
162
  end
154
163
  end
155
164
 
156
- [input_a_args.reverse, input_b_args.reverse]
165
+ [input_a_args.reverse, input_b_args.reverse]
157
166
  end
158
167
 
159
168
  ##
160
169
  # converts from a ruby Buffer object to the evaluator's native buffer format
161
- def convert_from_buffer(tensor, result)
170
+ def convert_from_buffer(_tensor, _result)
162
171
  raise "need implementation"
163
172
  end
164
173
 
165
- def prepare_input(tensor, context, options = {})
174
+ def prepare_input(_tensor, _context, _options = {})
166
175
  raise "need implementation"
167
176
  end
168
177
  end
@@ -0,0 +1,16 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ // same dimension add floating point op
3
+ __kernel void apply_momentum_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *grad, __global const <%= c_dtype %> *learning_rate,
4
+ __global const <%= c_dtype %> *momentum, __global <%= c_dtype %> *output, __global <%= c_dtype %> *acc) {
5
+ // Get the index of the current element to be processed
6
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
7
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
8
+ const int index = globalRow * N + globalCol;
9
+ <%= c_dtype %> acc_m = acc[index];
10
+ acc[index] = acc_m * momentum[0] + grad[index];
11
+ <% if nesterov %>
12
+ output[index] -= grad[index] * learning_rate[0] + acc_m * momentum[0] * learning_rate[0];
13
+ <% else %>
14
+ output[index] -= acc_m * learning_rate[0];
15
+ <% end %>
16
+ }
@@ -0,0 +1,24 @@
1
+ % ctype = dtype_to_c_type(data_type)
2
+
3
+ __kernel void pack(const int N, const int index, __global const <%= ctype %> *A, __global <%= ctype %> *C) {
4
+ // Get the index of the current element to be processed
5
+ const int globalCol = get_global_id(0); // Col ID of C (0..N)
6
+
7
+ int start = index * <%= divisors[0] %>;
8
+ int ptr = start + globalCol;
9
+ int index_map[<%= divisors.size %>] = { <%= Array.new(divisors.size) { 0 }.join(', ') %> };
10
+
11
+ // compute effective coordinates
12
+ <% divisors.each_with_index do |div, index| %>
13
+ index_map[<%= index %>] = (int)floor(ptr / (float)<%= div %>);<% if index < divisors.size - 1%>ptr = ptr % <%= div %>;<% end %><% end %>
14
+
15
+ // Apply axis translation if needed
16
+ <% if axis > 0 %>
17
+ int first = index_map[0];
18
+ <% axis.times do |i| %>
19
+ index_map[<%= i %>] = index_map[<%= (i + 1) %>];<% end %>
20
+ index_map[<%= axis %>] = first;
21
+ <% end%>
22
+
23
+ C[<%= multipliers.each_with_index.map { |m, idx| "#{m}*index_map[#{idx}]" }.join(' + ') %>] = A[globalCol];
24
+ }
@@ -1,9 +1,10 @@
1
+
1
2
  // First naive implementation
2
3
  % c_dtype = dtype_to_c_type(dtype)
3
4
  __kernel void softmax_cross_<%= dtype %>(const int N,
4
5
  const __global <%= c_dtype %>* A,
5
6
  const __global <%= c_dtype %>* L,
6
- __global <%= c_dtype %>* C) {
7
+ __global <%= c_dtype %>* C, __global <%= c_dtype %>* P) {
7
8
 
8
9
  // Get the index of the current element to be processed
9
10
  const int globalRow = get_global_id(0); // Row ID of C (0..M)
@@ -24,4 +25,8 @@ __kernel void softmax_cross_<%= dtype %>(const int N,
24
25
  for (int k=0; k < N; k++) {
25
26
  C[globalRow*N + k] = (log(acc) - (A[globalRow*N + k] - max)) * L[globalRow*N + k];
26
27
  }
28
+
29
+ for (int k=0; k < N; k++) {
30
+ P[globalRow*N + k] = (exp(A[globalRow*N + k] - max) / acc) - L[globalRow*N + k];
31
+ }
27
32
  }
@@ -16,10 +16,6 @@ module TensorStream
16
16
 
17
17
  def to_ruby
18
18
  return [] if buffer.empty?
19
- if shape.empty?
20
- return buffer[0] != 0 if data_type == :boolean
21
- return buffer[0]
22
- end
23
19
 
24
20
  if dirty
25
21
  op.command_queue.enqueue_read_buffer(cl_buffer, buffer, event_wait_list: [op].compact)
@@ -27,9 +23,13 @@ module TensorStream
27
23
  self.dirty = false
28
24
  end
29
25
 
26
+ if shape.empty?
27
+ return buffer[0] != 0 if data_type == :boolean
28
+ return buffer[0]
29
+ end
30
+
30
31
  result = buffer.reshape(*shape.map(&:to_i).reverse).to_a
31
- result = process_function_op(result, ->(a, _b) { a != 0 }) if data_type == :boolean
32
- result
32
+ data_type == :boolean ? process_function_op(result, ->(a, _b) { a != 0 }) : result
33
33
  end
34
34
  end
35
35
  end
@@ -30,6 +30,7 @@ module TensorStream
30
30
  ## PURE ruby evaluator used for testing and development
31
31
  class OpenclEvaluator < BaseEvaluator
32
32
  attr_accessor :retain
33
+ attr_reader :opencl_device
33
34
 
34
35
  include TensorStream::OpHelper
35
36
  include TensorStream::ArrayOpsHelper
@@ -51,20 +52,20 @@ module TensorStream
51
52
 
52
53
  def self.fetch_device(query = [])
53
54
  devices = query_devices_with_score
54
- platform_devices = devices.select { |d| d[0].platform.to_s.gsub(' ','_').downcase =~ /#{query[0].downcase}/ }
55
+ platform_devices = devices.select { |d| d[0].platform.to_s.tr(' ', '_').downcase =~ /#{query[0].downcase}/ }
55
56
  opencl_to_device(platform_devices[[query[1].to_i, platform_devices.size - 1].min])
56
57
  end
57
58
 
58
59
  def self.opencl_to_device(d)
59
60
  device = d[0]
60
61
  index = d[3]
61
- platform_name = device.platform.name.gsub(' ', '_').downcase
62
+ platform_name = device.platform.name.tr(' ', '_').downcase
62
63
  uri = [platform_name, index].join(':')
63
64
 
64
65
  device_type = device.type.to_s == 'GPU' ? :gpu : :cpu
65
66
 
66
- OpenclDevice.new(uri, device_type, self).tap do |d|
67
- d.native_device = device
67
+ OpenclDevice.new(uri, device_type, self).tap do |devide|
68
+ devide.native_device = device
68
69
  end
69
70
  end
70
71
 
@@ -96,8 +97,14 @@ module TensorStream
96
97
  end
97
98
  end
98
99
 
100
+ # buffer comes from non-opencl evaluator
99
101
  def convert_from_buffer(tensor, result)
100
- convert_to_opencl([result.buffer].flatten, shape_eval(result.buffer), data_type: result.data_type, name: tensor.name)
102
+ if result.buffer.is_a?(TensorStream::Evaluator::OutputGroup)
103
+ converted_outputs = result.buffer.outputs.zip(result.buffer.data_types).map { |output, data_type| convert_to_opencl([output].flatten, shape_eval(output), data_type: data_type, name: tensor.name) }
104
+ TensorStream::Evaluator::OutputGroup.new(converted_outputs, result.buffer.data_types)
105
+ else
106
+ convert_to_opencl([result.buffer].flatten, shape_eval(result.buffer), data_type: result.data_type, name: tensor.name)
107
+ end
101
108
  end
102
109
 
103
110
  def complete_eval(tensor, context)
@@ -106,7 +113,7 @@ module TensorStream
106
113
  if buffer.is_a?(Array)
107
114
  buffer = buffer.collect do |b|
108
115
  next b if b.buffer.size.zero?
109
- _opencl_queue.enqueue_read_buffer(b.cl_buffer, b.buffer, event_wait_list: [b.op].compact)
116
+ _opencl_queue.enqueue_read_buffer(b.cl_buffer, b.buffer, event_wait_list: build_event_wait_list([b]))
110
117
  b
111
118
  end
112
119
  else
@@ -114,14 +121,30 @@ module TensorStream
114
121
  return buffer if buffer.nil?
115
122
  return [] if buffer.buffer.nil?
116
123
  return buffer if buffer.buffer.size.zero?
117
- _opencl_queue.enqueue_read_buffer(buffer.cl_buffer, buffer.buffer, event_wait_list: [buffer.op].compact)
124
+ _opencl_queue.enqueue_read_buffer(buffer.cl_buffer, buffer.buffer, event_wait_list: build_event_wait_list([buffer]))
118
125
  end
119
126
  _opencl_queue.finish
120
127
  buffer
121
128
  end
122
129
 
123
- def opencl_device
124
- @opencl_device
130
+ def self.query_devices_with_score
131
+ OpenCL.platforms.flat_map do |p|
132
+ p.devices.select { |d| d.available > 0 }.each_with_index.collect do |d, index|
133
+ score = 0
134
+ if d.type.to_s == 'CPU'
135
+ score += 1
136
+ elsif d.type.to_s == 'GPU'
137
+ score += 4
138
+ end
139
+
140
+ score += 1000 if d.platform.name == 'NVIDIA CUDA'
141
+
142
+ score += d.max_compute_units
143
+ score += d.max_clock_frequency
144
+
145
+ [d, score, p.name, index]
146
+ end
147
+ end
125
148
  end
126
149
 
127
150
  protected
@@ -152,31 +175,9 @@ module TensorStream
152
175
  @opencl_context = OpenCL.create_context(opencl_device)
153
176
  end
154
177
 
155
- def self.query_devices_with_score
156
- OpenCL.platforms.flat_map do |p|
157
-
158
- p.devices.select { |d| d.available > 0 }.each_with_index.collect do |d, index|
159
- score = 0
160
- if d.type.to_s == 'CPU'
161
- score += 1
162
- elsif d.type.to_s == 'GPU'
163
- score += 4
164
- end
165
-
166
- if d.platform.name == 'NVIDIA CUDA'
167
- score += 1000
168
- end
169
-
170
- score += d.max_compute_units
171
- score += d.max_clock_frequency
172
-
173
- [d, score, p.name, index]
174
- end
175
- end
176
- end
177
-
178
178
  def create_command_queue
179
179
  supported_proprties = opencl_device.queue_properties.names
180
+
180
181
  properties = []
181
182
  properties << OpenCL::CommandQueue::PROFILING_ENABLE if supported_proprties.include?('PROFILING_ENABLE')
182
183
  properties << OpenCL::CommandQueue::OUT_OF_ORDER_EXEC_MODE_ENABLE if supported_proprties.include?('OUT_OF_ORDER_EXEC_MODE_ENABLE')
@@ -196,7 +197,7 @@ module TensorStream
196
197
  end
197
198
 
198
199
  def _cl_program(kernel, args = {})
199
- suffix = args.collect { |k,v| "#{k}.#{v}"}.join('.')
200
+ suffix = args.collect { |k, v| "#{k}.#{escape_arg_content(v)}" }.join('.')
200
201
  @context[:_cache]["_opencl_kernel_#{kernel}.#{suffix}:#{object_id}"] ||= begin
201
202
  filename = %w[cl.erb cl].map { |ext| cl_template_path(kernel, ext) }.find { |n| File.exist?(n) }
202
203
  raise "opencl kernel template for #{kernel} has not yet been defined" if filename.nil?
@@ -211,6 +212,13 @@ module TensorStream
211
212
  end
212
213
  end
213
214
 
215
+ def escape_arg_content(value)
216
+ return value.tr(' ','_') if value.is_a?(String)
217
+ return value.join('-') if value.is_a?(Array)
218
+
219
+ value
220
+ end
221
+
214
222
  def _run(tensor, execution_context)
215
223
  return tensor if tensor.is_a?(OpenCLBuffer)
216
224
  return tensor.map { |t| _run(t, execution_context) } if tensor.is_a?(Array) && !tensor.size.empty? && tensor[0].is_a?(Tensor)
@@ -236,7 +244,7 @@ module TensorStream
236
244
  res
237
245
  end
238
246
 
239
- def eval_variable(tensor, child_context)
247
+ def eval_variable(tensor, _child_context)
240
248
  raise "variable #{tensor.name} not initalized" if tensor.value.nil? && (tensor.buffer.nil? || !tensor.buffer.dirty)
241
249
  tensor.buffer = wrap_opencl(tensor, name: tensor.name) if tensor.buffer.nil?
242
250
  tensor.buffer
@@ -259,7 +267,10 @@ module TensorStream
259
267
  end
260
268
  end
261
269
 
262
- register_op :identity do |_context, _tensor, inputs|
270
+ register_op :identity do |context, tensor, inputs|
271
+ if tensor.inputs.size > 1
272
+ tensor.inputs[1..inputs.size].each { |input| complete_eval(input, context) }
273
+ end
263
274
  inputs[0]
264
275
  end
265
276
 
@@ -277,18 +288,19 @@ module TensorStream
277
288
  assign_var(tensor, value, context)
278
289
  end
279
290
 
291
+ register_op :variable, noop: true do |context, tensor, inputs|
292
+ variable = tensor.inputs[0]
293
+ raise "variable #{tensor.name} not initalized" if variable.value.nil? && (variable.buffer.nil? || !variable.buffer.dirty)
294
+ variable.buffer = wrap_opencl(variable, name: variable.name) if variable.buffer.nil?
295
+ variable.buffer
296
+ end
297
+
280
298
  # Fast in place multiply subtract assign
281
299
  register_op :apply_gradient_descent do |_context, tensor, inputs|
282
300
  _target_var, learning_rate, delta = inputs
283
301
 
284
302
  assign = tensor.inputs[0] || tensor
285
303
 
286
- unless assign.buffer
287
- value = read_final_result(buffer)
288
- assign.buffer = convert_to_opencl(value, buffer.shape, data_type: tensor.data_type, name: assign.name)
289
- assign.value = value
290
- end
291
-
292
304
  assign.buffer.dirty = true # force buffer copy when variable is read externally
293
305
  output_buffer = assign.buffer
294
306
 
@@ -297,13 +309,39 @@ module TensorStream
297
309
  cl_m = OpenCL::Int1.new(m || 1)
298
310
  cl_n = OpenCL::Int1.new(n || 1)
299
311
 
300
- event_wait_list = [assign.buffer.op, learning_rate.op, delta.op].compact # add dependency wait list
312
+ event_wait_list = build_event_wait_list([assign.buffer, learning_rate, delta])
301
313
  method_call = :"apply_gradient_#{output_buffer.data_type}"
302
314
  event = _cl_program("apply_gradient", dtype: output_buffer.data_type).send(method_call, _opencl_queue, work_group, cl_m, cl_n, delta.cl_buffer, learning_rate.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
303
315
  output_buffer.op = event
304
316
  output_buffer
305
317
  end
306
318
 
319
+ # Fast in place multiply subtract assign
320
+ register_op :apply_momentum do |_context, tensor, inputs|
321
+ target_var, momentum_var, learning_rate, grad, momentum = inputs
322
+
323
+ assign = tensor.inputs[0] || tensor
324
+ assign_acc = tensor.inputs[1]
325
+ assign.buffer.dirty = true # force buffer copy when variable is read externally
326
+ assign_acc.buffer.dirty = true # force buffer copy when variable is read externally
327
+
328
+ output_buffer = assign.buffer
329
+
330
+ m, n = output_buffer.shape
331
+ work_group = [m || 1, n || 1]
332
+ cl_m = OpenCL::Int1.new(m || 1)
333
+ cl_n = OpenCL::Int1.new(n || 1)
334
+
335
+ event_wait_list = build_event_wait_list([assign.buffer, assign_acc.buffer, learning_rate, grad, momentum])
336
+ method_call = :"apply_momentum_#{output_buffer.data_type}"
337
+ event = _cl_program("apply_momentum", nesterov: tensor.options[:use_nesterov], dtype: output_buffer.data_type).
338
+ send(method_call, _opencl_queue, work_group, cl_m, cl_n, grad.cl_buffer,
339
+ learning_rate.cl_buffer, momentum.cl_buffer, output_buffer.cl_buffer,
340
+ assign_acc.buffer.cl_buffer, event_wait_list: event_wait_list)
341
+ output_buffer.op = event
342
+ output_buffer
343
+ end
344
+
307
345
  %i[less less_equal greater greater_equal equal not_equal logical_and].each do |op|
308
346
  register_op op, noop: true do |context, tensor, inputs|
309
347
  execute_2_operand_func(op.to_s, tensor, inputs[0], inputs[1], context, 'cond')
@@ -332,7 +370,7 @@ module TensorStream
332
370
  a = inputs_queue.pop
333
371
  until inputs_queue.empty?
334
372
  b = inputs_queue.pop
335
- event_wait_list = [a.op, b.op].compact
373
+ event_wait_list = build_event_wait_list([a, b])
336
374
  method_call = :"add_#{a.data_type}_#{b.data_type}"
337
375
  event = _cl_program('add', a: a.data_type, b: b.data_type, dtype: dtype).send(method_call, _opencl_queue, work_group, cl_m, cl_n, cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
338
376
  a = output_buffer
@@ -353,6 +391,23 @@ module TensorStream
353
391
  convert_to_opencl(new_buf, new_shape, data_type: inputs[0].data_type, name: tensor.name)
354
392
  end
355
393
 
394
+ register_op :fill, buffer: true do |_context, tensor, inputs|
395
+ shape = inputs[0]
396
+ value = inputs[1]
397
+
398
+ narray_size = shape.buffer.to_a.reduce(:*) || 1
399
+ cl_buffer = get_cached_buffer(tensor.name, shape.buffer.to_a)
400
+
401
+ buffer = if cl_buffer
402
+ cl_buffer.buffer
403
+ else
404
+ allocate_narray_for_type(tensor.data_type, narray_size)
405
+ end
406
+
407
+ buffer.fill!(value.buffer[0])
408
+ convert_to_opencl(buffer, shape.buffer.to_a, data_type: tensor.data_type, name: tensor.name)
409
+ end
410
+
356
411
  register_op :floor_div, noop: true do |context, tensor, inputs|
357
412
  if fp_type?(tensor.data_type)
358
413
  execute_2_operand_func('floor_div', tensor, inputs[0], inputs[1], context)
@@ -374,8 +429,15 @@ module TensorStream
374
429
  v = b.shape[0]
375
430
  k = a.shape[1]
376
431
 
377
- m, k = [a.shape[1], a.shape[0]] if tensor.options[:transpose_a]
378
- n, v = [b.shape[0], b.shape[1]] if tensor.options[:transpose_b]
432
+ if tensor.options[:transpose_a]
433
+ m = a.shape[1]
434
+ k = a.shape[0]
435
+ end
436
+
437
+ if tensor.options[:transpose_b]
438
+ n = b.shape[0]
439
+ v = b.shape[1]
440
+ end
379
441
 
380
442
  result_shape = [m, n]
381
443
 
@@ -393,8 +455,8 @@ module TensorStream
393
455
 
394
456
  transpose_a = OpenCL::Int1.new(tensor.options[:transpose_a] ? 1 : 0)
395
457
  transpose_b = OpenCL::Int1.new(tensor.options[:transpose_b] ? 1 : 0)
396
-
397
- output_buffer.op = _cl_program('gemm', dtype: dtype).send(:"gemm_#{dtype}", _opencl_queue, result_shape, cl_m, cl_n, cl_k, transpose_a, transpose_b, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer)
458
+ event_wait_list = build_event_wait_list(inputs)
459
+ output_buffer.op = _cl_program('gemm', dtype: dtype).send(:"gemm_#{dtype}", _opencl_queue, result_shape, cl_m, cl_n, cl_k, transpose_a, transpose_b, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
398
460
  output_buffer
399
461
  end
400
462
 
@@ -406,14 +468,47 @@ module TensorStream
406
468
  cl_m = OpenCL::Int1.new(m || 1)
407
469
  cl_n = OpenCL::Int1.new(n || 1)
408
470
  work_group = [m || 1, n || 1]
409
-
410
- buffer.op = _cl_program("cast", source_dt: a.data_type, target_dt: tensor.data_type).cast(_opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, buffer.cl_buffer)
471
+ event_wait_list = build_event_wait_list(inputs)
472
+ buffer.op = _cl_program("cast", source_dt: a.data_type, target_dt: tensor.data_type).cast(_opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, buffer.cl_buffer, event_wait_list: event_wait_list)
411
473
  buffer
412
474
  else
413
475
  a
414
476
  end
415
477
  end
416
478
 
479
+ register_op :stack do |_context, tensor, inputs|
480
+ axis = tensor.options[:axis] || 0
481
+ shape = inputs[0].shape
482
+ rank = shape.size + 1
483
+ elem_size = shape.empty? ? 1 : shape.reduce(:*)
484
+
485
+ new_shape = [inputs.size]
486
+ shape.inject(new_shape) { |ns, s| ns << s }
487
+
488
+ divisors = new_shape.dup.drop(1).reverse.inject([1]) do |a, s|
489
+ a << s * a.last
490
+ end.reverse
491
+
492
+ axis = rank + axis if axis < 0
493
+ rotated_shape = Array.new(axis + 1) { new_shape.shift }
494
+ new_shape = rotated_shape.rotate! + new_shape
495
+
496
+ output_buffer = _create_result_buffer(tensor.data_type, new_shape, tensor.name)
497
+ multipliers = new_shape.dup.drop(1).reverse.inject([1]) do |a, s|
498
+ a << s * a.last
499
+ end.reverse
500
+
501
+ cl_n = OpenCL::Int1.new(elem_size)
502
+ work_group = [elem_size]
503
+ event_wait_list = build_event_wait_list(inputs)
504
+ ops = inputs.each_with_index.map do |input, index|
505
+ cl_index = OpenCL::Int1.new(index)
506
+ _cl_program("pack", data_type: tensor.data_type, divisors: divisors, multipliers: multipliers, axis: axis).pack(_opencl_queue, work_group, cl_n, cl_index, input.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
507
+ end
508
+ output_buffer.op = ops
509
+ output_buffer
510
+ end
511
+
417
512
  %i[sign exp tan acos asin sin cos abs sqrt negate square reciprocal tanh tanh_grad sigmoid log1p round floor ceil].each do |op|
418
513
  register_op op, noop: true do |context, tensor, inputs|
419
514
  execute_func(op.to_s, tensor, inputs[0], context)
@@ -422,7 +517,7 @@ module TensorStream
422
517
 
423
518
  register_op :softmax do |_context, tensor, inputs|
424
519
  a = inputs[0]
425
- event_wait_list = [a.op].compact
520
+ event_wait_list = build_event_wait_list(inputs)
426
521
  dtype = tensor.data_type
427
522
  output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
428
523
 
@@ -438,7 +533,7 @@ module TensorStream
438
533
 
439
534
  register_op :log_softmax do |_context, tensor, inputs|
440
535
  a = inputs[0] # logits
441
- event_wait_list = [a.op].compact
536
+ event_wait_list = build_event_wait_list(inputs)
442
537
  dtype = tensor.data_type
443
538
  output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
444
539
 
@@ -452,28 +547,33 @@ module TensorStream
452
547
  output_buffer
453
548
  end
454
549
 
455
- register_op :softmax_cross_entropy_with_logits_v2 do |_context, tensor, inputs|
550
+ register_op :softmax_cross_entropy_with_logits_v2 do |context, tensor, inputs|
456
551
  a = inputs[0] # logits
457
552
  b = inputs[1] # labels
458
- event_wait_list = [a.op, b.op].compact
553
+ event_wait_list = build_event_wait_list(inputs)
459
554
  dtype = tensor.data_type
460
555
  output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
461
-
556
+ output_buffer_backprop = _create_result_buffer(tensor.data_type, a.shape, "#{tensor.name}_2")
557
+ rank = a.shape.size - 1
462
558
  m, n = a.shape
463
559
  work_group = [m]
464
560
  n = m if n.nil?
465
561
  cl_n = OpenCL::Int1.new(n || 1)
466
562
 
467
- event = _cl_program("softmax_cross", dtype: dtype).send(:"softmax_cross_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
563
+ event = _cl_program("softmax_cross", dtype: dtype).send(:"softmax_cross_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer, b.cl_buffer,
564
+ output_buffer.cl_buffer, output_buffer_backprop.cl_buffer, event_wait_list: event_wait_list)
468
565
  output_buffer.op = event
469
- output_buffer
566
+ output_buffer_backprop.op = event
567
+
568
+ loss = reduction(context, tensor, output_buffer, rank, :sum)
569
+ OutputGroup.new([loss, output_buffer_backprop], [tensor.inputs[0].data_type, tensor.inputs[0].data_type])
470
570
  end
471
571
 
472
572
  register_op :softmax_cross_entropy_with_logits_v2_grad do |_context, tensor, inputs|
473
573
  a = inputs[0] # logits
474
574
  b = inputs[1] # labels
475
575
  c = inputs[2] # grads
476
- event_wait_list = [a.op, b.op, c.op].compact
576
+ event_wait_list = build_event_wait_list(inputs)
477
577
  dtype = tensor.data_type
478
578
  output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
479
579
 
@@ -490,7 +590,7 @@ module TensorStream
490
590
  register_op :softmax_grad do |_context, tensor, inputs|
491
591
  a, grad = inputs
492
592
 
493
- event_wait_list = [a.op].compact
593
+ event_wait_list = build_event_wait_list(inputs)
494
594
  dtype = tensor.data_type
495
595
  output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
496
596
 
@@ -508,7 +608,7 @@ module TensorStream
508
608
  name = tensor.options[:name]
509
609
 
510
610
  a.buffer.each do |input|
511
- raise "#{name} Invalid Argument" if input.nan? || input.infinite?
611
+ raise TensorStream::InvalidArgumentError, "#{name} Invalid Argument" if input.nan? || input.infinite?
512
612
  end
513
613
  a
514
614
  end
@@ -522,8 +622,8 @@ module TensorStream
522
622
  input_a = read_final_result(complete_eval(a, context))
523
623
  input_b = read_final_result(complete_eval(b, context))
524
624
  b_a, b_b = broadcast(input_a, input_b)
525
- [ wrap_opencl(b_a, data_type: a.data_type, name: "#{tensor.name}_a"),
526
- wrap_opencl(b_b, data_type: a.data_type, name: "#{tensor.name}_b")]
625
+ [wrap_opencl(b_a, data_type: a.data_type, name: "#{tensor.name}_a"),
626
+ wrap_opencl(b_b, data_type: a.data_type, name: "#{tensor.name}_b")]
527
627
  end
528
628
  end
529
629
 
@@ -557,8 +657,22 @@ module TensorStream
557
657
 
558
658
  register_op :transpose, buffer: true do |_context, tensor, inputs|
559
659
  t_param = Array.new(inputs[0].shape.size) { |index| index }.reverse
560
- transposed = inputs[0].buffer.reshape(*inputs[0].shape.reverse).transpose(*t_param)
561
- convert_to_opencl(transposed.flatten, transposed.shape.reverse, data_type: inputs[0].data_type, name: tensor.name)
660
+
661
+ if inputs[0].shape.size == 2 && inputs[1].nil?
662
+ transposed = inputs[0].buffer.reshape(*inputs[0].shape.reverse).transpose(*t_param)
663
+ res = convert_to_opencl(transposed.flatten, transposed.shape.reverse, data_type: inputs[0].data_type, name: tensor.name)
664
+ res
665
+ else
666
+ rank = inputs[0].shape.size
667
+ perm = inputs[1].nil? ? (0...rank).to_a.reverse : inputs[1].buffer
668
+ new_shape = perm.map { |p| inputs[0].shape[p] }.to_a
669
+ output_buffer = _create_result_buffer(tensor.data_type, new_shape, tensor.name)
670
+ transpose_with_perm(inputs[0].buffer, output_buffer.buffer, inputs[0].shape, new_shape, perm)
671
+
672
+ write_op = _opencl_queue.enqueue_write_buffer(output_buffer.cl_buffer, output_buffer.buffer)
673
+ output_buffer.op = write_op
674
+ output_buffer
675
+ end
562
676
  end
563
677
 
564
678
  register_op :index, noop: true do |context, tensor, inputs|
@@ -567,39 +681,36 @@ module TensorStream
567
681
 
568
682
  if a.is_a?(OutputGroup)
569
683
  a.outputs[index]
684
+ elsif a.is_a?(Array)
685
+ a[index]
570
686
  else
571
- if a.is_a?(Array)
572
- a[index]
573
- else
574
- new_shape = a.shape.dup
575
- new_shape.shift
576
- input_a = read_final_result(a)
577
- convert_to_opencl(input_a[index], new_shape, data_type: a.data_type, name: tensor.name)
578
- end
687
+ new_shape = a.shape.dup
688
+ new_shape.shift
689
+ input_a = read_final_result(a)
690
+ convert_to_opencl(input_a[index], new_shape, data_type: a.data_type, name: tensor.name)
579
691
  end
580
692
  end
581
693
 
582
694
  register_op :broadcast_gradient_args, buffer: true do |_context, tensor, inputs|
583
695
  rx, ry = get_broadcast_gradient_args(inputs[0].buffer.to_a, inputs[1].buffer.to_a)
584
- OutputGroup.new([wrap_opencl(rx, data_type: :int32, name: "#{tensor.name}"), wrap_opencl(ry, data_type: :int32, name: "#{tensor.name}:1")])
696
+ OutputGroup.new([wrap_opencl(rx, data_type: :int32, name: tensor.name), wrap_opencl(ry, data_type: :int32, name: "#{tensor.name}:1")], tensor.inputs.map(&:data_type))
585
697
  end
586
698
 
587
699
  register_op :shape do |_context, tensor, inputs|
588
700
  wrap_opencl(inputs[0].shape, name: tensor.name, data_type: tensor.data_type)
589
701
  end
590
702
 
591
- register_op :reshape, buffer: true do |_context, _tensor, inputs|
703
+ register_op :reshape, buffer: true do |_context, tensor, inputs|
592
704
  arr = inputs[0]
593
705
  new_shape = read_final_result(inputs[1])
594
706
 
595
- if new_shape.size.zero? && arr.buffer.size == 1
596
- arr.shape = new_shape
597
- arr
598
- else
599
- new_shape = TensorShape.fix_inferred_elements(new_shape, arr.buffer.size)
600
- arr.shape = new_shape
601
- arr
602
- end
707
+ shape = if new_shape.size.zero? && arr.buffer.size == 1
708
+ new_shape
709
+ else
710
+ TensorShape.fix_inferred_elements(new_shape, arr.buffer.size)
711
+ end
712
+
713
+ convert_to_opencl(arr.buffer, shape, data_type: arr.data_type, name: tensor.name)
603
714
  end
604
715
 
605
716
  register_op :flow_group do |_context, _tensor, inputs|
@@ -618,6 +729,7 @@ module TensorStream
618
729
 
619
730
  register_op :prod, noop: true do |context, tensor, inputs|
620
731
  input_a = complete_eval(inputs[0], context)
732
+
621
733
  if input_a.buffer.empty?
622
734
  convert_to_opencl([1.0], [], data_type: inputs[0].data_type, name: tensor.name)
623
735
  else
@@ -646,13 +758,11 @@ module TensorStream
646
758
  end
647
759
 
648
760
  def eval_operation(tensor, child_context)
649
-
650
761
  cache_key = "#{tensor.graph.object_id}_opencl_#{tensor.name}:#{object_id}"
651
762
  return @context[:_cache][cache_key] if @context[:_cache].key?(cache_key)
652
763
  return @context[cache_key] if @context.key?(cache_key)
653
- # puts tensor.name
764
+ # puts "opencl: #{tensor.name}"
654
765
  invoke(tensor, child_context).tap do |result|
655
- # puts "#{tensor.to_math(true,1)} = #{read_final_result(complete_eval(result, child_context))}"
656
766
  if tensor.breakpoint
657
767
  a = resolve_placeholder(tensor.inputs[0], child_context) if tensor.inputs && tensor.inputs[0]
658
768
  b = resolve_placeholder(tensor.inputs[1], child_context) if tensor.inputs && tensor.inputs[1]
@@ -676,9 +786,11 @@ module TensorStream
676
786
  @context[:_cache][cache_key] = result if tensor.is_const
677
787
  end
678
788
  rescue EvaluatorExcecutionException => e
679
- raise e
789
+ _opencl_queue.finish # dump queue
790
+ raise e, "error #{e.message} while evaluating #{tensor.name} : #{tensor.to_math(true, 1)} defined at #{tensor.source}"
680
791
  rescue TensorStreamError => e
681
- raise e
792
+ _opencl_queue.finish # dump queue
793
+ raise e, "error #{e.message} while evaluating #{tensor.name} : #{tensor.to_math(true, 1)} defined at #{tensor.source}"
682
794
  rescue StandardError => e
683
795
  _opencl_queue.finish # dump queue
684
796
  puts e.message
@@ -698,7 +810,7 @@ module TensorStream
698
810
  # File.write('/home/jedld/workspace/tensor_stream/samples/error.graphml', TensorStream::Graphml.new.get_string(tensor, @session))
699
811
 
700
812
  # File.write('/Users/josephemmanueldayo/workspace/gradients.graphml', TensorStream::Graphml.new.get_string(tensor, @session))
701
- raise EvaluatorExcecutionException.new(e, tensor), "error #{e.message} while evaluating #{tensor.name} : #{tensor.to_math(true,1)} defined at #{tensor.source}"
813
+ raise EvaluatorExcecutionException.new(e, tensor), "error #{e.message} while evaluating #{tensor.name} : #{tensor.to_math(true, 1)} defined at #{tensor.source}"
702
814
  end
703
815
 
704
816
  def eval_tensor(tensor, child_context)
@@ -724,8 +836,9 @@ module TensorStream
724
836
 
725
837
  if assign.buffer
726
838
  # buffer = type_cast(buffer, assign.data_type, name: "#{tensor.name}/cast_#{tensor.name}_#{tensor.data_type}")
839
+ event_wait_list = build_event_wait_list([buffer, assign.buffer])
727
840
  assign.buffer.op = if assign.buffer.cl_buffer != buffer.cl_buffer
728
- _opencl_queue.enqueue_copy_buffer(buffer.cl_buffer, assign.buffer.cl_buffer, event_wait_list: [buffer.op, assign.buffer.op])
841
+ _opencl_queue.enqueue_copy_buffer(buffer.cl_buffer, assign.buffer.cl_buffer, event_wait_list: event_wait_list)
729
842
  else
730
843
  buffer.op
731
844
  end
@@ -745,7 +858,6 @@ module TensorStream
745
858
  dtype = tensor.data_type
746
859
  result_shape = TensorShape.infer_shape(a.shape, b.shape)
747
860
  return _create_result_buffer(dtype, [0], "out_#{tensor.name}") if result_shape == [0]
748
-
749
861
  output_buffer = _create_result_buffer(tensor.data_type, result_shape, "out_#{tensor.name}")
750
862
  a, b, prog, switch_operands = select_program(a, b, op_name)
751
863
  m, n = result_shape
@@ -754,21 +866,26 @@ module TensorStream
754
866
  cl_n = OpenCL::Int1.new(n || 1)
755
867
  cl_switch = OpenCL::Int1.new(switch_operands) # no need to switch for addition
756
868
 
757
- event_wait_list = [a.op, b.op].compact # add dependency wait list
869
+ event_wait_list = build_event_wait_list([a, b]) # add dependency wait list
758
870
 
759
871
  method_call = :"#{prog}_#{a.data_type}_#{b.data_type}"
872
+ prog_name ||= op_name
760
873
  event = if prog == "#{op_name}_b"
761
- cl_m_b, cl_n_b = if b.shape.size == 2
762
- [OpenCL::Int1.new(b.shape[0]), OpenCL::Int1.new(b.shape[1])]
763
- elsif b.shape.size == 1
764
- [OpenCL::Int1.new(1), OpenCL::Int1.new(b.shape[0])]
765
- else
766
- raise "rank > 2 not supported!"
767
- end
768
- _cl_program("#{prog_name || op_name}", a: a.data_type, b: b.data_type, dtype: dtype).send(method_call, _opencl_queue, work_group, cl_m, cl_n, cl_m_b, cl_n_b, cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
769
- else
770
- _cl_program("#{prog_name || op_name}", a: a.data_type, b: b.data_type, dtype: dtype).send(method_call, _opencl_queue, work_group, cl_m, cl_n, cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
771
- end
874
+ cl_m_b, cl_n_b = if b.shape.size == 2
875
+ [OpenCL::Int1.new(b.shape[0]), OpenCL::Int1.new(b.shape[1])]
876
+ elsif b.shape.size == 1
877
+ [OpenCL::Int1.new(1), OpenCL::Int1.new(b.shape[0])]
878
+ else
879
+ raise "rank > 2 not supported!"
880
+ end
881
+ _cl_program(prog_name, a: a.data_type, b: b.data_type, dtype: dtype).
882
+ send(method_call, _opencl_queue, work_group, cl_m, cl_n, cl_m_b, cl_n_b,
883
+ cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
884
+ else
885
+ _cl_program(prog_name, a: a.data_type, b: b.data_type, dtype: dtype).
886
+ send(method_call, _opencl_queue, work_group, cl_m, cl_n, cl_switch,
887
+ a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
888
+ end
772
889
 
773
890
  output_buffer.op = event
774
891
  output_buffer
@@ -789,14 +906,14 @@ module TensorStream
789
906
  cl_m = OpenCL::Int1.new(m || 1)
790
907
  cl_n = OpenCL::Int1.new(n || 1)
791
908
 
792
- event_wait_list = [a.op, b.op, p.op].compact # add dependency wait list
909
+ event_wait_list = build_event_wait_list([a, b, p]) # add dependency wait list
793
910
  output_buffer.op = _cl_program(op_name.to_s, dtype: dtype).send(:"#{op_name}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, p.cl_buffer, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
794
911
  output_buffer
795
912
  end
796
913
 
797
914
  def execute_func(op_name, tensor, a, child_context)
798
915
  a = _run(a, child_context)
799
- event_wait_list = [a.op].compact
916
+ event_wait_list = build_event_wait_list([a])
800
917
  dtype = tensor.data_type
801
918
  output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
802
919
 
@@ -814,7 +931,7 @@ module TensorStream
814
931
  return [a, b] if a.data_type == b.data_type
815
932
  m, n = b.shape
816
933
  work_group = [m || 1, n || 1]
817
- event_wait_list = [b.op].compact
934
+ event_wait_list = build_event_wait_list([b])
818
935
  buffer = _create_result_buffer(b.data_type, b.shape, name)
819
936
 
820
937
  cl_m = OpenCL::Int1.new(m || 1)
@@ -848,6 +965,11 @@ module TensorStream
848
965
  convert_to_opencl(value, shape, data_type: data_type || tensor.data_type, name: name)
849
966
  end
850
967
 
968
+ def get_cached_buffer(name, shape)
969
+ cache_key = "_cl_object_#{name}:#{shape.join('_')}:#{object_id}"
970
+ @context[:_cache][cache_key]
971
+ end
972
+
851
973
  def convert_to_opencl(value, shape, data_type: nil, name: nil)
852
974
  value = [value] if !value.is_a?(Array) && !value.is_a?(NArray)
853
975
 
@@ -863,6 +985,8 @@ module TensorStream
863
985
  allocate_narray_for_type(data_type, narray_size)
864
986
  end
865
987
 
988
+ return nil if buffer.nil?
989
+
866
990
  cl_buffer_size = shape.empty? ? 1 : shape.reduce(:*)
867
991
 
868
992
  cl_buffer = unless value.flatten.empty?
@@ -908,18 +1032,20 @@ module TensorStream
908
1032
  NArray.sint(narray_size)
909
1033
  when :boolean
910
1034
  NArray.sint(narray_size)
1035
+ when :unknown
1036
+ nil
911
1037
  else
912
1038
  raise "unsupported type #{data_type}"
913
1039
  end
914
1040
  end
915
1041
 
916
1042
  def _create_result_buffer(data_type, shape, name)
917
- return OpenCLBuffer.new(data_type: data_type, shape: [0], buffer: nil, cl_buffer: nil) if shape == [0]
1043
+ return OpenCLBuffer.new(name: name, data_type: data_type, shape: [0], buffer: nil, cl_buffer: nil) if shape == [0]
918
1044
  @context[:_cache][:_cl_buffers]["_result_#{name}_#{shape.join('_')}:#{object_id}"] ||= begin
919
1045
  size = shape.empty? || shape == [0] ? 1 : shape.reduce(:*)
920
1046
  buffer = allocate_narray_for_type(data_type, size)
921
1047
  cl_buffer = _opencl_context.create_buffer(buffer.size * buffer.element_size)
922
- OpenCLBuffer.new(data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer)
1048
+ OpenCLBuffer.new(data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer, name: name)
923
1049
  end
924
1050
  end
925
1051
 
@@ -969,7 +1095,7 @@ module TensorStream
969
1095
 
970
1096
  def reduction(child_context, tensor, a, b, func)
971
1097
  input = complete_eval(a, child_context)
972
- axis = read_final_result(complete_eval(b, child_context))
1098
+ axis = b.is_a?(Tensor) ? read_final_result(complete_eval(b, child_context)) : b
973
1099
  if axis.nil?
974
1100
  red = input.buffer.send(func)
975
1101
  convert_to_opencl(red, [], data_type: tensor.data_type, name: tensor.name)
@@ -1021,6 +1147,10 @@ module TensorStream
1021
1147
  shape.is_a?(Array) ? shape.size : 0
1022
1148
  end
1023
1149
 
1150
+ def build_event_wait_list(inputs)
1151
+ inputs.compact.map(&:op).flatten
1152
+ end
1153
+
1024
1154
  def resolve_placeholder(placeholder, _execution_context = {})
1025
1155
  return nil if placeholder.nil?
1026
1156