tensor_stream 0.7.0 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.rubocop.yml +6 -1
- data/CHANGELOG.md +10 -0
- data/README.md +35 -0
- data/lib/tensor_stream.rb +2 -2
- data/lib/tensor_stream/debugging/debugging.rb +2 -1
- data/lib/tensor_stream/dynamic_stitch.rb +23 -24
- data/lib/tensor_stream/evaluator/base_evaluator.rb +27 -18
- data/lib/tensor_stream/evaluator/opencl/kernels/apply_momentum.cl +16 -0
- data/lib/tensor_stream/evaluator/opencl/kernels/pack.cl +24 -0
- data/lib/tensor_stream/evaluator/opencl/kernels/softmax_cross.cl +6 -1
- data/lib/tensor_stream/evaluator/opencl/opencl_buffer.rb +6 -6
- data/lib/tensor_stream/evaluator/opencl/opencl_evaluator.rb +237 -107
- data/lib/tensor_stream/evaluator/operation_helpers/array_ops_helper.rb +97 -7
- data/lib/tensor_stream/evaluator/ruby_evaluator.rb +230 -123
- data/lib/tensor_stream/exceptions.rb +1 -0
- data/lib/tensor_stream/graph_builder.rb +2 -3
- data/lib/tensor_stream/graph_deserializers/protobuf.rb +22 -23
- data/lib/tensor_stream/graph_serializers/graphml.rb +26 -29
- data/lib/tensor_stream/graph_serializers/pbtext.rb +22 -19
- data/lib/tensor_stream/helpers/string_helper.rb +4 -5
- data/lib/tensor_stream/math_gradients.rb +141 -77
- data/lib/tensor_stream/nn/nn_ops.rb +4 -6
- data/lib/tensor_stream/operation.rb +139 -120
- data/lib/tensor_stream/ops.rb +36 -3
- data/lib/tensor_stream/session.rb +7 -11
- data/lib/tensor_stream/tensor.rb +3 -3
- data/lib/tensor_stream/tensor_shape.rb +5 -0
- data/lib/tensor_stream/train/gradient_descent_optimizer.rb +4 -37
- data/lib/tensor_stream/train/momentum_optimizer.rb +48 -0
- data/lib/tensor_stream/train/optimizer.rb +129 -0
- data/lib/tensor_stream/train/saver.rb +0 -1
- data/lib/tensor_stream/train/slot_creator.rb +62 -0
- data/lib/tensor_stream/train/utils.rb +11 -12
- data/lib/tensor_stream/trainer.rb +3 -0
- data/lib/tensor_stream/utils.rb +18 -11
- data/lib/tensor_stream/variable.rb +19 -12
- data/lib/tensor_stream/variable_scope.rb +1 -1
- data/lib/tensor_stream/version.rb +1 -1
- data/samples/iris.rb +2 -1
- data/samples/linear_regression.rb +3 -1
- data/samples/nearest_neighbor.rb +2 -0
- data/test_samples/neural_network_raw.py +101 -0
- data/test_samples/raw_neural_net_sample.rb +6 -4
- data/test_samples/test2.py +73 -27
- metadata +9 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: f84c2b9852fcf4931c47c0130b67497a50a87b0f
|
4
|
+
data.tar.gz: 524e1105da4e06e3472cbcfa0e6f764ae4512d37
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 420e2675ab67d4c8462534bdf8c703671656f7852d984579e22ee57f1425dd5740fcb64a1e52363bf337cd7c691d87a75c76bd868b13c8a7f06d78e0eb00aa73
|
7
|
+
data.tar.gz: 24fe1022741883d46cdd5af51309da33d421d72874f0cc84bf2e0ed14a62602f1830c6060bd86e42359b7962b4a57727c9a48ce13d5950d5ba02f6a9cdfd719f
|
data/.rubocop.yml
CHANGED
@@ -6,6 +6,10 @@ AllCops:
|
|
6
6
|
- tensor_stream.gemspec
|
7
7
|
- Rakefile
|
8
8
|
|
9
|
+
Naming/AccessorMethodName:
|
10
|
+
Exclude:
|
11
|
+
- lib/tensor_stream/utils.rb
|
12
|
+
|
9
13
|
Style/StringLiterals:
|
10
14
|
Enabled: false
|
11
15
|
|
@@ -81,4 +85,5 @@ Style/TrailingCommaInHashLiteral:
|
|
81
85
|
|
82
86
|
Naming/UncommunicativeMethodParamName:
|
83
87
|
Exclude:
|
84
|
-
- lib/tensor_stream/evaluator/ruby_evaluator.rb
|
88
|
+
- lib/tensor_stream/evaluator/ruby_evaluator.rb
|
89
|
+
- lib/tensor_stream/ops.rb
|
data/CHANGELOG.md
CHANGED
@@ -4,6 +4,16 @@ All notable changes to this project will be documented in this file.
|
|
4
4
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
|
5
5
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
6
6
|
|
7
|
+
## [0.8.0] - 2018-08-29
|
8
|
+
### Added
|
9
|
+
- [TRAINING] Added new supported optimizer, MomentumOptimizer loosely based on tensorflow's implementation (with nesterov support)
|
10
|
+
- [NEW OP] fill, stack, atan, cumprod, gather, invert_permutation, setdiff1d
|
11
|
+
|
12
|
+
### Fixes
|
13
|
+
- Fixed device delegator where it does not pick the correct evaluator to use in some cases
|
14
|
+
- [GRADIENTS] Properly implement gradient computation for prod, tile, transpose
|
15
|
+
- Fixed gradient computation for softmax_cross_entropy_with_logits_v2 (now based on tensorflow's implementation)
|
16
|
+
|
7
17
|
## [0.7.0] - 2018-08-08
|
8
18
|
### Added
|
9
19
|
- [NEW OP] expand_dims, min, acos, asin, add_n
|
data/README.md
CHANGED
@@ -324,6 +324,41 @@ result = a + b
|
|
324
324
|
File.write("model.pbtext", result.graph.as_graph_def)
|
325
325
|
```
|
326
326
|
|
327
|
+
## Performance notes
|
328
|
+
|
329
|
+
Comparative performance with respect to other ruby libraries have not yet been performed. However it is
|
330
|
+
notable that TruffleRuby and ruby-2.6.0-preview2 with the --jit flag performs considerably better with respect
|
331
|
+
to previous versions of ruby(< 2.6)
|
332
|
+
|
333
|
+
Benchmarks running samples/linear_regression.rb on an Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz
|
334
|
+
|
335
|
+
ruby 2.4
|
336
|
+
|
337
|
+
```
|
338
|
+
$ ruby -v
|
339
|
+
ruby 2.4.0p0 (2016-12-24 revision 57164) [x86_64-linux]
|
340
|
+
$ ruby samples/linear_regression.rb
|
341
|
+
495 seconds 1000 epochs
|
342
|
+
```
|
343
|
+
|
344
|
+
ruby 2.6.0-preview2
|
345
|
+
|
346
|
+
```
|
347
|
+
$ ruby -v
|
348
|
+
ruby 2.6.0preview2 (2018-05-31 trunk 63539) [x86_64-linux]
|
349
|
+
$ ruby --jit samples/linear_regression.rb
|
350
|
+
394 seconds 10000 epochs
|
351
|
+
```
|
352
|
+
|
353
|
+
truffleruby
|
354
|
+
```
|
355
|
+
$ ruby -v
|
356
|
+
truffleruby 1.0.0-rc5, like ruby 2.4.4, GraalVM CE Native [x86_64-linux]
|
357
|
+
219 seconds 10000 epochs
|
358
|
+
```
|
359
|
+
|
360
|
+
For training large networks that works on images, the opencl evaluator is the only way to go.
|
361
|
+
|
327
362
|
## Roadmap
|
328
363
|
|
329
364
|
- Docs
|
data/lib/tensor_stream.rb
CHANGED
@@ -20,8 +20,6 @@ require 'tensor_stream/operation'
|
|
20
20
|
require 'tensor_stream/placeholder'
|
21
21
|
require 'tensor_stream/control_flow'
|
22
22
|
require 'tensor_stream/dynamic_stitch'
|
23
|
-
require 'tensor_stream/train/utils'
|
24
|
-
require 'tensor_stream/trainer'
|
25
23
|
require 'tensor_stream/nn/nn_ops'
|
26
24
|
require 'tensor_stream/evaluator/evaluator'
|
27
25
|
require 'tensor_stream/graph_serializers/serializer'
|
@@ -31,6 +29,8 @@ require 'tensor_stream/graph_serializers/graphml'
|
|
31
29
|
require 'tensor_stream/math_gradients'
|
32
30
|
require "tensor_stream/debugging/debugging"
|
33
31
|
require 'tensor_stream/utils'
|
32
|
+
require 'tensor_stream/train/utils'
|
33
|
+
require 'tensor_stream/trainer'
|
34
34
|
|
35
35
|
# require 'tensor_stream/libraries/layers'
|
36
36
|
require 'tensor_stream/monkey_patches/integer'
|
@@ -9,8 +9,9 @@ module TensorStream
|
|
9
9
|
nodes_to_process.each do |node|
|
10
10
|
node.inputs = node.inputs.collect do |input|
|
11
11
|
next if input.nil?
|
12
|
+
next input if input.is_a?(Variable)
|
12
13
|
|
13
|
-
if TensorStream::Ops::FLOATING_POINT_TYPES.include?(input.data_type)
|
14
|
+
if input.is_a?(Tensor) && TensorStream::Ops::FLOATING_POINT_TYPES.include?(input.data_type)
|
14
15
|
TensorStream.check_numerics(input, "#{node.name}/#{input.name}", name: "check/#{node.name}/#{input.name}" )
|
15
16
|
else
|
16
17
|
input
|
@@ -1,28 +1,27 @@
|
|
1
1
|
module TensorStream
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
def initialize(flow_type, inputs, ops = nil, options = {})
|
7
|
-
setup_initial_state(options)
|
8
|
-
|
9
|
-
@operation = :"flow_#{flow_type}"
|
10
|
-
@inputs = inputs
|
2
|
+
# Defines a TensorStream controlflow op
|
3
|
+
class DynamicStitch < Operation
|
4
|
+
attr_accessor :ops
|
11
5
|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
6
|
+
def initialize(flow_type, inputs, ops = nil, options = {})
|
7
|
+
setup_initial_state(options)
|
8
|
+
|
9
|
+
@operation = :"flow_#{flow_type}"
|
10
|
+
@inputs = inputs
|
11
|
+
|
12
|
+
@data_type = Tensor.detect_type(inputs[1])
|
13
|
+
@name = [@graph.get_name_scope, options[:name] || set_name].compact.join('/')
|
14
|
+
@ops = ops
|
15
|
+
@shape = TensorShape.new([inputs.size])
|
16
|
+
@graph.add_node(self)
|
17
|
+
end
|
18
|
+
|
19
|
+
def set_data_type(_passed_data_type)
|
20
|
+
:unknown
|
21
|
+
end
|
22
|
+
|
23
|
+
def run
|
24
|
+
eval
|
26
25
|
end
|
27
26
|
end
|
28
|
-
|
27
|
+
end
|
@@ -2,13 +2,14 @@ module TensorStream
|
|
2
2
|
# Evaluator base module
|
3
3
|
module Evaluator
|
4
4
|
class OutputGroup
|
5
|
-
attr_accessor :outputs
|
6
|
-
def initialize(outputs = [])
|
5
|
+
attr_accessor :outputs, :data_types
|
6
|
+
def initialize(outputs = [], data_types = [])
|
7
7
|
@outputs = outputs
|
8
|
+
@data_types = data_types
|
8
9
|
end
|
9
10
|
end
|
10
11
|
|
11
|
-
class UnsupportedOp <
|
12
|
+
class UnsupportedOp < RuntimeError
|
12
13
|
def initialize(tensor)
|
13
14
|
@tensor = tensor
|
14
15
|
end
|
@@ -111,22 +112,13 @@ module TensorStream
|
|
111
112
|
|
112
113
|
resolved_inputs = tensor.inputs.map do |i|
|
113
114
|
next if i.nil?
|
115
|
+
next i if op_options[:noop]
|
114
116
|
|
115
117
|
if i.is_a?(Array)
|
116
|
-
next i.collect { |sub_item| sub_item.is_a?(Tensor) ?
|
118
|
+
next i.collect { |sub_item| sub_item.is_a?(Tensor) ? global_eval(tensor, sub_item, execution_context) : sub_item }
|
117
119
|
end
|
118
120
|
|
119
|
-
|
120
|
-
cache_key = "#{tensor.graph.object_id}_#{i.name}:#{object_id}"
|
121
|
-
next @context[:_cache][cache_key] if @context[:_cache].key?(cache_key)
|
122
|
-
|
123
|
-
result = @session.delegate_to_evaluator(i, @context, execution_context)
|
124
|
-
convert_from_buffer(i, result).tap do |buffer|
|
125
|
-
@context[:_cache][cache_key] = buffer if i.is_const
|
126
|
-
end
|
127
|
-
else
|
128
|
-
prepare_input(i, execution_context, op_options)
|
129
|
-
end
|
121
|
+
global_eval(tensor, i, execution_context, op_options)
|
130
122
|
end
|
131
123
|
|
132
124
|
instance_exec(execution_context, tensor, resolved_inputs, &op[:block])
|
@@ -134,6 +126,23 @@ module TensorStream
|
|
134
126
|
|
135
127
|
protected
|
136
128
|
|
129
|
+
def global_eval(tensor, input, execution_context, op_options = {})
|
130
|
+
return nil unless input
|
131
|
+
return input unless input.is_a?(Tensor)
|
132
|
+
|
133
|
+
if object_id != @context[:_cache][:placement][input.name][1].object_id # tensor is on another device or evaluator
|
134
|
+
cache_key = "#{tensor.graph.object_id}_#{input.name}:#{object_id}"
|
135
|
+
return @context[:_cache][cache_key] if @context[:_cache].key?(cache_key)
|
136
|
+
|
137
|
+
result = @session.delegate_to_evaluator(input, @context, execution_context)
|
138
|
+
convert_from_buffer(input, result).tap do |buffer|
|
139
|
+
@context[:_cache][cache_key] = buffer if input.is_const
|
140
|
+
end
|
141
|
+
else
|
142
|
+
prepare_input(input, execution_context, op_options)
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
137
146
|
def get_broadcast_gradient_args(input_a, input_b)
|
138
147
|
return [[], []] if input_a == input_b
|
139
148
|
|
@@ -153,16 +162,16 @@ module TensorStream
|
|
153
162
|
end
|
154
163
|
end
|
155
164
|
|
156
|
-
|
165
|
+
[input_a_args.reverse, input_b_args.reverse]
|
157
166
|
end
|
158
167
|
|
159
168
|
##
|
160
169
|
# converts from a ruby Buffer object to the evaluator's native buffer format
|
161
|
-
def convert_from_buffer(
|
170
|
+
def convert_from_buffer(_tensor, _result)
|
162
171
|
raise "need implementation"
|
163
172
|
end
|
164
173
|
|
165
|
-
def prepare_input(
|
174
|
+
def prepare_input(_tensor, _context, _options = {})
|
166
175
|
raise "need implementation"
|
167
176
|
end
|
168
177
|
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
// same dimension add floating point op
|
3
|
+
__kernel void apply_momentum_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *grad, __global const <%= c_dtype %> *learning_rate,
|
4
|
+
__global const <%= c_dtype %> *momentum, __global <%= c_dtype %> *output, __global <%= c_dtype %> *acc) {
|
5
|
+
// Get the index of the current element to be processed
|
6
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
7
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
8
|
+
const int index = globalRow * N + globalCol;
|
9
|
+
<%= c_dtype %> acc_m = acc[index];
|
10
|
+
acc[index] = acc_m * momentum[0] + grad[index];
|
11
|
+
<% if nesterov %>
|
12
|
+
output[index] -= grad[index] * learning_rate[0] + acc_m * momentum[0] * learning_rate[0];
|
13
|
+
<% else %>
|
14
|
+
output[index] -= acc_m * learning_rate[0];
|
15
|
+
<% end %>
|
16
|
+
}
|
@@ -0,0 +1,24 @@
|
|
1
|
+
% ctype = dtype_to_c_type(data_type)
|
2
|
+
|
3
|
+
__kernel void pack(const int N, const int index, __global const <%= ctype %> *A, __global <%= ctype %> *C) {
|
4
|
+
// Get the index of the current element to be processed
|
5
|
+
const int globalCol = get_global_id(0); // Col ID of C (0..N)
|
6
|
+
|
7
|
+
int start = index * <%= divisors[0] %>;
|
8
|
+
int ptr = start + globalCol;
|
9
|
+
int index_map[<%= divisors.size %>] = { <%= Array.new(divisors.size) { 0 }.join(', ') %> };
|
10
|
+
|
11
|
+
// compute effective coordinates
|
12
|
+
<% divisors.each_with_index do |div, index| %>
|
13
|
+
index_map[<%= index %>] = (int)floor(ptr / (float)<%= div %>);<% if index < divisors.size - 1%>ptr = ptr % <%= div %>;<% end %><% end %>
|
14
|
+
|
15
|
+
// Apply axis translation if needed
|
16
|
+
<% if axis > 0 %>
|
17
|
+
int first = index_map[0];
|
18
|
+
<% axis.times do |i| %>
|
19
|
+
index_map[<%= i %>] = index_map[<%= (i + 1) %>];<% end %>
|
20
|
+
index_map[<%= axis %>] = first;
|
21
|
+
<% end%>
|
22
|
+
|
23
|
+
C[<%= multipliers.each_with_index.map { |m, idx| "#{m}*index_map[#{idx}]" }.join(' + ') %>] = A[globalCol];
|
24
|
+
}
|
@@ -1,9 +1,10 @@
|
|
1
|
+
|
1
2
|
// First naive implementation
|
2
3
|
% c_dtype = dtype_to_c_type(dtype)
|
3
4
|
__kernel void softmax_cross_<%= dtype %>(const int N,
|
4
5
|
const __global <%= c_dtype %>* A,
|
5
6
|
const __global <%= c_dtype %>* L,
|
6
|
-
__global <%= c_dtype %>* C) {
|
7
|
+
__global <%= c_dtype %>* C, __global <%= c_dtype %>* P) {
|
7
8
|
|
8
9
|
// Get the index of the current element to be processed
|
9
10
|
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
@@ -24,4 +25,8 @@ __kernel void softmax_cross_<%= dtype %>(const int N,
|
|
24
25
|
for (int k=0; k < N; k++) {
|
25
26
|
C[globalRow*N + k] = (log(acc) - (A[globalRow*N + k] - max)) * L[globalRow*N + k];
|
26
27
|
}
|
28
|
+
|
29
|
+
for (int k=0; k < N; k++) {
|
30
|
+
P[globalRow*N + k] = (exp(A[globalRow*N + k] - max) / acc) - L[globalRow*N + k];
|
31
|
+
}
|
27
32
|
}
|
@@ -16,10 +16,6 @@ module TensorStream
|
|
16
16
|
|
17
17
|
def to_ruby
|
18
18
|
return [] if buffer.empty?
|
19
|
-
if shape.empty?
|
20
|
-
return buffer[0] != 0 if data_type == :boolean
|
21
|
-
return buffer[0]
|
22
|
-
end
|
23
19
|
|
24
20
|
if dirty
|
25
21
|
op.command_queue.enqueue_read_buffer(cl_buffer, buffer, event_wait_list: [op].compact)
|
@@ -27,9 +23,13 @@ module TensorStream
|
|
27
23
|
self.dirty = false
|
28
24
|
end
|
29
25
|
|
26
|
+
if shape.empty?
|
27
|
+
return buffer[0] != 0 if data_type == :boolean
|
28
|
+
return buffer[0]
|
29
|
+
end
|
30
|
+
|
30
31
|
result = buffer.reshape(*shape.map(&:to_i).reverse).to_a
|
31
|
-
|
32
|
-
result
|
32
|
+
data_type == :boolean ? process_function_op(result, ->(a, _b) { a != 0 }) : result
|
33
33
|
end
|
34
34
|
end
|
35
35
|
end
|
@@ -30,6 +30,7 @@ module TensorStream
|
|
30
30
|
## PURE ruby evaluator used for testing and development
|
31
31
|
class OpenclEvaluator < BaseEvaluator
|
32
32
|
attr_accessor :retain
|
33
|
+
attr_reader :opencl_device
|
33
34
|
|
34
35
|
include TensorStream::OpHelper
|
35
36
|
include TensorStream::ArrayOpsHelper
|
@@ -51,20 +52,20 @@ module TensorStream
|
|
51
52
|
|
52
53
|
def self.fetch_device(query = [])
|
53
54
|
devices = query_devices_with_score
|
54
|
-
platform_devices = devices.select { |d| d[0].platform.to_s.
|
55
|
+
platform_devices = devices.select { |d| d[0].platform.to_s.tr(' ', '_').downcase =~ /#{query[0].downcase}/ }
|
55
56
|
opencl_to_device(platform_devices[[query[1].to_i, platform_devices.size - 1].min])
|
56
57
|
end
|
57
58
|
|
58
59
|
def self.opencl_to_device(d)
|
59
60
|
device = d[0]
|
60
61
|
index = d[3]
|
61
|
-
platform_name = device.platform.name.
|
62
|
+
platform_name = device.platform.name.tr(' ', '_').downcase
|
62
63
|
uri = [platform_name, index].join(':')
|
63
64
|
|
64
65
|
device_type = device.type.to_s == 'GPU' ? :gpu : :cpu
|
65
66
|
|
66
|
-
OpenclDevice.new(uri, device_type, self).tap do |
|
67
|
-
|
67
|
+
OpenclDevice.new(uri, device_type, self).tap do |devide|
|
68
|
+
devide.native_device = device
|
68
69
|
end
|
69
70
|
end
|
70
71
|
|
@@ -96,8 +97,14 @@ module TensorStream
|
|
96
97
|
end
|
97
98
|
end
|
98
99
|
|
100
|
+
# buffer comes from non-opencl evaluator
|
99
101
|
def convert_from_buffer(tensor, result)
|
100
|
-
|
102
|
+
if result.buffer.is_a?(TensorStream::Evaluator::OutputGroup)
|
103
|
+
converted_outputs = result.buffer.outputs.zip(result.buffer.data_types).map { |output, data_type| convert_to_opencl([output].flatten, shape_eval(output), data_type: data_type, name: tensor.name) }
|
104
|
+
TensorStream::Evaluator::OutputGroup.new(converted_outputs, result.buffer.data_types)
|
105
|
+
else
|
106
|
+
convert_to_opencl([result.buffer].flatten, shape_eval(result.buffer), data_type: result.data_type, name: tensor.name)
|
107
|
+
end
|
101
108
|
end
|
102
109
|
|
103
110
|
def complete_eval(tensor, context)
|
@@ -106,7 +113,7 @@ module TensorStream
|
|
106
113
|
if buffer.is_a?(Array)
|
107
114
|
buffer = buffer.collect do |b|
|
108
115
|
next b if b.buffer.size.zero?
|
109
|
-
_opencl_queue.enqueue_read_buffer(b.cl_buffer, b.buffer, event_wait_list: [b
|
116
|
+
_opencl_queue.enqueue_read_buffer(b.cl_buffer, b.buffer, event_wait_list: build_event_wait_list([b]))
|
110
117
|
b
|
111
118
|
end
|
112
119
|
else
|
@@ -114,14 +121,30 @@ module TensorStream
|
|
114
121
|
return buffer if buffer.nil?
|
115
122
|
return [] if buffer.buffer.nil?
|
116
123
|
return buffer if buffer.buffer.size.zero?
|
117
|
-
_opencl_queue.enqueue_read_buffer(buffer.cl_buffer, buffer.buffer, event_wait_list: [buffer
|
124
|
+
_opencl_queue.enqueue_read_buffer(buffer.cl_buffer, buffer.buffer, event_wait_list: build_event_wait_list([buffer]))
|
118
125
|
end
|
119
126
|
_opencl_queue.finish
|
120
127
|
buffer
|
121
128
|
end
|
122
129
|
|
123
|
-
def
|
124
|
-
|
130
|
+
def self.query_devices_with_score
|
131
|
+
OpenCL.platforms.flat_map do |p|
|
132
|
+
p.devices.select { |d| d.available > 0 }.each_with_index.collect do |d, index|
|
133
|
+
score = 0
|
134
|
+
if d.type.to_s == 'CPU'
|
135
|
+
score += 1
|
136
|
+
elsif d.type.to_s == 'GPU'
|
137
|
+
score += 4
|
138
|
+
end
|
139
|
+
|
140
|
+
score += 1000 if d.platform.name == 'NVIDIA CUDA'
|
141
|
+
|
142
|
+
score += d.max_compute_units
|
143
|
+
score += d.max_clock_frequency
|
144
|
+
|
145
|
+
[d, score, p.name, index]
|
146
|
+
end
|
147
|
+
end
|
125
148
|
end
|
126
149
|
|
127
150
|
protected
|
@@ -152,31 +175,9 @@ module TensorStream
|
|
152
175
|
@opencl_context = OpenCL.create_context(opencl_device)
|
153
176
|
end
|
154
177
|
|
155
|
-
def self.query_devices_with_score
|
156
|
-
OpenCL.platforms.flat_map do |p|
|
157
|
-
|
158
|
-
p.devices.select { |d| d.available > 0 }.each_with_index.collect do |d, index|
|
159
|
-
score = 0
|
160
|
-
if d.type.to_s == 'CPU'
|
161
|
-
score += 1
|
162
|
-
elsif d.type.to_s == 'GPU'
|
163
|
-
score += 4
|
164
|
-
end
|
165
|
-
|
166
|
-
if d.platform.name == 'NVIDIA CUDA'
|
167
|
-
score += 1000
|
168
|
-
end
|
169
|
-
|
170
|
-
score += d.max_compute_units
|
171
|
-
score += d.max_clock_frequency
|
172
|
-
|
173
|
-
[d, score, p.name, index]
|
174
|
-
end
|
175
|
-
end
|
176
|
-
end
|
177
|
-
|
178
178
|
def create_command_queue
|
179
179
|
supported_proprties = opencl_device.queue_properties.names
|
180
|
+
|
180
181
|
properties = []
|
181
182
|
properties << OpenCL::CommandQueue::PROFILING_ENABLE if supported_proprties.include?('PROFILING_ENABLE')
|
182
183
|
properties << OpenCL::CommandQueue::OUT_OF_ORDER_EXEC_MODE_ENABLE if supported_proprties.include?('OUT_OF_ORDER_EXEC_MODE_ENABLE')
|
@@ -196,7 +197,7 @@ module TensorStream
|
|
196
197
|
end
|
197
198
|
|
198
199
|
def _cl_program(kernel, args = {})
|
199
|
-
suffix = args.collect { |k,v| "#{k}.#{v}"}.join('.')
|
200
|
+
suffix = args.collect { |k, v| "#{k}.#{escape_arg_content(v)}" }.join('.')
|
200
201
|
@context[:_cache]["_opencl_kernel_#{kernel}.#{suffix}:#{object_id}"] ||= begin
|
201
202
|
filename = %w[cl.erb cl].map { |ext| cl_template_path(kernel, ext) }.find { |n| File.exist?(n) }
|
202
203
|
raise "opencl kernel template for #{kernel} has not yet been defined" if filename.nil?
|
@@ -211,6 +212,13 @@ module TensorStream
|
|
211
212
|
end
|
212
213
|
end
|
213
214
|
|
215
|
+
def escape_arg_content(value)
|
216
|
+
return value.tr(' ','_') if value.is_a?(String)
|
217
|
+
return value.join('-') if value.is_a?(Array)
|
218
|
+
|
219
|
+
value
|
220
|
+
end
|
221
|
+
|
214
222
|
def _run(tensor, execution_context)
|
215
223
|
return tensor if tensor.is_a?(OpenCLBuffer)
|
216
224
|
return tensor.map { |t| _run(t, execution_context) } if tensor.is_a?(Array) && !tensor.size.empty? && tensor[0].is_a?(Tensor)
|
@@ -236,7 +244,7 @@ module TensorStream
|
|
236
244
|
res
|
237
245
|
end
|
238
246
|
|
239
|
-
def eval_variable(tensor,
|
247
|
+
def eval_variable(tensor, _child_context)
|
240
248
|
raise "variable #{tensor.name} not initalized" if tensor.value.nil? && (tensor.buffer.nil? || !tensor.buffer.dirty)
|
241
249
|
tensor.buffer = wrap_opencl(tensor, name: tensor.name) if tensor.buffer.nil?
|
242
250
|
tensor.buffer
|
@@ -259,7 +267,10 @@ module TensorStream
|
|
259
267
|
end
|
260
268
|
end
|
261
269
|
|
262
|
-
register_op :identity do |
|
270
|
+
register_op :identity do |context, tensor, inputs|
|
271
|
+
if tensor.inputs.size > 1
|
272
|
+
tensor.inputs[1..inputs.size].each { |input| complete_eval(input, context) }
|
273
|
+
end
|
263
274
|
inputs[0]
|
264
275
|
end
|
265
276
|
|
@@ -277,18 +288,19 @@ module TensorStream
|
|
277
288
|
assign_var(tensor, value, context)
|
278
289
|
end
|
279
290
|
|
291
|
+
register_op :variable, noop: true do |context, tensor, inputs|
|
292
|
+
variable = tensor.inputs[0]
|
293
|
+
raise "variable #{tensor.name} not initalized" if variable.value.nil? && (variable.buffer.nil? || !variable.buffer.dirty)
|
294
|
+
variable.buffer = wrap_opencl(variable, name: variable.name) if variable.buffer.nil?
|
295
|
+
variable.buffer
|
296
|
+
end
|
297
|
+
|
280
298
|
# Fast in place multiply subtract assign
|
281
299
|
register_op :apply_gradient_descent do |_context, tensor, inputs|
|
282
300
|
_target_var, learning_rate, delta = inputs
|
283
301
|
|
284
302
|
assign = tensor.inputs[0] || tensor
|
285
303
|
|
286
|
-
unless assign.buffer
|
287
|
-
value = read_final_result(buffer)
|
288
|
-
assign.buffer = convert_to_opencl(value, buffer.shape, data_type: tensor.data_type, name: assign.name)
|
289
|
-
assign.value = value
|
290
|
-
end
|
291
|
-
|
292
304
|
assign.buffer.dirty = true # force buffer copy when variable is read externally
|
293
305
|
output_buffer = assign.buffer
|
294
306
|
|
@@ -297,13 +309,39 @@ module TensorStream
|
|
297
309
|
cl_m = OpenCL::Int1.new(m || 1)
|
298
310
|
cl_n = OpenCL::Int1.new(n || 1)
|
299
311
|
|
300
|
-
event_wait_list = [assign.buffer
|
312
|
+
event_wait_list = build_event_wait_list([assign.buffer, learning_rate, delta])
|
301
313
|
method_call = :"apply_gradient_#{output_buffer.data_type}"
|
302
314
|
event = _cl_program("apply_gradient", dtype: output_buffer.data_type).send(method_call, _opencl_queue, work_group, cl_m, cl_n, delta.cl_buffer, learning_rate.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
303
315
|
output_buffer.op = event
|
304
316
|
output_buffer
|
305
317
|
end
|
306
318
|
|
319
|
+
# Fast in place multiply subtract assign
|
320
|
+
register_op :apply_momentum do |_context, tensor, inputs|
|
321
|
+
target_var, momentum_var, learning_rate, grad, momentum = inputs
|
322
|
+
|
323
|
+
assign = tensor.inputs[0] || tensor
|
324
|
+
assign_acc = tensor.inputs[1]
|
325
|
+
assign.buffer.dirty = true # force buffer copy when variable is read externally
|
326
|
+
assign_acc.buffer.dirty = true # force buffer copy when variable is read externally
|
327
|
+
|
328
|
+
output_buffer = assign.buffer
|
329
|
+
|
330
|
+
m, n = output_buffer.shape
|
331
|
+
work_group = [m || 1, n || 1]
|
332
|
+
cl_m = OpenCL::Int1.new(m || 1)
|
333
|
+
cl_n = OpenCL::Int1.new(n || 1)
|
334
|
+
|
335
|
+
event_wait_list = build_event_wait_list([assign.buffer, assign_acc.buffer, learning_rate, grad, momentum])
|
336
|
+
method_call = :"apply_momentum_#{output_buffer.data_type}"
|
337
|
+
event = _cl_program("apply_momentum", nesterov: tensor.options[:use_nesterov], dtype: output_buffer.data_type).
|
338
|
+
send(method_call, _opencl_queue, work_group, cl_m, cl_n, grad.cl_buffer,
|
339
|
+
learning_rate.cl_buffer, momentum.cl_buffer, output_buffer.cl_buffer,
|
340
|
+
assign_acc.buffer.cl_buffer, event_wait_list: event_wait_list)
|
341
|
+
output_buffer.op = event
|
342
|
+
output_buffer
|
343
|
+
end
|
344
|
+
|
307
345
|
%i[less less_equal greater greater_equal equal not_equal logical_and].each do |op|
|
308
346
|
register_op op, noop: true do |context, tensor, inputs|
|
309
347
|
execute_2_operand_func(op.to_s, tensor, inputs[0], inputs[1], context, 'cond')
|
@@ -332,7 +370,7 @@ module TensorStream
|
|
332
370
|
a = inputs_queue.pop
|
333
371
|
until inputs_queue.empty?
|
334
372
|
b = inputs_queue.pop
|
335
|
-
event_wait_list = [a
|
373
|
+
event_wait_list = build_event_wait_list([a, b])
|
336
374
|
method_call = :"add_#{a.data_type}_#{b.data_type}"
|
337
375
|
event = _cl_program('add', a: a.data_type, b: b.data_type, dtype: dtype).send(method_call, _opencl_queue, work_group, cl_m, cl_n, cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
338
376
|
a = output_buffer
|
@@ -353,6 +391,23 @@ module TensorStream
|
|
353
391
|
convert_to_opencl(new_buf, new_shape, data_type: inputs[0].data_type, name: tensor.name)
|
354
392
|
end
|
355
393
|
|
394
|
+
register_op :fill, buffer: true do |_context, tensor, inputs|
|
395
|
+
shape = inputs[0]
|
396
|
+
value = inputs[1]
|
397
|
+
|
398
|
+
narray_size = shape.buffer.to_a.reduce(:*) || 1
|
399
|
+
cl_buffer = get_cached_buffer(tensor.name, shape.buffer.to_a)
|
400
|
+
|
401
|
+
buffer = if cl_buffer
|
402
|
+
cl_buffer.buffer
|
403
|
+
else
|
404
|
+
allocate_narray_for_type(tensor.data_type, narray_size)
|
405
|
+
end
|
406
|
+
|
407
|
+
buffer.fill!(value.buffer[0])
|
408
|
+
convert_to_opencl(buffer, shape.buffer.to_a, data_type: tensor.data_type, name: tensor.name)
|
409
|
+
end
|
410
|
+
|
356
411
|
register_op :floor_div, noop: true do |context, tensor, inputs|
|
357
412
|
if fp_type?(tensor.data_type)
|
358
413
|
execute_2_operand_func('floor_div', tensor, inputs[0], inputs[1], context)
|
@@ -374,8 +429,15 @@ module TensorStream
|
|
374
429
|
v = b.shape[0]
|
375
430
|
k = a.shape[1]
|
376
431
|
|
377
|
-
|
378
|
-
|
432
|
+
if tensor.options[:transpose_a]
|
433
|
+
m = a.shape[1]
|
434
|
+
k = a.shape[0]
|
435
|
+
end
|
436
|
+
|
437
|
+
if tensor.options[:transpose_b]
|
438
|
+
n = b.shape[0]
|
439
|
+
v = b.shape[1]
|
440
|
+
end
|
379
441
|
|
380
442
|
result_shape = [m, n]
|
381
443
|
|
@@ -393,8 +455,8 @@ module TensorStream
|
|
393
455
|
|
394
456
|
transpose_a = OpenCL::Int1.new(tensor.options[:transpose_a] ? 1 : 0)
|
395
457
|
transpose_b = OpenCL::Int1.new(tensor.options[:transpose_b] ? 1 : 0)
|
396
|
-
|
397
|
-
output_buffer.op = _cl_program('gemm', dtype: dtype).send(:"gemm_#{dtype}", _opencl_queue, result_shape, cl_m, cl_n, cl_k, transpose_a, transpose_b, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer)
|
458
|
+
event_wait_list = build_event_wait_list(inputs)
|
459
|
+
output_buffer.op = _cl_program('gemm', dtype: dtype).send(:"gemm_#{dtype}", _opencl_queue, result_shape, cl_m, cl_n, cl_k, transpose_a, transpose_b, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
398
460
|
output_buffer
|
399
461
|
end
|
400
462
|
|
@@ -406,14 +468,47 @@ module TensorStream
|
|
406
468
|
cl_m = OpenCL::Int1.new(m || 1)
|
407
469
|
cl_n = OpenCL::Int1.new(n || 1)
|
408
470
|
work_group = [m || 1, n || 1]
|
409
|
-
|
410
|
-
buffer.op = _cl_program("cast", source_dt: a.data_type, target_dt: tensor.data_type).cast(_opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, buffer.cl_buffer)
|
471
|
+
event_wait_list = build_event_wait_list(inputs)
|
472
|
+
buffer.op = _cl_program("cast", source_dt: a.data_type, target_dt: tensor.data_type).cast(_opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, buffer.cl_buffer, event_wait_list: event_wait_list)
|
411
473
|
buffer
|
412
474
|
else
|
413
475
|
a
|
414
476
|
end
|
415
477
|
end
|
416
478
|
|
479
|
+
register_op :stack do |_context, tensor, inputs|
|
480
|
+
axis = tensor.options[:axis] || 0
|
481
|
+
shape = inputs[0].shape
|
482
|
+
rank = shape.size + 1
|
483
|
+
elem_size = shape.empty? ? 1 : shape.reduce(:*)
|
484
|
+
|
485
|
+
new_shape = [inputs.size]
|
486
|
+
shape.inject(new_shape) { |ns, s| ns << s }
|
487
|
+
|
488
|
+
divisors = new_shape.dup.drop(1).reverse.inject([1]) do |a, s|
|
489
|
+
a << s * a.last
|
490
|
+
end.reverse
|
491
|
+
|
492
|
+
axis = rank + axis if axis < 0
|
493
|
+
rotated_shape = Array.new(axis + 1) { new_shape.shift }
|
494
|
+
new_shape = rotated_shape.rotate! + new_shape
|
495
|
+
|
496
|
+
output_buffer = _create_result_buffer(tensor.data_type, new_shape, tensor.name)
|
497
|
+
multipliers = new_shape.dup.drop(1).reverse.inject([1]) do |a, s|
|
498
|
+
a << s * a.last
|
499
|
+
end.reverse
|
500
|
+
|
501
|
+
cl_n = OpenCL::Int1.new(elem_size)
|
502
|
+
work_group = [elem_size]
|
503
|
+
event_wait_list = build_event_wait_list(inputs)
|
504
|
+
ops = inputs.each_with_index.map do |input, index|
|
505
|
+
cl_index = OpenCL::Int1.new(index)
|
506
|
+
_cl_program("pack", data_type: tensor.data_type, divisors: divisors, multipliers: multipliers, axis: axis).pack(_opencl_queue, work_group, cl_n, cl_index, input.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
507
|
+
end
|
508
|
+
output_buffer.op = ops
|
509
|
+
output_buffer
|
510
|
+
end
|
511
|
+
|
417
512
|
%i[sign exp tan acos asin sin cos abs sqrt negate square reciprocal tanh tanh_grad sigmoid log1p round floor ceil].each do |op|
|
418
513
|
register_op op, noop: true do |context, tensor, inputs|
|
419
514
|
execute_func(op.to_s, tensor, inputs[0], context)
|
@@ -422,7 +517,7 @@ module TensorStream
|
|
422
517
|
|
423
518
|
register_op :softmax do |_context, tensor, inputs|
|
424
519
|
a = inputs[0]
|
425
|
-
event_wait_list =
|
520
|
+
event_wait_list = build_event_wait_list(inputs)
|
426
521
|
dtype = tensor.data_type
|
427
522
|
output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
|
428
523
|
|
@@ -438,7 +533,7 @@ module TensorStream
|
|
438
533
|
|
439
534
|
register_op :log_softmax do |_context, tensor, inputs|
|
440
535
|
a = inputs[0] # logits
|
441
|
-
event_wait_list =
|
536
|
+
event_wait_list = build_event_wait_list(inputs)
|
442
537
|
dtype = tensor.data_type
|
443
538
|
output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
|
444
539
|
|
@@ -452,28 +547,33 @@ module TensorStream
|
|
452
547
|
output_buffer
|
453
548
|
end
|
454
549
|
|
455
|
-
register_op :softmax_cross_entropy_with_logits_v2 do |
|
550
|
+
register_op :softmax_cross_entropy_with_logits_v2 do |context, tensor, inputs|
|
456
551
|
a = inputs[0] # logits
|
457
552
|
b = inputs[1] # labels
|
458
|
-
event_wait_list =
|
553
|
+
event_wait_list = build_event_wait_list(inputs)
|
459
554
|
dtype = tensor.data_type
|
460
555
|
output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
|
461
|
-
|
556
|
+
output_buffer_backprop = _create_result_buffer(tensor.data_type, a.shape, "#{tensor.name}_2")
|
557
|
+
rank = a.shape.size - 1
|
462
558
|
m, n = a.shape
|
463
559
|
work_group = [m]
|
464
560
|
n = m if n.nil?
|
465
561
|
cl_n = OpenCL::Int1.new(n || 1)
|
466
562
|
|
467
|
-
event = _cl_program("softmax_cross", dtype: dtype).send(:"softmax_cross_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer, b.cl_buffer,
|
563
|
+
event = _cl_program("softmax_cross", dtype: dtype).send(:"softmax_cross_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer, b.cl_buffer,
|
564
|
+
output_buffer.cl_buffer, output_buffer_backprop.cl_buffer, event_wait_list: event_wait_list)
|
468
565
|
output_buffer.op = event
|
469
|
-
|
566
|
+
output_buffer_backprop.op = event
|
567
|
+
|
568
|
+
loss = reduction(context, tensor, output_buffer, rank, :sum)
|
569
|
+
OutputGroup.new([loss, output_buffer_backprop], [tensor.inputs[0].data_type, tensor.inputs[0].data_type])
|
470
570
|
end
|
471
571
|
|
472
572
|
register_op :softmax_cross_entropy_with_logits_v2_grad do |_context, tensor, inputs|
|
473
573
|
a = inputs[0] # logits
|
474
574
|
b = inputs[1] # labels
|
475
575
|
c = inputs[2] # grads
|
476
|
-
event_wait_list =
|
576
|
+
event_wait_list = build_event_wait_list(inputs)
|
477
577
|
dtype = tensor.data_type
|
478
578
|
output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
|
479
579
|
|
@@ -490,7 +590,7 @@ module TensorStream
|
|
490
590
|
register_op :softmax_grad do |_context, tensor, inputs|
|
491
591
|
a, grad = inputs
|
492
592
|
|
493
|
-
event_wait_list =
|
593
|
+
event_wait_list = build_event_wait_list(inputs)
|
494
594
|
dtype = tensor.data_type
|
495
595
|
output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
|
496
596
|
|
@@ -508,7 +608,7 @@ module TensorStream
|
|
508
608
|
name = tensor.options[:name]
|
509
609
|
|
510
610
|
a.buffer.each do |input|
|
511
|
-
raise "#{name} Invalid Argument" if input.nan? || input.infinite?
|
611
|
+
raise TensorStream::InvalidArgumentError, "#{name} Invalid Argument" if input.nan? || input.infinite?
|
512
612
|
end
|
513
613
|
a
|
514
614
|
end
|
@@ -522,8 +622,8 @@ module TensorStream
|
|
522
622
|
input_a = read_final_result(complete_eval(a, context))
|
523
623
|
input_b = read_final_result(complete_eval(b, context))
|
524
624
|
b_a, b_b = broadcast(input_a, input_b)
|
525
|
-
[
|
526
|
-
|
625
|
+
[wrap_opencl(b_a, data_type: a.data_type, name: "#{tensor.name}_a"),
|
626
|
+
wrap_opencl(b_b, data_type: a.data_type, name: "#{tensor.name}_b")]
|
527
627
|
end
|
528
628
|
end
|
529
629
|
|
@@ -557,8 +657,22 @@ module TensorStream
|
|
557
657
|
|
558
658
|
register_op :transpose, buffer: true do |_context, tensor, inputs|
|
559
659
|
t_param = Array.new(inputs[0].shape.size) { |index| index }.reverse
|
560
|
-
|
561
|
-
|
660
|
+
|
661
|
+
if inputs[0].shape.size == 2 && inputs[1].nil?
|
662
|
+
transposed = inputs[0].buffer.reshape(*inputs[0].shape.reverse).transpose(*t_param)
|
663
|
+
res = convert_to_opencl(transposed.flatten, transposed.shape.reverse, data_type: inputs[0].data_type, name: tensor.name)
|
664
|
+
res
|
665
|
+
else
|
666
|
+
rank = inputs[0].shape.size
|
667
|
+
perm = inputs[1].nil? ? (0...rank).to_a.reverse : inputs[1].buffer
|
668
|
+
new_shape = perm.map { |p| inputs[0].shape[p] }.to_a
|
669
|
+
output_buffer = _create_result_buffer(tensor.data_type, new_shape, tensor.name)
|
670
|
+
transpose_with_perm(inputs[0].buffer, output_buffer.buffer, inputs[0].shape, new_shape, perm)
|
671
|
+
|
672
|
+
write_op = _opencl_queue.enqueue_write_buffer(output_buffer.cl_buffer, output_buffer.buffer)
|
673
|
+
output_buffer.op = write_op
|
674
|
+
output_buffer
|
675
|
+
end
|
562
676
|
end
|
563
677
|
|
564
678
|
register_op :index, noop: true do |context, tensor, inputs|
|
@@ -567,39 +681,36 @@ module TensorStream
|
|
567
681
|
|
568
682
|
if a.is_a?(OutputGroup)
|
569
683
|
a.outputs[index]
|
684
|
+
elsif a.is_a?(Array)
|
685
|
+
a[index]
|
570
686
|
else
|
571
|
-
|
572
|
-
|
573
|
-
|
574
|
-
|
575
|
-
new_shape.shift
|
576
|
-
input_a = read_final_result(a)
|
577
|
-
convert_to_opencl(input_a[index], new_shape, data_type: a.data_type, name: tensor.name)
|
578
|
-
end
|
687
|
+
new_shape = a.shape.dup
|
688
|
+
new_shape.shift
|
689
|
+
input_a = read_final_result(a)
|
690
|
+
convert_to_opencl(input_a[index], new_shape, data_type: a.data_type, name: tensor.name)
|
579
691
|
end
|
580
692
|
end
|
581
693
|
|
582
694
|
register_op :broadcast_gradient_args, buffer: true do |_context, tensor, inputs|
|
583
695
|
rx, ry = get_broadcast_gradient_args(inputs[0].buffer.to_a, inputs[1].buffer.to_a)
|
584
|
-
OutputGroup.new([wrap_opencl(rx, data_type: :int32, name:
|
696
|
+
OutputGroup.new([wrap_opencl(rx, data_type: :int32, name: tensor.name), wrap_opencl(ry, data_type: :int32, name: "#{tensor.name}:1")], tensor.inputs.map(&:data_type))
|
585
697
|
end
|
586
698
|
|
587
699
|
register_op :shape do |_context, tensor, inputs|
|
588
700
|
wrap_opencl(inputs[0].shape, name: tensor.name, data_type: tensor.data_type)
|
589
701
|
end
|
590
702
|
|
591
|
-
register_op :reshape, buffer: true do |_context,
|
703
|
+
register_op :reshape, buffer: true do |_context, tensor, inputs|
|
592
704
|
arr = inputs[0]
|
593
705
|
new_shape = read_final_result(inputs[1])
|
594
706
|
|
595
|
-
if new_shape.size.zero? && arr.buffer.size == 1
|
596
|
-
|
597
|
-
|
598
|
-
|
599
|
-
|
600
|
-
|
601
|
-
|
602
|
-
end
|
707
|
+
shape = if new_shape.size.zero? && arr.buffer.size == 1
|
708
|
+
new_shape
|
709
|
+
else
|
710
|
+
TensorShape.fix_inferred_elements(new_shape, arr.buffer.size)
|
711
|
+
end
|
712
|
+
|
713
|
+
convert_to_opencl(arr.buffer, shape, data_type: arr.data_type, name: tensor.name)
|
603
714
|
end
|
604
715
|
|
605
716
|
register_op :flow_group do |_context, _tensor, inputs|
|
@@ -618,6 +729,7 @@ module TensorStream
|
|
618
729
|
|
619
730
|
register_op :prod, noop: true do |context, tensor, inputs|
|
620
731
|
input_a = complete_eval(inputs[0], context)
|
732
|
+
|
621
733
|
if input_a.buffer.empty?
|
622
734
|
convert_to_opencl([1.0], [], data_type: inputs[0].data_type, name: tensor.name)
|
623
735
|
else
|
@@ -646,13 +758,11 @@ module TensorStream
|
|
646
758
|
end
|
647
759
|
|
648
760
|
def eval_operation(tensor, child_context)
|
649
|
-
|
650
761
|
cache_key = "#{tensor.graph.object_id}_opencl_#{tensor.name}:#{object_id}"
|
651
762
|
return @context[:_cache][cache_key] if @context[:_cache].key?(cache_key)
|
652
763
|
return @context[cache_key] if @context.key?(cache_key)
|
653
|
-
|
764
|
+
# puts "opencl: #{tensor.name}"
|
654
765
|
invoke(tensor, child_context).tap do |result|
|
655
|
-
# puts "#{tensor.to_math(true,1)} = #{read_final_result(complete_eval(result, child_context))}"
|
656
766
|
if tensor.breakpoint
|
657
767
|
a = resolve_placeholder(tensor.inputs[0], child_context) if tensor.inputs && tensor.inputs[0]
|
658
768
|
b = resolve_placeholder(tensor.inputs[1], child_context) if tensor.inputs && tensor.inputs[1]
|
@@ -676,9 +786,11 @@ module TensorStream
|
|
676
786
|
@context[:_cache][cache_key] = result if tensor.is_const
|
677
787
|
end
|
678
788
|
rescue EvaluatorExcecutionException => e
|
679
|
-
|
789
|
+
_opencl_queue.finish # dump queue
|
790
|
+
raise e, "error #{e.message} while evaluating #{tensor.name} : #{tensor.to_math(true, 1)} defined at #{tensor.source}"
|
680
791
|
rescue TensorStreamError => e
|
681
|
-
|
792
|
+
_opencl_queue.finish # dump queue
|
793
|
+
raise e, "error #{e.message} while evaluating #{tensor.name} : #{tensor.to_math(true, 1)} defined at #{tensor.source}"
|
682
794
|
rescue StandardError => e
|
683
795
|
_opencl_queue.finish # dump queue
|
684
796
|
puts e.message
|
@@ -698,7 +810,7 @@ module TensorStream
|
|
698
810
|
# File.write('/home/jedld/workspace/tensor_stream/samples/error.graphml', TensorStream::Graphml.new.get_string(tensor, @session))
|
699
811
|
|
700
812
|
# File.write('/Users/josephemmanueldayo/workspace/gradients.graphml', TensorStream::Graphml.new.get_string(tensor, @session))
|
701
|
-
raise EvaluatorExcecutionException.new(e, tensor), "error #{e.message} while evaluating #{tensor.name} : #{tensor.to_math(true,1)} defined at #{tensor.source}"
|
813
|
+
raise EvaluatorExcecutionException.new(e, tensor), "error #{e.message} while evaluating #{tensor.name} : #{tensor.to_math(true, 1)} defined at #{tensor.source}"
|
702
814
|
end
|
703
815
|
|
704
816
|
def eval_tensor(tensor, child_context)
|
@@ -724,8 +836,9 @@ module TensorStream
|
|
724
836
|
|
725
837
|
if assign.buffer
|
726
838
|
# buffer = type_cast(buffer, assign.data_type, name: "#{tensor.name}/cast_#{tensor.name}_#{tensor.data_type}")
|
839
|
+
event_wait_list = build_event_wait_list([buffer, assign.buffer])
|
727
840
|
assign.buffer.op = if assign.buffer.cl_buffer != buffer.cl_buffer
|
728
|
-
_opencl_queue.enqueue_copy_buffer(buffer.cl_buffer, assign.buffer.cl_buffer, event_wait_list:
|
841
|
+
_opencl_queue.enqueue_copy_buffer(buffer.cl_buffer, assign.buffer.cl_buffer, event_wait_list: event_wait_list)
|
729
842
|
else
|
730
843
|
buffer.op
|
731
844
|
end
|
@@ -745,7 +858,6 @@ module TensorStream
|
|
745
858
|
dtype = tensor.data_type
|
746
859
|
result_shape = TensorShape.infer_shape(a.shape, b.shape)
|
747
860
|
return _create_result_buffer(dtype, [0], "out_#{tensor.name}") if result_shape == [0]
|
748
|
-
|
749
861
|
output_buffer = _create_result_buffer(tensor.data_type, result_shape, "out_#{tensor.name}")
|
750
862
|
a, b, prog, switch_operands = select_program(a, b, op_name)
|
751
863
|
m, n = result_shape
|
@@ -754,21 +866,26 @@ module TensorStream
|
|
754
866
|
cl_n = OpenCL::Int1.new(n || 1)
|
755
867
|
cl_switch = OpenCL::Int1.new(switch_operands) # no need to switch for addition
|
756
868
|
|
757
|
-
event_wait_list = [a
|
869
|
+
event_wait_list = build_event_wait_list([a, b]) # add dependency wait list
|
758
870
|
|
759
871
|
method_call = :"#{prog}_#{a.data_type}_#{b.data_type}"
|
872
|
+
prog_name ||= op_name
|
760
873
|
event = if prog == "#{op_name}_b"
|
761
|
-
|
762
|
-
|
763
|
-
|
764
|
-
|
765
|
-
|
766
|
-
|
767
|
-
|
768
|
-
|
769
|
-
|
770
|
-
|
771
|
-
|
874
|
+
cl_m_b, cl_n_b = if b.shape.size == 2
|
875
|
+
[OpenCL::Int1.new(b.shape[0]), OpenCL::Int1.new(b.shape[1])]
|
876
|
+
elsif b.shape.size == 1
|
877
|
+
[OpenCL::Int1.new(1), OpenCL::Int1.new(b.shape[0])]
|
878
|
+
else
|
879
|
+
raise "rank > 2 not supported!"
|
880
|
+
end
|
881
|
+
_cl_program(prog_name, a: a.data_type, b: b.data_type, dtype: dtype).
|
882
|
+
send(method_call, _opencl_queue, work_group, cl_m, cl_n, cl_m_b, cl_n_b,
|
883
|
+
cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
884
|
+
else
|
885
|
+
_cl_program(prog_name, a: a.data_type, b: b.data_type, dtype: dtype).
|
886
|
+
send(method_call, _opencl_queue, work_group, cl_m, cl_n, cl_switch,
|
887
|
+
a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
888
|
+
end
|
772
889
|
|
773
890
|
output_buffer.op = event
|
774
891
|
output_buffer
|
@@ -789,14 +906,14 @@ module TensorStream
|
|
789
906
|
cl_m = OpenCL::Int1.new(m || 1)
|
790
907
|
cl_n = OpenCL::Int1.new(n || 1)
|
791
908
|
|
792
|
-
event_wait_list = [a
|
909
|
+
event_wait_list = build_event_wait_list([a, b, p]) # add dependency wait list
|
793
910
|
output_buffer.op = _cl_program(op_name.to_s, dtype: dtype).send(:"#{op_name}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, p.cl_buffer, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
794
911
|
output_buffer
|
795
912
|
end
|
796
913
|
|
797
914
|
def execute_func(op_name, tensor, a, child_context)
|
798
915
|
a = _run(a, child_context)
|
799
|
-
event_wait_list = [a
|
916
|
+
event_wait_list = build_event_wait_list([a])
|
800
917
|
dtype = tensor.data_type
|
801
918
|
output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
|
802
919
|
|
@@ -814,7 +931,7 @@ module TensorStream
|
|
814
931
|
return [a, b] if a.data_type == b.data_type
|
815
932
|
m, n = b.shape
|
816
933
|
work_group = [m || 1, n || 1]
|
817
|
-
event_wait_list = [b
|
934
|
+
event_wait_list = build_event_wait_list([b])
|
818
935
|
buffer = _create_result_buffer(b.data_type, b.shape, name)
|
819
936
|
|
820
937
|
cl_m = OpenCL::Int1.new(m || 1)
|
@@ -848,6 +965,11 @@ module TensorStream
|
|
848
965
|
convert_to_opencl(value, shape, data_type: data_type || tensor.data_type, name: name)
|
849
966
|
end
|
850
967
|
|
968
|
+
def get_cached_buffer(name, shape)
|
969
|
+
cache_key = "_cl_object_#{name}:#{shape.join('_')}:#{object_id}"
|
970
|
+
@context[:_cache][cache_key]
|
971
|
+
end
|
972
|
+
|
851
973
|
def convert_to_opencl(value, shape, data_type: nil, name: nil)
|
852
974
|
value = [value] if !value.is_a?(Array) && !value.is_a?(NArray)
|
853
975
|
|
@@ -863,6 +985,8 @@ module TensorStream
|
|
863
985
|
allocate_narray_for_type(data_type, narray_size)
|
864
986
|
end
|
865
987
|
|
988
|
+
return nil if buffer.nil?
|
989
|
+
|
866
990
|
cl_buffer_size = shape.empty? ? 1 : shape.reduce(:*)
|
867
991
|
|
868
992
|
cl_buffer = unless value.flatten.empty?
|
@@ -908,18 +1032,20 @@ module TensorStream
|
|
908
1032
|
NArray.sint(narray_size)
|
909
1033
|
when :boolean
|
910
1034
|
NArray.sint(narray_size)
|
1035
|
+
when :unknown
|
1036
|
+
nil
|
911
1037
|
else
|
912
1038
|
raise "unsupported type #{data_type}"
|
913
1039
|
end
|
914
1040
|
end
|
915
1041
|
|
916
1042
|
def _create_result_buffer(data_type, shape, name)
|
917
|
-
return OpenCLBuffer.new(data_type: data_type, shape: [0], buffer: nil, cl_buffer: nil) if shape == [0]
|
1043
|
+
return OpenCLBuffer.new(name: name, data_type: data_type, shape: [0], buffer: nil, cl_buffer: nil) if shape == [0]
|
918
1044
|
@context[:_cache][:_cl_buffers]["_result_#{name}_#{shape.join('_')}:#{object_id}"] ||= begin
|
919
1045
|
size = shape.empty? || shape == [0] ? 1 : shape.reduce(:*)
|
920
1046
|
buffer = allocate_narray_for_type(data_type, size)
|
921
1047
|
cl_buffer = _opencl_context.create_buffer(buffer.size * buffer.element_size)
|
922
|
-
OpenCLBuffer.new(data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer)
|
1048
|
+
OpenCLBuffer.new(data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer, name: name)
|
923
1049
|
end
|
924
1050
|
end
|
925
1051
|
|
@@ -969,7 +1095,7 @@ module TensorStream
|
|
969
1095
|
|
970
1096
|
def reduction(child_context, tensor, a, b, func)
|
971
1097
|
input = complete_eval(a, child_context)
|
972
|
-
axis = read_final_result(complete_eval(b, child_context))
|
1098
|
+
axis = b.is_a?(Tensor) ? read_final_result(complete_eval(b, child_context)) : b
|
973
1099
|
if axis.nil?
|
974
1100
|
red = input.buffer.send(func)
|
975
1101
|
convert_to_opencl(red, [], data_type: tensor.data_type, name: tensor.name)
|
@@ -1021,6 +1147,10 @@ module TensorStream
|
|
1021
1147
|
shape.is_a?(Array) ? shape.size : 0
|
1022
1148
|
end
|
1023
1149
|
|
1150
|
+
def build_event_wait_list(inputs)
|
1151
|
+
inputs.compact.map(&:op).flatten
|
1152
|
+
end
|
1153
|
+
|
1024
1154
|
def resolve_placeholder(placeholder, _execution_context = {})
|
1025
1155
|
return nil if placeholder.nil?
|
1026
1156
|
|